[<prev] [next>] [day] [month] [year] [list]
Message-ID: <36b076dc17083f9edd9b100bd8fa57badde41158.camel@mailbox.org>
Date: Thu, 03 Apr 2025 16:40:19 +0200
From: Philipp Stanner <phasta@...lbox.org>
To: Christian König <christian.koenig@....com>,
phasta@...nel.org, Lyude Paul <lyude@...hat.com>, Danilo Krummrich
<dakr@...nel.org>, David Airlie <airlied@...il.com>, Simona Vetter
<simona@...ll.ch>, Sumit Semwal <sumit.semwal@...aro.org>
Cc: dri-devel@...ts.freedesktop.org, nouveau@...ts.freedesktop.org,
linux-kernel@...r.kernel.org, linux-media@...r.kernel.org,
linaro-mm-sig@...ts.linaro.org, stable@...r.kernel.org
Subject: Re: [PATCH v2] drm/nouveau: Prevent signalled fences in pending list
On Thu, 2025-04-03 at 15:10 +0200, Christian König wrote:
> Am 03.04.25 um 14:58 schrieb Philipp Stanner:
>
> >
> > On Thu, 2025-04-03 at 14:08 +0200, Christian König wrote:
> >
> > >
> > > Am 03.04.25 um 12:13 schrieb Philipp Stanner:
> > >
> > > >
> > > > Nouveau currently relies on the assumption that dma_fences will
> > > > only
> > > > ever get signalled through nouveau_fence_signal(), which takes
> > > > care
> > > > of
> > > > removing a signalled fence from the list
> > > > nouveau_fence_chan.pending.
> > > >
> > > > This self-imposed rule is violated in nouveau_fence_done(),
> > > > where
> > > > dma_fence_is_signaled() can signal the fence without removing
> > > > it
> > > > from
> > > > the list. This enables accesses to already signalled fences
> > > > through
> > > > the
> > > > list, which is a bug.
> > > >
> > > > Furthermore, it must always be possible to use standard
> > > > dma_fence
> > > > methods an a dma_fence and observe valid behavior. The
> > > > canonical
> > > > way of
> > > > ensuring that signalling a fence has additional effects is to
> > > > add
> > > > those
> > > > effects to a callback and register it on that fence.
> > > >
> > > > Move the code from nouveau_fence_signal() into a dma_fence
> > > > callback.
> > > > Register that callback when creating the fence.
> > > >
> > > > Cc: <stable@...r.kernel.org> # 4.10+
> > > > Signed-off-by: Philipp Stanner <phasta@...nel.org>
> > > > ---
> > > > Changes in v2:
> > > > - Remove Fixes: tag. (Danilo)
> > > > - Remove integer "drop" and call nvif_event_block() in the
> > > > fence
> > > > callback. (Danilo)
> > > > ---
> > > > drivers/gpu/drm/nouveau/nouveau_fence.c | 52 +++++++++++++----
> > > > ----
> > > > ----
> > > > drivers/gpu/drm/nouveau/nouveau_fence.h | 1 +
> > > > 2 files changed, 29 insertions(+), 24 deletions(-)
> > > >
> > > > diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c
> > > > b/drivers/gpu/drm/nouveau/nouveau_fence.c
> > > > index 7cc84472cece..cf510ef9641a 100644
> > > > --- a/drivers/gpu/drm/nouveau/nouveau_fence.c
> > > > +++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
> > > > @@ -50,24 +50,24 @@ nouveau_fctx(struct nouveau_fence *fence)
> > > > return container_of(fence->base.lock, struct
> > > > nouveau_fence_chan, lock);
> > > > }
> > > >
> > > > -static int
> > > > -nouveau_fence_signal(struct nouveau_fence *fence)
> > > > +static void
> > > > +nouveau_fence_cleanup_cb(struct dma_fence *dfence, struct
> > > > dma_fence_cb *cb)
> > > > {
> > > > - int drop = 0;
> > > > + struct nouveau_fence_chan *fctx;
> > > > + struct nouveau_fence *fence;
> > > > +
> > > > + fence = container_of(dfence, struct nouveau_fence,
> > > > base);
> > > > + fctx = nouveau_fctx(fence);
> > > >
> > > > - dma_fence_signal_locked(&fence->base);
> > > > list_del(&fence->head);
> > > > rcu_assign_pointer(fence->channel, NULL);
> > > >
> > > > if (test_bit(DMA_FENCE_FLAG_USER_BITS, &fence-
> > > >
> > > > >
> > > > > base.flags)) {
> > > > >
> > > >
> > > > - struct nouveau_fence_chan *fctx =
> > > > nouveau_fctx(fence);
> > > > -
> > > > if (!--fctx->notify_ref)
> > > > - drop = 1;
> > > > + nvif_event_block(&fctx->event);
> > > > }
> > > >
> > > > dma_fence_put(&fence->base);
> > > > - return drop;
> > > > }
> > > >
> > > > static struct nouveau_fence *
> > > > @@ -93,8 +93,7 @@ nouveau_fence_context_kill(struct
> > > > nouveau_fence_chan *fctx, int error)
> > > > if (error)
> > > > dma_fence_set_error(&fence->base,
> > > > error);
> > > >
> > > > - if (nouveau_fence_signal(fence))
> > > > - nvif_event_block(&fctx->event);
> > > > + dma_fence_signal_locked(&fence->base);
> > > > }
> > > > fctx->killed = 1;
> > > > spin_unlock_irqrestore(&fctx->lock, flags);
> > > > @@ -127,11 +126,10 @@ nouveau_fence_context_free(struct
> > > > nouveau_fence_chan *fctx)
> > > > kref_put(&fctx->fence_ref, nouveau_fence_context_put);
> > > > }
> > > >
> > > > -static int
> > > > +static void
> > > > nouveau_fence_update(struct nouveau_channel *chan, struct
> > > > nouveau_fence_chan *fctx)
> > > > {
> > > > struct nouveau_fence *fence;
> > > > - int drop = 0;
> > > > u32 seq = fctx->read(chan);
> > > >
> > > > while (!list_empty(&fctx->pending)) {
> > > > @@ -140,10 +138,8 @@ nouveau_fence_update(struct
> > > > nouveau_channel
> > > > *chan, struct nouveau_fence_chan *fc
> > > > if ((int)(seq - fence->base.seqno) < 0)
> > > > break;
> > > >
> > > > - drop |= nouveau_fence_signal(fence);
> > > > + dma_fence_signal_locked(&fence->base);
> > > > }
> > > > -
> > > > - return drop;
> > > > }
> > > >
> > > > static void
> > > > @@ -152,7 +148,6 @@ nouveau_fence_uevent_work(struct
> > > > work_struct
> > > > *work)
> > > > struct nouveau_fence_chan *fctx = container_of(work,
> > > > struct nouveau_fence_chan,
> > > >
> > > > uevent_work);
> > > > unsigned long flags;
> > > > - int drop = 0;
> > > >
> > > > spin_lock_irqsave(&fctx->lock, flags);
> > > > if (!list_empty(&fctx->pending)) {
> > > > @@ -161,11 +156,8 @@ nouveau_fence_uevent_work(struct
> > > > work_struct
> > > > *work)
> > > >
> > > > fence = list_entry(fctx->pending.next,
> > > > typeof(*fence), head);
> > > > chan = rcu_dereference_protected(fence-
> > > > >channel,
> > > > lockdep_is_held(&fctx->lock));
> > > > - if (nouveau_fence_update(chan, fctx))
> > > > - drop = 1;
> > > > + nouveau_fence_update(chan, fctx);
> > > > }
> > > > - if (drop)
> > > > - nvif_event_block(&fctx->event);
> > > >
> > > > spin_unlock_irqrestore(&fctx->lock, flags);
> > > > }
> > > > @@ -235,6 +227,19 @@ nouveau_fence_emit(struct nouveau_fence
> > > > *fence)
> > > > &fctx->lock, fctx->context,
> > > > ++fctx-
> > > >
> > > > >
> > > > > sequence);
> > > > >
> > > >
> > > > kref_get(&fctx->fence_ref);
> > > >
> > > > + fence->cb.func = nouveau_fence_cleanup_cb;
> > > > + /* Adding a callback runs into
> > > > __dma_fence_enable_signaling(), which will
> > > > + * ultimately run into nouveau_fence_no_signaling(),
> > > > where
> > > > a WARN_ON
> > > > + * would fire because the refcount can be dropped
> > > > there.
> > > > + *
> > > > + * Increment the refcount here temporarily to work
> > > > around
> > > > that.
> > > > + */
> > > > + dma_fence_get(&fence->base);
> > > > + ret = dma_fence_add_callback(&fence->base, &fence->cb,
> > > > nouveau_fence_cleanup_cb);
> > > >
> > >
> > > That looks like a really really awkward approach. The driver
> > > basically uses a the DMA fence infrastructure as middle layer and
> > > callbacks into itself to cleanup it's own structures.
> > >
> >
> > What else are callbacks good for, if not to do something
> > automatically
> > when the fence gets signaled?
> >
>
> Well if you add a callback for a signal you issued yourself then
> that's kind of awkward.
>
> E.g. you call into the DMA fence code, just for the DMA fence code
> to call yourself back again.
Now we're entering CS-Philosophy, because it depends on who "you" and
"yourself" are. In case of the driver, yes, naturally it registers a
callback because at some other place (e.g., in the driver's interrupt
handler) the fence will be signaled and the driver wants the callback
stuff to be done.
If that's not dma_fences' callbacks' purpose, then I'd be interested in
knowing what their purpose is, because from my POV this discussion
seems to imply that we effectively must never use them for anything.
How could it ever be different? Who, for example, registers dma_fence
callbacks while not signaling them "himself"?
>
>
>
> >
> > >
> > > Additional to that we don't guarantee any callback order for the
> > > DMA
> > > fence and so it can be that mix cleaning up the callback with
> > > other
> > > work which is certainly not good when you want to guarantee that
> > > the
> > > cleanup happens under the same lock.
> > >
> >
> > Isn't my perception correct that the primary issue you have with
> > this
> > approach is that dma_fence_put() is called from within the
> > callback? Or
> > do you also take issue with deleting from the list?
> >
>
> Well kind of both. The issue is that the caller of
> dma_fence_signal() or dma_fence_signal_locked() must hold the
> reference until the function returns.
>
> When you do the list cleanup and the drop inside the callback it is
> perfectly possible that the fence pointer becomes stale before you
> return and that's really not a good idea.
In other words, you would prefer if this patch would have a function
with my callback's code in a function, and that function would be
called at every place where the driver signals a fence?
If that's your opinion, then, IOW, it would mean for us to go almost
back to status quo, with nouveau_fence_signal2.0, but with the
dma_fence_is_signaled() part fixed.
P.
>
>
> >
> >
> >
> > >
> > > Instead the call to dma_fence_signal_locked() should probably be
> > > removed from nouveau_fence_signal() and into
> > > nouveau_fence_context_kill() and nouveau_fence_update().
> > >
> > > This way nouveau_fence_is_signaled() can call this function as
> > > well.
> > >
> >
> > Which "this function"? dma_fence_signal_locked()
> >
>
> No the cleanup function for the list entry. Whatever you call that
> then, the name nouveau_fence_signal() is probably not appropriate any
> more.
>
>
> >
> >
> >
> > >
> > > BTW: nouveau_fence_no_signaling() looks completely broken as
> > > well. It
> > > calls nouveau_fence_is_signaled() and then list_del() on the
> > > fence
> > > head.
> > >
> >
> > I can assure you that a great many things in Nouveau look
> > completely
> > broken.
> >
> > The question for us is always the cost-benefit-ratio when fixing
> > bugs.
> > There are fixes that solve the bug with reasonable effort, and
> > there
> > are great reworks towards an ideal state.
> >
>
> I would just simply drop that function. As far as I can see it
> severs no purpose other than doing exactly what the common DMA fence
> code does anyway.
>
> Just one less thing which could fail.
>
> Christian.
>
>
> >
> >
> > P.
> >
> >
> >
> > >
> > > As far as I can see that is completely superfluous and should
> > > probably be dropped. IIRC I once had a patch to clean that up but
> > > it
> > > was dropped for some reason.
> > >
> > > Regards,
> > > Christian.
> > >
> > >
> > >
> > > >
> > > > + dma_fence_put(&fence->base);
> > > > + if (ret)
> > > > + return ret;
> > > > +
> > > > ret = fctx->emit(fence);
> > > > if (!ret) {
> > > > dma_fence_get(&fence->base);
> > > > @@ -246,8 +251,7 @@ nouveau_fence_emit(struct nouveau_fence
> > > > *fence)
> > > > return -ENODEV;
> > > > }
> > > >
> > > > - if (nouveau_fence_update(chan, fctx))
> > > > - nvif_event_block(&fctx->event);
> > > > + nouveau_fence_update(chan, fctx);
> > > >
> > > > list_add_tail(&fence->head, &fctx->pending);
> > > > spin_unlock_irq(&fctx->lock);
> > > > @@ -270,8 +274,8 @@ nouveau_fence_done(struct nouveau_fence
> > > > *fence)
> > > >
> > > > spin_lock_irqsave(&fctx->lock, flags);
> > > > chan = rcu_dereference_protected(fence-
> > > > >channel,
> > > > lockdep_is_held(&fctx->lock));
> > > > - if (chan && nouveau_fence_update(chan, fctx))
> > > > - nvif_event_block(&fctx->event);
> > > > + if (chan)
> > > > + nouveau_fence_update(chan, fctx);
> > > > spin_unlock_irqrestore(&fctx->lock, flags);
> > > > }
> > > > return dma_fence_is_signaled(&fence->base);
> > > > diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.h
> > > > b/drivers/gpu/drm/nouveau/nouveau_fence.h
> > > > index 8bc065acfe35..e6b2df7fdc42 100644
> > > > --- a/drivers/gpu/drm/nouveau/nouveau_fence.h
> > > > +++ b/drivers/gpu/drm/nouveau/nouveau_fence.h
> > > > @@ -10,6 +10,7 @@ struct nouveau_bo;
> > > >
> > > > struct nouveau_fence {
> > > > struct dma_fence base;
> > > > + struct dma_fence_cb cb;
> > > >
> > > > struct list_head head;
> > > >
> > > >
> > >
> >
>
>
Powered by blists - more mailing lists