[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <C890CCBB-76C0-4E70-A7B8-846E34DABECE@nvidia.com>
Date: Tue, 16 Dec 2025 15:39:30 +0000
From: Joel Fernandes <joelagnelf@...dia.com>
To: Alexandre Courbot <acourbot@...dia.com>
CC: Danilo Krummrich <dakr@...nel.org>, Alice Ryhl <aliceryhl@...gle.com>,
David Airlie <airlied@...il.com>, Simona Vetter <simona@...ll.ch>, Bjorn
Helgaas <bhelgaas@...gle.com>, Krzysztof Wilczyński
<kwilczynski@...nel.org>, Miguel Ojeda <ojeda@...nel.org>, Boqun Feng
<boqun.feng@...il.com>, Gary Guo <gary@...yguo.net>,
Björn Roy Baron <bjorn3_gh@...tonmail.com>, Benno Lossin
<lossin@...nel.org>, Andreas Hindborg <a.hindborg@...nel.org>, Trevor Gross
<tmgross@...ch.edu>, John Hubbard <jhubbard@...dia.com>, Alistair Popple
<apopple@...dia.com>, Timur Tabi <ttabi@...dia.com>, Edwin Peer
<epeer@...dia.com>, Eliot Courtney <ecourtney@...dia.com>,
"nouveau@...ts.freedesktop.org" <nouveau@...ts.freedesktop.org>,
"dri-devel@...ts.freedesktop.org" <dri-devel@...ts.freedesktop.org>,
"linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
"linux-pci@...r.kernel.org" <linux-pci@...r.kernel.org>,
"rust-for-linux@...r.kernel.org" <rust-for-linux@...r.kernel.org>, Alexandre
Courbot <acourbot@...dia.com>
Subject: Re: [PATCH 6/7] gpu: nova-core: send UNLOADING_GUEST_DRIVER GSP
command GSP upon unloading
Hi Alex,
Just did a quick pass, will do a proper review later:
> On Dec 16, 2025, at 12:13 AM, Alexandre Courbot <acourbot@...dia.com> wrote:
>
> Currently, the GSP is left running after the driver is unbound. This is
> not great for several reasons, notably that it can still access shared
> memory areas that the kernel will now reclaim (especially problematic on
> setups without an IOMMU).
>
> Fix this by sending the `UNLOADING_GUEST_DRIVER` GSP command when
> unbinding. This stops the GSP and let us proceed with the rest of the
> unbind sequence in the next patch.
>
> Signed-off-by: Alexandre Courbot <acourbot@...dia.com>
> ---
> drivers/gpu/nova-core/gpu.rs | 5 +++
> drivers/gpu/nova-core/gsp/boot.rs | 42 +++++++++++++++++++++++
> drivers/gpu/nova-core/gsp/commands.rs | 42 +++++++++++++++++++++++
> drivers/gpu/nova-core/gsp/fw.rs | 4 +++
> drivers/gpu/nova-core/gsp/fw/commands.rs | 27 +++++++++++++++
> drivers/gpu/nova-core/gsp/fw/r570_144/bindings.rs | 8 +++++
> 6 files changed, 128 insertions(+)
>
> diff --git a/drivers/gpu/nova-core/gpu.rs b/drivers/gpu/nova-core/gpu.rs
> index ef6ceced350e..b94784f57b36 100644
> --- a/drivers/gpu/nova-core/gpu.rs
> +++ b/drivers/gpu/nova-core/gpu.rs
> @@ -291,6 +291,9 @@ pub(crate) fn new<'a>(
>
> /// Called when the corresponding [`Device`](device::Device) is unbound.
> ///
> + /// Prepares the GPU for unbinding by shutting down the GSP and unregistering the sysmem flush
> + /// memory page.
> + ///
> /// Note: This method must only be called from `Driver::unbind`.
> pub(crate) fn unbind(self: Pin<&mut Self>, dev: &device::Device<device::Core>) {
> let this = self.project();
> @@ -299,6 +302,8 @@ pub(crate) fn unbind(self: Pin<&mut Self>, dev: &device::Device<device::Core>) {
> return;
> };
>
> + let _ = kernel::warn_on_err!(this.gsp.unload(dev, bar, this.gsp_falcon,));
> +
> this.sysmem_flush.unregister(bar);
> }
> }
> diff --git a/drivers/gpu/nova-core/gsp/boot.rs b/drivers/gpu/nova-core/gsp/boot.rs
> index ca7efdaa753b..e12e1d3fd53f 100644
> --- a/drivers/gpu/nova-core/gsp/boot.rs
> +++ b/drivers/gpu/nova-core/gsp/boot.rs
> @@ -32,6 +32,7 @@
> },
> gpu::Chipset,
> gsp::{
> + cmdq::Cmdq,
> commands,
> sequencer::{
> GspSequencer,
> @@ -231,4 +232,45 @@ pub(crate) fn boot(
>
> Ok(())
> }
> +
> + /// Shutdowns the GSP and wait until it is offline.
> + fn shutdown_gsp(
> + cmdq: &mut Cmdq,
> + bar: &Bar0,
> + gsp_falcon: &Falcon<Gsp>,
> + suspend: bool,
> + ) -> Result<()> {
> + // Send command to shutdown GSP and wait for response.
> + commands::unload_guest_driver(cmdq, bar, suspend)?;
> +
> + // Wait until GSP signals it is suspended.
> + const LIBOS_INTERRUPT_PROCESSOR_SUSPENDED: u32 = 0x8000_0000;
> + read_poll_timeout(
> + || Ok(gsp_falcon.read_mailbox0(bar)),
> + |&mb0| mb0 == LIBOS_INTERRUPT_PROCESSOR_SUSPENDED,
> + Delta::ZERO,
> + Delta::from_secs(5),
> + )
> + .map(|_| ())
> + }
> +
> + /// Attempts to unload the GSP firmware.
> + ///
> + /// This stops all activity on the GSP.
> + pub(crate) fn unload(
> + self: Pin<&mut Self>,
> + dev: &device::Device<device::Bound>,
> + bar: &Bar0,
> + gsp_falcon: &Falcon<Gsp>,
> + ) -> Result {
> + let this = self.project();
> +
> + /* Shutdown the GSP */
> +
> + Self::shutdown_gsp(this.cmdq, bar, gsp_falcon, false)
> + .inspect_err(|e| dev_err!(dev, "unload guest driver failed: {:?}", e))?;
> + dev_dbg!(dev, "GSP shut down\n");
> +
> + Ok(())
> + }
> }
> diff --git a/drivers/gpu/nova-core/gsp/commands.rs b/drivers/gpu/nova-core/gsp/commands.rs
> index 2050771f9b53..0bcfca8c7e42 100644
> --- a/drivers/gpu/nova-core/gsp/commands.rs
> +++ b/drivers/gpu/nova-core/gsp/commands.rs
> @@ -225,3 +225,45 @@ pub(crate) fn get_gsp_info(cmdq: &mut Cmdq, bar: &Bar0) -> Result<GetGspStaticIn
> }
> }
> }
> +
> +impl CommandToGsp for UnloadingGuestDriver {
> + const FUNCTION: MsgFunction = MsgFunction::UnloadingGuestDriver;
> + type Command = Self;
> + type InitError = Infallible;
> +
> + fn init(&self) -> impl Init<Self::Command, Self::InitError> {
> + *self
> + }
> +}
> +
> +pub(crate) struct UnloadingGuestDriverReply;
> +
> +impl MessageFromGsp for UnloadingGuestDriverReply {
> + const FUNCTION: MsgFunction = MsgFunction::UnloadingGuestDriver;
> + type InitError = Infallible;
> + type Message = ();
> +
> + fn read(
> + _msg: &Self::Message,
> + _sbuffer: &mut SBufferIter<array::IntoIter<&[u8], 2>>,
> + ) -> Result<Self, Self::InitError> {
> + Ok(UnloadingGuestDriverReply)
> + }
> +}
> +
> +/// Send the [`UnloadingGuestDriver`] command to the GSP and await the reply.
> +pub(crate) fn unload_guest_driver(
> + cmdq: &mut Cmdq,
> + bar: &Bar0,
> + suspend: bool,
> +) -> Result<UnloadingGuestDriverReply> {
> + cmdq.send_command(bar, UnloadingGuestDriver::new(suspend))?;
> +
> + loop {
> + match cmdq.receive_msg::<UnloadingGuestDriverReply>(Delta::from_secs(5)) {
> + Ok(resp) => return Ok(resp),
> + Err(ERANGE) => continue,
> + Err(e) => return Err(e),
> + }
This outer loop can go on infinitely, lets bound it?
Either outer timeout or bounded number of tries. Bounded tries better since it has inner timeout.
> + }
> +}
> diff --git a/drivers/gpu/nova-core/gsp/fw.rs b/drivers/gpu/nova-core/gsp/fw.rs
> index 09549f7db52d..228464fd47a0 100644
> --- a/drivers/gpu/nova-core/gsp/fw.rs
> +++ b/drivers/gpu/nova-core/gsp/fw.rs
> @@ -209,6 +209,7 @@ pub(crate) enum MsgFunction {
> GspInitPostObjGpu = bindings::NV_VGPU_MSG_FUNCTION_GSP_INIT_POST_OBJGPU,
> GspRmControl = bindings::NV_VGPU_MSG_FUNCTION_GSP_RM_CONTROL,
> GetStaticInfo = bindings::NV_VGPU_MSG_FUNCTION_GET_STATIC_INFO,
> + UnloadingGuestDriver = bindings::NV_VGPU_MSG_FUNCTION_UNLOADING_GUEST_DRIVER,
>
> // Event codes
> GspInitDone = bindings::NV_VGPU_MSG_EVENT_GSP_INIT_DONE,
> @@ -249,6 +250,9 @@ fn try_from(value: u32) -> Result<MsgFunction> {
> }
> bindings::NV_VGPU_MSG_FUNCTION_GSP_RM_CONTROL => Ok(MsgFunction::GspRmControl),
> bindings::NV_VGPU_MSG_FUNCTION_GET_STATIC_INFO => Ok(MsgFunction::GetStaticInfo),
> + bindings::NV_VGPU_MSG_FUNCTION_UNLOADING_GUEST_DRIVER => {
> + Ok(MsgFunction::UnloadingGuestDriver)
> + }
> bindings::NV_VGPU_MSG_EVENT_GSP_INIT_DONE => Ok(MsgFunction::GspInitDone),
> bindings::NV_VGPU_MSG_EVENT_GSP_RUN_CPU_SEQUENCER => {
> Ok(MsgFunction::GspRunCpuSequencer)
> diff --git a/drivers/gpu/nova-core/gsp/fw/commands.rs b/drivers/gpu/nova-core/gsp/fw/commands.rs
> index 85465521de32..c7df4783ad21 100644
> --- a/drivers/gpu/nova-core/gsp/fw/commands.rs
> +++ b/drivers/gpu/nova-core/gsp/fw/commands.rs
> @@ -125,3 +125,30 @@ unsafe impl AsBytes for GspStaticConfigInfo {}
> // SAFETY: This struct only contains integer types for which all bit patterns
> // are valid.
> unsafe impl FromBytes for GspStaticConfigInfo {}
> +
> +/// Payload of the `UnloadingGuestDriver` command and message.
> +#[repr(transparent)]
> +#[derive(Clone, Copy, Debug, Zeroable)]
> +pub(crate) struct UnloadingGuestDriver {
> + inner: bindings::rpc_unloading_guest_driver_v1F_07,
> +}
> +
> +impl UnloadingGuestDriver {
> + pub(crate) fn new(suspend: bool) -> Self {
> + Self {
> + inner: bindings::rpc_unloading_guest_driver_v1F_07 {
We should go through intermediate firmware representation than direct bindings access?
> + bInPMTransition: if suspend { 1 } else { 0 },
Then this can just be passed as a bool.
> + bGc6Entering: 0,
> + newLevel: if suspend { 3 } else { 0 },
> + ..Zeroable::zeroed()
> + },
> + }
> + }
> +}
> +
> +// SAFETY: Padding is explicit and will not contain uninitialized data.
> +unsafe impl AsBytes for UnloadingGuestDriver {}
> +
> +// SAFETY: This struct only contains integer types for which all bit patterns
> +// are valid.
> +unsafe impl FromBytes for UnloadingGuestDriver {}
> diff --git a/drivers/gpu/nova-core/gsp/fw/r570_144/bindings.rs b/drivers/gpu/nova-core/gsp/fw/r570_144/bindings.rs
> index 6d25fe0bffa9..212ccc13c0cc 100644
> --- a/drivers/gpu/nova-core/gsp/fw/r570_144/bindings.rs
> +++ b/drivers/gpu/nova-core/gsp/fw/r570_144/bindings.rs
> @@ -879,6 +879,14 @@ fn default() -> Self {
> }
> }
> #[repr(C)]
> +#[derive(Debug, Default, Copy, Clone, MaybeZeroable)]
> +pub struct rpc_unloading_guest_driver_v1F_07 {
> + pub bInPMTransition: u8_,
> + pub bGc6Entering: u8_,
> + pub __bindgen_padding_0: [u8; 2usize],
> + pub newLevel: u32_,
When these are intermediate represented, documentation would be nice on the fields.
thanks,
- Joel
> +}
> +#[repr(C)]
> #[derive(Debug, Default, MaybeZeroable)]
> pub struct rpc_run_cpu_sequencer_v17_00 {
> pub bufferSizeDWord: u32_,
>
> --
> 2.52.0
>
Powered by blists - more mailing lists