[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <mv3vim76p66yp5wj34icrtq5nslupbtwzygjonpkprmi5nddlz@mswwpnqs5m5y>
Date: Mon, 27 Oct 2025 13:02:29 +0100
From: Michał Winiarski <michal.winiarski@...el.com>
To: Michal Wajdeczko <michal.wajdeczko@...el.com>
CC: Alex Williamson <alex.williamson@...hat.com>, Lucas De Marchi
<lucas.demarchi@...el.com>, Thomas Hellström
<thomas.hellstrom@...ux.intel.com>, Rodrigo Vivi <rodrigo.vivi@...el.com>,
Jason Gunthorpe <jgg@...pe.ca>, Yishai Hadas <yishaih@...dia.com>, Kevin Tian
<kevin.tian@...el.com>, <intel-xe@...ts.freedesktop.org>,
<linux-kernel@...r.kernel.org>, <kvm@...r.kernel.org>, Matthew Brost
<matthew.brost@...el.com>, <dri-devel@...ts.freedesktop.org>, Jani Nikula
<jani.nikula@...ux.intel.com>, Joonas Lahtinen
<joonas.lahtinen@...ux.intel.com>, Tvrtko Ursulin <tursulin@...ulin.net>,
David Airlie <airlied@...il.com>, Simona Vetter <simona@...ll.ch>, "Lukasz
Laguna" <lukasz.laguna@...el.com>
Subject: Re: [PATCH v2 03/26] drm/xe/pf: Add save/restore control state stubs
and connect to debugfs
On Thu, Oct 23, 2025 at 12:31:47AM +0200, Michal Wajdeczko wrote:
>
>
> On 10/22/2025 12:41 AM, Michał Winiarski wrote:
> > The states will be used by upcoming changes to produce (in case of save)
> > or consume (in case of resume) the VF migration data.
> >
> > Signed-off-by: Michał Winiarski <michal.winiarski@...el.com>
> > ---
> > drivers/gpu/drm/xe/xe_gt_sriov_pf_control.c | 248 ++++++++++++++++++
> > drivers/gpu/drm/xe/xe_gt_sriov_pf_control.h | 6 +
> > .../gpu/drm/xe/xe_gt_sriov_pf_control_types.h | 14 +
> > drivers/gpu/drm/xe/xe_sriov_pf_control.c | 96 +++++++
> > drivers/gpu/drm/xe/xe_sriov_pf_control.h | 4 +
> > drivers/gpu/drm/xe/xe_sriov_pf_debugfs.c | 38 +++
> > 6 files changed, 406 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_control.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf_control.c
> > index 2e6bd3d1fe1da..b770916e88e53 100644
> > --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_control.c
> > +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_control.c
> > @@ -184,6 +184,12 @@ static const char *control_bit_to_string(enum xe_gt_sriov_control_bits bit)
> > CASE2STR(PAUSE_SAVE_GUC);
> > CASE2STR(PAUSE_FAILED);
> > CASE2STR(PAUSED);
> > + CASE2STR(SAVE_WIP);
> > + CASE2STR(SAVE_FAILED);
> > + CASE2STR(SAVED);
> > + CASE2STR(RESTORE_WIP);
> > + CASE2STR(RESTORE_FAILED);
> > + CASE2STR(RESTORED);
> > CASE2STR(RESUME_WIP);
> > CASE2STR(RESUME_SEND_RESUME);
> > CASE2STR(RESUME_FAILED);
> > @@ -208,6 +214,8 @@ static unsigned long pf_get_default_timeout(enum xe_gt_sriov_control_bits bit)
> > case XE_GT_SRIOV_STATE_FLR_WIP:
> > case XE_GT_SRIOV_STATE_FLR_RESET_CONFIG:
> > return 5 * HZ;
> > + case XE_GT_SRIOV_STATE_RESTORE_WIP:
> > + return 20 * HZ;
> > default:
> > return HZ;
> > }
> > @@ -329,6 +337,8 @@ static void pf_exit_vf_mismatch(struct xe_gt *gt, unsigned int vfid)
> > pf_exit_vf_state(gt, vfid, XE_GT_SRIOV_STATE_PAUSE_FAILED);
> > pf_exit_vf_state(gt, vfid, XE_GT_SRIOV_STATE_RESUME_FAILED);
> > pf_exit_vf_state(gt, vfid, XE_GT_SRIOV_STATE_FLR_FAILED);
> > + pf_exit_vf_state(gt, vfid, XE_GT_SRIOV_STATE_SAVE_FAILED);
> > + pf_exit_vf_state(gt, vfid, XE_GT_SRIOV_STATE_RESTORE_FAILED);
> > }
> >
> > #define pf_enter_vf_state_machine_bug(gt, vfid) ({ \
> > @@ -359,6 +369,8 @@ static void pf_queue_vf(struct xe_gt *gt, unsigned int vfid)
> >
> > static void pf_exit_vf_flr_wip(struct xe_gt *gt, unsigned int vfid);
> > static void pf_exit_vf_stop_wip(struct xe_gt *gt, unsigned int vfid);
> > +static void pf_exit_vf_save_wip(struct xe_gt *gt, unsigned int vfid);
> > +static void pf_exit_vf_restore_wip(struct xe_gt *gt, unsigned int vfid);
> > static void pf_exit_vf_pause_wip(struct xe_gt *gt, unsigned int vfid);
> > static void pf_exit_vf_resume_wip(struct xe_gt *gt, unsigned int vfid);
> >
> > @@ -380,6 +392,8 @@ static void pf_exit_vf_wip(struct xe_gt *gt, unsigned int vfid)
> >
> > pf_exit_vf_flr_wip(gt, vfid);
> > pf_exit_vf_stop_wip(gt, vfid);
> > + pf_exit_vf_save_wip(gt, vfid);
> > + pf_exit_vf_restore_wip(gt, vfid);
> > pf_exit_vf_pause_wip(gt, vfid);
> > pf_exit_vf_resume_wip(gt, vfid);
> >
> > @@ -399,6 +413,8 @@ static void pf_enter_vf_ready(struct xe_gt *gt, unsigned int vfid)
> > pf_exit_vf_state(gt, vfid, XE_GT_SRIOV_STATE_PAUSED);
> > pf_exit_vf_state(gt, vfid, XE_GT_SRIOV_STATE_STOPPED);
> > pf_exit_vf_state(gt, vfid, XE_GT_SRIOV_STATE_RESUMED);
> > + pf_exit_vf_state(gt, vfid, XE_GT_SRIOV_STATE_SAVED);
> > + pf_exit_vf_state(gt, vfid, XE_GT_SRIOV_STATE_RESTORED);
> > pf_exit_vf_mismatch(gt, vfid);
> > pf_exit_vf_wip(gt, vfid);
> > }
> > @@ -675,6 +691,8 @@ static void pf_enter_vf_resumed(struct xe_gt *gt, unsigned int vfid)
> > {
> > pf_enter_vf_state(gt, vfid, XE_GT_SRIOV_STATE_RESUMED);
> > pf_exit_vf_state(gt, vfid, XE_GT_SRIOV_STATE_PAUSED);
> > + pf_exit_vf_state(gt, vfid, XE_GT_SRIOV_STATE_SAVED);
> > + pf_exit_vf_state(gt, vfid, XE_GT_SRIOV_STATE_RESTORED);
> > pf_exit_vf_mismatch(gt, vfid);
> > pf_exit_vf_wip(gt, vfid);
> > }
> > @@ -753,6 +771,16 @@ int xe_gt_sriov_pf_control_resume_vf(struct xe_gt *gt, unsigned int vfid)
> > return -EPERM;
> > }
> >
> > + if (pf_check_vf_state(gt, vfid, XE_GT_SRIOV_STATE_SAVE_WIP)) {
> > + xe_gt_sriov_dbg(gt, "VF%u save is in progress!\n", vfid);
> > + return -EBUSY;
> > + }
> > +
> > + if (pf_check_vf_state(gt, vfid, XE_GT_SRIOV_STATE_RESTORE_WIP)) {
> > + xe_gt_sriov_dbg(gt, "VF%u restore is in progress!\n", vfid);
> > + return -EBUSY;
> > + }
> > +
> > if (!pf_enter_vf_resume_wip(gt, vfid)) {
> > xe_gt_sriov_dbg(gt, "VF%u resume already in progress!\n", vfid);
> > return -EALREADY;
> > @@ -776,6 +804,218 @@ int xe_gt_sriov_pf_control_resume_vf(struct xe_gt *gt, unsigned int vfid)
> > return -ECANCELED;
> > }
> >
> > +static void pf_exit_vf_save_wip(struct xe_gt *gt, unsigned int vfid)
> > +{
> > + pf_exit_vf_state(gt, vfid, XE_GT_SRIOV_STATE_SAVE_WIP);
> > +}
> > +
> > +static void pf_enter_vf_saved(struct xe_gt *gt, unsigned int vfid)
> > +{
> > + if (!pf_enter_vf_state(gt, vfid, XE_GT_SRIOV_STATE_SAVED))
> > + pf_enter_vf_state_machine_bug(gt, vfid);
> > +
> > + xe_gt_sriov_dbg(gt, "VF%u saved!\n", vfid);
>
> nit: you can move expect(PAUSED) here
Ok.
>
> > +
> > + pf_exit_vf_mismatch(gt, vfid);
> > + pf_exit_vf_wip(gt, vfid);
> > + pf_expect_vf_state(gt, vfid, XE_GT_SRIOV_STATE_PAUSED);
> > +}
> > +
> > +static bool pf_handle_vf_save(struct xe_gt *gt, unsigned int vfid)
> > +{
> > + if (!pf_exit_vf_state(gt, vfid, XE_GT_SRIOV_STATE_SAVE_WIP))
> > + return false;
> > +
> > + pf_enter_vf_saved(gt, vfid);
> > +
> > + return true;
> > +}
> > +
> > +static bool pf_enter_vf_save_wip(struct xe_gt *gt, unsigned int vfid)
> > +{
> > + if (pf_enter_vf_state(gt, vfid, XE_GT_SRIOV_STATE_SAVE_WIP)) {
> > + pf_enter_vf_wip(gt, vfid);
> > + pf_queue_vf(gt, vfid);
> > + return true;
> > + }
> > +
> > + return false;
> > +}
> > +
> > +/**
> > + * xe_gt_sriov_pf_control_trigger_save_vf() - Start an SR-IOV VF migration data save sequence.
> > + * @gt: the &xe_gt
> > + * @vfid: the VF identifier
> > + *
> > + * This function is for PF only.
> > + *
> > + * Return: 0 on success or a negative error code on failure.
> > + */
> > +int xe_gt_sriov_pf_control_trigger_save_vf(struct xe_gt *gt, unsigned int vfid)
> > +{
> > + if (pf_check_vf_state(gt, vfid, XE_GT_SRIOV_STATE_STOPPED)) {
> > + xe_gt_sriov_dbg(gt, "VF%u is stopped!\n", vfid);
> > + return -EPERM;
> > + }
> > +
> > + if (!pf_check_vf_state(gt, vfid, XE_GT_SRIOV_STATE_PAUSED)) {
> > + xe_gt_sriov_dbg(gt, "VF%u is not paused!\n", vfid);
> > + return -EPERM;
> > + }
> > +
> > + if (pf_check_vf_state(gt, vfid, XE_GT_SRIOV_STATE_RESTORE_WIP)) {
> > + xe_gt_sriov_dbg(gt, "VF%u restore is in progress!\n", vfid);
> > + return -EBUSY;
> > + }
> > +
> > + if (!pf_enter_vf_save_wip(gt, vfid)) {
> > + xe_gt_sriov_dbg(gt, "VF%u save already in progress!\n", vfid);
> > + return -EALREADY;
> > + }
> > +
> > + return 0;
> > +}
> > +
> > +/**
> > + * xe_gt_sriov_pf_control_finish_save_vf() - Complete a VF migration data save sequence.
> > + * @gt: the &xe_gt
> > + * @vfid: the VF identifier
> > + *
> > + * This function is for PF only.
> > + *
> > + * Return: 0 on success or a negative error code on failure.
> > + */
> > +int xe_gt_sriov_pf_control_finish_save_vf(struct xe_gt *gt, unsigned int vfid)
> > +{
> > + if (!pf_expect_vf_state(gt, vfid, XE_GT_SRIOV_STATE_SAVED)) {
> > + pf_enter_vf_mismatch(gt, vfid);
> > + return -EIO;
> > + }
> > +
> > + pf_expect_vf_state(gt, vfid, XE_GT_SRIOV_STATE_PAUSED);
> > +
> > + return 0;
> > +}
> > +
> > +static void pf_exit_vf_restore_wip(struct xe_gt *gt, unsigned int vfid)
> > +{
> > + pf_exit_vf_state(gt, vfid, XE_GT_SRIOV_STATE_RESTORE_WIP);
> > +}
> > +
> > +static void pf_enter_vf_restored(struct xe_gt *gt, unsigned int vfid)
> > +{
> > + if (!pf_enter_vf_state(gt, vfid, XE_GT_SRIOV_STATE_RESTORED))
> > + pf_enter_vf_state_machine_bug(gt, vfid);
> > +
> > + xe_gt_sriov_dbg(gt, "VF%u restored!\n", vfid);
> > +
> > + pf_exit_vf_mismatch(gt, vfid);
> > + pf_exit_vf_wip(gt, vfid);
> > + pf_expect_vf_state(gt, vfid, XE_GT_SRIOV_STATE_PAUSED);
> > +}
> > +
> > +static bool pf_handle_vf_restore(struct xe_gt *gt, unsigned int vfid)
> > +{
> > + if (!pf_exit_vf_state(gt, vfid, XE_GT_SRIOV_STATE_RESTORE_WIP))
> > + return false;
> > +
> > + pf_enter_vf_restored(gt, vfid);
> > +
> > + return true;
> > +}
> > +
> > +static bool pf_enter_vf_restore_wip(struct xe_gt *gt, unsigned int vfid)
> > +{
> > + if (pf_enter_vf_state(gt, vfid, XE_GT_SRIOV_STATE_RESTORE_WIP)) {
> > + pf_enter_vf_wip(gt, vfid);
> > + pf_queue_vf(gt, vfid);
> > + return true;
> > + }
> > +
> > + return false;
> > +}
> > +
> > +/**
> > + * xe_gt_sriov_pf_control_trigger restore_vf() - Start an SR-IOV VF migration data restore sequence.
> > + * @gt: the &xe_gt
> > + * @vfid: the VF identifier
> > + *
> > + * This function is for PF only.
> > + *
> > + * Return: 0 on success or a negative error code on failure.
> > + */
> > +int xe_gt_sriov_pf_control_trigger_restore_vf(struct xe_gt *gt, unsigned int vfid)
> > +{
> > + if (pf_check_vf_state(gt, vfid, XE_GT_SRIOV_STATE_STOPPED)) {
> > + xe_gt_sriov_dbg(gt, "VF%u is stopped!\n", vfid);
> > + return -EPERM;
> > + }
> > +
> > + if (!pf_check_vf_state(gt, vfid, XE_GT_SRIOV_STATE_PAUSED)) {
> > + xe_gt_sriov_dbg(gt, "VF%u is not paused!\n", vfid);
> > + return -EPERM;
> > + }
> > +
> > + if (pf_check_vf_state(gt, vfid, XE_GT_SRIOV_STATE_SAVE_WIP)) {
> > + xe_gt_sriov_dbg(gt, "VF%u save is in progress!\n", vfid);
> > + return -EBUSY;
> > + }
> > +
> > + if (!pf_enter_vf_restore_wip(gt, vfid)) {
> > + xe_gt_sriov_dbg(gt, "VF%u restore already in progress!\n", vfid);
> > + return -EALREADY;
> > + }
> > +
> > + return 0;
> > +}
> > +
> > +static int pf_wait_vf_restore_done(struct xe_gt *gt, unsigned int vfid)
> > +{
> > + unsigned long timeout = pf_get_default_timeout(XE_GT_SRIOV_STATE_RESTORE_WIP);
> > + int err;
> > +
> > + err = pf_wait_vf_wip_done(gt, vfid, timeout);
> > + if (err) {
> > + xe_gt_sriov_notice(gt, "VF%u RESTORE didn't finish in %u ms (%pe)\n",
> > + vfid, jiffies_to_msecs(timeout), ERR_PTR(err));
> > + return err;
> > + }
> > +
> > + if (!pf_expect_vf_not_state(gt, vfid, XE_GT_SRIOV_STATE_RESTORE_FAILED))
> > + return -EIO;
> > +
> > + return 0;
> > +}
> > +
> > +/**
> > + * xe_gt_sriov_pf_control_finish_restore_vf() - Complete a VF migration data restore sequence.
> > + * @gt: the &xe_gt
> > + * @vfid: the VF identifier
> > + *
> > + * This function is for PF only.
> > + *
> > + * Return: 0 on success or a negative error code on failure.
> > + */
> > +int xe_gt_sriov_pf_control_finish_restore_vf(struct xe_gt *gt, unsigned int vfid)
> > +{
> > + int ret;
> > +
> > + if (pf_check_vf_state(gt, vfid, XE_GT_SRIOV_STATE_RESTORE_WIP)) {
> > + ret = pf_wait_vf_restore_done(gt, vfid);
> > + if (ret)
> > + return ret;
> > + }
> > +
> > + if (!pf_expect_vf_state(gt, vfid, XE_GT_SRIOV_STATE_RESTORED)) {
> > + pf_enter_vf_mismatch(gt, vfid);
> > + return -EIO;
> > + }
> > +
> > + pf_expect_vf_state(gt, vfid, XE_GT_SRIOV_STATE_PAUSED);
> > +
> > + return 0;
> > +}
> > +
> > /**
> > * DOC: The VF STOP state machine
> > *
> > @@ -817,6 +1057,8 @@ static void pf_enter_vf_stopped(struct xe_gt *gt, unsigned int vfid)
> >
> > pf_exit_vf_state(gt, vfid, XE_GT_SRIOV_STATE_RESUMED);
> > pf_exit_vf_state(gt, vfid, XE_GT_SRIOV_STATE_PAUSED);
> > + pf_exit_vf_state(gt, vfid, XE_GT_SRIOV_STATE_SAVED);
> > + pf_exit_vf_state(gt, vfid, XE_GT_SRIOV_STATE_RESTORED);
> > pf_exit_vf_mismatch(gt, vfid);
> > pf_exit_vf_wip(gt, vfid);
> > }
> > @@ -1461,6 +1703,12 @@ static bool pf_process_vf_state_machine(struct xe_gt *gt, unsigned int vfid)
> > if (pf_exit_vf_pause_save_guc(gt, vfid))
> > return true;
> >
> > + if (pf_handle_vf_save(gt, vfid))
> > + return true;
> > +
> > + if (pf_handle_vf_restore(gt, vfid))
> > + return true;
> > +
> > if (pf_exit_vf_resume_send_resume(gt, vfid))
> > return true;
> >
> > diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_control.h b/drivers/gpu/drm/xe/xe_gt_sriov_pf_control.h
> > index 8a72ef3778d47..abc233f6302ed 100644
> > --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_control.h
> > +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_control.h
> > @@ -14,8 +14,14 @@ struct xe_gt;
> > int xe_gt_sriov_pf_control_init(struct xe_gt *gt);
> > void xe_gt_sriov_pf_control_restart(struct xe_gt *gt);
> >
> > +bool xe_gt_sriov_pf_control_check_vf_data_wip(struct xe_gt *gt, unsigned int vfid);
> > +
> > int xe_gt_sriov_pf_control_pause_vf(struct xe_gt *gt, unsigned int vfid);
> > int xe_gt_sriov_pf_control_resume_vf(struct xe_gt *gt, unsigned int vfid);
> > +int xe_gt_sriov_pf_control_trigger_save_vf(struct xe_gt *gt, unsigned int vfid);
> > +int xe_gt_sriov_pf_control_finish_save_vf(struct xe_gt *gt, unsigned int vfid);
> > +int xe_gt_sriov_pf_control_trigger_restore_vf(struct xe_gt *gt, unsigned int vfid);
> > +int xe_gt_sriov_pf_control_finish_restore_vf(struct xe_gt *gt, unsigned int vfid);
> > int xe_gt_sriov_pf_control_stop_vf(struct xe_gt *gt, unsigned int vfid);
> > int xe_gt_sriov_pf_control_trigger_flr(struct xe_gt *gt, unsigned int vfid);
> > int xe_gt_sriov_pf_control_sync_flr(struct xe_gt *gt, unsigned int vfid, bool sync);
> > diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_control_types.h b/drivers/gpu/drm/xe/xe_gt_sriov_pf_control_types.h
> > index c80b7e77f1ad2..e113dc98b33ce 100644
> > --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_control_types.h
> > +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_control_types.h
> > @@ -31,6 +31,12 @@
> > * @XE_GT_SRIOV_STATE_PAUSE_SAVE_GUC: indicates that the PF needs to save the VF GuC state.
> > * @XE_GT_SRIOV_STATE_PAUSE_FAILED: indicates that a VF pause operation has failed.
> > * @XE_GT_SRIOV_STATE_PAUSED: indicates that the VF is paused.
> > + * @XE_GT_SRIOV_STATE_SAVE_WIP: indicates that VF save operation is in progress.
> > + * @XE_GT_SRIOV_STATE_SAVE_FAILED: indicates that VF save operation has failed.
> > + * @XE_GT_SRIOV_STATE_SAVED: indicates that VF data is saved.
> > + * @XE_GT_SRIOV_STATE_RESTORE_WIP: indicates that VF restore operation is in progress.
> > + * @XE_GT_SRIOV_STATE_RESTORE_FAILED: indicates that VF restore operation has failed.
> > + * @XE_GT_SRIOV_STATE_RESTORED: indicates that VF data is restored.
> > * @XE_GT_SRIOV_STATE_RESUME_WIP: indicates the a VF resume operation is in progress.
> > * @XE_GT_SRIOV_STATE_RESUME_SEND_RESUME: indicates that the PF is about to send RESUME command.
> > * @XE_GT_SRIOV_STATE_RESUME_FAILED: indicates that a VF resume operation has failed.
> > @@ -63,6 +69,14 @@ enum xe_gt_sriov_control_bits {
> > XE_GT_SRIOV_STATE_PAUSE_FAILED,
> > XE_GT_SRIOV_STATE_PAUSED,
> >
> > + XE_GT_SRIOV_STATE_SAVE_WIP,
> > + XE_GT_SRIOV_STATE_SAVE_FAILED,
> > + XE_GT_SRIOV_STATE_SAVED,
> > +
> > + XE_GT_SRIOV_STATE_RESTORE_WIP,
> > + XE_GT_SRIOV_STATE_RESTORE_FAILED,
> > + XE_GT_SRIOV_STATE_RESTORED,
> > +
> > XE_GT_SRIOV_STATE_RESUME_WIP,
> > XE_GT_SRIOV_STATE_RESUME_SEND_RESUME,
> > XE_GT_SRIOV_STATE_RESUME_FAILED,
>
> it is easier to understand those states after patch 04/26 with diagrams,
> and while there are small and hard to avoid overlaps between 03/26 and 04/26
> the patch itself LGTM, so
>
> Reviewed-by: Michal Wajdeczko <michal.wajdeczko@...el.com>
Thanks,
-Michał
Powered by blists - more mailing lists