[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <d3c8f29c-22ca-4ece-8beb-ed14587bcaf0@intel.com>
Date: Mon, 17 Jun 2024 21:53:59 +0200
From: "Wysocki, Rafael J" <rafael.j.wysocki@...el.com>
To: Petr Machata <petrm@...dia.com>, "David S. Miller" <davem@...emloft.net>,
Eric Dumazet <edumazet@...gle.com>, Jakub Kicinski <kuba@...nel.org>, "Paolo
Abeni" <pabeni@...hat.com>, <netdev@...r.kernel.org>
CC: Ido Schimmel <idosch@...dia.com>, <mlxsw@...dia.com>, Lukasz Luba
<lukasz.luba@....com>, Daniel Lezcano <daniel.lezcano@...aro.org>, "Vadim
Pasternak" <vadimp@...dia.com>
Subject: Re: [PATCH net 2/3] mlxsw: core_thermal: Fix driver initialization
failure
On 6/17/2024 6:56 PM, Petr Machata wrote:
> From: Ido Schimmel <idosch@...dia.com>
>
> Commit 31a0fa0019b0 ("thermal/debugfs: Pass cooling device state to
> thermal_debug_cdev_add()") changed the thermal core to read the current
> state of the cooling device as part of the cooling device's
> registration. This is incompatible with the current implementation of
> the cooling device operations in mlxsw, leading to initialization
> failure with errors such as:
>
> mlxsw_spectrum 0000:01:00.0: Failed to register cooling device
> mlxsw_spectrum 0000:01:00.0: cannot register bus device
Is this still a problem after
https://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git/commit/?h=thermal&id=1af89dedc8a58006d8e385b1e0d2cd24df8a3b69
which has been merged into 6.10-rc4?
> The reason for the failure is that when the get current state operation
> is invoked the driver tries to derive the index of the cooling device by
> walking a per thermal zone array and looking for the matching cooling
> device pointer. However, the pointer is returned from the registration
> function and therefore only set in the array after the registration.
>
> Fix by passing to the registration function a per cooling device private
> data that already has the cooling device index populated.
>
> Decided to fix the issue in the driver since as far as I can tell other
> drivers do not suffer from this problem.
>
> Fixes: 31a0fa0019b0 ("thermal/debugfs: Pass cooling device state to thermal_debug_cdev_add()")
> Fixes: 755113d76786 ("thermal/debugfs: Add thermal cooling device debugfs information")
> Cc: "Rafael J. Wysocki" <rafael.j.wysocki@...el.com>
> Cc: Lukasz Luba <lukasz.luba@....com>
> Cc: Daniel Lezcano <daniel.lezcano@...aro.org>
> Signed-off-by: Ido Schimmel <idosch@...dia.com>
> Reviewed-by: Vadim Pasternak <vadimp@...dia.com>
> Signed-off-by: Petr Machata <petrm@...dia.com>
> ---
> .../ethernet/mellanox/mlxsw/core_thermal.c | 50 ++++++++++---------
> 1 file changed, 26 insertions(+), 24 deletions(-)
>
> diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
> index 5c511e1a8efa..eee3e37983ca 100644
> --- a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
> +++ b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
> @@ -100,6 +100,12 @@ static const struct mlxsw_cooling_states default_cooling_states[] = {
>
> struct mlxsw_thermal;
>
> +struct mlxsw_thermal_cooling_device {
> + struct mlxsw_thermal *thermal;
> + struct thermal_cooling_device *cdev;
> + unsigned int idx;
> +};
> +
> struct mlxsw_thermal_module {
> struct mlxsw_thermal *parent;
> struct thermal_zone_device *tzdev;
> @@ -123,7 +129,7 @@ struct mlxsw_thermal {
> const struct mlxsw_bus_info *bus_info;
> struct thermal_zone_device *tzdev;
> int polling_delay;
> - struct thermal_cooling_device *cdevs[MLXSW_MFCR_PWMS_MAX];
> + struct mlxsw_thermal_cooling_device cdevs[MLXSW_MFCR_PWMS_MAX];
> struct thermal_trip trips[MLXSW_THERMAL_NUM_TRIPS];
> struct mlxsw_cooling_states cooling_states[MLXSW_THERMAL_NUM_TRIPS];
> struct mlxsw_thermal_area line_cards[];
> @@ -147,7 +153,7 @@ static int mlxsw_get_cooling_device_idx(struct mlxsw_thermal *thermal,
> int i;
>
> for (i = 0; i < MLXSW_MFCR_PWMS_MAX; i++)
> - if (thermal->cdevs[i] == cdev)
> + if (thermal->cdevs[i].cdev == cdev)
> return i;
>
> /* Allow mlxsw thermal zone binding to an external cooling device */
> @@ -352,17 +358,14 @@ static int mlxsw_thermal_get_cur_state(struct thermal_cooling_device *cdev,
> unsigned long *p_state)
>
> {
> - struct mlxsw_thermal *thermal = cdev->devdata;
> + struct mlxsw_thermal_cooling_device *mlxsw_cdev = cdev->devdata;
> + struct mlxsw_thermal *thermal = mlxsw_cdev->thermal;
> struct device *dev = thermal->bus_info->dev;
> char mfsc_pl[MLXSW_REG_MFSC_LEN];
> - int err, idx;
> u8 duty;
> + int err;
>
> - idx = mlxsw_get_cooling_device_idx(thermal, cdev);
> - if (idx < 0)
> - return idx;
> -
> - mlxsw_reg_mfsc_pack(mfsc_pl, idx, 0);
> + mlxsw_reg_mfsc_pack(mfsc_pl, mlxsw_cdev->idx, 0);
> err = mlxsw_reg_query(thermal->core, MLXSW_REG(mfsc), mfsc_pl);
> if (err) {
> dev_err(dev, "Failed to query PWM duty\n");
> @@ -378,22 +381,19 @@ static int mlxsw_thermal_set_cur_state(struct thermal_cooling_device *cdev,
> unsigned long state)
>
> {
> - struct mlxsw_thermal *thermal = cdev->devdata;
> + struct mlxsw_thermal_cooling_device *mlxsw_cdev = cdev->devdata;
> + struct mlxsw_thermal *thermal = mlxsw_cdev->thermal;
> struct device *dev = thermal->bus_info->dev;
> char mfsc_pl[MLXSW_REG_MFSC_LEN];
> - int idx;
> int err;
>
> if (state > MLXSW_THERMAL_MAX_STATE)
> return -EINVAL;
>
> - idx = mlxsw_get_cooling_device_idx(thermal, cdev);
> - if (idx < 0)
> - return idx;
> -
> /* Normalize the state to the valid speed range. */
> state = max_t(unsigned long, MLXSW_THERMAL_MIN_STATE, state);
> - mlxsw_reg_mfsc_pack(mfsc_pl, idx, mlxsw_state_to_duty(state));
> + mlxsw_reg_mfsc_pack(mfsc_pl, mlxsw_cdev->idx,
> + mlxsw_state_to_duty(state));
> err = mlxsw_reg_write(thermal->core, MLXSW_REG(mfsc), mfsc_pl);
> if (err) {
> dev_err(dev, "Failed to write PWM duty\n");
> @@ -753,17 +753,21 @@ int mlxsw_thermal_init(struct mlxsw_core *core,
> }
> for (i = 0; i < MLXSW_MFCR_PWMS_MAX; i++) {
> if (pwm_active & BIT(i)) {
> + struct mlxsw_thermal_cooling_device *mlxsw_cdev;
> struct thermal_cooling_device *cdev;
>
> + mlxsw_cdev = &thermal->cdevs[i];
> + mlxsw_cdev->thermal = thermal;
> + mlxsw_cdev->idx = i;
> cdev = thermal_cooling_device_register("mlxsw_fan",
> - thermal,
> + mlxsw_cdev,
> &mlxsw_cooling_ops);
> if (IS_ERR(cdev)) {
> err = PTR_ERR(cdev);
> dev_err(dev, "Failed to register cooling device\n");
> goto err_thermal_cooling_device_register;
> }
> - thermal->cdevs[i] = cdev;
> + mlxsw_cdev->cdev = cdev;
> }
> }
>
> @@ -824,8 +828,8 @@ int mlxsw_thermal_init(struct mlxsw_core *core,
> err_thermal_zone_device_register:
> err_thermal_cooling_device_register:
> for (i = 0; i < MLXSW_MFCR_PWMS_MAX; i++)
> - if (thermal->cdevs[i])
> - thermal_cooling_device_unregister(thermal->cdevs[i]);
> + if (thermal->cdevs[i].cdev)
> + thermal_cooling_device_unregister(thermal->cdevs[i].cdev);
> err_reg_write:
> err_reg_query:
> kfree(thermal);
> @@ -848,10 +852,8 @@ void mlxsw_thermal_fini(struct mlxsw_thermal *thermal)
> }
>
> for (i = 0; i < MLXSW_MFCR_PWMS_MAX; i++) {
> - if (thermal->cdevs[i]) {
> - thermal_cooling_device_unregister(thermal->cdevs[i]);
> - thermal->cdevs[i] = NULL;
> - }
> + if (thermal->cdevs[i].cdev)
> + thermal_cooling_device_unregister(thermal->cdevs[i].cdev);
> }
>
> kfree(thermal);
Powered by blists - more mailing lists