lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date: Mon, 17 Jun 2024 21:53:59 +0200
From: "Wysocki, Rafael J" <rafael.j.wysocki@...el.com>
To: Petr Machata <petrm@...dia.com>, "David S. Miller" <davem@...emloft.net>,
	Eric Dumazet <edumazet@...gle.com>, Jakub Kicinski <kuba@...nel.org>, "Paolo
 Abeni" <pabeni@...hat.com>, <netdev@...r.kernel.org>
CC: Ido Schimmel <idosch@...dia.com>, <mlxsw@...dia.com>, Lukasz Luba
	<lukasz.luba@....com>, Daniel Lezcano <daniel.lezcano@...aro.org>, "Vadim
 Pasternak" <vadimp@...dia.com>
Subject: Re: [PATCH net 2/3] mlxsw: core_thermal: Fix driver initialization
 failure

On 6/17/2024 6:56 PM, Petr Machata wrote:
> From: Ido Schimmel <idosch@...dia.com>
>
> Commit 31a0fa0019b0 ("thermal/debugfs: Pass cooling device state to
> thermal_debug_cdev_add()") changed the thermal core to read the current
> state of the cooling device as part of the cooling device's
> registration. This is incompatible with the current implementation of
> the cooling device operations in mlxsw, leading to initialization
> failure with errors such as:
>
> mlxsw_spectrum 0000:01:00.0: Failed to register cooling device
> mlxsw_spectrum 0000:01:00.0: cannot register bus device

Is this still a problem after

https://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git/commit/?h=thermal&id=1af89dedc8a58006d8e385b1e0d2cd24df8a3b69

which has been merged into 6.10-rc4?

> The reason for the failure is that when the get current state operation
> is invoked the driver tries to derive the index of the cooling device by
> walking a per thermal zone array and looking for the matching cooling
> device pointer. However, the pointer is returned from the registration
> function and therefore only set in the array after the registration.
>
> Fix by passing to the registration function a per cooling device private
> data that already has the cooling device index populated.
>
> Decided to fix the issue in the driver since as far as I can tell other
> drivers do not suffer from this problem.
>
> Fixes: 31a0fa0019b0 ("thermal/debugfs: Pass cooling device state to thermal_debug_cdev_add()")
> Fixes: 755113d76786 ("thermal/debugfs: Add thermal cooling device debugfs information")
> Cc: "Rafael J. Wysocki" <rafael.j.wysocki@...el.com>
> Cc: Lukasz Luba <lukasz.luba@....com>
> Cc: Daniel Lezcano <daniel.lezcano@...aro.org>
> Signed-off-by: Ido Schimmel <idosch@...dia.com>
> Reviewed-by: Vadim Pasternak <vadimp@...dia.com>
> Signed-off-by: Petr Machata <petrm@...dia.com>
> ---
>   .../ethernet/mellanox/mlxsw/core_thermal.c    | 50 ++++++++++---------
>   1 file changed, 26 insertions(+), 24 deletions(-)
>
> diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
> index 5c511e1a8efa..eee3e37983ca 100644
> --- a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
> +++ b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
> @@ -100,6 +100,12 @@ static const struct mlxsw_cooling_states default_cooling_states[] = {
>   
>   struct mlxsw_thermal;
>   
> +struct mlxsw_thermal_cooling_device {
> +	struct mlxsw_thermal *thermal;
> +	struct thermal_cooling_device *cdev;
> +	unsigned int idx;
> +};
> +
>   struct mlxsw_thermal_module {
>   	struct mlxsw_thermal *parent;
>   	struct thermal_zone_device *tzdev;
> @@ -123,7 +129,7 @@ struct mlxsw_thermal {
>   	const struct mlxsw_bus_info *bus_info;
>   	struct thermal_zone_device *tzdev;
>   	int polling_delay;
> -	struct thermal_cooling_device *cdevs[MLXSW_MFCR_PWMS_MAX];
> +	struct mlxsw_thermal_cooling_device cdevs[MLXSW_MFCR_PWMS_MAX];
>   	struct thermal_trip trips[MLXSW_THERMAL_NUM_TRIPS];
>   	struct mlxsw_cooling_states cooling_states[MLXSW_THERMAL_NUM_TRIPS];
>   	struct mlxsw_thermal_area line_cards[];
> @@ -147,7 +153,7 @@ static int mlxsw_get_cooling_device_idx(struct mlxsw_thermal *thermal,
>   	int i;
>   
>   	for (i = 0; i < MLXSW_MFCR_PWMS_MAX; i++)
> -		if (thermal->cdevs[i] == cdev)
> +		if (thermal->cdevs[i].cdev == cdev)
>   			return i;
>   
>   	/* Allow mlxsw thermal zone binding to an external cooling device */
> @@ -352,17 +358,14 @@ static int mlxsw_thermal_get_cur_state(struct thermal_cooling_device *cdev,
>   				       unsigned long *p_state)
>   
>   {
> -	struct mlxsw_thermal *thermal = cdev->devdata;
> +	struct mlxsw_thermal_cooling_device *mlxsw_cdev = cdev->devdata;
> +	struct mlxsw_thermal *thermal = mlxsw_cdev->thermal;
>   	struct device *dev = thermal->bus_info->dev;
>   	char mfsc_pl[MLXSW_REG_MFSC_LEN];
> -	int err, idx;
>   	u8 duty;
> +	int err;
>   
> -	idx = mlxsw_get_cooling_device_idx(thermal, cdev);
> -	if (idx < 0)
> -		return idx;
> -
> -	mlxsw_reg_mfsc_pack(mfsc_pl, idx, 0);
> +	mlxsw_reg_mfsc_pack(mfsc_pl, mlxsw_cdev->idx, 0);
>   	err = mlxsw_reg_query(thermal->core, MLXSW_REG(mfsc), mfsc_pl);
>   	if (err) {
>   		dev_err(dev, "Failed to query PWM duty\n");
> @@ -378,22 +381,19 @@ static int mlxsw_thermal_set_cur_state(struct thermal_cooling_device *cdev,
>   				       unsigned long state)
>   
>   {
> -	struct mlxsw_thermal *thermal = cdev->devdata;
> +	struct mlxsw_thermal_cooling_device *mlxsw_cdev = cdev->devdata;
> +	struct mlxsw_thermal *thermal = mlxsw_cdev->thermal;
>   	struct device *dev = thermal->bus_info->dev;
>   	char mfsc_pl[MLXSW_REG_MFSC_LEN];
> -	int idx;
>   	int err;
>   
>   	if (state > MLXSW_THERMAL_MAX_STATE)
>   		return -EINVAL;
>   
> -	idx = mlxsw_get_cooling_device_idx(thermal, cdev);
> -	if (idx < 0)
> -		return idx;
> -
>   	/* Normalize the state to the valid speed range. */
>   	state = max_t(unsigned long, MLXSW_THERMAL_MIN_STATE, state);
> -	mlxsw_reg_mfsc_pack(mfsc_pl, idx, mlxsw_state_to_duty(state));
> +	mlxsw_reg_mfsc_pack(mfsc_pl, mlxsw_cdev->idx,
> +			    mlxsw_state_to_duty(state));
>   	err = mlxsw_reg_write(thermal->core, MLXSW_REG(mfsc), mfsc_pl);
>   	if (err) {
>   		dev_err(dev, "Failed to write PWM duty\n");
> @@ -753,17 +753,21 @@ int mlxsw_thermal_init(struct mlxsw_core *core,
>   	}
>   	for (i = 0; i < MLXSW_MFCR_PWMS_MAX; i++) {
>   		if (pwm_active & BIT(i)) {
> +			struct mlxsw_thermal_cooling_device *mlxsw_cdev;
>   			struct thermal_cooling_device *cdev;
>   
> +			mlxsw_cdev = &thermal->cdevs[i];
> +			mlxsw_cdev->thermal = thermal;
> +			mlxsw_cdev->idx = i;
>   			cdev = thermal_cooling_device_register("mlxsw_fan",
> -							       thermal,
> +							       mlxsw_cdev,
>   							       &mlxsw_cooling_ops);
>   			if (IS_ERR(cdev)) {
>   				err = PTR_ERR(cdev);
>   				dev_err(dev, "Failed to register cooling device\n");
>   				goto err_thermal_cooling_device_register;
>   			}
> -			thermal->cdevs[i] = cdev;
> +			mlxsw_cdev->cdev = cdev;
>   		}
>   	}
>   
> @@ -824,8 +828,8 @@ int mlxsw_thermal_init(struct mlxsw_core *core,
>   err_thermal_zone_device_register:
>   err_thermal_cooling_device_register:
>   	for (i = 0; i < MLXSW_MFCR_PWMS_MAX; i++)
> -		if (thermal->cdevs[i])
> -			thermal_cooling_device_unregister(thermal->cdevs[i]);
> +		if (thermal->cdevs[i].cdev)
> +			thermal_cooling_device_unregister(thermal->cdevs[i].cdev);
>   err_reg_write:
>   err_reg_query:
>   	kfree(thermal);
> @@ -848,10 +852,8 @@ void mlxsw_thermal_fini(struct mlxsw_thermal *thermal)
>   	}
>   
>   	for (i = 0; i < MLXSW_MFCR_PWMS_MAX; i++) {
> -		if (thermal->cdevs[i]) {
> -			thermal_cooling_device_unregister(thermal->cdevs[i]);
> -			thermal->cdevs[i] = NULL;
> -		}
> +		if (thermal->cdevs[i].cdev)
> +			thermal_cooling_device_unregister(thermal->cdevs[i].cdev);
>   	}
>   
>   	kfree(thermal);

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ