lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <26681360-45a5-cfcc-e1f0-1c2e16fafca5@linaro.org>
Date:   Sat, 1 Apr 2023 22:28:52 +0200
From:   Daniel Lezcano <daniel.lezcano@...aro.org>
To:     Petr Machata <petrm@...dia.com>,
        "David S. Miller" <davem@...emloft.net>,
        Eric Dumazet <edumazet@...gle.com>,
        Jakub Kicinski <kuba@...nel.org>,
        Paolo Abeni <pabeni@...hat.com>, netdev@...r.kernel.org
Cc:     "Rafael J. Wysocki" <rafael@...nel.org>,
        Ido Schimmel <idosch@...dia.com>,
        Vadim Pasternak <vadimp@...dia.com>, mlxsw@...dia.com
Subject: Re: [PATCH net-next 1/3] mlxsw: core_thermal: Use static trip points
 for transceiver modules

On 31/03/2023 16:17, Petr Machata wrote:
> From: Ido Schimmel <idosch@...dia.com>
> 
> The driver registers a thermal zone for each transceiver module and
> tries to set the trip point temperatures according to the thresholds
> read from the transceiver. If a threshold cannot be read or if a
> transceiver is unplugged, the trip point temperature is set to zero,
> which means that it is disabled as far as the thermal subsystem is
> concerned.
> 
> A recent change in the thermal core made it so that such trip points are
> no longer marked as disabled, which lead the thermal subsystem to
> incorrectly set the associated cooling devices to the their maximum
> state [1]. A fix to restore this behavior was merged in commit
> f1b80a3878b2 ("thermal: core: Restore behavior regarding invalid trip
> points"). However, the thermal maintainer suggested to not rely on this
> behavior and instead always register a valid array of trip points [2].
> 
> Therefore, create a static array of trip points with sane defaults
> (suggested by Vadim) and register it with the thermal zone of each
> transceiver module. User space can choose to override these defaults
> using the thermal zone sysfs interface since these files are writeable.
> 
> Before:
> 
>   $ cat /sys/class/thermal/thermal_zone11/type
>   mlxsw-module11
>   $ cat /sys/class/thermal/thermal_zone11/trip_point_*_temp
>   65000
>   75000
>   80000
> 
> After:
> 
>   $ cat /sys/class/thermal/thermal_zone11/type
>   mlxsw-module11
>   $ cat /sys/class/thermal/thermal_zone11/trip_point_*_temp
>   55000
>   65000
>   80000
> 
> Also tested by reverting commit f1b80a3878b2 ("thermal: core: Restore
> behavior regarding invalid trip points") and making sure that the
> associated cooling devices are not set to their maximum state.
> 
> [1] https://lore.kernel.org/linux-pm/ZA3CFNhU4AbtsP4G@shredder/
> [2] https://lore.kernel.org/linux-pm/f78e6b70-a963-c0ca-a4b2-0d4c6aeef1fb@linaro.org/
> 
> Signed-off-by: Ido Schimmel <idosch@...dia.com>
> Reviewed-by: Vadim Pasternak <vadimp@...dia.com>
> Signed-off-by: Petr Machata <petrm@...dia.com>
> ---

Sounds like the changes result in a nice cleanup :)

Thanks for taking care of doing these changes

   -- Daniel

>   .../ethernet/mellanox/mlxsw/core_thermal.c    | 110 ++++--------------
>   1 file changed, 25 insertions(+), 85 deletions(-)
> 
> diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
> index 09ed6e5fa6c3..ece5075b7dbf 100644
> --- a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
> +++ b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
> @@ -19,6 +19,9 @@
>   #define MLXSW_THERMAL_ASIC_TEMP_NORM	75000	/* 75C */
>   #define MLXSW_THERMAL_ASIC_TEMP_HIGH	85000	/* 85C */
>   #define MLXSW_THERMAL_ASIC_TEMP_HOT	105000	/* 105C */
> +#define MLXSW_THERMAL_MODULE_TEMP_NORM	55000	/* 55C */
> +#define MLXSW_THERMAL_MODULE_TEMP_HIGH	65000	/* 65C */
> +#define MLXSW_THERMAL_MODULE_TEMP_HOT	80000	/* 80C */
>   #define MLXSW_THERMAL_HYSTERESIS_TEMP	5000	/* 5C */
>   #define MLXSW_THERMAL_MODULE_TEMP_SHIFT	(MLXSW_THERMAL_HYSTERESIS_TEMP * 2)
>   #define MLXSW_THERMAL_MAX_STATE	10
> @@ -30,12 +33,6 @@ static char * const mlxsw_thermal_external_allowed_cdev[] = {
>   	"mlxreg_fan",
>   };
>   
> -enum mlxsw_thermal_trips {
> -	MLXSW_THERMAL_TEMP_TRIP_NORM,
> -	MLXSW_THERMAL_TEMP_TRIP_HIGH,
> -	MLXSW_THERMAL_TEMP_TRIP_HOT,
> -};
> -
>   struct mlxsw_cooling_states {
>   	int	min_state;
>   	int	max_state;
> @@ -59,6 +56,24 @@ static const struct thermal_trip default_thermal_trips[] = {
>   	},
>   };
>   
> +static const struct thermal_trip default_thermal_module_trips[] = {
> +	{	/* In range - 0-40% PWM */
> +		.type		= THERMAL_TRIP_ACTIVE,
> +		.temperature	= MLXSW_THERMAL_MODULE_TEMP_NORM,
> +		.hysteresis	= MLXSW_THERMAL_HYSTERESIS_TEMP,
> +	},
> +	{
> +		/* In range - 40-100% PWM */
> +		.type		= THERMAL_TRIP_ACTIVE,
> +		.temperature	= MLXSW_THERMAL_MODULE_TEMP_HIGH,
> +		.hysteresis	= MLXSW_THERMAL_HYSTERESIS_TEMP,
> +	},
> +	{	/* Warning */
> +		.type		= THERMAL_TRIP_HOT,
> +		.temperature	= MLXSW_THERMAL_MODULE_TEMP_HOT,
> +	},
> +};
> +
>   static const struct mlxsw_cooling_states default_cooling_states[] = {
>   	{
>   		.min_state	= 0,
> @@ -140,63 +155,6 @@ static int mlxsw_get_cooling_device_idx(struct mlxsw_thermal *thermal,
>   	return -ENODEV;
>   }
>   
> -static void
> -mlxsw_thermal_module_trips_reset(struct mlxsw_thermal_module *tz)
> -{
> -	tz->trips[MLXSW_THERMAL_TEMP_TRIP_NORM].temperature = 0;
> -	tz->trips[MLXSW_THERMAL_TEMP_TRIP_HIGH].temperature = 0;
> -	tz->trips[MLXSW_THERMAL_TEMP_TRIP_HOT].temperature = 0;
> -}
> -
> -static int
> -mlxsw_thermal_module_trips_update(struct device *dev, struct mlxsw_core *core,
> -				  struct mlxsw_thermal_module *tz,
> -				  int crit_temp, int emerg_temp)
> -{
> -	int err;
> -
> -	/* Do not try to query temperature thresholds directly from the module's
> -	 * EEPROM if we got valid thresholds from MTMP.
> -	 */
> -	if (!emerg_temp || !crit_temp) {
> -		err = mlxsw_env_module_temp_thresholds_get(core, tz->slot_index,
> -							   tz->module,
> -							   SFP_TEMP_HIGH_WARN,
> -							   &crit_temp);
> -		if (err)
> -			return err;
> -
> -		err = mlxsw_env_module_temp_thresholds_get(core, tz->slot_index,
> -							   tz->module,
> -							   SFP_TEMP_HIGH_ALARM,
> -							   &emerg_temp);
> -		if (err)
> -			return err;
> -	}
> -
> -	if (crit_temp > emerg_temp) {
> -		dev_warn(dev, "%s : Critical threshold %d is above emergency threshold %d\n",
> -			 tz->tzdev->type, crit_temp, emerg_temp);
> -		return 0;
> -	}
> -
> -	/* According to the system thermal requirements, the thermal zones are
> -	 * defined with three trip points. The critical and emergency
> -	 * temperature thresholds, provided by QSFP module are set as "active"
> -	 * and "hot" trip points, "normal" trip point is derived from "active"
> -	 * by subtracting double hysteresis value.
> -	 */
> -	if (crit_temp >= MLXSW_THERMAL_MODULE_TEMP_SHIFT)
> -		tz->trips[MLXSW_THERMAL_TEMP_TRIP_NORM].temperature = crit_temp -
> -					MLXSW_THERMAL_MODULE_TEMP_SHIFT;
> -	else
> -		tz->trips[MLXSW_THERMAL_TEMP_TRIP_NORM].temperature = crit_temp;
> -	tz->trips[MLXSW_THERMAL_TEMP_TRIP_HIGH].temperature = crit_temp;
> -	tz->trips[MLXSW_THERMAL_TEMP_TRIP_HOT].temperature = emerg_temp;
> -
> -	return 0;
> -}
> -
>   static int mlxsw_thermal_bind(struct thermal_zone_device *tzdev,
>   			      struct thermal_cooling_device *cdev)
>   {
> @@ -358,10 +316,8 @@ static int mlxsw_thermal_module_temp_get(struct thermal_zone_device *tzdev,
>   	struct mlxsw_thermal_module *tz = tzdev->devdata;
>   	struct mlxsw_thermal *thermal = tz->parent;
>   	int temp, crit_temp, emerg_temp;
> -	struct device *dev;
>   	u16 sensor_index;
>   
> -	dev = thermal->bus_info->dev;
>   	sensor_index = MLXSW_REG_MTMP_MODULE_INDEX_MIN + tz->module;
>   
>   	/* Read module temperature and thresholds. */
> @@ -371,13 +327,6 @@ static int mlxsw_thermal_module_temp_get(struct thermal_zone_device *tzdev,
>   						     &crit_temp, &emerg_temp);
>   	*p_temp = temp;
>   
> -	if (!temp)
> -		return 0;
> -
> -	/* Update trip points. */
> -	mlxsw_thermal_module_trips_update(dev, thermal->core, tz,
> -					  crit_temp, emerg_temp);
> -
>   	return 0;
>   }
>   
> @@ -527,10 +476,7 @@ mlxsw_thermal_module_init(struct device *dev, struct mlxsw_core *core,
>   			  struct mlxsw_thermal_area *area, u8 module)
>   {
>   	struct mlxsw_thermal_module *module_tz;
> -	int dummy_temp, crit_temp, emerg_temp;
> -	u16 sensor_index;
>   
> -	sensor_index = MLXSW_REG_MTMP_MODULE_INDEX_MIN + module;
>   	module_tz = &area->tz_module_arr[module];
>   	/* Skip if parent is already set (case of port split). */
>   	if (module_tz->parent)
> @@ -538,19 +484,13 @@ mlxsw_thermal_module_init(struct device *dev, struct mlxsw_core *core,
>   	module_tz->module = module;
>   	module_tz->slot_index = area->slot_index;
>   	module_tz->parent = thermal;
> -	memcpy(module_tz->trips, default_thermal_trips,
> +	BUILD_BUG_ON(ARRAY_SIZE(default_thermal_module_trips) !=
> +		     MLXSW_THERMAL_NUM_TRIPS);
> +	memcpy(module_tz->trips, default_thermal_module_trips,
>   	       sizeof(thermal->trips));
>   	memcpy(module_tz->cooling_states, default_cooling_states,
>   	       sizeof(thermal->cooling_states));
> -	/* Initialize all trip point. */
> -	mlxsw_thermal_module_trips_reset(module_tz);
> -	/* Read module temperature and thresholds. */
> -	mlxsw_thermal_module_temp_and_thresholds_get(core, area->slot_index,
> -						     sensor_index, &dummy_temp,
> -						     &crit_temp, &emerg_temp);
> -	/* Update trip point according to the module data. */
> -	return mlxsw_thermal_module_trips_update(dev, core, module_tz,
> -						 crit_temp, emerg_temp);
> +	return 0;
>   }
>   
>   static void mlxsw_thermal_module_fini(struct mlxsw_thermal_module *module_tz)

-- 
<http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs

Follow Linaro:  <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ