lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [day] [month] [year] [list]
Message-ID: <1759906638-867747-1-git-send-email-tariqt@nvidia.com>
Date: Wed, 8 Oct 2025 09:57:18 +0300
From: Tariq Toukan <tariqt@...dia.com>
To: Stephen Hemminger <stephen@...workplumber.org>, David Ahern
	<dsahern@...il.com>, Jiri Pirko <jiri@...nulli.us>, Eric Dumazet
	<edumazet@...gle.com>, Jakub Kicinski <kuba@...nel.org>, Paolo Abeni
	<pabeni@...hat.com>, Andrew Lunn <andrew+netdev@...n.ch>, "David S. Miller"
	<davem@...emloft.net>
CC: Saeed Mahameed <saeedm@...dia.com>, Leon Romanovsky <leon@...nel.org>,
	Tariq Toukan <tariqt@...dia.com>, Mark Bloch <mbloch@...dia.com>,
	<netdev@...r.kernel.org>, <linux-rdma@...r.kernel.org>,
	<linux-kernel@...r.kernel.org>, Gal Pressman <gal@...dia.com>, Moshe Shemesh
	<moshe@...dia.com>, Jiri Pirko <jiri@...dia.com>, Carolina Jubran
	<cjubran@...dia.com>, Shahar Shitrit <shshitrit@...dia.com>
Subject: [PATCH iproute2-next] devlink: Introduce burst period for health reporter

From: Shahar Shitrit <shshitrit@...dia.com>

Add a new devlink health set option to configure the health
reporter’s burst period. The burst period defines a time window
during which recovery attempts for reported errors are allowed.
Once this period expires, the configured grace period begins.

This feature addresses cases where multiple errors occur
simultaneously due to a common root cause. Without a burst period,
the grace period starts immediately after the first error recovery
attempt finishes. This means that only the first error might be
recovered, while subsequent errors are blocked during the grace period.
With the burst period, the reporter initiates a recovery attempt for
every error reported within this time window before the grace period
starts.

Example:
$ devlink health set pci/0000:00:09.0 reporter tx burst_period 500

Signed-off-by: Shahar Shitrit <shshitrit@...dia.com>
Reviewed-by: Carolina Jubran <cjubran@...dia.com>
Reviewed-by: Jiri Pirko <jiri@...dia.com>
Signed-off-by: Tariq Toukan <tariqt@...dia.com>
---
 bash-completion/devlink   |  4 ++--
 devlink/devlink.c         | 19 +++++++++++++++++++
 man/man8/devlink-health.8 | 20 ++++++++++++++++++++
 3 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/bash-completion/devlink b/bash-completion/devlink
index 52dc82b37ca5..c053d3d08009 100644
--- a/bash-completion/devlink
+++ b/bash-completion/devlink
@@ -792,12 +792,12 @@ _devlink_health()
             if [[ $command == "set" ]]; then
                 case $cword in
                     6)
-                        COMPREPLY=( $( compgen -W "grace_period auto_recover" \
+                        COMPREPLY=( $( compgen -W "grace_period burst_period auto_recover" \
                                    -- "$cur" ) )
                         ;;
                     7)
                         case $prev in
-                            grace_period)
+                            grace_period|burst_period)
                                 # Integer argument- msec
                                 ;;
                             auto_recover)
diff --git a/devlink/devlink.c b/devlink/devlink.c
index 171b85327be3..f77b4449e8c5 100644
--- a/devlink/devlink.c
+++ b/devlink/devlink.c
@@ -311,6 +311,7 @@ static int ifname_map_update(struct ifname_map *ifname_map, const char *ifname)
 #define DL_OPT_PORT_FN_CAPS	BIT(57)
 #define DL_OPT_PORT_FN_MAX_IO_EQS	BIT(58)
 #define DL_OPT_PORT_FN_RATE_TC_BWS	BIT(59)
+#define DL_OPT_HEALTH_REPORTER_BURST_PERIOD	BIT(60)
 
 struct dl_opts {
 	uint64_t present; /* flags of present items */
@@ -346,6 +347,7 @@ struct dl_opts {
 	const char *flash_component;
 	const char *reporter_name;
 	__u64 reporter_graceful_period;
+	__u64 reporter_burst_period;
 	bool reporter_auto_recover;
 	bool reporter_auto_dump;
 	const char *trap_name;
@@ -697,6 +699,7 @@ static const enum mnl_attr_data_type devlink_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_HEALTH_REPORTER_RECOVER_COUNT] = MNL_TYPE_U64,
 	[DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS] = MNL_TYPE_U64,
 	[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] = MNL_TYPE_U64,
+	[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD] = MNL_TYPE_U64,
 	[DEVLINK_ATTR_FLASH_UPDATE_COMPONENT] = MNL_TYPE_STRING,
 	[DEVLINK_ATTR_FLASH_UPDATE_STATUS_MSG] = MNL_TYPE_STRING,
 	[DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE] = MNL_TYPE_U64,
@@ -2101,6 +2104,13 @@ static int dl_argv_parse(struct dl *dl, uint64_t o_required,
 			if (err)
 				return err;
 			o_found |= DL_OPT_HEALTH_REPORTER_GRACEFUL_PERIOD;
+		} else if (dl_argv_match(dl, "burst_period") &&
+			   (o_all & DL_OPT_HEALTH_REPORTER_BURST_PERIOD)) {
+			dl_arg_inc(dl);
+			err = dl_argv_uint64_t(dl, &opts->reporter_burst_period);
+			if (err)
+				return err;
+			o_found |= DL_OPT_HEALTH_REPORTER_BURST_PERIOD;
 		} else if (dl_argv_match(dl, "auto_recover") &&
 			(o_all & DL_OPT_HEALTH_REPORTER_AUTO_RECOVER)) {
 			dl_arg_inc(dl);
@@ -2701,6 +2711,10 @@ static void dl_opts_put(struct nlmsghdr *nlh, struct dl *dl)
 		mnl_attr_put_u64(nlh,
 				 DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD,
 				 opts->reporter_graceful_period);
+	if (opts->present & DL_OPT_HEALTH_REPORTER_BURST_PERIOD)
+		mnl_attr_put_u64(nlh,
+				 DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD,
+				 opts->reporter_burst_period);
 	if (opts->present & DL_OPT_HEALTH_REPORTER_AUTO_RECOVER)
 		mnl_attr_put_u8(nlh, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER,
 				opts->reporter_auto_recover);
@@ -9309,6 +9323,7 @@ static int cmd_health_set_params(struct dl *dl)
 			       NLM_F_REQUEST | NLM_F_ACK);
 	err = dl_argv_parse(dl, DL_OPT_HANDLE | DL_OPT_HANDLEP | DL_OPT_HEALTH_REPORTER_NAME,
 			    DL_OPT_HEALTH_REPORTER_GRACEFUL_PERIOD |
+			    DL_OPT_HEALTH_REPORTER_BURST_PERIOD |
 			    DL_OPT_HEALTH_REPORTER_AUTO_RECOVER |
 			    DL_OPT_HEALTH_REPORTER_AUTO_DUMP);
 	if (err)
@@ -9753,6 +9768,9 @@ static void pr_out_health(struct dl *dl, struct nlattr **tb_health,
 	if (tb[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD])
 		pr_out_u64(dl, "grace_period",
 			   mnl_attr_get_u64(tb[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]));
+	if (tb[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD])
+		pr_out_u64(dl, "burst_period",
+			   mnl_attr_get_u64(tb[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD]));
 	if (tb[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER])
 		print_bool(PRINT_ANY, "auto_recover", " auto_recover %s",
 			   mnl_attr_get_u8(tb[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]));
@@ -9827,6 +9845,7 @@ static void cmd_health_help(void)
 	pr_err("       devlink health dump clear { DEV | DEV/PORT_INDEX } reporter REPORTER_NAME\n");
 	pr_err("       devlink health set { DEV | DEV/PORT_INDEX } reporter REPORTER_NAME\n");
 	pr_err("                          [ grace_period MSEC ]\n");
+	pr_err("                          [ burst_period MSEC ]\n");
 	pr_err("                          [ auto_recover { true | false } ]\n");
 	pr_err("                          [ auto_dump    { true | false } ]\n");
 }
diff --git a/man/man8/devlink-health.8 b/man/man8/devlink-health.8
index 975b8c75d798..fd6818dfadaa 100644
--- a/man/man8/devlink-health.8
+++ b/man/man8/devlink-health.8
@@ -61,6 +61,8 @@ devlink-health \- devlink health reporting and recovery
 [
 .BI "grace_period " MSEC "
 ] [
+.BI "burst_period " MSEC "
+] [
 .BR auto_recover " { " true " | " false " } "
 ] [
 .BR auto_dump " { " true " | " false " } "
@@ -182,6 +184,11 @@ doesn't support a recovery or dump method.
 .BI grace_period " MSEC "
 Time interval between consecutive auto recoveries.
 
+.TP
+.BI burst_period " MSEC "
+Time window for error recoveries before starting the grace period.
+Configuring burst_period is invalid when the grace period is disabled.
+
 .TP
 .BR auto_recover " { " true " | " false " } "
 Indicates whether the devlink should execute automatic recover on error.
@@ -242,6 +249,19 @@ the specified port and reporter.
 devlink health set pci/0000:00:09.0 reporter fw_fatal auto_recover false
 .RS 4
 Turn off auto recovery on the specified device and reporter.
+.RE
+.PP
+devlink health set pci/0000:00:09.0 reporter tx burst_period 5000
+.RS 4
+Set the burst period to 5000 milliseconds on the specified
+device and reporter, prior to initiating the grace period.
+.RE
+.PP
+devlink health set pci/0000:00:09.0 reporter tx grace_period 0
+.RS 4
+Disable grace period on the specified device and reporter. Disabling the grace
+period also deactivates the burst period.
+.RE
 
 .RE
 .SH SEE ALSO

base-commit: 1f7924938884235daa5594f1d0f18c5b07fa9d74
-- 
2.31.1


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ