netdev - [PATCH net-next] gve: Add retry logic for recoverable adminq errors

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240628204139.458075-1-rushilg@google.com>
Date: Fri, 28 Jun 2024 20:41:39 +0000
From: Rushil Gupta <rushilg@...gle.com>
To: netdev@...r.kernel.org
Cc: jeroendb@...gle.com, pkaligineedi@...gle.com, davem@...emloft.net, 
	kuba@...nel.org, edumazet@...gle.com, pabeni@...hat.com, willemb@...gle.com, 
	hramamurthy@...gle.com, Rushil Gupta <rushilg@...gle.com>, 
	Shailend Chand <shailend@...gle.com>, Ziwei Xiao <ziweixiao@...gle.com>
Subject: [PATCH net-next] gve: Add retry logic for recoverable adminq errors

From: Jeroen de Borst <jeroendb@...gle.com>

An adminq command is retried if it fails with an ETIME error code
which translates to the deadline exceeded error for the device.
The create and destroy adminq commands are now managed via a common
method. This method keeps track of return codes for each queue and retries
the commands for the queues that failed with ETIME.
Other adminq commands that do not require queue level granularity are
simply retried in gve_adminq_execute_cmd.

Signed-off-by: Rushil Gupta <rushilg@...gle.com>
Signed-off-by: Jeroen de Borst <jeroendb@...gle.com>
Reviewed-by: Shailend Chand <shailend@...gle.com>
Reviewed-by: Ziwei Xiao <ziweixiao@...gle.com>
---
 drivers/net/ethernet/google/gve/gve_adminq.c | 197 ++++++++++++-------
 drivers/net/ethernet/google/gve/gve_adminq.h |   5 +
 2 files changed, 129 insertions(+), 73 deletions(-)

diff --git a/drivers/net/ethernet/google/gve/gve_adminq.c b/drivers/net/ethernet/google/gve/gve_adminq.c
index c5bbc1b7524e..74c61b90ea45 100644
--- a/drivers/net/ethernet/google/gve/gve_adminq.c
+++ b/drivers/net/ethernet/google/gve/gve_adminq.c
@@ -12,7 +12,7 @@
 
 #define GVE_MAX_ADMINQ_RELEASE_CHECK	500
 #define GVE_ADMINQ_SLEEP_LEN		20
-#define GVE_MAX_ADMINQ_EVENT_COUNTER_CHECK	100
+#define GVE_MAX_ADMINQ_EVENT_COUNTER_CHECK	1000
 
 #define GVE_DEVICE_OPTION_ERROR_FMT "%s option error:\n" \
 "Expected: length=%d, feature_mask=%x.\n" \
@@ -415,14 +415,17 @@ static int gve_adminq_parse_err(struct gve_priv *priv, u32 status)
 /* Flushes all AQ commands currently queued and waits for them to complete.
  * If there are failures, it will return the first error.
  */
-static int gve_adminq_kick_and_wait(struct gve_priv *priv)
+static int gve_adminq_kick_and_wait(struct gve_priv *priv, int ret_cnt, int *ret_codes)
 {
 	int tail, head;
-	int i;
+	int i, j;
 
 	tail = ioread32be(&priv->reg_bar0->adminq_event_counter);
 	head = priv->adminq_prod_cnt;
 
+	if ((head - tail) > ret_cnt)
+		return -EINVAL;
+
 	gve_adminq_kick_cmd(priv, head);
 	if (!gve_adminq_wait_for_cmd(priv, head)) {
 		dev_err(&priv->pdev->dev, "AQ commands timed out, need to reset AQ\n");
@@ -430,16 +433,13 @@ static int gve_adminq_kick_and_wait(struct gve_priv *priv)
 		return -ENOTRECOVERABLE;
 	}
 
-	for (i = tail; i < head; i++) {
+	for (i = tail, j = 0; i < head; i++, j++) {
 		union gve_adminq_command *cmd;
-		u32 status, err;
+		u32 status;
 
 		cmd = &priv->adminq[i & priv->adminq_mask];
 		status = be32_to_cpu(READ_ONCE(cmd->status));
-		err = gve_adminq_parse_err(priv, status);
-		if (err)
-			// Return the first error if we failed.
-			return err;
+		ret_codes[j] = gve_adminq_parse_err(priv, status);
 	}
 
 	return 0;
@@ -458,24 +458,8 @@ static int gve_adminq_issue_cmd(struct gve_priv *priv,
 	tail = ioread32be(&priv->reg_bar0->adminq_event_counter);
 
 	// Check if next command will overflow the buffer.
-	if (((priv->adminq_prod_cnt + 1) & priv->adminq_mask) ==
-	    (tail & priv->adminq_mask)) {
-		int err;
-
-		// Flush existing commands to make room.
-		err = gve_adminq_kick_and_wait(priv);
-		if (err)
-			return err;
-
-		// Retry.
-		tail = ioread32be(&priv->reg_bar0->adminq_event_counter);
-		if (((priv->adminq_prod_cnt + 1) & priv->adminq_mask) ==
-		    (tail & priv->adminq_mask)) {
-			// This should never happen. We just flushed the
-			// command queue so there should be enough space.
-			return -ENOMEM;
-		}
-	}
+	if ((priv->adminq_prod_cnt - tail) > priv->adminq_mask)
+		return -ENOMEM;
 
 	cmd = &priv->adminq[priv->adminq_prod_cnt & priv->adminq_mask];
 	priv->adminq_prod_cnt++;
@@ -544,8 +528,9 @@ static int gve_adminq_issue_cmd(struct gve_priv *priv,
 static int gve_adminq_execute_cmd(struct gve_priv *priv,
 				  union gve_adminq_command *cmd_orig)
 {
+	int retry_cnt = 0;
 	u32 tail, head;
-	int err;
+	int err, ret;
 
 	mutex_lock(&priv->adminq_lock);
 	tail = ioread32be(&priv->reg_bar0->adminq_event_counter);
@@ -555,15 +540,21 @@ static int gve_adminq_execute_cmd(struct gve_priv *priv,
 		goto out;
 	}
 
-	err = gve_adminq_issue_cmd(priv, cmd_orig);
-	if (err)
-		goto out;
+	do {
+		err = gve_adminq_issue_cmd(priv, cmd_orig);
+		if (err)
+			goto out;
 
-	err = gve_adminq_kick_and_wait(priv);
+		err = gve_adminq_kick_and_wait(priv, 1, &ret);
+		if (err)
+			goto out;
+	} while (ret == -ETIME && ++retry_cnt < GVE_ADMINQ_RETRY_COUNT);
 
 out:
 	mutex_unlock(&priv->adminq_lock);
-	return err;
+	if (err)
+		return err;
+	return ret;
 }
 
 static int gve_adminq_execute_extended_cmd(struct gve_priv *priv, u32 opcode,
@@ -638,6 +629,98 @@ int gve_adminq_deconfigure_device_resources(struct gve_priv *priv)
 	return gve_adminq_execute_cmd(priv, &cmd);
 }
 
+typedef int (gve_adminq_queue_cmd) (struct gve_priv *priv, u32 queue_index);
+
+static int gve_adminq_manage_queues(struct gve_priv *priv,
+				    gve_adminq_queue_cmd *cmd,
+				    u32 start_id, u32 num_queues)
+{
+	u32 cmd_idx, queue_idx, ret_code_idx;
+	int queue_done = -1;
+	int *queues_waiting;
+	int retry_cnt = 0;
+	int retry_needed;
+	int *ret_codes;
+	int *commands;
+	int err;
+	int ret;
+
+	queues_waiting = kvcalloc(num_queues, sizeof(int), GFP_KERNEL);
+	if (!queues_waiting)
+		return -ENOMEM;
+	ret_codes = kvcalloc(num_queues, sizeof(int), GFP_KERNEL);
+	if (!ret_codes) {
+		err = -ENOMEM;
+		goto free_with_queues_waiting;
+	}
+	commands = kvcalloc(num_queues, sizeof(int), GFP_KERNEL);
+	if (!commands) {
+		err = -ENOMEM;
+		goto free_with_ret_codes;
+	}
+
+	for (queue_idx = 0; queue_idx < num_queues; queue_idx++)
+		queues_waiting[queue_idx] = start_id + queue_idx;
+	do {
+		retry_needed = 0;
+		queue_idx = 0;
+		while (queue_idx < num_queues) {
+			cmd_idx = 0;
+			while (queue_idx < num_queues) {
+				if (queues_waiting[queue_idx] != queue_done) {
+					err = cmd(priv, queues_waiting[queue_idx]);
+					if (err == -ENOMEM)
+						break;
+					if (err)
+						goto free_with_commands;
+					commands[cmd_idx++] = queue_idx;
+				}
+				queue_idx++;
+			}
+
+			if (queue_idx < num_queues)
+				dev_dbg(&priv->pdev->dev,
+					"Issued %d of %d batched commands\n",
+					queue_idx, num_queues);
+
+			err = gve_adminq_kick_and_wait(priv, cmd_idx, ret_codes);
+			if (err)
+				goto free_with_commands;
+
+			for (ret_code_idx = 0; ret_code_idx < cmd_idx; ret_code_idx++) {
+				if (ret_codes[ret_code_idx] == 0) {
+					queues_waiting[commands[ret_code_idx]] = queue_done;
+				} else if (ret_codes[ret_code_idx] != -ETIME) {
+					ret = ret_codes[ret_code_idx];
+					goto free_with_commands;
+				} else {
+					retry_needed++;
+				}
+			}
+
+			if (retry_needed)
+				dev_dbg(&priv->pdev->dev,
+					"Issued %d batched commands, %d needed a retry\n",
+					cmd_idx, retry_needed);
+		}
+	} while (retry_needed && ++retry_cnt < GVE_ADMINQ_RETRY_COUNT);
+
+	ret = retry_needed ? -ETIME : 0;
+
+free_with_commands:
+	kvfree(commands);
+	commands = NULL;
+free_with_ret_codes:
+	kvfree(ret_codes);
+	ret_codes = NULL;
+free_with_queues_waiting:
+	kvfree(queues_waiting);
+	queues_waiting = NULL;
+	if (err)
+		return err;
+	return ret;
+}
+
 static int gve_adminq_create_tx_queue(struct gve_priv *priv, u32 queue_index)
 {
 	struct gve_tx_ring *tx = &priv->tx[queue_index];
@@ -678,16 +761,8 @@ static int gve_adminq_create_tx_queue(struct gve_priv *priv, u32 queue_index)
 
 int gve_adminq_create_tx_queues(struct gve_priv *priv, u32 start_id, u32 num_queues)
 {
-	int err;
-	int i;
-
-	for (i = start_id; i < start_id + num_queues; i++) {
-		err = gve_adminq_create_tx_queue(priv, i);
-		if (err)
-			return err;
-	}
-
-	return gve_adminq_kick_and_wait(priv);
+	return gve_adminq_manage_queues(priv, &gve_adminq_create_tx_queue,
+					start_id, num_queues);
 }
 
 static void gve_adminq_get_create_rx_queue_cmd(struct gve_priv *priv,
@@ -759,16 +834,8 @@ int gve_adminq_create_single_rx_queue(struct gve_priv *priv, u32 queue_index)
 
 int gve_adminq_create_rx_queues(struct gve_priv *priv, u32 num_queues)
 {
-	int err;
-	int i;
-
-	for (i = 0; i < num_queues; i++) {
-		err = gve_adminq_create_rx_queue(priv, i);
-		if (err)
-			return err;
-	}
-
-	return gve_adminq_kick_and_wait(priv);
+	return gve_adminq_manage_queues(priv, &gve_adminq_create_rx_queue,
+					0, num_queues);
 }
 
 static int gve_adminq_destroy_tx_queue(struct gve_priv *priv, u32 queue_index)
@@ -791,16 +858,8 @@ static int gve_adminq_destroy_tx_queue(struct gve_priv *priv, u32 queue_index)
 
 int gve_adminq_destroy_tx_queues(struct gve_priv *priv, u32 start_id, u32 num_queues)
 {
-	int err;
-	int i;
-
-	for (i = start_id; i < start_id + num_queues; i++) {
-		err = gve_adminq_destroy_tx_queue(priv, i);
-		if (err)
-			return err;
-	}
-
-	return gve_adminq_kick_and_wait(priv);
+	return gve_adminq_manage_queues(priv, &gve_adminq_destroy_tx_queue,
+					start_id, num_queues);
 }
 
 static void gve_adminq_make_destroy_rx_queue_cmd(union gve_adminq_command *cmd,
@@ -832,16 +891,8 @@ int gve_adminq_destroy_single_rx_queue(struct gve_priv *priv, u32 queue_index)
 
 int gve_adminq_destroy_rx_queues(struct gve_priv *priv, u32 num_queues)
 {
-	int err;
-	int i;
-
-	for (i = 0; i < num_queues; i++) {
-		err = gve_adminq_destroy_rx_queue(priv, i);
-		if (err)
-			return err;
-	}
-
-	return gve_adminq_kick_and_wait(priv);
+	return gve_adminq_manage_queues(priv, &gve_adminq_destroy_rx_queue,
+					0, num_queues);
 }
 
 static void gve_set_default_desc_cnt(struct gve_priv *priv,
diff --git a/drivers/net/ethernet/google/gve/gve_adminq.h b/drivers/net/ethernet/google/gve/gve_adminq.h
index ed1370c9b197..96e98f65273c 100644
--- a/drivers/net/ethernet/google/gve/gve_adminq.h
+++ b/drivers/net/ethernet/google/gve/gve_adminq.h
@@ -62,6 +62,11 @@ enum gve_adminq_statuses {
 	GVE_ADMINQ_COMMAND_ERROR_UNKNOWN_ERROR		= 0xFFFFFFFF,
 };
 
+/* AdminQ commands (that aren't batched) will be retried if they encounter
+ * an recoverable error.
+ */
+#define GVE_ADMINQ_RETRY_COUNT 3
+
 #define GVE_ADMINQ_DEVICE_DESCRIPTOR_VERSION 1
 
 /* All AdminQ command structs should be naturally packed. The static_assert
-- 
2.45.2.803.g4e1b14247a-goog