lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20210624180632.3659809-11-bcf@google.com>
Date:   Thu, 24 Jun 2021 11:06:26 -0700
From:   Bailey Forrest <bcf@...gle.com>
To:     Bailey Forrest <bcf@...gle.com>,
        "David S . Miller" <davem@...emloft.net>
Cc:     netdev@...r.kernel.org, Willem de Bruijn <willemb@...gle.com>,
        Catherine Sullivan <csully@...gle.com>
Subject: [PATCH net-next 10/16] gve: Add DQO fields for core data structures

- Add new DQO datapath structures:
  - `gve_rx_buf_queue_dqo`
  - `gve_rx_compl_queue_dqo`
  - `gve_rx_buf_state_dqo`
  - `gve_tx_desc_dqo`
  - `gve_tx_pending_packet_dqo`

- Incorporate these into the existing ring data structures:
  - `gve_rx_ring`
  - `gve_tx_ring`

Noteworthy mentions:

- `gve_rx_buf_state` represents an RX buffer which was posted to HW.
  Each RX queue has an array of these objects and the index into the
  array is used as the buffer_id when posted to HW.

- `gve_tx_pending_packet_dqo` is treated similarly for TX queues. The
  completion_tag is the index into the array.

- These two structures have links for linked lists which are represented
  by 16b indexes into a contiguous array of these structures.
  This reduces memory footprint compared to 64b pointers.

- We use unions for the writeable datapath structures to reduce cache
  footprint. GQI specific members will renamed like DQO members in a
  future patch.

Signed-off-by: Bailey Forrest <bcf@...gle.com>
Reviewed-by: Willem de Bruijn <willemb@...gle.com>
Reviewed-by: Catherine Sullivan <csully@...gle.com>
---
 drivers/net/ethernet/google/gve/gve.h | 262 ++++++++++++++++++++++++--
 1 file changed, 251 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/google/gve/gve.h b/drivers/net/ethernet/google/gve/gve.h
index e32730f50bf9..5bfab1ac20d1 100644
--- a/drivers/net/ethernet/google/gve/gve.h
+++ b/drivers/net/ethernet/google/gve/gve.h
@@ -13,6 +13,7 @@
 #include <linux/u64_stats_sync.h>
 
 #include "gve_desc.h"
+#include "gve_desc_dqo.h"
 
 #ifndef PCI_VENDOR_ID_GOOGLE
 #define PCI_VENDOR_ID_GOOGLE	0x1ae0
@@ -80,17 +81,117 @@ struct gve_rx_data_queue {
 
 struct gve_priv;
 
-/* An RX ring that contains a power-of-two sized desc and data ring. */
+/* RX buffer queue for posting buffers to HW.
+ * Each RX (completion) queue has a corresponding buffer queue.
+ */
+struct gve_rx_buf_queue_dqo {
+	struct gve_rx_desc_dqo *desc_ring;
+	dma_addr_t bus;
+	u32 head; /* Pointer to start cleaning buffers at. */
+	u32 tail; /* Last posted buffer index + 1 */
+	u32 mask; /* Mask for indices to the size of the ring */
+};
+
+/* RX completion queue to receive packets from HW. */
+struct gve_rx_compl_queue_dqo {
+	struct gve_rx_compl_desc_dqo *desc_ring;
+	dma_addr_t bus;
+
+	/* Number of slots which did not have a buffer posted yet. We should not
+	 * post more buffers than the queue size to avoid HW overrunning the
+	 * queue.
+	 */
+	int num_free_slots;
+
+	/* HW uses a "generation bit" to notify SW of new descriptors. When a
+	 * descriptor's generation bit is different from the current generation,
+	 * that descriptor is ready to be consumed by SW.
+	 */
+	u8 cur_gen_bit;
+
+	/* Pointer into desc_ring where the next completion descriptor will be
+	 * received.
+	 */
+	u32 head;
+	u32 mask; /* Mask for indices to the size of the ring */
+};
+
+/* Stores state for tracking buffers posted to HW */
+struct gve_rx_buf_state_dqo {
+	/* The page posted to HW. */
+	struct gve_rx_slot_page_info page_info;
+
+	/* The DMA address corresponding to `page_info`. */
+	dma_addr_t addr;
+
+	/* Last offset into the page when it only had a single reference, at
+	 * which point every other offset is free to be reused.
+	 */
+	u32 last_single_ref_offset;
+
+	/* Linked list index to next element in the list, or -1 if none */
+	s16 next;
+};
+
+/* `head` and `tail` are indices into an array, or -1 if empty. */
+struct gve_index_list {
+	s16 head;
+	s16 tail;
+};
+
+/* Contains datapath state used to represent an RX queue. */
 struct gve_rx_ring {
 	struct gve_priv *gve;
-	struct gve_rx_desc_queue desc;
-	struct gve_rx_data_queue data;
+	union {
+		/* GQI fields */
+		struct {
+			struct gve_rx_desc_queue desc;
+			struct gve_rx_data_queue data;
+
+			/* threshold for posting new buffs and descs */
+			u32 db_threshold;
+		};
+
+		/* DQO fields. */
+		struct {
+			struct gve_rx_buf_queue_dqo bufq;
+			struct gve_rx_compl_queue_dqo complq;
+
+			struct gve_rx_buf_state_dqo *buf_states;
+			u16 num_buf_states;
+
+			/* Linked list of gve_rx_buf_state_dqo. Index into
+			 * buf_states, or -1 if empty.
+			 */
+			s16 free_buf_states;
+
+			/* Linked list of gve_rx_buf_state_dqo. Indexes into
+			 * buf_states, or -1 if empty.
+			 *
+			 * This list contains buf_states which are pointing to
+			 * valid buffers.
+			 *
+			 * We use a FIFO here in order to increase the
+			 * probability that buffers can be reused by increasing
+			 * the time between usages.
+			 */
+			struct gve_index_list recycled_buf_states;
+
+			/* Linked list of gve_rx_buf_state_dqo. Indexes into
+			 * buf_states, or -1 if empty.
+			 *
+			 * This list contains buf_states which have buffers
+			 * which cannot be reused yet.
+			 */
+			struct gve_index_list used_buf_states;
+		} dqo;
+	};
+
 	u64 rbytes; /* free-running bytes received */
 	u64 rpackets; /* free-running packets received */
 	u32 cnt; /* free-running total number of completed packets */
 	u32 fill_cnt; /* free-running total number of descs and buffs posted */
 	u32 mask; /* masks the cnt and fill_cnt to the size of the ring */
-	u32 db_threshold; /* threshold for posting new buffs and descs */
 	u64 rx_copybreak_pkt; /* free-running count of copybreak packets */
 	u64 rx_copied_pkt; /* free-running total number of copied packets */
 	u64 rx_skb_alloc_fail; /* free-running count of skb alloc fails */
@@ -141,23 +242,161 @@ struct gve_tx_fifo {
 	struct gve_queue_page_list *qpl; /* QPL mapped into this FIFO */
 };
 
-/* A TX ring that contains a power-of-two sized desc ring and a FIFO buffer */
+/* TX descriptor for DQO format */
+union gve_tx_desc_dqo {
+	struct gve_tx_pkt_desc_dqo pkt;
+	struct gve_tx_tso_context_desc_dqo tso_ctx;
+	struct gve_tx_general_context_desc_dqo general_ctx;
+};
+
+enum gve_packet_state {
+	/* Packet is in free list, available to be allocated.
+	 * This should always be zero since state is not explicitly initialized.
+	 */
+	GVE_PACKET_STATE_UNALLOCATED,
+	/* Packet is expecting a regular data completion or miss completion */
+	GVE_PACKET_STATE_PENDING_DATA_COMPL,
+	/* Packet has received a miss completion and is expecting a
+	 * re-injection completion.
+	 */
+	GVE_PACKET_STATE_PENDING_REINJECT_COMPL,
+	/* No valid completion received within the specified timeout. */
+	GVE_PACKET_STATE_TIMED_OUT_COMPL,
+};
+
+struct gve_tx_pending_packet_dqo {
+	struct sk_buff *skb; /* skb for this packet */
+
+	/* 0th element corresponds to the linear portion of `skb`, should be
+	 * unmapped with `dma_unmap_single`.
+	 *
+	 * All others correspond to `skb`'s frags and should be unmapped with
+	 * `dma_unmap_page`.
+	 */
+	struct gve_tx_dma_buf bufs[MAX_SKB_FRAGS + 1];
+	u16 num_bufs;
+
+	/* Linked list index to next element in the list, or -1 if none */
+	s16 next;
+
+	/* Linked list index to prev element in the list, or -1 if none.
+	 * Used for tracking either outstanding miss completions or prematurely
+	 * freed packets.
+	 */
+	s16 prev;
+
+	/* Identifies the current state of the packet as defined in
+	 * `enum gve_packet_state`.
+	 */
+	u8 state;
+
+	/* If packet is an outstanding miss completion, then the packet is
+	 * freed if the corresponding re-injection completion is not received
+	 * before kernel jiffies exceeds timeout_jiffies.
+	 */
+	unsigned long timeout_jiffies;
+};
+
+/* Contains datapath state used to represent a TX queue. */
 struct gve_tx_ring {
 	/* Cacheline 0 -- Accessed & dirtied during transmit */
-	struct gve_tx_fifo tx_fifo;
-	u32 req; /* driver tracked head pointer */
-	u32 done; /* driver tracked tail pointer */
+	union {
+		/* GQI fields */
+		struct {
+			struct gve_tx_fifo tx_fifo;
+			u32 req; /* driver tracked head pointer */
+			u32 done; /* driver tracked tail pointer */
+		};
+
+		/* DQO fields. */
+		struct {
+			/* Linked list of gve_tx_pending_packet_dqo. Index into
+			 * pending_packets, or -1 if empty.
+			 *
+			 * This is a consumer list owned by the TX path. When it
+			 * runs out, the producer list is stolen from the
+			 * completion handling path
+			 * (dqo_compl.free_pending_packets).
+			 */
+			s16 free_pending_packets;
+
+			/* Cached value of `dqo_compl.hw_tx_head` */
+			u32 head;
+			u32 tail; /* Last posted buffer index + 1 */
+
+			/* Index of the last descriptor with "report event" bit
+			 * set.
+			 */
+			u32 last_re_idx;
+		} dqo_tx;
+	};
 
 	/* Cacheline 1 -- Accessed & dirtied during gve_clean_tx_done */
-	__be32 last_nic_done ____cacheline_aligned; /* NIC tail pointer */
+	union {
+		/* GQI fields */
+		struct {
+			/* NIC tail pointer */
+			__be32 last_nic_done;
+		};
+
+		/* DQO fields. */
+		struct {
+			u32 head; /* Last read on compl_desc */
+
+			/* Tracks the current gen bit of compl_q */
+			u8 cur_gen_bit;
+
+			/* Linked list of gve_tx_pending_packet_dqo. Index into
+			 * pending_packets, or -1 if empty.
+			 *
+			 * This is the producer list, owned by the completion
+			 * handling path. When the consumer list
+			 * (dqo_tx.free_pending_packets) is runs out, this list
+			 * will be stolen.
+			 */
+			atomic_t free_pending_packets;
+
+			/* Last TX ring index fetched by HW */
+			atomic_t hw_tx_head;
+
+			/* List to track pending packets which received a miss
+			 * completion but not a corresponding reinjection.
+			 */
+			struct gve_index_list miss_completions;
+
+			/* List to track pending packets that were completed
+			 * before receiving a valid completion because they
+			 * reached a specified timeout.
+			 */
+			struct gve_index_list timed_out_completions;
+		} dqo_compl;
+	} ____cacheline_aligned;
 	u64 pkt_done; /* free-running - total packets completed */
 	u64 bytes_done; /* free-running - total bytes completed */
 	u64 dropped_pkt; /* free-running - total packets dropped */
 	u64 dma_mapping_error; /* count of dma mapping errors */
 
 	/* Cacheline 2 -- Read-mostly fields */
-	union gve_tx_desc *desc ____cacheline_aligned;
-	struct gve_tx_buffer_state *info; /* Maps 1:1 to a desc */
+	union {
+		/* GQI fields */
+		struct {
+			union gve_tx_desc *desc;
+
+			/* Maps 1:1 to a desc */
+			struct gve_tx_buffer_state *info;
+		};
+
+		/* DQO fields. */
+		struct {
+			union gve_tx_desc_dqo *tx_ring;
+			struct gve_tx_compl_desc *compl_ring;
+
+			struct gve_tx_pending_packet_dqo *pending_packets;
+			s16 num_pending_packets;
+
+			u32 complq_mask; /* complq size is complq_mask + 1 */
+		} dqo;
+	} ____cacheline_aligned;
 	struct netdev_queue *netdev_txq;
 	struct gve_queue_resources *q_resources; /* head and tail pointer idx */
 	struct device *dev;
@@ -171,6 +410,7 @@ struct gve_tx_ring {
 	u32 ntfy_id; /* notification block index */
 	dma_addr_t bus; /* dma address of the descr ring */
 	dma_addr_t q_resources_bus; /* dma address of the queue resources */
+	dma_addr_t complq_bus_dqo; /* dma address of the dqo.compl_ring */
 	struct u64_stats_sync statss; /* sync stats for 32bit archs */
 } ____cacheline_aligned;
 
-- 
2.32.0.288.g62a8d224e6-goog

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ