lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Tue, 10 Jan 2023 22:07:23 +0100
From:   Gerhard Engleder <gerhard@...leder-embedded.com>
To:     Alexander H Duyck <alexander.duyck@...il.com>,
        netdev@...r.kernel.org
Cc:     davem@...emloft.net, kuba@...nel.org, edumazet@...gle.com,
        pabeni@...hat.com
Subject: Re: [PATCH net-next v4 05/10] tsnep: Add XDP TX support

On 10.01.23 17:56, Alexander H Duyck wrote:
> nOn Mon, 2023-01-09 at 20:15 +0100, Gerhard Engleder wrote:
>> Implement ndo_xdp_xmit() for XDP TX support. Support for fragmented XDP
>> frames is included.
>>
>> Also some const, braces and logic clean ups are done in normal TX path
>> to keep both TX paths in sync.
>>
>> Signed-off-by: Gerhard Engleder <gerhard@...leder-embedded.com>
>> ---
>>   drivers/net/ethernet/engleder/tsnep.h      |  12 +-
>>   drivers/net/ethernet/engleder/tsnep_main.c | 211 +++++++++++++++++++--
>>   2 files changed, 210 insertions(+), 13 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/engleder/tsnep.h b/drivers/net/ethernet/engleder/tsnep.h
>> index d658413ceb14..9cb267938794 100644
>> --- a/drivers/net/ethernet/engleder/tsnep.h
>> +++ b/drivers/net/ethernet/engleder/tsnep.h
>> @@ -57,6 +57,12 @@ struct tsnep_rxnfc_rule {
>>   	int location;
>>   };
>>   
>> +enum tsnep_tx_type {
>> +	TSNEP_TX_TYPE_SKB,
>> +	TSNEP_TX_TYPE_XDP_TX,
>> +	TSNEP_TX_TYPE_XDP_NDO,
>> +};
>> +
> 
> I'm more a fan of using a bitmap rather than an enum for these type of
> things since otherwise you are having to check for 3 possible values
> for everything.
> 
> Also you may want to reorder this so that SKB is the last option. With
> that 0/1 would be your two XDP values and 2 would be your SKB value. It
> should make it easier for this to be treated more as a decision tree.

I will try to use bitmap.

>>   struct tsnep_tx_entry {
>>   	struct tsnep_tx_desc *desc;
>>   	struct tsnep_tx_desc_wb *desc_wb;
>> @@ -65,7 +71,11 @@ struct tsnep_tx_entry {
>>   
>>   	u32 properties;
>>   
>> -	struct sk_buff *skb;
>> +	enum tsnep_tx_type type;
>> +	union {
>> +		struct sk_buff *skb;
>> +		struct xdp_frame *xdpf;
>> +	};
>>   	size_t len;
>>   	DEFINE_DMA_UNMAP_ADDR(dma);xdp_return_frame_bulk
>>   };
>> diff --git a/drivers/net/ethernet/engleder/tsnep_main.c b/drivers/net/ethernet/engleder/tsnep_main.c
>> index 943de5a09693..1ae73c706c9e 100644
>> --- a/drivers/net/ethernet/engleder/tsnep_main.c
>> +++ b/drivers/net/ethernet/engleder/tsnep_main.c
>> @@ -310,10 +310,12 @@ static void tsnep_tx_activate(struct tsnep_tx *tx, int index, int length,
>>   	struct tsnep_tx_entry *entry = &tx->entry[index];
>>   
>>   	entry->properties = 0;
>> +	/* xdpf is union with skb */
>>   	if (entry->skb) {
>>   		entry->properties = length & TSNEP_DESC_LENGTH_MASK;
>>   		entry->properties |= TSNEP_DESC_INTERRUPT_FLAG;
>> -		if (skb_shinfo(entry->skb)->tx_flags & SKBTX_IN_PROGRESS)
>> +		if (entry->type == Txdp_return_frame_bulkSNEP_TX_TYPE_SKB &&
>> +		    (skb_shinfo(entry->skb)->tx_flags & SKBTX_IN_PROGRESS))
>>   			entry->properties |= TSNEP_DESC_EXTENDED_WRITEBACK_FLAG;
>>   
>>   		/* toggle user flag to prevent false acknowledge
>> @@ -370,7 +372,8 @@ static int tsnep_tx_desc_available(struct tsnep_tx *tx)
>>   		return tx->read - tx->write - 1;
>>   }
>>   
>> -static int tsnep_tx_map(struct sk_buff *skb, struct tsnep_tx *tx, int count)
>> +static int tsnep_tx_map(const struct sk_buff *skb, struct tsnep_tx *tx,
>> +			int count)
>>   {
> 
> This change to const doesn't add anything since this is a static
> function. You could probably just skip making this change since the
> function will likely be inlined anyway.

const was requested for tsnep_xdp_tx_map() during last review round so I
added it also here to keep both function similar.

>>   	struct device *dmadev = tx->adapter->dmadev;
>>   	struct tsnep_tx_entry *entry;
>> @@ -382,7 +385,7 @@ static int tsnep_tx_map(struct sk_buff *skb, struct tsnep_tx *tx, int count)
>>   	for (i = 0; i < count; i++) {
>>   		entry = &tx->entry[(tx->write + i) % TSNEP_RING_SIZE];
>>   
>> -		if (i == 0) {
>> +		if (!i) {
>>   			len = skb_headlen(skb);
>>   			dma = dma_map_single(dmadev, skb->data, len,
>>   					     DMA_TO_DEVICE);
>> @@ -400,6 +403,8 @@ static int tsnep_tx_map(struct sk_buff *skb, struct tsnep_tx *tx, int count)
>>   
>>   		entry->desc->tx = __cpu_to_le64(dma);
>>   
>> +		entry->type = TSNEP_TX_TYPE_SKB;
>> +
>>   		map_len += len;
>>   	}
>>   
> 
> I wonder if it wouldn't be better to change this so that you have a 4th
> type for just "PAGE" or "FRAGMENT" since you only really need to
> identify this as SKB or XDP on the first buffer, and then all the rest
> are just going to be unmapped as page regardless of if it is XDP or
> SKB.

I will it try in combination with your bitmap suggestion.

>> @@ -417,12 +422,13 @@ static int tsnep_tx_unmap(struct tsnep_tx *tx, int index, int count)
>>   		entry = &tx->entry[(index + i) % TSNEP_RING_SIZE];
>>   
>>   		if (entry->len) {
>> -			if (i == 0)
>> +			if (!i && entry->type == TSNEP_TX_TYPE_SKB)
>>   				dma_unmap_single(dmadev,
>>   						 dma_unmap_addr(entry, dma),
>>   						 dma_unmap_len(entry, len),
>>   						 DMA_TO_DEVICE);
>> -			else
>> +			else if (entry->type == TSNEP_TX_TYPE_SKB ||
>> +				 entry->type == TSNEP_TX_TYPE_XDP_NDO)
>>   				dma_unmap_page(dmadev,
>>   					       dma_unmap_addr(entry, dma),
>>   					       dma_unmap_len(entry, len),
> 
> Rather than perform 2 checks here you could just verify type !=
> TYPE_XDP_TX which would save you a check.

Will be improved with your bitmap suggestion.

>> @@ -482,7 +488,7 @@ static netdev_tx_t tsnep_xmit_frame_ring(struct sk_buff *skb,
>>   
>>   	for (i = 0; i < count; i++)
>>   		tsnep_tx_activate(tx, (tx->write + i) % TSNEP_RING_SIZE, length,
>> -				  i == (count - 1));
>> +				  i == count - 1);
>>   	tx->write = (tx->write + count) % TSNEP_RING_SIZE;
>>   
>>   	skb_tx_timestamp(skb);
>> @@ -502,12 +508,133 @@ static netdev_tx_t tsnep_xmit_frame_ring(struct sk_buff *skb,
>>   	return NETDEV_TX_OK;
>>   }
>>   
>> +static int tsnep_xdp_tx_map(const struct xdp_frame *xdpf, struct tsnep_tx *tx,
>> +			    const struct skb_shared_info *shinfo, int count,
>> +			    enum tsnep_tx_type type)
> 
> Again the const here isn't adding any value since this is a static
> function and will likely be inlined into the function below which calls
> it.

const was requested here during last review round so I added it. It may
add some value by detecting some problems at compile time.

>> +{
>> +	struct device *dmadev = tx->adapter->dmadev;
>> +	struct tsnep_tx_entry *entry;
>> +	const skb_frag_t *frag;
>> +	struct page *page;
>> +	unsigned int len;
>> +	int map_len = 0;
>> +	dma_addr_t dma;
>> +	void *data;
>> +	int i;
>> +
>> +	frag = NULL;
>> +	len = xdpf->len;
>> +	for (i = 0; i < count; i++) {
>> +		entry = &tx->entry[(tx->write + i) % TSNEP_RING_SIZE];
>> +		if (type == TSNEP_TX_TYPE_XDP_NDO) {
>> +			data = unlikely(frag) ? skb_frag_address(frag) :
>> +						xdpf->data;
>> +			dma = dma_map_single(dmadev, data, len, DMA_TO_DEVICE);
>> +			if (dma_mapping_error(dmadev, dma))
>> +				return -ENOMEM;
>> +
>> +			entry->type = TSNEP_TX_TYPE_XDP_NDO;
>> +		} else {
>> +			page = unlikely(frag) ? skb_frag_page(frag) :
>> +						virt_to_page(xdpf->data);
>> +			dma = page_pool_get_dma_addr(page);
>> +			if (unlikely(frag))
>> +				dma += skb_frag_off(frag);
>> +			else
>> +				dma += sizeof(*xdpf) + xdpf->headroom;
>> +			dma_sync_single_for_device(dmadev, dma, len,
>> +						   DMA_BIDIRECTIONAL);
>> +
>> +			entry->type = TSNEP_TX_TYPE_XDP_TX;
>> +		}
>> +
>> +		entry->len = len;
>> +		dma_unmap_addr_set(entry, dma, dma);
>> +
>> +		entry->desc->tx = __cpu_to_le64(dma);
>> +
>> +		map_len += len;
>> +
>> +		if (i + 1 < count) {
>> +			frag = &shinfo->frags[i];
>> +			len = skb_frag_size(frag);
>> +		}
>> +	}
>> +
>> +	return map_len;
>> +}
>> +
>> +/* This function requires __netif_tx_lock is held by the caller. */
>> +static bool tsnep_xdp_xmit_frame_ring(struct xdp_frame *xdpf,
>> +				      struct tsnep_tx *tx,
>> +				      enum tsnep_tx_type type)
>> +{
>> +	const struct skb_shared_info *shinfo =
>> +		xdp_get_shared_info_from_frame(xdpf);
>> +	struct tsnep_tx_entry *entry;
>> +	int count, length, retval, i;
>> +
>> +	count = 1;
>> +	if (unlikely(xdp_frame_has_frags(xdpf)))
>> +		count += shinfo->nr_frags;
>> +
>> +	spin_lock_bh(&tx->lock);
>> +
>> +	/* ensure that TX ring is not filled up by XDP, always MAX_SKB_FRAGS
>> +	 * will be available for normal TX path and queue is stopped there if
>> +	 * necessary
>> +	 */
>> +	if (tsnep_tx_desc_available(tx) < (MAX_SKB_FRAGS + 1 + count)) {
>> +		spin_unlock_bh(&tx->lock);
>> +
>> +		return false;
>> +	}
>> +
>> +	entry = &tx->entry[tx->write];
>> +	entry->xdpf = xdpf;
>> +
>> +	retval = tsnep_xdp_tx_map(xdpf, tx, shinfo, count, type);
>> +	if (retval < 0) {
>> +		tsnep_tx_unmap(tx, tx->write, count);
>> +		entry->xdpf = NULL;
>> +
>> +		tx->dropped++;
>> +
>> +		spin_unlock_bh(&tx->lock);
>> +
>> +		return false;
>> +	}
>> +	length = retval;
>> +
>> +	for (i = 0; i < count; i++)
>> +		tsnep_tx_activate(tx, (tx->write + i) % TSNEP_RING_SIZE, length,
>> +				  i == count - 1);
>> +	tx->write = (tx->write + count) % TSNEP_RING_SIZE;
>> +
>> +	/* descriptor properties shall be valid before hardware is notified */
>> +	dma_wmb();
>> +
>> +	spin_unlock_bh(&tx->lock);
>> +
>> +	return true;
>> +}
>> +
>> +static void tsnep_xdp_xmit_flush(struct tsnep_tx *tx)
>> +{
>> +	iowrite32(TSNEP_CONTROL_TX_ENABLE, tx->addr + TSNEP_CONTROL);
>> +}
>> +
>>   static bool tsnep_tx_poll(struct tsnep_tx *tx, int napi_budget)
>>   {
>> -	int budget = 128;
>>   	struct tsnep_tx_entry *entry;
>> -	int count;
>> +	struct xdp_frame_bulk bq;
>> +	int budget = 128;
>>   	int length;
>> +	int count;
>> +
>> +	xdp_frame_bulk_init(&bq);
>> +
>> +	rcu_read_lock(); /* need for xdp_return_frame_bulk */
>>   
> 
> You should be able to get rid of both of these. See comments below.
> 
>>   	spin_lock_bh(&tx->lock);
>>   
>> @@ -527,12 +654,17 @@ static bool tsnep_tx_poll(struct tsnep_tx *tx, int napi_budget)
>>   		dma_rmb();
>>   
>>   		count = 1;
>> -		if (skb_shinfo(entry->skb)->nr_frags > 0)
>> +		if (entry->type == TSNEP_TX_TYPE_SKB &&
>> +		    skb_shinfo(entry->skb)->nr_frags > 0)
>>   			count += skb_shinfo(entry->skb)->nr_frags;
>> +		else if (entry->type != TSNEP_TX_TYPE_SKB &&
>> +			 xdp_frame_has_frags(entry->xdpf))
>> +			count += xdp_get_shared_info_from_frame(entry->xdpf)->nr_frags;
>>   
>>   		length = tsnep_tx_unmap(tx, tx->read, count);
>>   
>> -		if ((skb_shinfo(entry->skb)->tx_flags & SKBTX_IN_PROGRESS) &&
>> +		if (entry->type == TSNEP_TX_TYPE_SKB &&
>> +		    (skb_shinfo(entry->skb)->tx_flags & SKBTX_IN_PROGRESS) &&
>>   		    (__le32_to_cpu(entry->desc_wb->properties) &
>>   		     TSNEP_DESC_EXTENDED_WRITEBACK_FLAG)) {
>>   			struct skb_shared_hwtstamps hwtstamps;
>> @@ -552,7 +684,18 @@ static bool tsnep_tx_poll(struct tsnep_tx *tx, int napi_budget)
>>   			skb_tstamp_tx(entry->skb, &hwtstamps);
>>   		}
>>   
>> -		napi_consume_skb(entry->skb, napi_budget);
>> +		switch (entry->type) {
>> +		case TSNEP_TX_TYPE_SKB:
>> +			napi_consume_skb(entry->skb, napi_budget);
>> +			break;
>> +		case TSNEP_TX_TYPE_XDP_TX:
>> +			xdp_return_frame_rx_napi(entry->xdpf);
>> +			break;
>> +		case TSNEP_TX_TYPE_XDP_NDO:
>> +			xdp_return_frame_bulk(entry->xdpf, &bq);
>> +			break;
>> +		}
>> +		/* xdpf is union with skb */
>>   		entry->skb = NULL;
>>   
>>   		tx->read = (tx->read + count) % TSNEP_RING_SIZE;
> 
> If I am not mistaken I think you can use xdp_return_frame_rx_napi for
> both of these without having to resort to the bulk approach since the
> tsnep_tx_poll is operating in the NAPI polling context.
> 
> When you free the buffers outside of polling context you would then
> want to use the xdp_return_frame_bulk. So for example if you have any
> buffers that weren't transmitted when you get to the point of
> tsnep_tx_ring_cleanup then you would want to unmap and free them there
> using something like the xdp_return_frame or xdp_return_frame_bulk.

This is done similar in other drivers, so I thought it is good practice.
I will compare the performance of xdp_return_frame_rx_napi() and
xdp_return_frame_bulk().

>> @@ -570,7 +713,11 @@ static bool tsnep_tx_poll(struct tsnep_tx *tx, int napi_budget)
>>   
>>   	spin_unlock_bh(&tx->lock);
>>   
>> -	return (budget != 0);
>> +	xdp_flush_frame_bulk(&bq);
>> +
>> +	rcu_read_unlock();
>> +
>> +	return budget != 0;
>>   }
>>   
>>   static bool tsnep_tx_pending(struct tsnep_tx *tx)
>> @@ -1330,6 +1477,45 @@ static ktime_t tsnep_netdev_get_tstamp(struct net_device *netdev,
>>   	return ns_to_ktime(timestamp);
>>   }
>>   
>> +static int tsnep_netdev_xdp_xmit(struct net_device *dev, int n,
>> +				 struct xdp_frame **xdp, u32 flags)
>> +{
>> +	struct tsnep_adapter *adapter = netdev_priv(dev);
>> +	u32 cpu = smp_processor_id();
>> +	struct netdev_queue *nq;
>> +	int nxmit, queue;
>> +	bool xmit;
>> +
>> +	if (unlikely(test_bit(__TSNEP_DOWN, &adapter->state)))
>> +		return -ENETDOWN;
>> +
> 
> Again, here you are better off probably checking for netif_carrier_ok
> rather than adding a new flag.
> 
>> +	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
>> +		return -EINVAL;
>> +
>> +	queue = cpu % adapter->num_tx_queues;
> 
> Modulo math here unconditionally will be expensive. Take a look at
> using reciprocal_scale like we do in skb_tx_hash.

I will try to find a better solution.

>> +	nq = netdev_get_tx_queue(adapter->netdev, queue);
>> +
>> +	__netif_tx_lock(nq, cpu);
>> +
>> +	/* Avoid transmit queue timeout since we share it with the slow path */
>> +	txq_trans_cond_update(nq);
>> +
> 
> Doing this here may not be the right spot. It might make more sense to
> do this after the call to xmit_frame_ring after the break case in order
> to guarantee that you actually put something on the ring before
> updating the state otherwise you can stall the Tx ring without
> triggering the watchdog by just hammering this.
> 
>> +	for (nxmit = 0; nxmit < n; nxmit++) {
>> +		xmit = tsnep_xdp_xmit_frame_ring(xdp[nxmit],
>> +						 &adapter->tx[queue],
>> +						 TSNEP_TX_TYPE_XDP_NDO);
>> +		if (!xmit)
>> +			break;
> 
> So move txq_trans_cond_update(nq) here.

Will be fixed.

Gerhard

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ