lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date: Thu, 23 May 2024 10:27:54 +0300
From: Tariq Toukan <ttoukan.linux@...il.com>
To: Joe Damato <jdamato@...tly.com>, linux-kernel@...r.kernel.org,
 netdev@...r.kernel.org, zyjzyj2000@...il.com, nalramli@...tly.com,
 Saeed Mahameed <saeedm@...dia.com>, Leon Romanovsky <leon@...nel.org>,
 "David S. Miller" <davem@...emloft.net>, Eric Dumazet <edumazet@...gle.com>,
 Jakub Kicinski <kuba@...nel.org>, Paolo Abeni <pabeni@...hat.com>,
 Richard Cochran <richardcochran@...il.com>,
 "open list:MELLANOX MLX5 core VPI driver" <linux-rdma@...r.kernel.org>,
 Tariq Toukan <tariqt@...dia.com>
Subject: Re: [PATCH net-next v2 1/1] net/mlx5e: Add per queue netdev-genl
 stats



On 15/05/2024 10:19, Joe Damato wrote:
> On Tue, May 14, 2024 at 09:44:37PM +0300, Tariq Toukan wrote:
>>
>>
>> On 10/05/2024 7:17, Joe Damato wrote:
>>> Add functions to support the netdev-genl per queue stats API.
>>>
>>> ./cli.py --spec netlink/specs/netdev.yaml \
>>> --dump qstats-get --json '{"scope": "queue"}'
>>>
>>> ...snip
>>>
>>>    {'ifindex': 7,
>>>     'queue-id': 62,
>>>     'queue-type': 'rx',
>>>     'rx-alloc-fail': 0,
>>>     'rx-bytes': 105965251,
>>>     'rx-packets': 179790},
>>>    {'ifindex': 7,
>>>     'queue-id': 0,
>>>     'queue-type': 'tx',
>>>     'tx-bytes': 9402665,
>>>     'tx-packets': 17551},
>>>
>>> ...snip
>>>
>>> Also tested with the script tools/testing/selftests/drivers/net/stats.py
>>> in several scenarios to ensure stats tallying was correct:
>>>
>>> - on boot (default queue counts)
>>> - adjusting queue count up or down (ethtool -L eth0 combined ...)
>>> - adding mqprio TCs
>>>
>>> Signed-off-by: Joe Damato <jdamato@...tly.com>
>>> ---
>>>    .../net/ethernet/mellanox/mlx5/core/en_main.c | 144 ++++++++++++++++++
>>>    1 file changed, 144 insertions(+)
>>>
>>> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
>>> index ffe8919494d5..4a675d8b31b5 100644
>>> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
>>> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
>>> @@ -39,6 +39,7 @@
>>>    #include <linux/debugfs.h>
>>>    #include <linux/if_bridge.h>
>>>    #include <linux/filter.h>
>>> +#include <net/netdev_queues.h>
>>>    #include <net/page_pool/types.h>
>>>    #include <net/pkt_sched.h>
>>>    #include <net/xdp_sock_drv.h>
>>> @@ -5282,6 +5283,148 @@ static bool mlx5e_tunnel_any_tx_proto_supported(struct mlx5_core_dev *mdev)
>>>    	return (mlx5_vxlan_allowed(mdev->vxlan) || mlx5_geneve_tx_allowed(mdev));
>>>    }
>>> +static void mlx5e_get_queue_stats_rx(struct net_device *dev, int i,
>>> +				     struct netdev_queue_stats_rx *stats)
>>> +{
>>> +	struct mlx5e_priv *priv = netdev_priv(dev);
>>> +
>>> +	if (mlx5e_is_uplink_rep(priv))
>>> +		return;
>>> +
>>> +	struct mlx5e_channel_stats *channel_stats = priv->channel_stats[i];
>>> +	struct mlx5e_rq_stats *xskrq_stats = &channel_stats->xskrq;
>>> +	struct mlx5e_rq_stats *rq_stats = &channel_stats->rq;
>>> +
>>
>> Don't we allow variable declaration only at the beginning of a block?
>> Is this style accepted in the networking subsystem?
>>
>>> +	stats->packets = rq_stats->packets + xskrq_stats->packets;
>>> +	stats->bytes = rq_stats->bytes + xskrq_stats->bytes;
>>> +	stats->alloc_fail = rq_stats->buff_alloc_err +
>>> +			    xskrq_stats->buff_alloc_err;
>>> +}
>>> +
>>> +static void mlx5e_get_queue_stats_tx(struct net_device *dev, int i,
>>> +				     struct netdev_queue_stats_tx *stats)
>>> +{
>>> +	struct mlx5e_priv *priv = netdev_priv(dev);
>>> +	struct net_device *netdev = priv->netdev;
>>> +	struct mlx5e_txqsq *sq;
>>> +	int j;
>>> +
>>> +	if (mlx5e_is_uplink_rep(priv))
>>> +		return;
>>> +
>>> +	for (j = 0; j < netdev->num_tx_queues; j++) {
>>> +		sq = priv->txq2sq[j];
>>
>> No sq instance in case interface is down.
> 
> This seems easily fixable by checking:
> 
>    priv->channels.num > 0
> 

yes, or test_bit(MLX5E_STATE_OPENED, &priv->state).

>> This should be a simple arithmetic calculation.
> 
> I'm not sure why I can't use txq2sq? Please see below for my
> explanation about why I think txq2sq might be all I need.
> 
>> Need to expose the proper functions for this calculation, and use it here
>> and in the sq create flows.
> 
> I re-read the code several times and my apologies, but I am probably
> still missing something.
> 
> I don't think a calculation function is necessary (see below), but
> if one is really needed, I'd probably add something like:
> 
>    static inline int tc_to_txq_ix(struct mlx5e_channel *c,
>                                   struct mlx5e_params *params,
>                                   int tc)
>    {
>           return c->ix + tc * params->num_channels;

We need the opposite direction.

The goal is to calculate the proper pair of (ch_ix, tc) in order to 
access the correct sq_stats struct in the stats array:
priv->channel_stats[ch_ix]->sq[tc];

Given i in [0, real_num_tx_queues), we should extract the pair as follows:

ch_ix = i % params->num_channels;
tc = i / params->num_channels;

>    }
> 
> And call it from mlx5e_open_sqs.
> 
> But, I don't understand why any calculation is needed in
> mlx5e_get_queue_stats_tx. Please see below for explanation.
> 
>>
>> Here it seems that you need a very involved user, so he passes the correct
>> index i of the SQ that he's interested in..
>>
>>> +		if (sq->ch_ix == i) {
>>
>> So you're looking for the first SQ on channel i?
>> But there might be multiple SQs on channel i...
>> Also, this SQ might be already included in the base stats.
>> In addition, this i might be too large for a channel index (num_tx_queues
>> can be 8 * num_channels)
>>
>> The logic here (of mapping from i in num_tx_queues to SQ stats) needs
>> careful definition.
> 
> I read your comments a few times and read the mlx5 source and I am
> probably still missing something obvious here; my apologies.
> 
> In net/core/netdev-genl.c, calls to the driver's get_queue_stats_tx
> appear to pass [0, netdev->real_num_tx_queues) as i.
> 
> I think this means i is a txq_ix in mlx5, because mlx5 sets
> netdev->real_num_tx_queues in mlx5e_update_tx_netdev_queues, as:
> 
>    nch * ntc + qos_queues
> 
> which when expanded is
> 
>    priv->channels.params.num_channels * mlx5e_get_dcb_num_tc + qos_queues
> 
> So, net/core/netdev-genl.c will be using 0 up to that expression as
> i when calling mlx5e_get_queue_stats_tx.
> 

Right.

> In mlx5:
>    - mlx5e_activate_priv_channels calls mlx5e_build_txq_maps which
>      generates priv->txq2sq[txq_ix] = sq for every mlx5e_get_dcb_num_tc
>      of every priv->channels.num.
>   
> This seems to happen every time mlx5e_activate_priv_channels is
> called, which I think means that priv->txq2sq is always up to date
> and will give the right sq for a given txq_ix (assuming the device
> isn't down).
> 

Right.

> Putting all of this together, I think that mlx5e_get_queue_stats_tx
> might need to be something like:
> 
>    mutex_lock(&priv->state_lock);
>    if (priv->channels.num > 0) {
>            sq = priv->txq2sq[i];
>            stats->packets = sq->stats->packets;
>            stats->bytes = sq->stats->bytes;
>    }
>    mutex_unlock(&priv->state_lock);
> 

Right.
But you can also access the sq_stats directly without going through the 
sq. This is important as the channels might be down.


> Is this still incorrect somehow?
> 
> If so, could you explain a bit more about why a calculation is
> needed? It seems like txq2sq would provide the mapping from txq_ix's
> (which is what mlx5e_get_queue_stats_tx gets as 'i') and an sq,
> which seems like all I would need?
> 
> Sorry if I am still not following here.
> 

I see two possible solutions:

1.
a. in the base, sum all stats for entries that are no longer available 
in the current configuration (independently to if the netdev is up or 
down), like sqs for ch_ix >= params->num_channels.
b. in mlx5e_get_queue_stats_tx, access the sq_stats without going 
through the sq (as it might not exist), this will be done for all i in 0 
ti real_num_tx_queues.

2.
a. in the base, sum all stats for all non existing sqs. if interface is 
down, then no sqs exist, so you sum the whole array.
b. in mlx5e_get_queue_stats_tx, go through the txq2sq and the sq, if it 
exists, if interface is down just return empty stats.

I don't have strong preference, although #1 looks slightly better to me.


>>> +			stats->packets = sq->stats->packets;
>>> +			stats->bytes = sq->stats->bytes;
>>> +			return;
>>> +		}
>>> +	}
>>> +}
>>> +
>>> +static void mlx5e_get_base_stats(struct net_device *dev,
>>> +				 struct netdev_queue_stats_rx *rx,
>>> +				 struct netdev_queue_stats_tx *tx)
>>> +{
>>> +	struct mlx5e_priv *priv = netdev_priv(dev);
>>> +	int i, j;
>>> +
>>> +	if (!mlx5e_is_uplink_rep(priv)) {
>>> +		rx->packets = 0;
>>> +		rx->bytes = 0;
>>> +		rx->alloc_fail = 0;
>>> +
>>> +		/* compute stats for deactivated RX queues
>>> +		 *
>>> +		 * if priv->channels.num == 0 the device is down, so compute
>>> +		 * stats for every queue.
>>> +		 *
>>> +		 * otherwise, compute only the queues which have been deactivated.
>>> +		 */
>>> +		if (priv->channels.num == 0)
>>> +			i = 0;
>>> +		else
>>> +			i = priv->channels.params.num_channels;
>>> +
>>> +		for (; i < priv->stats_nch; i++) {
>>> +			struct mlx5e_channel_stats *channel_stats = priv->channel_stats[i];
>>> +			struct mlx5e_rq_stats *xskrq_stats = &channel_stats->xskrq;
>>> +			struct mlx5e_rq_stats *rq_stats = &channel_stats->rq;
>>> +
>>> +			rx->packets += rq_stats->packets + xskrq_stats->packets;
>>> +			rx->bytes += rq_stats->bytes + xskrq_stats->bytes;
>>> +			rx->alloc_fail += rq_stats->buff_alloc_err +
>>> +					  xskrq_stats->buff_alloc_err;
>>
>> Isn't this equivalent to mlx5e_get_queue_stats_rx(i) ?
>>
>>> +		}
>>> +
>>> +		if (priv->rx_ptp_opened) {
>>> +			struct mlx5e_rq_stats *rq_stats = &priv->ptp_stats.rq;
>>> +
>>> +			rx->packets += rq_stats->packets;
>>> +			rx->bytes += rq_stats->bytes;
>>> +		}
>>> +	}
>>> +
>>> +	tx->packets = 0;
>>> +	tx->bytes = 0;
>>> +
>>> +	/* three TX cases to handle:
>>> +	 *
>>> +	 * case 1: priv->channels.num == 0, get the stats for every TC
>>> +	 *         on every queue.
>>> +	 *
>>> +	 * case 2: priv->channel.num > 0, so get the stats for every TC on
>>> +	 *         every deactivated queue.
>>> +	 *
>>> +	 * case 3: the number of TCs has changed, so get the stats for the
>>> +	 *         inactive TCs on active TX queues (handled in the second loop
>>> +	 *         below).
>>> +	 */
>>> +	if (priv->channels.num == 0)
>>> +		i = 0;
>>> +	else
>>> +		i = priv->channels.params.num_channels;
>>> +
>>
>> All reads/writes to priv->channels must be under the priv->state_lock.
>>
>>> +	for (; i < priv->stats_nch; i++) {
>>> +		struct mlx5e_channel_stats *channel_stats = priv->channel_stats[i];
>>> +
>>> +		for (j = 0; j < priv->max_opened_tc; j++) {
>>> +			struct mlx5e_sq_stats *sq_stats = &channel_stats->sq[j];
>>> +
>>> +			tx->packets += sq_stats->packets;
>>> +			tx->bytes += sq_stats->bytes;
>>> +		}
>>> +	}
>>> +
>>> +	/* Handle case 3 described above. */
>>> +	for (i = 0; i < priv->channels.params.num_channels; i++) {
>>> +		struct mlx5e_channel_stats *channel_stats = priv->channel_stats[i];
>>> +		u8 dcb_num_tc = mlx5e_get_dcb_num_tc(&priv->channels.params);
>>> +
>>> +		for (j = dcb_num_tc; j < priv->max_opened_tc; j++) {
>>> +			struct mlx5e_sq_stats *sq_stats = &channel_stats->sq[j];
>>> +
>>> +			tx->packets += sq_stats->packets;
>>> +			tx->bytes += sq_stats->bytes;
>>> +		}
>>> +	}
>>> +
>>> +	if (priv->tx_ptp_opened) {
>>> +		for (j = 0; j < priv->max_opened_tc; j++) {
>>> +			struct mlx5e_sq_stats *sq_stats = &priv->ptp_stats.sq[j];
>>> +
>>> +			tx->packets    += sq_stats->packets;
>>> +			tx->bytes      += sq_stats->bytes;
>>> +		}
>>> +	}
>>> +}
>>> +
>>> +static const struct netdev_stat_ops mlx5e_stat_ops = {
>>> +	.get_queue_stats_rx     = mlx5e_get_queue_stats_rx,
>>> +	.get_queue_stats_tx     = mlx5e_get_queue_stats_tx,
>>> +	.get_base_stats         = mlx5e_get_base_stats,
>>> +};
>>> +
>>>    static void mlx5e_build_nic_netdev(struct net_device *netdev)
>>>    {
>>>    	struct mlx5e_priv *priv = netdev_priv(netdev);
>>> @@ -5299,6 +5442,7 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev)
>>>    	netdev->watchdog_timeo    = 15 * HZ;
>>> +	netdev->stat_ops          = &mlx5e_stat_ops;
>>>    	netdev->ethtool_ops	  = &mlx5e_ethtool_ops;
>>>    	netdev->vlan_features    |= NETIF_F_SG;

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ