lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <0a528fe1-59cf-4761-9827-5f12ca886884@nvidia.com>
Date: Mon, 17 Jun 2024 08:17:01 +0300
From: Tariq Toukan <tariqt@...dia.com>
To: Joe Damato <jdamato@...tly.com>, Tariq Toukan <ttoukan.linux@...il.com>,
 netdev@...r.kernel.org, linux-kernel@...r.kernel.org, nalramli@...tly.com,
 Saeed Mahameed <saeedm@...dia.com>, Leon Romanovsky <leon@...nel.org>,
 "David S. Miller" <davem@...emloft.net>, Eric Dumazet <edumazet@...gle.com>,
 Jakub Kicinski <kuba@...nel.org>, Paolo Abeni <pabeni@...hat.com>,
 Richard Cochran <richardcochran@...il.com>,
 "open list:MELLANOX MLX5 core VPI driver" <linux-rdma@...r.kernel.org>
Subject: Re: [net-next v5 2/2] net/mlx5e: Add per queue netdev-genl stats



On 14/06/2024 1:06, Joe Damato wrote:
> On Thu, Jun 13, 2024 at 11:25:12PM +0300, Tariq Toukan wrote:
>>
>>
>> On 12/06/2024 23:08, Joe Damato wrote:
>>> ./cli.py --spec netlink/specs/netdev.yaml \
>>>            --dump qstats-get --json '{"scope": "queue"}'
>>>
>>> ...snip
>>>
>>>    {'ifindex': 7,
>>>     'queue-id': 62,
>>>     'queue-type': 'rx',
>>>     'rx-alloc-fail': 0,
>>>     'rx-bytes': 105965251,
>>>     'rx-packets': 179790},
>>>    {'ifindex': 7,
>>>     'queue-id': 0,
>>>     'queue-type': 'tx',
>>>     'tx-bytes': 9402665,
>>>     'tx-packets': 17551},
>>>
>>> ...snip
>>>
>>> Also tested with the script tools/testing/selftests/drivers/net/stats.py
>>> in several scenarios to ensure stats tallying was correct:
>>>
>>> - on boot (default queue counts)
>>> - adjusting queue count up or down (ethtool -L eth0 combined ...)
>>>
>>> The tools/testing/selftests/drivers/net/stats.py brings the device up,
>>> so to test with the device down, I did the following:
>>>
>>> $ ip link show eth4
>>> 7: eth4: <BROADCAST,MULTICAST> mtu 9000 qdisc mq state DOWN [..snip..]
>>>     [..snip..]
>>>
>>> $ cat /proc/net/dev | grep eth4
>>> eth4: 235710489  434811 [..snip rx..] 2878744 21227  [..snip tx..]
>>>
>>> $ ./cli.py --spec ../../../Documentation/netlink/specs/netdev.yaml \
>>>              --dump qstats-get --json '{"ifindex": 7}'
>>> [{'ifindex': 7,
>>>     'rx-alloc-fail': 0,
>>>     'rx-bytes': 235710489,
>>>     'rx-packets': 434811,
>>>     'tx-bytes': 2878744,
>>>     'tx-packets': 21227}]
>>>
>>> Compare the values in /proc/net/dev match the output of cli for the same
>>> device, even while the device is down.
>>>
>>> Note that while the device is down, per queue stats output nothing
>>> (because the device is down there are no queues):
>>
>> Yeah, the query doesn't reach the device driver...
> 
> Yes. Are you suggesting that I update the commit message? I can do
> so, if you think that is needed?
> 

No, that's fine.

>>>
>>> $ ./cli.py --spec ../../../Documentation/netlink/specs/netdev.yaml \
>>>              --dump qstats-get --json '{"scope": "queue", "ifindex": 7}'
>>> []
>>>
>>> Signed-off-by: Joe Damato <jdamato@...tly.com>
>>> ---
>>>    .../net/ethernet/mellanox/mlx5/core/en_main.c | 132 ++++++++++++++++++
>>>    1 file changed, 132 insertions(+)
>>>
>>> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
>>> index c548e2fdc58f..d3f38b4b18eb 100644
>>> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
>>> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
>>> @@ -39,6 +39,7 @@
>>>    #include <linux/debugfs.h>
>>>    #include <linux/if_bridge.h>
>>>    #include <linux/filter.h>
>>> +#include <net/netdev_queues.h>
>>>    #include <net/page_pool/types.h>
>>>    #include <net/pkt_sched.h>
>>>    #include <net/xdp_sock_drv.h>
>>> @@ -5299,6 +5300,136 @@ static bool mlx5e_tunnel_any_tx_proto_supported(struct mlx5_core_dev *mdev)
>>>    	return (mlx5_vxlan_allowed(mdev->vxlan) || mlx5_geneve_tx_allowed(mdev));
>>>    }
>>> +static void mlx5e_get_queue_stats_rx(struct net_device *dev, int i,
>>> +				     struct netdev_queue_stats_rx *stats)
>>> +{
>>> +	struct mlx5e_priv *priv = netdev_priv(dev);
>>> +	struct mlx5e_channel_stats *channel_stats;
>>> +	struct mlx5e_rq_stats *xskrq_stats;
>>> +	struct mlx5e_rq_stats *rq_stats;
>>> +
>>> +	ASSERT_RTNL();
>>> +	if (mlx5e_is_uplink_rep(priv))
>>> +		return;
>>> +
>>> +	channel_stats = priv->channel_stats[i];
>>> +	xskrq_stats = &channel_stats->xskrq;
>>> +	rq_stats = &channel_stats->rq;
>>> +
>>> +	stats->packets = rq_stats->packets + xskrq_stats->packets;
>>> +	stats->bytes = rq_stats->bytes + xskrq_stats->bytes;
>>> +	stats->alloc_fail = rq_stats->buff_alloc_err +
>>> +			    xskrq_stats->buff_alloc_err;
>>> +}
>>> +
>>> +static void mlx5e_get_queue_stats_tx(struct net_device *dev, int i,
>>> +				     struct netdev_queue_stats_tx *stats)
>>> +{
>>> +	struct mlx5e_priv *priv = netdev_priv(dev);
>>> +	struct mlx5e_sq_stats *sq_stats;
>>> +
>>> +	ASSERT_RTNL();
>>> +	/* no special case needed for ptp htb etc since txq2sq_stats is kept up
>>> +	 * to date for active sq_stats, otherwise get_base_stats takes care of
>>> +	 * inactive sqs.
>>> +	 */
>>> +	sq_stats = priv->txq2sq_stats[i];
>>> +	stats->packets = sq_stats->packets;
>>> +	stats->bytes = sq_stats->bytes;
>>> +}
>>> +
>>> +static void mlx5e_get_base_stats(struct net_device *dev,
>>> +				 struct netdev_queue_stats_rx *rx,
>>> +				 struct netdev_queue_stats_tx *tx)
>>> +{
>>> +	struct mlx5e_priv *priv = netdev_priv(dev);
>>> +	struct mlx5e_ptp *ptp_channel;
>>> +	int i, tc;
>>> +
>>> +	ASSERT_RTNL();
>>> +	if (!mlx5e_is_uplink_rep(priv)) {
>>> +		rx->packets = 0;
>>> +		rx->bytes = 0;
>>> +		rx->alloc_fail = 0;
>>> +
>>> +		for (i = priv->channels.params.num_channels; i < priv->stats_nch; i++) {
>>
>> IIUC, per the current kernel implementation, the lower parts won't be
>> completed in a loop over [0..real_num_rx_queues-1], as that loop is
>> conditional, happening only if the queues are active.
> 
> Sorry, but I'm probably missing something -- you said:
> 
>> as that loop is conditional, happening only if the queues are active.
> 
> I don't think so? Please continue reading for an example with code.
> 
> Let me clarify one thing, please? When you say "the lower parts"
> here you mean [0...priv->channels.params.num_channels], is that
> right?
> 

Right.

> If yes, I don't understand why the code in this v5 is wrong. It looks correct
> to me, if my understanding of "lower parts" is right.
> 
> Here's an example:
> 
> 1. Machine boots with 32 queues by default.
> 2. User runs ethtool -L eth0 combined 4
> 
>  From mlx5/core/en_ethtool.c, mlx5e_ethtool_set_channels:
> 
>    new_params = *cur_params;
>    new_params.num_channels = count;
> 
> So, priv->channels.params.num_channels = 4, [0...4) are the active
> queues.
> 
> The above loop in mlx5e_get_base_stats sums [4...32), which were previously
> active but have since been deactivated by a call to ethtool:
> 
>     for (i = priv->channels.params.num_channels; i < priv->stats_nch; i++)
> 
> The (snipped) code for netdev-genl, net/core/netdev-genl.c
> netdev_nl_stats_by_netdev (which does NOT check IFF_UP) does this:

It does NOT check IFF_UP.. Now things make sense.

> 
>    /* ... */
>    ops->get_base_stats(netdev, &rx_sum, &tx_sum);
> 
>    /* ... */
>    for (i = 0; i < netdev->real_num_rx_queues; i++) {
>      memset(&rx, 0xff, sizeof(rx));
>      if (ops->get_queue_stats_rx)
>              ops->get_queue_stats_rx(netdev, i, &rx);
>      netdev_nl_stats_add(&rx_sum, &rx, sizeof(rx));
>    }
> 
>   /* ... */
>     ... same for netdev->real_num_tx_queues
> 
> The above code gets the base stats (which in my example is [4..32)) and then
> gets the stats for the active RX (and if you continue reading, TX) based on
> real_num_rx_queues and real_num_tx_queues (which would be [0..4)).
> 
> This is why in the commit message, my example:
> 
> $ ./cli.py --spec ../../../Documentation/netlink/specs/netdev.yaml \
>              --dump qstats-get --json '{"ifindex": 7}'
> 
> The numbers match /proc/net/dev even when the device is down because all queues
> active and deactivated are summed properly.
> 
> Do you agree with me so far?
> 

Yep.

> The other case is the per-queue case, which is expressed like this (note the
> different "scope"):
> 
> ./cli.py --spec netlink/specs/netdev.yaml \
>            --dump qstats-get --json '{"scope": "queue"}'
> 
> In this case the user is querying stats on a per queue basis, not overall
> across the device.
> 
> In this case:
>    1. If the device is marked as !IFF_UP (down), an empty set is returned.

Ok, so here is the main difference in comparison to the previous case.
The kernel does the check and doesn't propagate the query to the device 
driver.

I think the kernel can trust the device driver, but ok, that's a 
different discussion.

Thanks for the detailed walkthrough.

>    2. Otherwise, as seen in netdev_nl_stats_by_queue (from net/core/netdev-genl.c):
> 
>      while (ops->get_queue_stats_rx && i < netdev->real_num_rx_queues) {
>        err = netdev_nl_stats_queue(netdev, rsp, NETDEV_QUEUE_TYPE_RX, i, info);
>        /* ... */
>      
>      /* the same for real_num_tx_queues */	
> 
> And so the individual stats for the active queues are returned (as shown in the
> commit message example).
> 
> If you disagree, can you please provide a detailed example so that I can
> understand where I am going wrong?
> 
>> I would like the kernel to drop that condition, and stop forcing the device
>> driver to conditionally include this part in the base.
> 
> I personally don't think the condition should be dropped, but this is a
> question for the implementor, who I believe is Jakub.
> 
> CC: Jakub on Tariq's request/question above.
> 
>> Otherwise, the lower parts need to be added here.
> 
> My understanding is that get_base is only called for the summary stats for the
> entire device, not the per-queue stats, so I don't think the "lower parts"
> (which I take to mean [0...priv->channels.params.num_channels)) need to be added here.
> 

Right.

> The per-queue stats are only called for a specific queue number that is valid
> and will be returned by the other functions, not base.
> 
> Of course, I could be wrong and would appreciate insight from Jakub
> on this, if possible.
> 
>>> +			struct netdev_queue_stats_rx rx_i = {0};
>>> +
>>> +			mlx5e_get_queue_stats_rx(dev, i, &rx_i);
>>> +
>>> +			rx->packets += rx_i.packets;
>>> +			rx->bytes += rx_i.bytes;
>>> +			rx->alloc_fail += rx_i.alloc_fail;
>>> +		}
>>> +
>>> +		/* always report PTP RX stats from base as there is no
>>> +		 * corresponding channel to report them under in
>>> +		 * mlx5e_get_queue_stats_rx.
>>> +		 */
>>> +		if (priv->rx_ptp_opened) {
>>> +			struct mlx5e_rq_stats *rq_stats = &priv->ptp_stats.rq;
>>> +
>>> +			rx->packets += rq_stats->packets;
>>> +			rx->bytes += rq_stats->bytes;
>>> +		}
>>> +	}
>>> +
>>> +	tx->packets = 0;
>>> +	tx->bytes = 0;
>>> +
>>> +	for (i = 0; i < priv->stats_nch; i++) { > +		struct mlx5e_channel_stats *channel_stats = priv->channel_stats[i];
>>> +
>>> +		/* handle two cases:
>>> +		 *
>>> +		 *  1. channels which are active. In this case,
>>> +		 *     report only deactivated TCs on these channels.
>>> +		 *
>>> +		 *  2. channels which were deactivated
>>> +		 *     (i > priv->channels.params.num_channels)
>>> +		 *     must have all of their TCs [0 .. priv->max_opened_tc)
>>> +		 *     examined because deactivated channels will not be in the
>>> +		 *     range of [0..real_num_tx_queues) and will not have their
>>> +		 *     stats reported by mlx5e_get_queue_stats_tx.
>>> +		 */
>>> +		if (i < priv->channels.params.num_channels)
>>> +			tc = mlx5e_get_dcb_num_tc(&priv->channels.params);
>>> +		else
>>> +			tc = 0;
>>> +
>>> +		for (; tc < priv->max_opened_tc; tc++) {
>>> +			struct mlx5e_sq_stats *sq_stats = &channel_stats->sq[tc];
>>> +
>>> +			tx->packets += sq_stats->packets;
>>> +			tx->bytes += sq_stats->bytes;
>>> +		}
>>
>> Again, what about the lower part in case queues are not active?
> 
> I am not trying to be difficult here; I appreciate your time and energy, but I
> think there is still some misunderstanding here.
> 
> Probably on my side ;)
> 
> But, if the queues are not active then any queue above
> priv->channels.params.num_channels (the non active queues) will have all TCs
> summed.
> 

Thanks for your contribution.

Reviewed-by: Tariq Toukan <tariqt@...dia.com>

>>> +	}
>>> +
>>> +	/* if PTP TX was opened at some point and has since either:
>>> +	 *    -  been shutdown and set to NULL, or
>>> +	 *    -  simply disabled (bit unset)
>>> +	 *
>>> +	 * report stats directly from the ptp_stats structures as these queues
>>> +	 * are now unavailable and there is no txq index to retrieve these
>>> +	 * stats via calls to mlx5e_get_queue_stats_tx.
>>> +	 */
>>> +	ptp_channel = priv->channels.ptp;
>>> +	if (priv->tx_ptp_opened && (!ptp_channel || !test_bit(MLX5E_PTP_STATE_TX, ptp_channel->state))) {
>>> +		for (tc = 0; tc < priv->max_opened_tc; tc++) {
>>> +			struct mlx5e_sq_stats *sq_stats = &priv->ptp_stats.sq[tc];
>>> +
>>> +			tx->packets += sq_stats->packets;
>>> +			tx->bytes   += sq_stats->bytes;
>>> +		}
>>> +	}
>>> +}
>>> +
>>> +static const struct netdev_stat_ops mlx5e_stat_ops = {
>>> +	.get_queue_stats_rx  = mlx5e_get_queue_stats_rx,
>>> +	.get_queue_stats_tx  = mlx5e_get_queue_stats_tx,
>>> +	.get_base_stats      = mlx5e_get_base_stats,
>>> +};
>>> +
>>>    static void mlx5e_build_nic_netdev(struct net_device *netdev)
>>>    {
>>>    	struct mlx5e_priv *priv = netdev_priv(netdev);
>>> @@ -5316,6 +5447,7 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev)
>>>    	netdev->watchdog_timeo    = 15 * HZ;
>>> +	netdev->stat_ops	  = &mlx5e_stat_ops;
>>>    	netdev->ethtool_ops	  = &mlx5e_ethtool_ops;
>>>    	netdev->vlan_features    |= NETIF_F_SG;

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ