[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CO1PR11MB5089BDFAF1B498FE9FDDA3FCD6DE2@CO1PR11MB5089.namprd11.prod.outlook.com>
Date: Tue, 18 Mar 2025 22:05:23 +0000
From: "Keller, Jacob E" <jacob.e.keller@...el.com>
To: Jiri Pirko <jiri@...nulli.us>, "netdev@...r.kernel.org"
<netdev@...r.kernel.org>
CC: "davem@...emloft.net" <davem@...emloft.net>, "Dumazet, Eric"
<edumazet@...gle.com>, "kuba@...nel.org" <kuba@...nel.org>,
"pabeni@...hat.com" <pabeni@...hat.com>, "saeedm@...dia.com"
<saeedm@...dia.com>, "leon@...nel.org" <leon@...nel.org>, "tariqt@...dia.com"
<tariqt@...dia.com>, "andrew+netdev@...n.ch" <andrew+netdev@...n.ch>,
"dakr@...nel.org" <dakr@...nel.org>, "rafael@...nel.org" <rafael@...nel.org>,
"gregkh@...uxfoundation.org" <gregkh@...uxfoundation.org>, "Kitszel,
Przemyslaw" <przemyslaw.kitszel@...el.com>, "Nguyen, Anthony L"
<anthony.l.nguyen@...el.com>, "cratiu@...dia.com" <cratiu@...dia.com>,
"Knitter, Konrad" <konrad.knitter@...el.com>, "cjubran@...dia.com"
<cjubran@...dia.com>
Subject: RE: [PATCH net-next RFC 2/3] net/mlx5: Introduce shared devlink
instance for PFs on same chip
> -----Original Message-----
> From: Jiri Pirko <jiri@...nulli.us>
> Sent: Tuesday, March 18, 2025 5:47 AM
> To: netdev@...r.kernel.org
> Cc: davem@...emloft.net; Dumazet, Eric <edumazet@...gle.com>;
> kuba@...nel.org; pabeni@...hat.com; saeedm@...dia.com; leon@...nel.org;
> tariqt@...dia.com; andrew+netdev@...n.ch; dakr@...nel.org;
> rafael@...nel.org; gregkh@...uxfoundation.org; Kitszel, Przemyslaw
> <przemyslaw.kitszel@...el.com>; Nguyen, Anthony L
> <anthony.l.nguyen@...el.com>; cratiu@...dia.com; Keller, Jacob E
> <jacob.e.keller@...el.com>; Knitter, Konrad <konrad.knitter@...el.com>;
> cjubran@...dia.com
> Subject: [PATCH net-next RFC 2/3] net/mlx5: Introduce shared devlink instance for
> PFs on same chip
>
> From: Jiri Pirko <jiri@...dia.com>
>
> Multiple PFS may reside on the same physical chip, running a single
> firmware. Some of the resources and configurations may be shared among
> these PFs. Currently, there is not good object to pin the configuration
> knobs on.
>
> Introduce a shared devlink, instantiated upon probe of the first PF,
> removed during remove of the last PF. Back this shared devlink instance
> by faux device, as there is no PCI device related to it.
>
> Make the PF devlink instances nested in this shared devlink instance.
>
> Example:
>
> $ devlink dev
> pci/0000:08:00.0:
> nested_devlink:
> auxiliary/mlx5_core.eth.0
> faux/mlx5_core_83013c12b77faa1a30000c82a1045c91:
> nested_devlink:
> pci/0000:08:00.0
> pci/0000:08:00.1
> auxiliary/mlx5_core.eth.0
> pci/0000:08:00.1:
> nested_devlink:
> auxiliary/mlx5_core.eth.1
> auxiliary/mlx5_core.eth.1
>
> Signed-off-by: Jiri Pirko <jiri@...dia.com>
> ---
> .../net/ethernet/mellanox/mlx5/core/Makefile | 4 +-
> .../net/ethernet/mellanox/mlx5/core/main.c | 18 +++
> .../ethernet/mellanox/mlx5/core/sh_devlink.c | 150 ++++++++++++++++++
> .../ethernet/mellanox/mlx5/core/sh_devlink.h | 10 ++
> include/linux/mlx5/driver.h | 5 +
> 5 files changed, 185 insertions(+), 2 deletions(-)
> create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/sh_devlink.c
> create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/sh_devlink.h
>
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
> b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
> index 568bbe5f83f5..510850b6e6e2 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
> @@ -16,8 +16,8 @@ mlx5_core-y := main.o cmd.o debugfs.o fw.o eq.o uar.o
> pagealloc.o \
> transobj.o vport.o sriov.o fs_cmd.o fs_core.o pci_irq.o \
> fs_counters.o fs_ft_pool.o rl.o lag/debugfs.o lag/lag.o dev.o
> events.o wq.o lib/gid.o \
> lib/devcom.o lib/pci_vsc.o lib/dm.o lib/fs_ttc.o
> diag/fs_tracepoint.o \
> - diag/fw_tracer.o diag/crdump.o devlink.o diag/rsc_dump.o
> diag/reporter_vnic.o \
> - fw_reset.o qos.o lib/tout.o lib/aso.o wc.o fs_pool.o
> + diag/fw_tracer.o diag/crdump.o devlink.o sh_devlink.o
> diag/rsc_dump.o \
> + diag/reporter_vnic.o fw_reset.o qos.o lib/tout.o lib/aso.o wc.o
> fs_pool.o
>
> #
> # Netdev basic
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c
> b/drivers/net/ethernet/mellanox/mlx5/core/main.c
> index 710633d5fdbe..e1217a8bf4db 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
> @@ -74,6 +74,7 @@
> #include "mlx5_irq.h"
> #include "hwmon.h"
> #include "lag/lag.h"
> +#include "sh_devlink.h"
>
> MODULE_AUTHOR("Eli Cohen <eli@...lanox.com>");
> MODULE_DESCRIPTION("Mellanox 5th generation network adapters (ConnectX
> series) core driver");
> @@ -1554,10 +1555,17 @@ int mlx5_init_one(struct mlx5_core_dev *dev)
> int err;
>
> devl_lock(devlink);
> + if (dev->shd) {
> + err = devl_nested_devlink_set(priv_to_devlink(dev->shd),
> + devlink);
> + if (err)
> + goto unlock;
> + }
> devl_register(devlink);
> err = mlx5_init_one_devl_locked(dev);
> if (err)
> devl_unregister(devlink);
> +unlock:
> devl_unlock(devlink);
> return err;
> }
> @@ -1998,6 +2006,13 @@ static int probe_one(struct pci_dev *pdev, const struct
> pci_device_id *id)
> goto pci_init_err;
> }
>
> + err = mlx5_shd_init(dev);
> + if (err) {
> + mlx5_core_err(dev, "mlx5_shd_init failed with error code %d\n",
> + err);
> + goto shd_init_err;
> + }
> +
> err = mlx5_init_one(dev);
> if (err) {
> mlx5_core_err(dev, "mlx5_init_one failed with error code %d\n",
> @@ -2009,6 +2024,8 @@ static int probe_one(struct pci_dev *pdev, const struct
> pci_device_id *id)
> return 0;
>
> err_init_one:
> + mlx5_shd_uninit(dev);
> +shd_init_err:
> mlx5_pci_close(dev);
> pci_init_err:
> mlx5_mdev_uninit(dev);
> @@ -2030,6 +2047,7 @@ static void remove_one(struct pci_dev *pdev)
> mlx5_drain_health_wq(dev);
> mlx5_sriov_disable(pdev, false);
> mlx5_uninit_one(dev);
> + mlx5_shd_uninit(dev);
> mlx5_pci_close(dev);
> mlx5_mdev_uninit(dev);
> mlx5_adev_idx_free(dev->priv.adev_idx);
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sh_devlink.c
> b/drivers/net/ethernet/mellanox/mlx5/core/sh_devlink.c
> new file mode 100644
> index 000000000000..671a3442525b
> --- /dev/null
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/sh_devlink.c
> @@ -0,0 +1,150 @@
> +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
> +/* Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
> +
> +#include <linux/device/faux.h>
> +#include <linux/mlx5/driver.h>
> +#include <linux/mlx5/vport.h>
> +
> +#include "sh_devlink.h"
> +
> +static LIST_HEAD(shd_list);
> +static DEFINE_MUTEX(shd_mutex); /* Protects shd_list and shd->list */
> +
> +/* This structure represents a shared devlink instance,
> + * there is one created for PF group of the same chip.
> + */
> +struct mlx5_shd {
> + /* Node in shd list */
> + struct list_head list;
> + /* Serial number of the chip */
> + const char *sn;
> + /* List of per-PF dev instances. */
> + struct list_head dev_list;
> + /* Related faux device */
> + struct faux_device *faux_dev;
> +};
> +
For ice, the equivalent of this would essentially replace ice_adapter I imagine.
> +static const struct devlink_ops mlx5_shd_ops = {
> +};
> +
> +static int mlx5_shd_faux_probe(struct faux_device *faux_dev)
> +{
> + struct devlink *devlink;
> + struct mlx5_shd *shd;
> +
> + devlink = devlink_alloc(&mlx5_shd_ops, sizeof(struct mlx5_shd),
> &faux_dev->dev);
> + if (!devlink)
> + return -ENOMEM;
> + shd = devlink_priv(devlink);
> + faux_device_set_drvdata(faux_dev, shd);
> +
> + devl_lock(devlink);
> + devl_register(devlink);
> + devl_unlock(devlink);
> + return 0;
> +}
> +
> +static void mlx5_shd_faux_remove(struct faux_device *faux_dev)
> +{
> + struct mlx5_shd *shd = faux_device_get_drvdata(faux_dev);
> + struct devlink *devlink = priv_to_devlink(shd);
> +
> + devl_lock(devlink);
> + devl_unregister(devlink);
> + devl_unlock(devlink);
> + devlink_free(devlink);
> +}
> +
> +static const struct faux_device_ops mlx5_shd_faux_ops = {
> + .probe = mlx5_shd_faux_probe,
> + .remove = mlx5_shd_faux_remove,
> +};
> +
> +static struct mlx5_shd *mlx5_shd_create(const char *sn)
> +{
> + struct faux_device *faux_dev;
> + struct mlx5_shd *shd;
> +
> + faux_dev = faux_device_create(THIS_MODULE, sn, NULL,
> &mlx5_shd_faux_ops);
> + if (!faux_dev)
> + return NULL;
> + shd = faux_device_get_drvdata(faux_dev);
> + if (!shd)
> + return NULL;
> + list_add_tail(&shd->list, &shd_list);
> + shd->sn = sn;
> + INIT_LIST_HEAD(&shd->dev_list);
> + shd->faux_dev = faux_dev;
> + return shd;
> +}
> +
> +static void mlx5_shd_destroy(struct mlx5_shd *shd)
> +{
> + list_del(&shd->list);
> + kfree(shd->sn);
> + faux_device_destroy(shd->faux_dev);
> +}
> +
> +int mlx5_shd_init(struct mlx5_core_dev *dev)
> +{
> + u8 *vpd_data __free(kfree) = NULL;
> + struct pci_dev *pdev = dev->pdev;
> + unsigned int vpd_size, kw_len;
> + struct mlx5_shd *shd;
> + const char *sn;
> + char *end;
> + int start;
> + int err;
> +
> + if (!mlx5_core_is_pf(dev))
> + return 0;
> +
> + vpd_data = pci_vpd_alloc(pdev, &vpd_size);
> + if (IS_ERR(vpd_data)) {
> + err = PTR_ERR(vpd_data);
> + return err == -ENODEV ? 0 : err;
> + }
> + start = pci_vpd_find_ro_info_keyword(vpd_data, vpd_size, "V3",
> &kw_len);
> + if (start < 0) {
> + /* Fall-back to SN for older devices. */
> + start = pci_vpd_find_ro_info_keyword(vpd_data, vpd_size,
> +
> PCI_VPD_RO_KEYWORD_SERIALNO, &kw_len);
> + if (start < 0)
> + return -ENOENT;
> + }
> + sn = kstrndup(vpd_data + start, kw_len, GFP_KERNEL);
> + if (!sn)
> + return -ENOMEM;
> + end = strchrnul(sn, ' ');
> + *end = '\0';
> +
> + guard(mutex)(&shd_mutex);
> + list_for_each_entry(shd, &shd_list, list) {
> + if (!strcmp(shd->sn, sn)) {
> + kfree(sn);
> + goto found;
> + }
> + }
> + shd = mlx5_shd_create(sn);
> + if (!shd) {
> + kfree(sn);
> + return -ENOMEM;
> + }
How is the faux device kept in memory? I guess its reference counted somewhere? But I don't see that reference being incremented in the list_for_each.
> +found:
> + list_add_tail(&dev->shd_list, &shd->dev_list);
> + dev->shd = shd;
> + return 0;
> +}
> +
> +void mlx5_shd_uninit(struct mlx5_core_dev *dev)
> +{
> + struct mlx5_shd *shd = dev->shd;
> +
> + if (!dev->shd)
> + return;
> +
> + guard(mutex)(&shd_mutex);
> + list_del(&dev->shd_list);
> + if (list_empty(&shd->dev_list))
> + mlx5_shd_destroy(shd);
> +}
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sh_devlink.h
> b/drivers/net/ethernet/mellanox/mlx5/core/sh_devlink.h
> new file mode 100644
> index 000000000000..67df03e3c72e
> --- /dev/null
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/sh_devlink.h
> @@ -0,0 +1,10 @@
> +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
> +/* Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
> +
> +#ifndef __MLX5_SH_DEVLINK_H__
> +#define __MLX5_SH_DEVLINK_H__
> +
> +int mlx5_shd_init(struct mlx5_core_dev *dev);
> +void mlx5_shd_uninit(struct mlx5_core_dev *dev);
> +
> +#endif /* __MLX5_SH_DEVLINK_H__ */
> diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
> index 46bd7550adf8..78f1f034568f 100644
> --- a/include/linux/mlx5/driver.h
> +++ b/include/linux/mlx5/driver.h
> @@ -721,6 +721,8 @@ enum mlx5_wc_state {
> MLX5_WC_STATE_SUPPORTED,
> };
>
> +struct mlx5_shd;
> +
> struct mlx5_core_dev {
> struct device *device;
> enum mlx5_coredev_type coredev_type;
> @@ -783,6 +785,9 @@ struct mlx5_core_dev {
> enum mlx5_wc_state wc_state;
> /* sync write combining state */
> struct mutex wc_state_lock;
> + /* node in shared devlink list */
> + struct list_head shd_list;
> + struct mlx5_shd *shd;
> };
>
> struct mlx5_db {
> --
> 2.48.1
Powered by blists - more mailing lists