lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CADEbmW1YeugjaPOUKyVLw+vDh6ahK+XTJjY7RF9qN2r=B8XoMg@mail.gmail.com>
Date: Fri, 8 Mar 2024 15:18:16 +0100
From: Michal Schmidt <mschmidt@...hat.com>
To: Przemek Kitszel <przemyslaw.kitszel@...el.com>
Cc: intel-wired-lan@...ts.osuosl.org, Jiri Pirko <jiri@...nulli.us>, 
	netdev@...r.kernel.org, Arkadiusz Kubalewski <arkadiusz.kubalewski@...el.com>, 
	Karol Kolacinski <karol.kolacinski@...el.com>, Jacob Keller <jacob.e.keller@...el.com>, 
	Jakub Kicinski <kuba@...nel.org>, "Temerkhanov, Sergey" <sergey.temerkhanov@...el.com>
Subject: Re: [Intel-wired-lan] [PATCH net-next v3 1/3] ice: add ice_adapter
 for shared data across PFs on the same NIC

On Fri, Mar 8, 2024 at 1:17 PM Przemek Kitszel
<przemyslaw.kitszel@...el.com> wrote:
> On 3/7/24 23:25, Michal Schmidt wrote:
> > There is a need for synchronization between ice PFs on the same physical
> > adapter.
> >
> > Add a "struct ice_adapter" for holding data shared between PFs of the
> > same multifunction PCI device. The struct is refcounted - each ice_pf
> > holds a reference to it.
> >
> > Its first use will be for PTP. I expect it will be useful also to
> > improve the ugliness that is ice_prot_id_tbl.
> >
> > Signed-off-by: Michal Schmidt <mschmidt@...hat.com>
>
> Thank you very much for this series, we have spotted the need for
> something like that very recently, I have already pinged our PTP folks
> to take a look. (+CC Sergey)
>
> Why not wipe ice_ptp_lock() entirely?
> (could be left for Intel folks though)

I am doing this too, just not yet in this series I posted, because I
did not want to expand the scope of the series between reviews. See my
gitlab branch I linked in the cover letter, specifically this patch:
https://gitlab.com/mschmidt2/linux/-/commit/89a1bd2904ac8b0614bcfc2fce464bf5f60b0f0c

> please find the usual code related feedback inline
> (again, I really appreciate and I am grateful for this series)
>
> > ---
> >   drivers/net/ethernet/intel/ice/Makefile      |   3 +-
> >   drivers/net/ethernet/intel/ice/ice.h         |   2 +
> >   drivers/net/ethernet/intel/ice/ice_adapter.c | 107 +++++++++++++++++++
> >   drivers/net/ethernet/intel/ice/ice_adapter.h |  22 ++++
> >   drivers/net/ethernet/intel/ice/ice_main.c    |   8 ++
> >   5 files changed, 141 insertions(+), 1 deletion(-)
> >   create mode 100644 drivers/net/ethernet/intel/ice/ice_adapter.c
> >   create mode 100644 drivers/net/ethernet/intel/ice/ice_adapter.h
> >
> > diff --git a/drivers/net/ethernet/intel/ice/Makefile b/drivers/net/ethernet/intel/ice/Makefile
> > index cddd82d4ca0f..4fa09c321440 100644
> > --- a/drivers/net/ethernet/intel/ice/Makefile
> > +++ b/drivers/net/ethernet/intel/ice/Makefile
> > @@ -36,7 +36,8 @@ ice-y := ice_main.o \
> >        ice_repr.o     \
> >        ice_tc_lib.o   \
> >        ice_fwlog.o    \
> > -      ice_debugfs.o
> > +      ice_debugfs.o  \
> > +      ice_adapter.o
> >   ice-$(CONFIG_PCI_IOV) +=    \
> >       ice_sriov.o             \
> >       ice_virtchnl.o          \
> > diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h
> > index 365c03d1c462..1ffecbdd361a 100644
> > --- a/drivers/net/ethernet/intel/ice/ice.h
> > +++ b/drivers/net/ethernet/intel/ice/ice.h
> > @@ -77,6 +77,7 @@
> >   #include "ice_gnss.h"
> >   #include "ice_irq.h"
> >   #include "ice_dpll.h"
> > +#include "ice_adapter.h"
> >
> >   #define ICE_BAR0            0
> >   #define ICE_REQ_DESC_MULTIPLE       32
> > @@ -544,6 +545,7 @@ struct ice_agg_node {
> >
> >   struct ice_pf {
> >       struct pci_dev *pdev;
> > +     struct ice_adapter *adapter;
> >
> >       struct devlink_region *nvm_region;
> >       struct devlink_region *sram_region;
> > diff --git a/drivers/net/ethernet/intel/ice/ice_adapter.c b/drivers/net/ethernet/intel/ice/ice_adapter.c
> > new file mode 100644
> > index 000000000000..6b9eeba6edf7
> > --- /dev/null
> > +++ b/drivers/net/ethernet/intel/ice/ice_adapter.c
> > @@ -0,0 +1,107 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +// SPDX-FileCopyrightText: Copyright Red Hat
> > +
> > +#include <linux/cleanup.h>
> > +#include <linux/mutex.h>
> > +#include <linux/pci.h>
> > +#include <linux/slab.h>
> > +#include <linux/xarray.h>
> > +#include "ice_adapter.h"
> > +
> > +static DEFINE_XARRAY(ice_adapters);
> > +
> > +static unsigned long ice_adapter_index(const struct pci_dev *pdev)
> > +{
> > +     unsigned int domain = pci_domain_nr(pdev->bus);
> > +
> > +     WARN_ON((unsigned long)domain >> (BITS_PER_LONG - 13));
> > +     return ((unsigned long)domain << 13) |
> > +            ((unsigned long)pdev->bus->number << 5) |
> > +            PCI_SLOT(pdev->devfn);
>
> xarray is free to use non-0-based indices, so this whole function could
> be simplified as:
>
> /* note the PCI_SLOT() call to clear function from devfn */
> return PCI_DEVID(pci_domain_nr(pdev->bus), PCI_SLOT(pdev->devfn));

This is not equivalent. My function encodes three PCI numbers into the
index: domain, bus, slot.
Your version would have only: domain, slot. The bus number would be
lost. And also, higher-than-16 bits of the domain would be lost
unnecessarily.

> > +}
> > +
> > +static struct ice_adapter *ice_adapter_new(void)
> > +{
> > +     struct ice_adapter *adapter;
> > +
> > +     adapter = kzalloc(sizeof(*adapter), GFP_KERNEL);
> > +     if (!adapter)
> > +             return NULL;
> > +
> > +     refcount_set(&adapter->refcount, 1);
> > +
> > +     return adapter;
> > +}
> > +
> > +static void ice_adapter_free(struct ice_adapter *adapter)
> > +{
> > +     kfree(adapter);
> > +}
>
> I would say that this is too thin wrapper for "kernel interface" (memory
> ptr) to warrant it, IOW just place kfree in place of ice_adapter_free,
> that will also free us from the need to use DEFINE_FREE()

I am anticipating the need for struct members to destroy here. Eg. in
order to replace ice_ptp_lock entirely, I will add a mutex, which will
require mutex_destroy to be called in ice_adapter_free.

>
> > +
> > +DEFINE_FREE(ice_adapter_free, struct ice_adapter*, if (_T) ice_adapter_free(_T))
> > +
> > +/**
> > + * ice_adapter_get - Get a shared ice_adapter structure.
> > + * @pdev: Pointer to the pci_dev whose driver is getting the ice_adapter.
> > + *
> > + * Gets a pointer to a shared ice_adapter structure. Physical functions (PFs)
> > + * of the same multi-function PCI device share one ice_adapter structure.
> > + * The ice_adapter is reference-counted. The PF driver must use ice_adapter_put
> > + * to release its reference.
> > + *
> > + * Context: Process, may sleep.
> > + * Return:  Pointer to ice_adapter on success.
> > + *          ERR_PTR() on error. -ENOMEM is the only possible error.
>
> that's inconvenient, to the point that it will be better to have a dummy
> static entry used for this purpose, but I see that this is something
> broader that this particular use case, so - feel free to ignore

Perhaps I don't get what you mean by a static entry. Maybe a static
singleton instance of struct ice_adapter? I don't want that, because I
can have multiple E810 NICs in the system and I don't want them to
share anything unnecessarily.

Besides, this allocates only a small amount of memory and the
allocation is GFP_KERNEL. It won't fail under any realistic scenario
(afaik, the "too small to fail" rule still holds in the kernel). This
is called only from ice_probe, which surely allocates much more
memory. I am not calling this every time I need to access the
ice_adapter. I'm keeping a pointer in ice_pf.

> > + */
> > +struct ice_adapter *ice_adapter_get(const struct pci_dev *pdev)
> > +{
> > +     struct ice_adapter *ret, __free(ice_adapter_free) *adapter = NULL;
> > +     unsigned long index = ice_adapter_index(pdev);
> > +
> > +     adapter = ice_adapter_new();
> > +     if (!adapter)
> > +             return ERR_PTR(-ENOMEM);
> > +
> > +     xa_lock(&ice_adapters);
> > +     ret = __xa_cmpxchg(&ice_adapters, index, NULL, adapter, GFP_KERNEL);
> > +     if (xa_is_err(ret)) {
> > +             ret = ERR_PTR(xa_err(ret));
> > +             goto unlock;
> > +     }
> > +     if (ret) {
> > +             refcount_inc(&ret->refcount);
> > +             goto unlock;
> > +     }
> > +     ret = no_free_ptr(adapter);
>
> nice solution, but this is an idiom that we want across the kernel
> instead of opting out of auto management in such cases as this one?
> (esp that you have open-coded locking anyway)

I will follow up by changing the xa_lock usage to a guard if my
recently proposed patch ("xarray: add guard definitions for xa_lock")
gets accepted:
https://lore.kernel.org/lkml/20240228135352.14444-1-mschmidt@redhat.com/

> I would expect to have explicit two stores (first to ensure index is
> present, second to overwrite entry if null) easier than cmpxchg
> + unneeded allocation (that could cause whole function to fail!)

For reasons mentioned above, I am not worried about the allocation failing.
I am afraid moving away from the cmpxchg approach would force me to either:
 - Re-add the additional mutex I had in v1 of this series and that
Jiri Pirko asked me to remove and rely on xa_lock; or
 - Allocate ice_adapter under xa_lock, i.e. with GFP_ATOMIC. That
would only make running into ENOMEM more likely.

> > +unlock:
> > +     xa_unlock(&ice_adapters);
> > +     return ret;
> > +}
> > +
> > +/**
> > + * ice_adapter_put - Release a reference to the shared ice_adapter structure.
> > + * @pdev: Pointer to the pci_dev whose driver is releasing the ice_adapter.
> > + *
> > + * Releases the reference to ice_adapter previously obtained with
> > + * ice_adapter_get.
> > + *
> > + * Context: Any.
> > + */
> > +void ice_adapter_put(const struct pci_dev *pdev)
> > +{
> > +     unsigned long index = ice_adapter_index(pdev);
> > +     struct ice_adapter *adapter;
> > +
> > +     xa_lock(&ice_adapters);
> > +     adapter = xa_load(&ice_adapters, index);
> > +     if (WARN_ON(!adapter))
> > +             goto unlock;
> > +
> > +     if (!refcount_dec_and_test(&adapter->refcount))
> > +             goto unlock;
> > +
> > +     WARN_ON(__xa_erase(&ice_adapters, index) != adapter);
> > +     ice_adapter_free(adapter);
> > +unlock:
> > +     xa_unlock(&ice_adapters);
> > +}
> > diff --git a/drivers/net/ethernet/intel/ice/ice_adapter.h b/drivers/net/ethernet/intel/ice/ice_adapter.h
> > new file mode 100644
> > index 000000000000..cb5a02eb24c1
> > --- /dev/null
> > +++ b/drivers/net/ethernet/intel/ice/ice_adapter.h
> > @@ -0,0 +1,22 @@
> > +/* SPDX-License-Identifier: GPL-2.0-only */
> > +/* SPDX-FileCopyrightText: Copyright Red Hat */
> > +
> > +#ifndef _ICE_ADAPTER_H_
> > +#define _ICE_ADAPTER_H_
> > +
> > +#include <linux/refcount_types.h>
> > +
> > +struct pci_dev;
> > +
> > +/**
> > + * struct ice_adapter - PCI adapter resources shared across PFs
> > + * @refcount: Reference count. struct ice_pf objects hold the references.
> > + */
> > +struct ice_adapter {
> > +     refcount_t refcount;
>
> this is refcounted always under a lock, so could be plain "int",
> but not a big deal

Yes, I know, but the over/under-flow checks provided by refcount_t
keep me warm and fuzzy :)

> > +};
> > +
> > +struct ice_adapter *ice_adapter_get(const struct pci_dev *pdev);
> > +void ice_adapter_put(const struct pci_dev *pdev);
> > +
> > +#endif /* _ICE_ADAPTER_H */
> > diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
> > index 8f73ba77e835..a3c545e56256 100644
> > --- a/drivers/net/ethernet/intel/ice/ice_main.c
> > +++ b/drivers/net/ethernet/intel/ice/ice_main.c
> > @@ -5093,6 +5093,7 @@ static int
> >   ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent)
> >   {
> >       struct device *dev = &pdev->dev;
> > +     struct ice_adapter *adapter;
> >       struct ice_pf *pf;
> >       struct ice_hw *hw;
> >       int err;
> > @@ -5145,7 +5146,12 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent)
> >
> >       pci_set_master(pdev);
> >
> > +     adapter = ice_adapter_get(pdev);
> > +     if (IS_ERR(adapter))
> > +             return PTR_ERR(adapter);
> > +
> >       pf->pdev = pdev;
> > +     pf->adapter = adapter;
> >       pci_set_drvdata(pdev, pf);
> >       set_bit(ICE_DOWN, pf->state);
> >       /* Disable service task until DOWN bit is cleared */
> > @@ -5196,6 +5202,7 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent)
> >   err_load:
> >       ice_deinit(pf);
> >   err_init:
> > +     ice_adapter_put(pdev);
> >       pci_disable_device(pdev);
> >       return err;
> >   }
> > @@ -5302,6 +5309,7 @@ static void ice_remove(struct pci_dev *pdev)
> >       ice_setup_mc_magic_wake(pf);
> >       ice_set_wake(pf);
> >
> > +     ice_adapter_put(pdev);
> >       pci_disable_device(pdev);
> >   }
> >
>


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ