linux-kernel - Re: [PATCH net-next v22 2/2] mctp pcc: Implement MCTP over PCC Transport

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <e64da89fdd2c72afaa62f02449db9b144e02b743.camel@codeconstruct.com.au>
Date: Fri, 11 Jul 2025 20:52:00 +0800
From: Jeremy Kerr <jk@...econstruct.com.au>
To: admiyo@...amperecomputing.com, Matt Johnston
 <matt@...econstruct.com.au>,  Andrew Lunn <andrew+netdev@...n.ch>, "David
 S. Miller" <davem@...emloft.net>, Eric Dumazet <edumazet@...gle.com>, Jakub
 Kicinski <kuba@...nel.org>, Paolo Abeni <pabeni@...hat.com>
Cc: netdev@...r.kernel.org, linux-kernel@...r.kernel.org, Sudeep Holla
	 <sudeep.holla@....com>, Jonathan Cameron <Jonathan.Cameron@...wei.com>, 
	Huisong Li <lihuisong@...wei.com>
Subject: Re: [PATCH net-next v22 2/2] mctp pcc: Implement MCTP over PCC
 Transport

Hi Adam,

A few comments inline:

> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * mctp-pcc.c - Driver for MCTP over PCC.
> + * Copyright (c) 2024, Ampere Computing LLC

It's 2025 now, I'd suggest a range:

    Copyright (c) 2024-2025, Ampere Computing LLC

> + */
> +
> +/* Implementation of MCTP over PCC DMTF Specification DSP0256
> + * https://www.dmtf.org/sites/default/files/standards/documents/DSP0256_2.0.0WIP50.pdf

DSP0256 has been released now, if that's a more appropriate reference.
But it looks like DSP0292 is more specific to the PCC parts?

> + */
> +
> +#include <linux/acpi.h>
> +#include <linux/if_arp.h>
> +#include <linux/init.h>
> +#include <linux/kernel.h>
> +#include <linux/module.h>
> +#include <linux/netdevice.h>
> +#include <linux/platform_device.h>
> +#include <linux/string.h>
> +#include <linux/skbuff.h>
> +#include <linux/hrtimer.h>
> +
> +#include <acpi/acpi_bus.h>
> +#include <acpi/acpi_drivers.h>
> +#include <acpi/acrestyp.h>
> +#include <acpi/actbl.h>
> +#include <net/mctp.h>
> +#include <net/mctpdevice.h>
> +#include <acpi/pcc.h>
> +
> +#include "../../mailbox/mailbox.h"
> +
> +#define MCTP_PAYLOAD_LENGTH     256
> +#define MCTP_CMD_LENGTH         4
> +#define MCTP_PCC_VERSION        0x1 /* DSP0292 a single version: 1 */
> +#define MCTP_SIGNATURE          "MCTP"
> +#define MCTP_SIGNATURE_LENGTH   (sizeof(MCTP_SIGNATURE) - 1)
> +#define MCTP_MIN_MTU            68
> +#define PCC_DWORD_TYPE          0x0c
> +
> +struct mctp_pcc_mailbox {
> +       u32 index;
> +       struct pcc_mbox_chan *chan;
> +       struct mbox_client client;
> +       struct sk_buff_head packets;
> +};
> +
> +/* The netdev structure. One of these per PCC adapter. */
> +struct mctp_pcc_ndev {
> +       /* spinlock to serialize access to PCC outbox buffer and registers
> +        * Note that what PCC calls registers are memory locations, not CPU
> +        * Registers.  They include the fields used to synchronize access
> +        * between the OS and remote endpoints.
> +        *
> +        * Only the Outbox needs a spinlock, to prevent multiple
> +        * sent packets triggering multiple attempts to over write
> +        * the outbox.  The Inbox buffer is controlled by the remote
> +        * service and a spinlock would have no effect.
> +        */
> +       spinlock_t lock;
> +       struct mctp_dev mdev;

You are only ever using the mdev->dev pointer; just use a struct
net_device * here. The MCTP layer handles the creation of the MCTP
parts.

> +       struct acpi_device *acpi_device;
> +       struct mctp_pcc_mailbox inbox;
> +       struct mctp_pcc_mailbox outbox;
> +};
> +
> +static void *mctp_pcc_rx_alloc(struct mbox_client *c, int size)
> +{
> +       struct mctp_pcc_mailbox *box;
> +       struct mctp_pcc_ndev *mctp_pcc_ndev;
> +       struct sk_buff *skb;
> +       void *skb_buf;
> +
> +       box = container_of(c, struct mctp_pcc_mailbox, client);
> +       mctp_pcc_ndev = container_of(c, struct mctp_pcc_ndev, inbox.client);
> +       if (size > mctp_pcc_ndev->mdev.dev->mtu)
> +               return NULL;
> +       mctp_pcc_ndev = container_of(c, struct mctp_pcc_ndev, inbox.client);

You already have set mctp_ncc_ndev a few lines above. It's also a bit
unusual doing the two container_of() operations, when you can find the
ndev and reference the mailbox from there.

The common pattern for this is to set up your context from the input
pointers early too, so something like:

    static void *mctp_pcc_rx_alloc(struct mbox_client *c, int size)
    {
            struct mctp_pcc_ndev *mctp_pcc_ndev =
                    container_of(c, struct mctp_pcc_ndev, inbox.client);
            struct mctp_pcc_mailbox *box = &mctp_pcc_ndev->inbox;
            struct sk_buff *skb;

            /* ... */

(but you may not need a var for 'box' at all)

> +       skb = netdev_alloc_skb(mctp_pcc_ndev->mdev.dev, size);
> +       if (!skb)
> +               return NULL;
> +       skb_buf = skb_put(skb, size);

you don't use skb_buf anywhere?

(building with W=1 should catch this)

> +       skb->protocol = htons(ETH_P_MCTP);
> +
> +       skb_queue_head(&box->packets, skb);
> +
> +       return skb->data;
> +}
> +
> +static void mctp_pcc_client_rx_callback(struct mbox_client *c, void *buffer)
> +{
> +       struct mctp_pcc_ndev *mctp_pcc_ndev;
> +       struct pcc_header pcc_header;
> +       struct mctp_skb_cb *cb;
> +       struct sk_buff *skb;
> +
> +       mctp_pcc_ndev = container_of(c, struct mctp_pcc_ndev, inbox.client);
> +       if (!buffer) {
> +               dev_dstats_rx_dropped(mctp_pcc_ndev->mdev.dev);
> +               return;
> +       }

Mainly out of curiosity: how does this happen? How do we get a
completion where there is no original buffer?

> +
> +       skb_queue_walk(&mctp_pcc_ndev->inbox.packets, skb) {
> +               if (skb->data == buffer) {
> +                       skb_unlink(skb, &mctp_pcc_ndev->inbox.packets);
> +                       dev_dstats_rx_add(mctp_pcc_ndev->mdev.dev, skb->len);
> +                       skb_reset_mac_header(skb);
> +                       skb_pull(skb, sizeof(pcc_header));
> +                       skb_reset_network_header(skb);
> +                       cb = __mctp_cb(skb);
> +                       cb->halen = 0;
> +                       netif_rx(skb);
> +                       return;
> +               }

You can save a bit of indent by flipping the logic here:

        skb_queue_walk(&mctp_pcc_ndev->inbox.packets, skb) {
                if (skb->data != buffer)
                        continue;

                skb_unlink(skb, &mctp_pcc_ndev->inbox.packets);
                dev_dstats_rx_add(mctp_pcc_ndev->mdev.dev, skb->len);
                skb_reset_mac_header(skb);
                skb_pull(skb, sizeof(pcc_header));
                skb_reset_network_header(skb);
                cb = __mctp_cb(skb);
                cb->halen = 0;
                netif_rx(skb);
                return;
        }

I figure we're restricted to what the mailbox API provides, but is there
any way we can access the skb through a pointer, rather than having to
dig through these lists?

I think the issue is that the mbox API is using the void * buffer as
both the data to transfer, and the callback context, so we can't stash
useful context across the completion?


> +       }
> +       pr_warn("Unmatched packet in mctp-pcc inbox packet list");
> +}
> +
> +static void mctp_pcc_tx_done(struct mbox_client *c, void *mssg, int r)
> +{
> +       struct mctp_pcc_mailbox *box;
> +       struct sk_buff *skb;
> +
> +       box = container_of(c, struct mctp_pcc_mailbox, client);
> +       skb_queue_walk(&box->packets, skb) {
> +               if (skb->data == mssg) {
> +                       skb_unlink(skb, &box->packets);
> +                       dev_consume_skb_any(skb);
> +                       break;
> +               }
> +       }
> +}
> +
> +static netdev_tx_t mctp_pcc_tx(struct sk_buff *skb, struct net_device *ndev)
> +{
> +       struct mctp_pcc_ndev *mpnd = netdev_priv(ndev);
> +       struct pcc_header *pcc_header;
> +       int len = skb->len;
> +       int rc;
> +
> +       rc = skb_cow_head(skb, sizeof(*pcc_header));
> +       if (rc)
> +               goto err_drop;
> +
> +       pcc_header = skb_push(skb, sizeof(*pcc_header));
> +       pcc_header->signature = cpu_to_le32(PCC_SIGNATURE | mpnd->outbox.index);
> +       pcc_header->flags = cpu_to_le32(PCC_CMD_COMPLETION_NOTIFY);
> +       memcpy(&pcc_header->command, MCTP_SIGNATURE, MCTP_SIGNATURE_LENGTH);
> +       pcc_header->length = cpu_to_le32(len + MCTP_SIGNATURE_LENGTH);
> +
> +       skb_queue_head(&mpnd->outbox.packets, skb);
> +
> +       rc = mbox_send_message(mpnd->outbox.chan->mchan, skb->data);
> +
> +       if (rc < 0) {
> +               pr_info("%s fail, rc = %d", __func__, rc);
> +               return NETDEV_TX_BUSY;
> +       }

What happens on mbox_send_message failure? The skb will still be present
in the outbox.packets queue - I assume we don't see a completion
callback in that case, and so the skb will be in the outbox.packets
queue forever?

Are you sure you want to return NETDEV_TX_BUSY here?

Is there any situation where the mbox_send_message will continually
fail? Should we ratelimit the pr_info() message there (and regardless,
better to use one of netdev_info / netdev_warn / etc functions, since we
are dealing with netdevs here).

> +       dev_dstats_tx_add(ndev, len);
> +       return NETDEV_TX_OK;
> +err_drop:
> +       dev_dstats_tx_dropped(ndev);
> +       kfree_skb(skb);
> +       return NETDEV_TX_OK;
> +}
> +
> +static const struct net_device_ops mctp_pcc_netdev_ops = {
> +       .ndo_start_xmit = mctp_pcc_tx,
> +};
> +
> +static const struct mctp_netdev_ops mctp_netdev_ops = {
> +       NULL
> +};
> +
> +static void mctp_pcc_setup(struct net_device *ndev)
> +{
> +       ndev->type = ARPHRD_MCTP;
> +       ndev->hard_header_len = 0;
> +       ndev->tx_queue_len = 0;
> +       ndev->flags = IFF_NOARP;
> +       ndev->netdev_ops = &mctp_pcc_netdev_ops;
> +       ndev->needs_free_netdev = true;
> +       ndev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS;
> +}
> +
> +struct mctp_pcc_lookup_context {
> +       int index;
> +       u32 inbox_index;
> +       u32 outbox_index;
> +};
> +
> +static acpi_status lookup_pcct_indices(struct acpi_resource *ares,
> +                                      void *context)
> +{
> +       struct mctp_pcc_lookup_context *luc = context;
> +       struct acpi_resource_address32 *addr;
> +
> +       if (ares->type != PCC_DWORD_TYPE)
> +               return AE_OK;
> +
> +       addr = ACPI_CAST_PTR(struct acpi_resource_address32, &ares->data);
> +       switch (luc->index) {
> +       case 0:
> +               luc->outbox_index = addr[0].address.minimum;
> +               break;
> +       case 1:
> +               luc->inbox_index = addr[0].address.minimum;
> +               break;
> +       }
> +       luc->index++;
> +       return AE_OK;
> +}
> +
> +static void drain_packets(struct sk_buff_head *list)
> +{
> +       struct sk_buff *skb;
> +
> +       while (!skb_queue_empty(list)) {
> +               skb = skb_dequeue(list);
> +               dev_consume_skb_any(skb);
> +       }
> +}
> +
> +static void mctp_cleanup_netdev(void *data)
> +{
> +       struct mctp_pcc_ndev *mctp_pcc_ndev;
> +       struct net_device *ndev = data;
> +
> +       mctp_pcc_ndev = netdev_priv(ndev);
> +       drain_packets(&mctp_pcc_ndev->outbox.packets);
> +       drain_packets(&mctp_pcc_ndev->inbox.packets);
> +
> +       mctp_unregister_netdev(ndev);
> +}
> +
> +static void mctp_cleanup_channel(void *data)
> +{
> +       struct pcc_mbox_chan *chan = data;
> +
> +       pcc_mbox_free_channel(chan);
> +}
> +
> +static int mctp_pcc_initialize_mailbox(struct device *dev,
> +                                      struct mctp_pcc_mailbox *box, u32 index)
> +{
> +       box->index = index;
> +       skb_queue_head_init(&box->packets);
> +       box->chan = pcc_mbox_request_channel(&box->client, index);
> +       box->chan->rx_alloc = mctp_pcc_rx_alloc;
> +
> +       box->client.dev = dev;
> +       if (IS_ERR(box->chan))
> +               return PTR_ERR(box->chan);
> +       return devm_add_action_or_reset(dev, mctp_cleanup_channel, box->chan);
> +}
> +
> +static int mctp_pcc_driver_add(struct acpi_device *acpi_dev)
> +{
> +       struct mctp_pcc_lookup_context context = {0};
> +       struct mctp_pcc_ndev *mctp_pcc_ndev;
> +       struct device *dev = &acpi_dev->dev;
> +       struct net_device *ndev;
> +       acpi_handle dev_handle;
> +       acpi_status status;
> +       int mctp_pcc_mtu;
> +       char name[32];
> +       int rc;
> +
> +       dev_dbg(dev, "Adding mctp_pcc device for HID %s\n",
> +               acpi_device_hid(acpi_dev));
> +       dev_handle = acpi_device_handle(acpi_dev);
> +       status = acpi_walk_resources(dev_handle, "_CRS", lookup_pcct_indices,
> +                                    &context);
> +       if (!ACPI_SUCCESS(status)) {
> +               dev_err(dev, "FAILURE to lookup PCC indexes from CRS\n");
> +               return -EINVAL;
> +       }
> +
> +       /* inbox initialization */

the inbox initialization seems to be a bit further down.

> +       snprintf(name, sizeof(name), "mctpipcc%d", context.inbox_index);
> +       ndev = alloc_netdev(sizeof(*mctp_pcc_ndev), name, NET_NAME_PREDICTABLE,
> +                           mctp_pcc_setup);
> +       if (!ndev)
> +               return -ENOMEM;
> +
> +       mctp_pcc_ndev = netdev_priv(ndev);
> +       spin_lock_init(&mctp_pcc_ndev->lock);
> +       rc = mctp_pcc_initialize_mailbox(dev, &mctp_pcc_ndev->inbox,
> +                                        context.inbox_index);
> +       if (rc)
> +               goto free_netdev;
> +       mctp_pcc_ndev->inbox.client.rx_callback = mctp_pcc_client_rx_callback;
> +
> +       /* outbox initialization */
> +       rc = mctp_pcc_initialize_mailbox(dev, &mctp_pcc_ndev->outbox,
> +                                        context.outbox_index);
> +       if (rc)
> +               goto free_netdev;
> +
> +       mctp_pcc_ndev->outbox.client.tx_done = mctp_pcc_tx_done;
> +       mctp_pcc_ndev->acpi_device = acpi_dev;
> +       mctp_pcc_ndev->mdev.dev = ndev;
> +       acpi_dev->driver_data = mctp_pcc_ndev;
> +
> +       /* There is no clean way to pass the MTU to the callback function
> +        * used for registration, so set the values ahead of time.
> +        */
> +       mctp_pcc_mtu = mctp_pcc_ndev->outbox.chan->shmem_size -
> +               sizeof(struct pcc_header);
> +       ndev->mtu = MCTP_MIN_MTU;
> +       ndev->max_mtu = mctp_pcc_mtu;
> +       ndev->min_mtu = MCTP_MIN_MTU;
> +
> +       /* ndev needs to be freed before the iomemory (mapped above) gets
> +        * unmapped,  devm resources get freed in reverse to the order they
> +        * are added.
> +        */
> +       rc = mctp_register_netdev(ndev, &mctp_netdev_ops, MCTP_PHYS_BINDING_PCC);
> +       if (rc)
> +               goto free_netdev;
> +       return devm_add_action_or_reset(dev, mctp_cleanup_netdev, ndev);
> +free_netdev:
> +       free_netdev(ndev);
> +       return rc;
> +}

Just a couple of nitpicky style things here (and above): try to keep the
return-value checks immediately after the appropriate call:

            rc = mbox_send_message(mpnd->outbox.chan->mchan, skb->data);
            if (rc < 0) {
                    pr_info("%s fail, rc = %d", __func__, rc);
                    return NETDEV_TX_BUSY;
            }

but give yourself some space after those checks, and around the
returns/goto labels:

            rc = mctp_register_netdev(ndev, &mctp_netdev_ops, MCTP_PHYS_BINDING_PCC);
            if (rc)
                    goto free_netdev;

            return devm_add_action_or_reset(dev, mctp_cleanup_netdev, ndev);

    free_netdev:
            free_netdev(ndev);
            return rc;
    }

Cheers,


Jeremy