netdev - Re: [PATCH net-next v25 1/1] mctp pcc: Implement MCTP over PCC Transport

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <9f8af2684b017cd5f5c6f8043ee3c7d3b72a044b.camel@codeconstruct.com.au>
Date: Wed, 27 Aug 2025 15:36:30 +0800
From: Jeremy Kerr <jk@...econstruct.com.au>
To: admiyo@...amperecomputing.com, Matt Johnston
 <matt@...econstruct.com.au>,  Andrew Lunn <andrew+netdev@...n.ch>, "David
 S. Miller" <davem@...emloft.net>, Eric Dumazet <edumazet@...gle.com>, Jakub
 Kicinski <kuba@...nel.org>, Paolo Abeni <pabeni@...hat.com>
Cc: netdev@...r.kernel.org, linux-kernel@...r.kernel.org, Sudeep Holla
	 <sudeep.holla@....com>, Jonathan Cameron <Jonathan.Cameron@...wei.com>, 
	Huisong Li <lihuisong@...wei.com>
Subject: Re: [PATCH net-next v25 1/1] mctp pcc: Implement MCTP over PCC
 Transport

Hi Adam,

> +static void mctp_pcc_client_rx_callback(struct mbox_client *c, void *buffer)
> +{
> +       struct mctp_pcc_ndev *mctp_pcc_ndev;
> +       struct sk_buff *curr_skb = NULL;
> +       struct pcc_header pcc_header;
> +       struct sk_buff *skb = NULL;
> +       struct mctp_skb_cb *cb;
> +
> +       mctp_pcc_ndev = container_of(c, struct mctp_pcc_ndev, inbox.client);
> +       if (!buffer) {
> +               dev_dstats_rx_dropped(mctp_pcc_ndev->ndev);
> +               return;
> +       }
> +
> +       spin_lock(&mctp_pcc_ndev->inbox.packets.lock);
> +       skb_queue_walk(&mctp_pcc_ndev->inbox.packets, curr_skb) {
> +               skb = curr_skb;

You're setting skb unconditionally here...

> +               if (skb->data != buffer)
> +                       continue;
> +               __skb_unlink(skb, &mctp_pcc_ndev->inbox.packets);
> +               break;
> +       }
> +       spin_unlock(&mctp_pcc_ndev->inbox.packets.lock);
> +
> +       if (skb) {

... so in the case where no skb matches, this will netif_rx() the last
skb in your list.

Only set the skb var on a match.

> +               dev_dstats_rx_add(mctp_pcc_ndev->ndev, skb->len);
> +               skb_reset_mac_header(skb);
> +               skb_pull(skb, sizeof(pcc_header));
> +               skb_reset_network_header(skb);
> +               cb = __mctp_cb(skb);
> +               cb->halen = 0;
> +               netif_rx(skb);
> +       }
> +}
> +
> +static void mctp_pcc_tx_done(struct mbox_client *c, void *mssg, int r)
> +{
> +       struct mctp_pcc_ndev *mctp_pcc_ndev;
> +       struct mctp_pcc_mailbox *box;
> +       struct sk_buff *skb = NULL;
> +       struct sk_buff *curr_skb;
> +
> +       mctp_pcc_ndev = container_of(c, struct mctp_pcc_ndev, outbox.client);
> +       box = container_of(c, struct mctp_pcc_mailbox, client);
> +       spin_lock(&box->packets.lock);
> +       skb_queue_walk(&box->packets, curr_skb) {
> +               skb = curr_skb;
> +               if (skb->data == mssg) {
> +                       __skb_unlink(skb, &box->packets);
> +                       break;
> +               }
> +       }
> +       spin_unlock(&box->packets.lock);
> +
> +       if (skb)
> +               dev_consume_skb_any(skb);

And the same with the loop logic here.

> +}
> +
> +static netdev_tx_t mctp_pcc_tx(struct sk_buff *skb, struct net_device *ndev)
> +{
> +       struct mctp_pcc_ndev *mpnd = netdev_priv(ndev);
> +       struct pcc_header *pcc_header;
> +       int len = skb->len;
> +       int rc;
> +
> +       rc = skb_cow_head(skb, sizeof(*pcc_header));
> +       if (rc) {
> +               dev_dstats_tx_dropped(ndev);
> +               kfree_skb(skb);
> +               return NETDEV_TX_OK;
> +       }
> +
> +       pcc_header = skb_push(skb, sizeof(*pcc_header));
> +       pcc_header->signature = PCC_SIGNATURE | mpnd->outbox.index;
> +       pcc_header->flags = PCC_CMD_COMPLETION_NOTIFY;
> +       memcpy(&pcc_header->command, MCTP_SIGNATURE, MCTP_SIGNATURE_LENGTH);
> +       pcc_header->length = len + MCTP_SIGNATURE_LENGTH;
> +
> +       skb_queue_head(&mpnd->outbox.packets, skb);
> +
> +       rc = mbox_send_message(mpnd->outbox.chan->mchan, skb->data);
> +
> +       if (rc < 0) {
> +               skb_unlink(skb, &mpnd->outbox.packets);
> +               return NETDEV_TX_BUSY;
> +       }

Merging your other email thread into this one:

> Which means the failed-to-send packet is unlinked.  I guess I am unclear
> if this is sufficient to deal with the packet flow control issue or not.

The unlinking is fine - the issue is that you're not doing anything to
prevent that an immediate retransmit attempt of that same skb.

> I have not yet had a setup up where I can flood the network with packets
> and see what happens if I fill up the ring buffer.  I think that is the
> most likely failure case that will lead to flow control issues.  If the
> remote side cannot handle packets as fast as they are sent, at some
> point we have to stop sending them.  The mailbox abstraction makes that
> hard to detect;

If the mbox API has any facility for you to get a notification on
available space, you could stop the queues here, and restart them on
that notification.

(or even better, you could stop the queues when the skb that fills the
channel is sent)

Does the mbox API give you any facility for tx backpressure?

Otherwise, I guess we're OK, just inefficient.

> +
> +       dev_dstats_tx_add(ndev, len);
> +       return NETDEV_TX_OK;
> +}
> +
> +static void drain_packets(struct sk_buff_head *list)
> +{
> +       struct sk_buff *skb;
> +
> +       while (!skb_queue_empty(list)) {
> +               skb = skb_dequeue(list);
> +               dev_consume_skb_any(skb);
> +       }
> +}

Might be better to avoid TOCTOU issues between the skb_queue_empty() and
skb_dequeue() - something like:

	for (;;) {
		skb = skb_dequeue(list);
		if (!skb)
			break;
		dev_consume_skb_any(skb);
	}

(or some better incantation of the loop)

However, I assume doing these without any locking is okay, if you can
guarantee that no further mbox callbacks will happen after the
free_channel() returns - and that the free is synchronous.

(or if you can't guarantee that, you have other issues in the cleanup
path)

If you do change this to lockless, it would warrant some commenting
here, because the locking would differ from other sites.


> +static int mctp_pcc_ndo_open(struct net_device *ndev)
> +{
> +       struct mctp_pcc_ndev *mctp_pcc_ndev =
> +           netdev_priv(ndev);
> +       struct mctp_pcc_mailbox *outbox =
> +           &mctp_pcc_ndev->outbox;
> +       struct mctp_pcc_mailbox *inbox =
> +           &mctp_pcc_ndev->inbox;
> +       int mctp_pcc_mtu;

Minor: as mentioned in the other thread, no need to RCT this.

> +
> +       outbox->chan = pcc_mbox_request_channel(&outbox->client, outbox->index);
> +       if (IS_ERR(outbox->chan))
> +               return PTR_ERR(outbox->chan);
> +
> +       inbox->chan = pcc_mbox_request_channel(&inbox->client, inbox->index);
> +       if (IS_ERR(inbox->chan)) {
> +               pcc_mbox_free_channel(outbox->chan);
> +               return PTR_ERR(inbox->chan);
> +       }
> +
> +       mctp_pcc_ndev->inbox.chan->rx_alloc = mctp_pcc_rx_alloc;
> +       mctp_pcc_ndev->outbox.chan->manage_writes = true;

Minor: you have the convenience vars for ->inbox and ->outbox, may as
well use them.

> +
> +       mctp_pcc_mtu = mctp_pcc_ndev->outbox.chan->shmem_size -
> +               sizeof(struct pcc_header);
> +       ndev->mtu = MCTP_MIN_MTU;
> +       ndev->max_mtu = mctp_pcc_mtu;
> +       ndev->min_mtu = MCTP_MIN_MTU;

Merging with your other thread:

> > For my own clarity, what's "the callback function used for
> > registration"?
> 
> Actually, this is not longer true: we can do it in ndo_open, and it is
> clean.  Removed the comment.

On second thought, why are these being set on ndo_open? it should be
possible to:

  $ mctp link set mctpipcc0 mtu 252
  $ mctp link set mctpipcc0 up

the latter would trigger the ->ndo_open, so you'll lose the MTU set up
from the former. You probably don't want to lose settings on a down->up
cycle.

(also, what does the 'i' signify, in mctpipcc?)

> +
> +       return 0;
> +}
> +
> +static int mctp_pcc_ndo_stop(struct net_device *ndev)
> +{
> +       struct mctp_pcc_ndev *mctp_pcc_ndev =
> +           netdev_priv(ndev);
> +       struct mctp_pcc_mailbox *outbox =
> +           &mctp_pcc_ndev->outbox;
> +       struct mctp_pcc_mailbox *inbox =
> +           &mctp_pcc_ndev->inbox;
> +
> +       pcc_mbox_free_channel(outbox->chan);
> +       pcc_mbox_free_channel(inbox->chan);
> +
> +       drain_packets(&mctp_pcc_ndev->outbox.packets);
> +       drain_packets(&mctp_pcc_ndev->inbox.packets);
> +       return 0;
> +}
> +
> +static const struct net_device_ops mctp_pcc_netdev_ops = {
> +       .ndo_open = mctp_pcc_ndo_open,
> +       .ndo_stop = mctp_pcc_ndo_stop,
> +       .ndo_start_xmit = mctp_pcc_tx,
> +
> +};
> +
> +static void mctp_pcc_setup(struct net_device *ndev)
> +{
> +       ndev->type = ARPHRD_MCTP;
> +       ndev->hard_header_len = 0;
> +       ndev->tx_queue_len = 0;
> +       ndev->flags = IFF_NOARP;
> +       ndev->netdev_ops = &mctp_pcc_netdev_ops;
> +       ndev->needs_free_netdev = true;
> +       ndev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS;
> +}
> +
> +struct mctp_pcc_lookup_context {
> +       int index;
> +       u32 inbox_index;
> +       u32 outbox_index;
> +};
> +
> +static acpi_status lookup_pcct_indices(struct acpi_resource *ares,
> +                                      void *context)
> +{
> +       struct mctp_pcc_lookup_context *luc = context;
> +       struct acpi_resource_address32 *addr;
> +
> +       if (ares->type != PCC_DWORD_TYPE)
> +               return AE_OK;
> +
> +       addr = ACPI_CAST_PTR(struct acpi_resource_address32, &ares->data);
> +       switch (luc->index) {
> +       case 0:
> +               luc->outbox_index = addr[0].address.minimum;
> +               break;
> +       case 1:
> +               luc->inbox_index = addr[0].address.minimum;
> +               break;
> +       }
> +       luc->index++;
> +       return AE_OK;
> +}
> +
> +static void mctp_cleanup_netdev(void *data)
> +{
> +       struct net_device *ndev = data;
> +
> +       mctp_unregister_netdev(ndev);
> +}
> +
> +static int mctp_pcc_initialize_mailbox(struct device *dev,
> +                                      struct mctp_pcc_mailbox *box, u32 index)
> +{
> +       box->index = index;
> +       skb_queue_head_init(&box->packets);
> +       box->client.dev = dev;
> +       return 0;
> +}
> +
> +static int mctp_pcc_driver_add(struct acpi_device *acpi_dev)
> +{
> +       struct mctp_pcc_lookup_context context = {0};
> +       struct mctp_pcc_ndev *mctp_pcc_ndev;
> +       struct device *dev = &acpi_dev->dev;
> +       struct net_device *ndev;
> +       acpi_handle dev_handle;
> +       acpi_status status;
> +       char name[32];
> +       int rc;
> +
> +       dev_dbg(dev, "Adding mctp_pcc device for HID %s\n",
> +               acpi_device_hid(acpi_dev));
> +       dev_handle = acpi_device_handle(acpi_dev);
> +       status = acpi_walk_resources(dev_handle, "_CRS", lookup_pcct_indices,
> +                                    &context);
> +       if (!ACPI_SUCCESS(status)) {
> +               dev_err(dev, "FAILURE to lookup PCC indexes from CRS\n");
> +               return -EINVAL;
> +       }
> +
> +       snprintf(name, sizeof(name), "mctpipcc%d", context.inbox_index);
> +       ndev = alloc_netdev(sizeof(*mctp_pcc_ndev), name, NET_NAME_PREDICTABLE,
> +                           mctp_pcc_setup);
> +       if (!ndev)
> +               return -ENOMEM;
> +
> +       mctp_pcc_ndev = netdev_priv(ndev);
> +
> +       /* inbox initialization */
> +       rc = mctp_pcc_initialize_mailbox(dev, &mctp_pcc_ndev->inbox,
> +                                        context.inbox_index);
> +       if (rc)
> +               goto free_netdev;
> +
> +       mctp_pcc_ndev->inbox.client.rx_callback = mctp_pcc_client_rx_callback;
> +
> +       /* outbox initialization */
> +       rc = mctp_pcc_initialize_mailbox(dev, &mctp_pcc_ndev->outbox,
> +                                        context.outbox_index);
> +       if (rc)
> +               goto free_netdev;
> +
> +       mctp_pcc_ndev->outbox.client.tx_done = mctp_pcc_tx_done;
> +       mctp_pcc_ndev->acpi_device = acpi_dev;
> +       mctp_pcc_ndev->ndev = ndev;
> +       acpi_dev->driver_data = mctp_pcc_ndev;
> +
> +       /* ndev needs to be freed before the iomemory (mapped above) gets
> +        * unmapped,  devm resources get freed in reverse to the order they
> +        * are added.
> +        */

minor: this comment seems stale; we're not doing any mapping or
unmapping here. What you care about is that the netdev is unregistered
(and hence ->ndo_stop() invoked, freeing the allocated channels) when
the struct acpi_device is destroyed.

Cheers,


Jeremy