[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <aKjm7mBDQ6VR8kWl@mail.minyard.net>
Date: Fri, 22 Aug 2025 16:53:50 -0500
From: Corey Minyard <corey@...yard.net>
To: Tony Hutter <hutter2@...l.gov>
Cc: alok.a.tiwari@...cle.com, Bjorn Helgaas <helgaas@...nel.org>,
Lukas Wunner <lukas@...ner.de>, mariusz.tkaczyk@...ux.intel.com,
minyard@....org, linux-pci@...r.kernel.org,
openipmi-developer@...ts.sourceforge.net,
Linux Kernel Mailing List <linux-kernel@...r.kernel.org>
Subject: Re: [PATCH v4] PCI: Introduce Cray ClusterStor E1000 NVMe slot LED
driver
On Fri, Aug 22, 2025 at 02:36:34PM -0700, Tony Hutter wrote:
> Add driver to control the NVMe slot LEDs on the Cray ClusterStor E1000.
> The driver provides hotplug attention status callbacks for the 24 NVMe
> slots on the E1000. This allows users to access the E1000's locate and
> fault LEDs via the normal /sys/bus/pci/slots/<slot>/attention sysfs
> entries. This driver uses IPMI to communicate with the E1000 controller
> to toggle the LEDs.
>
> Signed-off-by: Tony Hutter <hutter2@...l.gov>
> ---
> Changes in v4:
> - Fix typo in Kconfig: "is it" -> "it is"
> - Rename some #defines to CRAYE1K_SUBCMD_*
> - Use IS_ERR() check in craye1k_debugfs_init()
> - Return -EIO instead of -EINVAL when LED value check fails
>
> Changes in v3:
> - Add 'attention' values in Documentation/ABI/testing/sysfs-bus-pci.
> - Remove ACPI_PCI_SLOT dependency.
> - Cleanup craye1k_do_message() error checking.
> - Skip unneeded memcpy() on failure in __craye1k_do_command().
> - Merge craye1k_do_command_and_netfn() code into craye1k_do_command().
> - Make craye1k_is_primary() return boolean.
> - Return negative error code on failure in craye1k_set_primary().
>
> Changes in v2:
> - Integrated E1000 code into the pciehp driver as an built-in
> extention rather than as a standalone module.
> - Moved debug tunables and counters to debugfs.
> - Removed forward declarations.
> - Kept the /sys/bus/pci/slots/<slot>/attention interface rather
> than using NPEM/_DSM or led_classdev as suggested. The "attention"
> interface is more beneficial for our site, since it allows us to
> control the NVMe slot LEDs agnostically across different enclosure
> vendors and kernel versions using the same scripts. It is also
> nice to use the same /sys/bus/pci/slots/<slot>/ sysfs directory for
> both slot LED toggling ("attention") and slot power control
> ("power").
> ---
> Documentation/ABI/testing/sysfs-bus-pci | 21 +
> MAINTAINERS | 5 +
> drivers/pci/hotplug/Kconfig | 10 +
> drivers/pci/hotplug/Makefile | 3 +
> drivers/pci/hotplug/pciehp.h | 7 +
> drivers/pci/hotplug/pciehp_core.c | 12 +
> drivers/pci/hotplug/pciehp_craye1k.c | 659 ++++++++++++++++++++++++
> 7 files changed, 717 insertions(+)
> create mode 100644 drivers/pci/hotplug/pciehp_craye1k.c
>
...snip
> diff --git a/drivers/pci/hotplug/pciehp_craye1k.c b/drivers/pci/hotplug/pciehp_craye1k.c
> new file mode 100644
> index 000000000000..72c636ceb976
> --- /dev/null
> +++ b/drivers/pci/hotplug/pciehp_craye1k.c
> @@ -0,0 +1,659 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright 2022-2024 Lawrence Livermore National Security, LLC
> + */
> +/*
> + * Cray ClusterStor E1000 hotplug slot LED driver extensions
> + *
> + * This driver controls the NVMe slot LEDs on the Cray ClusterStore E1000.
> + * It provides hotplug attention status callbacks for the 24 NVMe slots on
> + * the E1000. This allows users to access the E1000's locate and fault
> + * LEDs via the normal /sys/bus/pci/slots/<slot>/attention sysfs entries.
> + * This driver uses IPMI to communicate with the E1000 controller to toggle
> + * the LEDs.
> + *
> + * This driver is based off of ibmpex.c
> + */
> +
> +#include <linux/debugfs.h>
> +#include <linux/delay.h>
> +#include <linux/errno.h>
> +#include <linux/dmi.h>
> +#include <linux/ipmi.h>
> +#include <linux/ipmi_smi.h>
You shouldn't need to include ipmi_smi.h. If you do, that's a bug on my
part.
> +#include <linux/module.h>
> +#include <linux/pci.h>
> +#include <linux/pci_hotplug.h>
> +#include <linux/random.h>
> +#include "pciehp.h"
> +
...snip
> +/*
> + * craye1k_send_message() - Send the message already setup in 'craye1k'
> + *
> + * Context: craye1k->lock is already held.
> + * Return: 0 on success, non-zero on error.
> + */
> +static int craye1k_send_message(struct craye1k *craye1k)
> +{
> + int rc;
> +
> + rc = ipmi_validate_addr(&craye1k->address, sizeof(craye1k->address));
> + if (rc) {
> + dev_err_ratelimited(craye1k->dev, "validate_addr() = %d\n", rc);
> + return rc;
> + }
> +
> + craye1k->tx_msg_id++;
> +
> + rc = ipmi_request_settime(craye1k->user, &craye1k->address,
> + craye1k->tx_msg_id, &craye1k->tx_msg, craye1k,
> + 0, craye1k->ipmi_retries,
> + craye1k->ipmi_timeout_ms);
> +
> + if (rc) {
> + craye1k->request_failed++;
> + return rc;
> + }
> +
> + return 0;
> +}
> +
> +/*
> + * craye1k_do_message() - Send the message in 'craye1k' and wait for a response
> + *
> + * Context: craye1k->lock is already held.
> + * Return: 0 on success, non-zero on error.
> + */
> +static int craye1k_do_message(struct craye1k *craye1k)
> +{
> + int rc;
> + struct completion *read_complete = &craye1k->read_complete;
> + unsigned long tout = msecs_to_jiffies(craye1k->completion_timeout_ms);
I don't see anything that will prevent multiple messages from being sent
at one time. What happens if two things send a message at the same time
here?
-corey
> +
> + rc = craye1k_send_message(craye1k);
> + if (rc)
> + return rc;
> +
> + rc = wait_for_completion_killable_timeout(read_complete, tout);
> + if (rc == 0) {
> + /* timed out */
> + craye1k->completion_timeout++;
> + return -ETIME;
> + }
> +
> + return 0;
> +}
> +
> +/*
> + * __craye1k_do_command() - Do an IPMI command
> + *
> + * Send a command with optional data bytes, and read back response bytes.
> + * Context: craye1k->lock is already held.
> + * Returns: 0 on success, non-zero on error.
> + */
> +static int __craye1k_do_command(struct craye1k *craye1k, u8 netfn, u8 cmd,
> + u8 *send_data, u8 send_data_len, u8 *recv_data,
> + u8 recv_data_len)
> +{
> + int rc;
> +
> + craye1k->tx_msg.netfn = netfn;
> + craye1k->tx_msg.cmd = cmd;
> +
> + if (send_data) {
> + memcpy(&craye1k->tx_msg_data[0], send_data, send_data_len);
> + craye1k->tx_msg.data_len = send_data_len;
> + } else {
> + craye1k->tx_msg_data[0] = 0;
> + craye1k->tx_msg.data_len = 0;
> + }
> +
> + rc = craye1k_do_message(craye1k);
> + if (rc == 0)
> + memcpy(recv_data, craye1k->rx_msg_data, recv_data_len);
> +
> + return rc;
> +}
> +
> +/*
> + * craye1k_do_command() - Do a Cray E1000 specific IPMI command.
> + * @cmd: Cray E1000 specific command
> + * @send_data: Data to send after the command
> + * @send_data_len: Data length
> + *
> + * Context: craye1k->lock is already held.
> + * Returns: the last byte from the response or 0 if response had no response
> + * data bytes, else -1 on error.
> + */
> +static int craye1k_do_command(struct craye1k *craye1k, u8 cmd, u8 *send_data,
> + u8 send_data_len)
> +{
> + int rc;
> +
> + rc = __craye1k_do_command(craye1k, CRAYE1K_CMD_NETFN, cmd, send_data,
> + send_data_len, NULL, 0);
> + if (rc != 0) {
> + /* Error attempting command */
> + return -1;
> + }
> +
> + if (craye1k->tx_msg.data_len == 0)
> + return 0;
> +
> + /* Return last received byte value */
> + return craye1k->rx_msg_data[craye1k->rx_msg_len - 1];
> +}
> +
> +/*
> + * __craye1k_set_primary() - Tell the BMC we want to be the primary server
> + *
> + * An E1000 board has two physical servers on it. In order to set a slot
> + * NVMe LED, this server needs to first tell the BMC that it's the primary
> + * server.
> + *
> + * Returns: 0 on success, non-zero on error.
> + */
> +static int __craye1k_set_primary(struct craye1k *craye1k)
> +{
> + u8 bytes[2] = {CRAYE1K_SUBCMD_SET_PRIMARY, 1}; /* set primary to 1 */
> +
> + craye1k->set_primary++;
> + return craye1k_do_command(craye1k, CRAYE1K_CMD_PRIMARY, bytes, 2);
> +}
> +
> +/*
> + * craye1k_is_primary() - Are we the primary server?
> + *
> + * Returns: true if we are the primary server, false otherwise.
> + */
> +static bool craye1k_is_primary(struct craye1k *craye1k)
> +{
> + u8 byte = 0;
> + int rc;
> +
> + /* Response byte is 0x1 on success */
> + rc = craye1k_do_command(craye1k, CRAYE1K_CMD_PRIMARY, &byte, 1);
> + craye1k->check_primary++;
> + if (rc == 0x1)
> + return true; /* success */
> +
> + craye1k->check_primary_failed++;
> + return false; /* We are not the primary server node */
> +}
> +
> +/*
> + * craye1k_set_primary() - Attempt to set ourselves as the primary server
> + *
> + * Returns: 0 on success, -1 otherwise.
> + */
> +static int craye1k_set_primary(struct craye1k *craye1k)
> +{
> + int tries = 10;
> +
> + if (craye1k_is_primary(craye1k)) {
> + craye1k->was_already_primary++;
> + return 0;
> + }
> + craye1k->was_not_already_primary++;
> +
> + /* delay found through experimentation */
> + msleep(300);
> +
> + if (__craye1k_set_primary(craye1k) != 0) {
> + craye1k->set_initial_primary_failed++;
> + return -1; /* error */
> + }
> +
> + /*
> + * It can take 2 to 3 seconds after setting primary for the controller
> + * to report that it is the primary.
> + */
> + while (tries--) {
> + msleep(500);
> + if (craye1k_is_primary(craye1k))
> + break;
> + }
> +
> + if (tries == 0) {
> + craye1k->set_primary_failed++;
> + return -1; /* never reported that it's primary */
> + }
> +
> + /* Wait for primary switch to finish */
> + msleep(1500);
> +
> + return 0;
> +}
> +
> +/*
> + * craye1k_get_slot_led() - Get slot LED value
> + * @slot: Slot number (1-24)
> + * @is_locate_led: 0 = get fault LED value, 1 = get locate LED value
> + *
> + * Returns: slot value on success, -1 on failure.
> + */
> +static int craye1k_get_slot_led(struct craye1k *craye1k, unsigned char slot,
> + bool is_locate_led)
> +{
> + u8 bytes[2];
> + u8 cmd;
> +
> + bytes[0] = CRAYE1K_SUBCMD_GET_LED;
> + bytes[1] = slot;
> +
> + cmd = is_locate_led ? CRAYE1K_CMD_LOCATE_LED : CRAYE1K_CMD_FAULT_LED;
> +
> + return craye1k_do_command(craye1k, cmd, bytes, 2);
> +}
> +
> +/*
> + * craye1k_set_slot_led() - Attempt to set the locate/fault LED to a value
> + * @slot: Slot number (1-24)
> + * @is_locate_led: 0 = use fault LED, 1 = use locate LED
> + * @value: Value to set (0 or 1)
> + *
> + * Check the LED value after calling this function to ensure it has been set
> + * properly.
> + *
> + * Returns: 0 on success, non-zero on failure.
> + */
> +static int craye1k_set_slot_led(struct craye1k *craye1k, unsigned char slot,
> + unsigned char is_locate_led,
> + unsigned char value)
> +{
> + u8 bytes[3];
> + u8 cmd;
> +
> + bytes[0] = CRAYE1K_SUBCMD_SET_LED;
> + bytes[1] = slot;
> + bytes[2] = value;
> +
> + cmd = is_locate_led ? CRAYE1K_CMD_LOCATE_LED : CRAYE1K_CMD_FAULT_LED;
> +
> + return craye1k_do_command(craye1k, cmd, bytes, 3);
> +}
> +
> +static int __craye1k_get_attention_status(struct hotplug_slot *hotplug_slot,
> + u8 *status, bool set_primary)
> +{
> + unsigned char slot;
> + int locate, fault;
> + struct craye1k *craye1k;
> +
> + craye1k = craye1k_global;
> + slot = PSN(to_ctrl(hotplug_slot));
> +
> + if (set_primary) {
> + if (craye1k_set_primary(craye1k) != 0) {
> + craye1k->get_led_failed++;
> + return -EIO;
> + }
> + }
> +
> + locate = craye1k_get_slot_led(craye1k, slot, true);
> + if (locate == -1) {
> + craye1k->get_led_failed++;
> + return -EIO;
> + }
> + msleep(CRAYE1K_POST_CMD_WAIT_MS);
> +
> + fault = craye1k_get_slot_led(craye1k, slot, false);
> + if (fault == -1) {
> + craye1k->get_led_failed++;
> + return -EIO;
> + }
> + msleep(CRAYE1K_POST_CMD_WAIT_MS);
> +
> + *status = locate << 1 | fault;
> +
> + return 0;
> +}
> +
> +int craye1k_get_attention_status(struct hotplug_slot *hotplug_slot,
> + u8 *status)
> +{
> + int rc;
> + struct craye1k *craye1k;
> +
> + craye1k = craye1k_global;
> +
> + if (mutex_lock_interruptible(&craye1k->lock) != 0)
> + return -EINTR;
> +
> + rc = __craye1k_get_attention_status(hotplug_slot, status, true);
> +
> + mutex_unlock(&craye1k->lock);
> + return rc;
> +}
> +
> +int craye1k_set_attention_status(struct hotplug_slot *hotplug_slot,
> + u8 status)
> +{
> + unsigned char slot;
> + int tries = 4;
> + int rc;
> + u8 new_status;
> + struct craye1k *craye1k;
> + bool locate, fault;
> +
> + craye1k = craye1k_global;
> +
> + slot = PSN(to_ctrl(hotplug_slot));
> +
> + if (mutex_lock_interruptible(&craye1k->lock) != 0)
> + return -EINTR;
> +
> + /* Retry to ensure all LEDs are set */
> + while (tries--) {
> + /*
> + * The node must first set itself to be the primary node before
> + * setting the slot LEDs (each board has two nodes, or
> + * "servers" as they're called by the manufacturer). This can
> + * lead to contention if both nodes are trying to set the LEDs
> + * at the same time.
> + */
> + rc = craye1k_set_primary(craye1k);
> + if (rc != 0) {
> + /* Could not set as primary node. Just retry again. */
> + continue;
> + }
> +
> + /* Write value twice to increase success rate */
> + locate = (status & 0x2) >> 1;
> + craye1k_set_slot_led(craye1k, slot, 1, locate);
> + if (craye1k_set_slot_led(craye1k, slot, 1, locate) != 0) {
> + craye1k->set_led_locate_failed++;
> + continue; /* fail, retry */
> + }
> +
> + msleep(CRAYE1K_POST_CMD_WAIT_MS);
> +
> + fault = status & 0x1;
> + craye1k_set_slot_led(craye1k, slot, 0, fault);
> + if (craye1k_set_slot_led(craye1k, slot, 0, fault) != 0) {
> + craye1k->set_led_fault_failed++;
> + continue; /* fail, retry */
> + }
> +
> + msleep(CRAYE1K_POST_CMD_WAIT_MS);
> +
> + rc = __craye1k_get_attention_status(hotplug_slot, &new_status,
> + false);
> +
> + msleep(CRAYE1K_POST_CMD_WAIT_MS);
> +
> + if (rc == 0 && new_status == status)
> + break; /* success */
> +
> + craye1k->set_led_readback_failed++;
> +
> + /*
> + * At this point we weren't successful in setting the LED and
> + * need to try again.
> + *
> + * Do a random back-off to reduce contention with other server
> + * node in the unlikely case that both server nodes are trying to
> + * trying to set a LED at the same time.
> + *
> + * The 500ms minimum in the back-off reduced the chance of this
> + * whole retry loop failing from 1 in 700 to none in 10000.
> + */
> + msleep(500 + (get_random_long() % 500));
> + }
> + mutex_unlock(&craye1k->lock);
> + if (tries == 0) {
> + craye1k->set_led_failed++;
> + return -EIO;
> + }
> +
> + return 0;
> +}
> +
> +static bool is_craye1k_board(void)
> +{
> + return dmi_match(DMI_PRODUCT_NAME, "VSSEP1EC");
> +}
> +
> +bool is_craye1k_slot(struct controller *ctrl)
> +{
> + return (PSN(ctrl) >= 1 && PSN(ctrl) <= 24 && is_craye1k_board());
> +}
> +
> +int craye1k_init(void)
> +{
> + if (!is_craye1k_board())
> + return 0;
> +
> + return ipmi_smi_watcher_register(&craye1k_smi_watcher);
> +}
> +
> +MODULE_LICENSE("GPL");
> +MODULE_AUTHOR("Tony Hutter <hutter2@...l.gov>");
> +MODULE_DESCRIPTION("Cray E1000 NVMe Slot LED driver");
> --
> 2.43.7
Powered by blists - more mailing lists