[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250207143028.1865-2-shiju.jose@huawei.com>
Date: Fri, 7 Feb 2025 14:30:22 +0000
From: <shiju.jose@...wei.com>
To: <linux-edac@...r.kernel.org>, <linux-cxl@...r.kernel.org>,
<mchehab@...nel.org>, <dave.jiang@...el.com>, <dan.j.williams@...el.com>,
<bp@...en8.de>, <jonathan.cameron@...wei.com>, <alison.schofield@...el.com>,
<vishal.l.verma@...el.com>, <ira.weiny@...el.com>, <dave@...olabs.net>
CC: <linux-kernel@...r.kernel.org>, <linuxarm@...wei.com>,
<tanxiaofei@...wei.com>, <prime.zeng@...ilicon.com>, <shiju.jose@...wei.com>
Subject: [PATCH 1/4] rasdaemon: cxl: Add support for memory sparing operation
From: Shiju Jose <shiju.jose@...wei.com>
CXL spec 3.1, Section 8.2.9.2.1, Table 8-43, "Common Event Record Format"
table defines the Event Record Flags: 'Maintenance Needed' and 'Maintenance
Operation Subclass Valid Flag' flags, which indicate when these flags are
set, signaling that the memory device requires maintenance. When the device
sets the maintenance operation class and maintenance operation subclass for
memory sparing, the CXL DRAM handler sets attributes for memory sparing via
the EDAC memory repair sysfs interface, initiating the sparing operation
in the CXL memory device.
Add support for the memory sparing operation and enable for the CXL DRAM
event if auto repair is on.
Auto memory repair is disabled default.
Signed-off-by: Shiju Jose <shiju.jose@...wei.com>
---
misc/rasdaemon.env | 4 +
ras-cxl-handler.c | 287 +++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 291 insertions(+)
diff --git a/misc/rasdaemon.env b/misc/rasdaemon.env
index 963aaa0..b3fdba7 100644
--- a/misc/rasdaemon.env
+++ b/misc/rasdaemon.env
@@ -88,3 +88,7 @@ TRIGGER_DIR=
# MC_UE_TRIGGER=mc_event_trigger
MC_CE_TRIGGER=
MC_UE_TRIGGER=
+
+# CXL memory auto repair control
+# Whether to enable CXL memory auto repair (yes|no).
+CXL_AUTO_REPAIR_ENABLE="no"
diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c
index cb95fa6..3311949 100644
--- a/ras-cxl-handler.c
+++ b/ras-cxl-handler.c
@@ -4,7 +4,9 @@
* Copyright (c) Huawei Technologies Co., Ltd. 2023. All rights reserved.
*/
+#include <dirent.h>
#include <endian.h>
+#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
@@ -722,6 +724,140 @@ static int handle_ras_cxl_common_hdr(struct trace_seq *s,
return 0;
}
+/* memory repair */
+/*
+ * Common Event Record Format
+ * CXL 3.1 section 8.2.9.2.1; Table 8-43
+ */
+#define CXL_MAINT_CLASS_SPARING 0x02
+#define CXL_MAINT_SUBCLASS_CACHE_SPARING 0x00
+#define CXL_MAINT_SUBCLASS_ROW_SPARING 0x01
+#define CXL_MAINT_SUBCLASS_BANK_SPARING 0x02
+#define CXL_MAINT_SUBCLASS_RANK_SPARING 0x03
+
+#define CXL_CMD_BUF_SIZE 256
+
+enum cxl_mem_sparing_type {
+ CXL_CACHE_SPARING,
+ CXL_ROW_SPARING,
+ CXL_BANK_SPARING,
+ CXL_RANK_SPARING,
+};
+
+static const char *edac_bus_path = "/sys/bus/edac/devices/";
+#define EDAC_CXL_DEV_PREFIX "cxl_"
+
+/*
+ * Auto repair is disabled default.
+ * 'export CXL_AUTO_REPAIR_ENABLE=yes' to enable auto repair.
+ */
+static bool auto_repair;
+
+static void check_config_status(void)
+{
+ char *env = getenv("CXL_AUTO_REPAIR_ENABLE");
+
+ if (!env || strcasecmp(env, "yes"))
+ return;
+
+ auto_repair = true;
+}
+
+static int get_sysfs_data_str(const char *dir, const char *file, char *out)
+{
+ char path[CXL_CMD_BUF_SIZE];
+ char buf[CXL_CMD_BUF_SIZE];
+ int fd;
+
+ snprintf(path, CXL_CMD_BUF_SIZE, "%s/%s", dir, file);
+ fd = open(path, O_RDONLY);
+ if (fd == -1) {
+ log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__, path);
+ return -1;
+ }
+
+ memset(buf, 0, strlen(buf));
+ if (read(fd, buf, sizeof(buf)) <= 0)
+ goto error;
+
+ if (sscanf(buf, "%s", out) <= 0)
+ goto error;
+
+ close(fd);
+ return 0;
+
+error:
+ close(fd);
+ return -1;
+}
+
+static int set_sysfs_data_uint32(const char *dir, const char *file, uint32_t data)
+{
+ char path[CXL_CMD_BUF_SIZE];
+ int fd;
+
+ snprintf(path, CXL_CMD_BUF_SIZE, "%s/%s", dir, file);
+ fd = open(path, O_WRONLY);
+ if (fd == -1) {
+ log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__, path);
+ return -1;
+ }
+
+ if (dprintf(fd, "%d", data) <= 0) {
+ log(TERM, LOG_ERR, "[%s]: write data to [%s] failed, errno:%d\n",
+ __func__, path, errno);
+ close(fd);
+ return -1;
+ }
+ close(fd);
+
+ return 0;
+}
+
+static int set_sysfs_data_uint64(const char *dir, const char *file, uint64_t data)
+{
+ char path[CXL_CMD_BUF_SIZE];
+ int fd;
+
+ snprintf(path, CXL_CMD_BUF_SIZE, "%s/%s", dir, file);
+ fd = open(path, O_WRONLY);
+ if (fd == -1) {
+ log(TERM, LOG_ERR, "[%s]:open file: %s failed\n", __func__, path);
+ return -1;
+ }
+
+ if (dprintf(fd, "0x%lx", data) <= 0) {
+ log(TERM, LOG_ERR, "[%s]: write data to [%s] failed, errno:%d\n",
+ __func__, path, errno);
+ close(fd);
+ return -1;
+ }
+ close(fd);
+
+ return 0;
+}
+
+static int cxl_find_spare(const char *repair_dev, const char *repair_type)
+{
+ char dir[CXL_CMD_BUF_SIZE];
+ char out[CXL_CMD_BUF_SIZE];
+ int idx = 0;
+
+ while (1) {
+ snprintf(dir, CXL_CMD_BUF_SIZE, "%s%s%s/mem_repair%d",
+ edac_bus_path, EDAC_CXL_DEV_PREFIX, repair_dev, idx);
+
+ if (get_sysfs_data_str(dir, "repair_type", out))
+ return -1;
+
+ if (!strcmp(repair_type, out))
+ return idx;
+ idx++;
+ }
+
+ return -1;
+}
+
int ras_cxl_generic_event_handler(struct trace_seq *s,
struct tep_record *record,
struct tep_event *event, void *context)
@@ -1027,6 +1163,155 @@ static const char * const cxl_der_mem_event_type[] = {
"CKID Violation",
};
+/*
+ * Each type of sparing requires a superset of the info needed for
+ * coarser grained sparing.
+ */
+static int fill_rank_sparing_attrs(struct ras_cxl_dram_event *ev,
+ const char *dir)
+{
+ if (set_sysfs_data_uint64(dir, "dpa", ev->dpa))
+ return -1;
+
+ if (set_sysfs_data_uint32(dir, "channel", ev->channel))
+ return -1;
+
+ if (set_sysfs_data_uint32(dir, "rank", ev->rank))
+ return -1;
+
+ if (ev->validity_flags & CXL_DER_VALID_NIBBLE) {
+ if (set_sysfs_data_uint32(dir, "nibble_mask", ev->nibble_mask))
+ return -1;
+ }
+
+ return 0;
+}
+
+static int fill_bank_sparing_attrs(struct ras_cxl_dram_event *ev,
+ const char *dir)
+{
+ if (fill_rank_sparing_attrs(ev, dir))
+ return -1;
+
+ if (set_sysfs_data_uint32(dir, "bank_group", ev->bank_group))
+ return -1;
+
+ if (set_sysfs_data_uint32(dir, "bank", ev->bank))
+ return -1;
+
+ return 0;
+}
+
+static int fill_row_sparing_attrs(struct ras_cxl_dram_event *ev,
+ const char *dir)
+{
+ if (fill_bank_sparing_attrs(ev, dir))
+ return -1;
+
+ if (set_sysfs_data_uint32(dir, "row", ev->row))
+ return -1;
+
+ return 0;
+}
+
+static int fill_cacheline_sparing_attrs(struct ras_cxl_dram_event *ev,
+ const char *dir)
+{
+ if (fill_row_sparing_attrs(ev, dir))
+ return -1;
+
+ if (set_sysfs_data_uint32(dir, "column", ev->column))
+ return -1;
+
+ if (ev->validity_flags & CXL_DER_VALID_SUB_CHANNEL) {
+ if (set_sysfs_data_uint32(dir, "sub_channel", ev->sub_channel))
+ return -1;
+ }
+
+ return 0;
+}
+
+static int cxl_dram_sparing(struct ras_cxl_dram_event *ev)
+{
+ struct ras_cxl_event_common_hdr *hdr = &ev->hdr;
+ char dir[CXL_CMD_BUF_SIZE];
+ char repair_type[256];
+ uint8_t sparing_type;
+ int idx;
+
+ check_config_status();
+ if (!auto_repair)
+ return -1;
+
+ if (!(ev->hdr.hdr_flags & CXL_EVENT_RECORD_FLAG_MAINT_NEEDED) ||
+ !(ev->hdr.hdr_flags & CXL_EVENT_RECORD_FLAG_MAINT_OP_SUB_CLASS_VALID) ||
+ ev->hdr.hdr_maint_op_class != CXL_MAINT_CLASS_SPARING ||
+ ev->dpa_flags & CXL_DPA_NOT_REPAIRABLE)
+ return -1;
+
+ if (!(ev->validity_flags & CXL_DER_VALID_CHANNEL) ||
+ !(ev->validity_flags & CXL_DER_VALID_RANK))
+ return -1;
+
+ /*
+ * CXL device reports the type of the repair in the event record.
+ */
+ switch (hdr->hdr_maint_op_sub_class) {
+ case CXL_MAINT_SUBCLASS_CACHE_SPARING:
+ if (!(ev->validity_flags & CXL_DER_VALID_BANK_GROUP) ||
+ !(ev->validity_flags & CXL_DER_VALID_BANK) ||
+ !(ev->validity_flags & CXL_DER_VALID_ROW) ||
+ !(ev->validity_flags & CXL_DER_VALID_COLUMN))
+ return -1;
+ snprintf(repair_type, CXL_CMD_BUF_SIZE, "cacheline-sparing");
+ sparing_type = CXL_CACHE_SPARING;
+ break;
+ case CXL_MAINT_SUBCLASS_ROW_SPARING:
+ if (!(ev->validity_flags & CXL_DER_VALID_BANK_GROUP) ||
+ !(ev->validity_flags & CXL_DER_VALID_BANK) ||
+ !(ev->validity_flags & CXL_DER_VALID_ROW))
+ return -1;
+ snprintf(repair_type, CXL_CMD_BUF_SIZE, "row-sparing");
+ sparing_type = CXL_ROW_SPARING;
+ break;
+ case CXL_MAINT_SUBCLASS_BANK_SPARING:
+ if (!(ev->validity_flags & CXL_DER_VALID_BANK_GROUP) ||
+ !(ev->validity_flags & CXL_DER_VALID_BANK))
+ return -1;
+ snprintf(repair_type, CXL_CMD_BUF_SIZE, "bank-sparing");
+ sparing_type = CXL_CACHE_SPARING;
+ break;
+ case CXL_MAINT_SUBCLASS_RANK_SPARING:
+ snprintf(repair_type, CXL_CMD_BUF_SIZE, "rank-sparing");
+ sparing_type = CXL_CACHE_SPARING;
+ break;
+ default:
+ return -1;
+ }
+
+ idx = cxl_find_spare(hdr->memdev, repair_type);
+ if (idx < 0)
+ return -1;
+
+ snprintf(dir, CXL_CMD_BUF_SIZE, "%s%s%s/mem_repair%d",
+ edac_bus_path, EDAC_CXL_DEV_PREFIX, ev->hdr.memdev, idx);
+
+ if (sparing_type == CXL_CACHE_SPARING)
+ fill_cacheline_sparing_attrs(ev, dir);
+ else if (sparing_type == CXL_ROW_SPARING)
+ fill_row_sparing_attrs(ev, dir);
+ else if (sparing_type == CXL_BANK_SPARING)
+ fill_bank_sparing_attrs(ev, dir);
+ else if (sparing_type == CXL_RANK_SPARING)
+ fill_rank_sparing_attrs(ev, dir);
+ else
+ return -1;
+
+ set_sysfs_data_uint32(dir, "repair", 1);
+
+ return 0;
+}
+
int ras_cxl_dram_event_handler(struct trace_seq *s,
struct tep_record *record,
struct tep_event *event, void *context)
@@ -1231,6 +1516,8 @@ int ras_cxl_dram_event_handler(struct trace_seq *s,
if (trace_seq_printf(s, "CVME Count:%u ", ev.cvme_count) <= 0)
return -1;
+ cxl_dram_sparing(&ev);
+
/* Insert data into the SGBD */
#ifdef HAVE_SQLITE3
ras_store_cxl_dram_event(ras, &ev);
--
2.43.0
Powered by blists - more mailing lists