[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250110122641.1668-9-shiju.jose@huawei.com>
Date: Fri, 10 Jan 2025 12:26:34 +0000
From: <shiju.jose@...wei.com>
To: <linux-edac@...r.kernel.org>, <linux-cxl@...r.kernel.org>,
<mchehab@...nel.org>, <dave.jiang@...el.com>, <dan.j.williams@...el.com>,
<jonathan.cameron@...wei.com>, <alison.schofield@...el.com>,
<nifan.cxl@...il.com>, <vishal.l.verma@...el.com>, <ira.weiny@...el.com>,
<dave@...olabs.net>
CC: <linux-kernel@...r.kernel.org>, <linuxarm@...wei.com>,
<tanxiaofei@...wei.com>, <prime.zeng@...ilicon.com>, <shiju.jose@...wei.com>
Subject: [PATCH v2 08/14] rasdaemon: cxl: Update CXL DRAM event to CXL spec rev 3.1
From: Shiju Jose <shiju.jose@...wei.com>
CXL spec 3.1 section 8.2.9.2.1.2 Table 8-46, DRAM Event Record has updated
with following new fields and new types for Memory Event Type, Transaction
Type and Validity Flags fields.
1. Component Identifier
2. Sub-channel
3. Advanced Programmable Corrected Memory Error Threshold Event Flags
4. Corrected Memory Error Count at Event
5. Memory Event Sub-Type
Update the parsing, logging and recording of DRAM event for the above
spec rev 3.1 changes.
Example rasdaemon log for CXL DRAM event,
cxl_dram 2024-11-19 18:39:00 +0000 memdev:mem3 host:0000:0f:00.0 serial:0x3 \
log type:Informational hdr_uuid:601dcbb3-9c06-4eab-b8af-4e9bfb5c9624 \
hdr_handle:0x1 hdr_related_handle:0x0 hdr_timestamp:1970-01-01 00:05:21 +0000 \
hdr_length:128 hdr_maint_op_class:1 hdr_maint_op_sub_class:3 dpa:0x18680 \
dpa_flags:descriptor:'UNCORRECTABLE EVENT' 'THRESHOLD EVENT' \
memory_event_type:Data Path Error memory_event_sub_type:Media Link CRC Error \
transaction_type:Internal Media Scrub channel:3 rank:17 nibble_mask:3866802 \
bank_group:7 bank:11 row:2 column:77 correction_mask:21 00 00 00 00 00 00 00 \
2c 00 00 00 00 00 00 00 37 00 00 00 00 00 00 00 42 00 00 00 00 00 00 00 \
comp_id:01 74 c5 08 9a 1a 0b fc d2 7e 2f 31 9b 3c 81 4d \
comp_id_pldm_valid_flags:'PLDM Entity ID' PLDM Entity ID:74 c5 08 9a 1a 0b \
Advanced Programmable CME threshold Event Flags:'Corrected Memory Errors in \
Multiple Media Components' 'Exceeded Programmable Threshold' CVME Count:0x94
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@...wei.com>
Signed-off-by: Shiju Jose <shiju.jose@...wei.com>
---
ras-cxl-handler.c | 67 +++++++++++++++++++++++++++++++++++++++++++++--
ras-record.c | 18 +++++++++++++
ras-record.h | 7 +++++
ras-report.c | 12 +++++++--
4 files changed, 100 insertions(+), 4 deletions(-)
diff --git a/ras-cxl-handler.c b/ras-cxl-handler.c
index 225bde6..3dd795e 100644
--- a/ras-cxl-handler.c
+++ b/ras-cxl-handler.c
@@ -1003,7 +1003,7 @@ int ras_cxl_general_media_event_handler(struct trace_seq *s,
/*
* DRAM Event Record - DER
*
- * CXL rev 3.0 section 8.2.9.2.1.2; Table 8-44
+ * CXL rev 3.1 section 8.2.9.2.1.2; Table 8-46
*/
#define CXL_DER_VALID_CHANNEL BIT(0)
#define CXL_DER_VALID_RANK BIT(1)
@@ -1013,19 +1013,25 @@ int ras_cxl_general_media_event_handler(struct trace_seq *s,
#define CXL_DER_VALID_ROW BIT(5)
#define CXL_DER_VALID_COLUMN BIT(6)
#define CXL_DER_VALID_CORRECTION_MASK BIT(7)
+#define CXL_DER_VALID_COMPONENT_ID BIT(8)
+#define CXL_DER_VALID_COMPONENT_ID_FORMAT BIT(9)
+#define CXL_DER_VALID_SUB_CHANNEL BIT(10)
static const char * const cxl_der_mem_event_type[] = {
"Media ECC Error",
"Scrub Media ECC Error",
"Invalid Address",
"Data Path Error",
+ "TE State Violation",
+ "Advanced Programmable CME Counter Expiration",
+ "CKID Violation",
};
int ras_cxl_dram_event_handler(struct trace_seq *s,
struct tep_record *record,
struct tep_event *event, void *context)
{
- int len, i;
+ int len, i, rc;
unsigned long long val;
struct ras_events *ras = context;
struct ras_cxl_dram_event ev;
@@ -1067,6 +1073,15 @@ int ras_cxl_dram_event_handler(struct trace_seq *s,
ev.type)) <= 0)
return -1;
+ if (tep_get_field_val(s, event, "sub_type", record, &val, 1) < 0)
+ return -1;
+ ev.sub_type = val;
+ if (trace_seq_printf(s, "memory_event_sub_type:%s ",
+ get_cxl_type_str(cxl_mem_event_sub_type,
+ ARRAY_SIZE(cxl_mem_event_sub_type),
+ ev.sub_type)) <= 0)
+ return -1;
+
if (tep_get_field_val(s, event, "transaction_type", record, &val, 1) < 0)
return -1;
ev.transaction_type = val;
@@ -1108,6 +1123,14 @@ int ras_cxl_dram_event_handler(struct trace_seq *s,
return -1;
}
+ if (ev.validity_flags & CXL_DER_VALID_SUB_CHANNEL) {
+ if (tep_get_field_val(s, event, "sub_channel", record, &val, 1) < 0)
+ return -1;
+ ev.sub_channel = val;
+ if (trace_seq_printf(s, "sub_channel:%u ", ev.sub_channel) <= 0)
+ return -1;
+ }
+
if (ev.validity_flags & CXL_DER_VALID_RANK) {
if (tep_get_field_val(s, event, "rank", record, &val, 1) < 0)
return -1;
@@ -1168,6 +1191,46 @@ int ras_cxl_dram_event_handler(struct trace_seq *s,
}
}
+ if (ev.validity_flags & CXL_DER_VALID_COMPONENT_ID) {
+ ev.comp_id = tep_get_field_raw(s, event, "comp_id", record, &len, 1);
+ if (!ev.comp_id)
+ return -1;
+ if (trace_seq_printf(s, "comp_id:") <= 0)
+ return -1;
+ for (i = 0; i < CXL_EVENT_GEN_MED_COMP_ID_SIZE; i++) {
+ if (trace_seq_printf(s, "%02x ", ev.comp_id[i]) <= 0)
+ break;
+ }
+
+ if (ev.validity_flags & CXL_DER_VALID_COMPONENT_ID_FORMAT) {
+ if (trace_seq_printf(s, "comp_id_pldm_valid_flags:") <= 0)
+ return -1;
+ if (decode_cxl_event_flags(s, ev.comp_id[0], cxl_pldm_comp_id_flags,
+ ARRAY_SIZE(cxl_pldm_comp_id_flags)) < 0)
+ return -1;
+
+ rc = ras_cxl_print_component_id(s, ev.comp_id, ev.entity_id, ev.res_id);
+ if (rc)
+ return rc;
+ }
+ }
+
+ if (tep_get_field_val(s, event, "cme_threshold_ev_flags", record, &val, 1) < 0)
+ return -1;
+ ev.cme_threshold_ev_flags = val;
+ if (trace_seq_printf(s, "Advanced Programmable CME threshold Event Flags:") <= 0)
+ return -1;
+ if (decode_cxl_event_flags(s, ev.cme_threshold_ev_flags,
+ cxl_cme_threshold_ev_flags,
+ ARRAY_SIZE(cxl_cme_threshold_ev_flags)) < 0)
+ return -1;
+
+ if (tep_get_field_val(s, event, "cvme_count", record, &val, 1) < 0)
+ return -1;
+ ev.cvme_count = val;
+ if (trace_seq_printf(s, "CVME Count:%u ", ev.cvme_count) <= 0)
+ return -1;
+
/* Insert data into the SGBD */
#ifdef HAVE_SQLITE3
ras_store_cxl_dram_event(ras, &ev);
diff --git a/ras-record.c b/ras-record.c
index 1020c37..9799d7e 100644
--- a/ras-record.c
+++ b/ras-record.c
@@ -986,6 +986,13 @@ static const struct db_fields cxl_dram_event_fields[] = {
{ .name = "hpa", .type = "INTEGER" },
{ .name = "region", .type = "TEXT" },
{ .name = "region_uuid", .type = "TEXT" },
+ { .name = "comp_id", .type = "BLOB" },
+ { .name = "pldm_entity_id", .type = "BLOB" },
+ { .name = "pldm_resource_id", .type = "BLOB" },
+ { .name = "sub_type", .type = "INTEGER" },
+ { .name = "sub_channel", .type = "INTEGER" },
+ { .name = "cme_threshold_ev_flags", .type = "INTEGER" },
+ { .name = "cvme_count", .type = "INTEGER" },
};
static const struct db_table_descriptor cxl_dram_event_tab = {
@@ -1025,6 +1032,17 @@ int ras_store_cxl_dram_event(struct ras_events *ras, struct ras_cxl_dram_event *
sqlite3_bind_int64(priv->stmt_cxl_dram_event, idx++, ev->hpa);
sqlite3_bind_text(priv->stmt_cxl_dram_event, idx++, ev->region, -1, NULL);
sqlite3_bind_text(priv->stmt_cxl_dram_event, idx++, ev->region_uuid, -1, NULL);
+ sqlite3_bind_blob(priv->stmt_cxl_dram_event, idx++, ev->comp_id,
+ CXL_EVENT_GEN_MED_COMP_ID_SIZE, NULL);
+ sqlite3_bind_blob(priv->stmt_cxl_dram_event, idx++, ev->entity_id,
+ CXL_PLDM_ENTITY_ID_LEN, NULL);
+ sqlite3_bind_blob(priv->stmt_cxl_dram_event, idx++, ev->res_id,
+ CXL_PLDM_RES_ID_LEN, NULL);
+ sqlite3_bind_int(priv->stmt_cxl_dram_event, idx++, ev->sub_type);
+ sqlite3_bind_int(priv->stmt_cxl_dram_event, idx++, ev->sub_channel);
+ sqlite3_bind_int(priv->stmt_cxl_dram_event, idx++,
+ ev->cme_threshold_ev_flags);
+ sqlite3_bind_int(priv->stmt_cxl_dram_event, idx++, ev->cvme_count);
rc = sqlite3_step(priv->stmt_cxl_dram_event);
if (rc != SQLITE_OK && rc != SQLITE_DONE)
diff --git a/ras-record.h b/ras-record.h
index 12e693b..3aec063 100644
--- a/ras-record.h
+++ b/ras-record.h
@@ -218,8 +218,10 @@ struct ras_cxl_dram_event {
uint8_t dpa_flags;
uint8_t descriptor;
uint8_t type;
+ uint8_t sub_type;
uint8_t transaction_type;
uint8_t channel;
+ uint8_t sub_channel;
uint8_t rank;
uint32_t nibble_mask;
uint8_t bank_group;
@@ -231,6 +233,11 @@ struct ras_cxl_dram_event {
uint64_t hpa;
const char *region;
const char *region_uuid;
+ uint8_t *comp_id;
+ uint8_t entity_id[CXL_PLDM_ENTITY_ID_LEN];
+ uint8_t res_id[CXL_PLDM_RES_ID_LEN];
+ uint8_t cme_threshold_ev_flags;
+ uint32_t cvme_count;
};
struct ras_cxl_memory_module_event {
diff --git a/ras-report.c b/ras-report.c
index ed1f4b8..8e343fc 100644
--- a/ras-report.c
+++ b/ras-report.c
@@ -624,17 +624,21 @@ static int set_cxl_dram_event_backtrace(char *buf, struct ras_cxl_dram_event *ev
"dpa_flags=%u\n"
"descriptor=%u\n"
"type=%u\n"
+ "sub_type=0x%x\n"
"transaction_type=%u\n"
"hpa=0x%lx\n"
"region=%s\n"
"region_uuid=%s\n"
"channel=%u\n"
+ "sub_channel=%u\n"
"rank=%u\n"
"nibble_mask=%u\n"
"bank_group=%u\n"
"bank=%u\n"
"row=%u\n"
- "column=%u\n",
+ "column=%u\n"
+ "cme_threshold_ev_flags=0x%x\n"
+ "cvme_count=0x%x\n",
ev->hdr.timestamp,
ev->hdr.memdev,
ev->hdr.host,
@@ -651,17 +655,21 @@ static int set_cxl_dram_event_backtrace(char *buf, struct ras_cxl_dram_event *ev
ev->dpa_flags,
ev->descriptor,
ev->type,
+ ev->sub_type,
ev->transaction_type,
ev->hpa,
ev->region,
ev->region_uuid,
ev->channel,
+ ev->sub_channel,
ev->rank,
ev->nibble_mask,
ev->bank_group,
ev->bank,
ev->row,
- ev->column);
+ ev->column,
+ ev->cme_threshold_ev_flags,
+ ev->cvme_count);
return 0;
}
--
2.43.0
Powered by blists - more mailing lists