[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <6697fbef.250a0220.24877d.00a7@mx.google.com>
Date: Wed, 17 Jul 2024 10:13:24 -0700
From: nifan.cxl@...il.com
To: shiju.jose@...wei.com
Cc: linux-edac@...r.kernel.org, linux-cxl@...r.kernel.org,
linux-acpi@...r.kernel.org, linux-mm@...ck.org,
linux-kernel@...r.kernel.org, bp@...en8.de, tony.luck@...el.com,
rafael@...nel.org, lenb@...nel.org, mchehab@...nel.org,
dan.j.williams@...el.com, dave@...olabs.net,
jonathan.cameron@...wei.com, dave.jiang@...el.com,
alison.schofield@...el.com, vishal.l.verma@...el.com,
ira.weiny@...el.com, david@...hat.com, Vilas.Sridharan@....com,
leo.duran@....com, Yazen.Ghannam@....com, rientjes@...gle.com,
jiaqiyan@...gle.com, Jon.Grimm@....com, dave.hansen@...ux.intel.com,
naoya.horiguchi@....com, james.morse@....com, jthoughton@...gle.com,
somasundaram.a@....com, erdemaktas@...gle.com, pgonda@...gle.com,
duenwen@...gle.com, mike.malvestuto@...el.com, gthelen@...gle.com,
wschwartz@...erecomputing.com, dferguson@...erecomputing.com,
wbs@...amperecomputing.com, nifan.cxl@...il.com,
tanxiaofei@...wei.com, prime.zeng@...ilicon.com,
roberto.sassu@...wei.com, kangkang.shen@...urewei.com,
wanghuiqiang@...wei.com, linuxarm@...wei.com
Subject: Re: [RFC PATCH v9 03/11] EDAC: Add EDAC ECS control driver
On Tue, Jul 16, 2024 at 04:03:27PM +0100, shiju.jose@...wei.com wrote:
> From: Shiju Jose <shiju.jose@...wei.com>
>
> Add EDAC ECS (Error Check Scrub) control driver supports configuring
> the memory device's ECS feature.
>
> The Error Check Scrub (ECS) is a feature defined in JEDEC DDR5 SDRAM
> Specification (JESD79-5) and allows the DRAM to internally read, correct
> single-bit errors, and write back corrected data bits to the DRAM array
> while providing transparency to error counts.
>
> The DDR5 device contains number of memory media FRUs per device. The
> DDR5 ECS feature and thus the ECS control driver supports configuring
> the ECS parameters per FRU.
>
> The memory devices supports ECS feature register with the EDAC ECS driver
> and thus with the generic EDAC RAS feature driver, which adds the sysfs
> ECS control interface. The ECS control attributes are exposed to the
> userspace in /sys/bus/edac/devices/<dev-name>/ecs_fruX/.
>
> Generic EDAC ECS driver and the common sysfs ECS interface promotes
> unambiguous control from the userspace irrespective of the underlying
> devices, support ECS feature.
>
> The support for ECS feature is added separately because the DDR5 ECS
> feature's control attributes are dissimilar from those of the scrub
> feature.
>
> Note: Documentation can be added if necessary.
>
> Co-developed-by: Jonathan Cameron <Jonathan.Cameron@...wei.com>
> Signed-off-by: Jonathan Cameron <Jonathan.Cameron@...wei.com>
> Signed-off-by: Shiju Jose <shiju.jose@...wei.com>
> ---
> drivers/edac/Makefile | 2 +-
> drivers/edac/edac_ecs.c | 396 +++++++++++++++++++++++++++++++
> drivers/edac/edac_ras_feature.c | 5 +
> include/linux/edac_ras_feature.h | 36 +++
> 4 files changed, 438 insertions(+), 1 deletion(-)
> create mode 100755 drivers/edac/edac_ecs.c
>
> diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
> index de56cbd039eb..c1412c7d3efb 100644
> --- a/drivers/edac/Makefile
> +++ b/drivers/edac/Makefile
> @@ -10,7 +10,7 @@ obj-$(CONFIG_EDAC) := edac_core.o
>
> edac_core-y := edac_mc.o edac_device.o edac_mc_sysfs.o
> edac_core-y += edac_module.o edac_device_sysfs.o wq.o
> -edac_core-y += edac_ras_feature.o edac_scrub.o
> +edac_core-y += edac_ras_feature.o edac_scrub.o edac_ecs.o
>
> edac_core-$(CONFIG_EDAC_DEBUG) += debugfs.o
>
> diff --git a/drivers/edac/edac_ecs.c b/drivers/edac/edac_ecs.c
> new file mode 100755
> index 000000000000..37dabd053c36
> --- /dev/null
> +++ b/drivers/edac/edac_ecs.c
> @@ -0,0 +1,396 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * ECS driver supporting controlling on die error check scrub
> + * (e.g. DDR5 ECS). The common sysfs ECS interface promotes
> + * unambiguous access from the userspace.
> + *
> + * Copyright (c) 2024 HiSilicon Limited.
> + */
> +
> +#define pr_fmt(fmt) "EDAC ECS: " fmt
> +
> +#include <linux/edac_ras_feature.h>
> +
> +#define EDAC_ECS_FRU_NAME "ecs_fru"
> +
> +enum edac_ecs_attributes {
> + ecs_log_entry_type,
> + ecs_log_entry_type_per_dram,
> + ecs_log_entry_type_per_memory_media,
> + ecs_mode,
> + ecs_mode_counts_rows,
> + ecs_mode_counts_codewords,
> + ecs_reset,
> + ecs_name,
> + ecs_threshold,
> + ecs_max_attrs
> +};
As mentioned in other review, use uppercase.
Fan
> +
> +struct edac_ecs_dev_attr {
> + struct device_attribute dev_attr;
> + int fru_id;
> +};
> +
> +struct edac_ecs_fru_context {
> + char name[EDAC_RAS_NAME_LEN];
> + struct edac_ecs_dev_attr ecs_dev_attr[ecs_max_attrs];
> + struct attribute *ecs_attrs[ecs_max_attrs + 1];
> + struct attribute_group group;
> +};
> +
> +struct edac_ecs_context {
> + u16 num_media_frus;
> + struct edac_ecs_fru_context *fru_ctxs;
> +};
> +
> +#define to_ecs_dev_attr(_dev_attr) \
> + container_of(_dev_attr, struct edac_ecs_dev_attr, dev_attr)
> +
> +static ssize_t log_entry_type_show(struct device *ras_feat_dev,
> + struct device_attribute *attr,
> + char *buf)
> +{
> + struct edac_ecs_dev_attr *ecs_dev_attr = to_ecs_dev_attr(attr);
> + struct edac_ras_feat_ctx *ctx = dev_get_drvdata(ras_feat_dev);
> + const struct edac_ecs_ops *ops = ctx->ecs.ops;
> + u64 val;
> + int ret;
> +
> + ret = ops->get_log_entry_type(ras_feat_dev->parent, ctx->ecs.private,
> + ecs_dev_attr->fru_id, &val);
> + if (ret)
> + return ret;
> +
> + return sysfs_emit(buf, "0x%llx\n", val);
> +}
> +
> +static ssize_t log_entry_type_store(struct device *ras_feat_dev,
> + struct device_attribute *attr,
> + const char *buf, size_t len)
> +{
> + struct edac_ecs_dev_attr *ecs_dev_attr = to_ecs_dev_attr(attr);
> + struct edac_ras_feat_ctx *ctx = dev_get_drvdata(ras_feat_dev);
> + const struct edac_ecs_ops *ops = ctx->ecs.ops;
> + long val;
> + int ret;
> +
> + ret = kstrtol(buf, 10, &val);
> + if (ret < 0)
> + return ret;
> +
> + ret = ops->set_log_entry_type(ras_feat_dev->parent, ctx->ecs.private,
> + ecs_dev_attr->fru_id, val);
> + if (ret)
> + return ret;
> +
> + return len;
> +}
> +
> +static ssize_t log_entry_type_per_dram_show(struct device *ras_feat_dev,
> + struct device_attribute *attr,
> + char *buf)
> +{
> + struct edac_ecs_dev_attr *ecs_dev_attr = to_ecs_dev_attr(attr);
> + struct edac_ras_feat_ctx *ctx = dev_get_drvdata(ras_feat_dev);
> + const struct edac_ecs_ops *ops = ctx->ecs.ops;
> + u64 val;
> + int ret;
> +
> + ret = ops->get_log_entry_type_per_dram(ras_feat_dev->parent, ctx->ecs.private,
> + ecs_dev_attr->fru_id, &val);
> + if (ret)
> + return ret;
> +
> + return sysfs_emit(buf, "0x%llx\n", val);
> +}
> +
> +static ssize_t log_entry_type_per_memory_media_show(struct device *ras_feat_dev,
> + struct device_attribute *attr,
> + char *buf)
> +{
> + struct edac_ecs_dev_attr *ecs_dev_attr = to_ecs_dev_attr(attr);
> + struct edac_ras_feat_ctx *ctx = dev_get_drvdata(ras_feat_dev);
> + const struct edac_ecs_ops *ops = ctx->ecs.ops;
> + u64 val;
> + int ret;
> +
> + ret = ops->get_log_entry_type_per_memory_media(ras_feat_dev->parent,
> + ctx->ecs.private,
> + ecs_dev_attr->fru_id, &val);
> + if (ret)
> + return ret;
> +
> + return sysfs_emit(buf, "0x%llx\n", val);
> +}
> +
> +static ssize_t mode_show(struct device *ras_feat_dev,
> + struct device_attribute *attr,
> + char *buf)
> +{
> + struct edac_ecs_dev_attr *ecs_dev_attr = to_ecs_dev_attr(attr);
> + struct edac_ras_feat_ctx *ctx = dev_get_drvdata(ras_feat_dev);
> + const struct edac_ecs_ops *ops = ctx->ecs.ops;
> + u64 val;
> + int ret;
> +
> + ret = ops->get_mode(ras_feat_dev->parent, ctx->ecs.private,
> + ecs_dev_attr->fru_id, &val);
> + if (ret)
> + return ret;
> +
> + return sysfs_emit(buf, "0x%llx\n", val);
> +}
> +
> +static ssize_t mode_store(struct device *ras_feat_dev,
> + struct device_attribute *attr,
> + const char *buf, size_t len)
> +{
> + struct edac_ecs_dev_attr *ecs_dev_attr = to_ecs_dev_attr(attr);
> + struct edac_ras_feat_ctx *ctx = dev_get_drvdata(ras_feat_dev);
> + const struct edac_ecs_ops *ops = ctx->ecs.ops;
> + long val;
> + int ret;
> +
> + ret = kstrtol(buf, 10, &val);
> + if (ret < 0)
> + return ret;
> +
> + ret = ops->set_mode(ras_feat_dev->parent, ctx->ecs.private,
> + ecs_dev_attr->fru_id, val);
> + if (ret)
> + return ret;
> +
> + return len;
> +}
> +
> +static ssize_t mode_counts_rows_show(struct device *ras_feat_dev,
> + struct device_attribute *attr,
> + char *buf)
> +{
> + struct edac_ecs_dev_attr *ecs_dev_attr = to_ecs_dev_attr(attr);
> + struct edac_ras_feat_ctx *ctx = dev_get_drvdata(ras_feat_dev);
> + const struct edac_ecs_ops *ops = ctx->ecs.ops;
> + u64 val;
> + int ret;
> +
> + ret = ops->get_mode_counts_rows(ras_feat_dev->parent, ctx->ecs.private,
> + ecs_dev_attr->fru_id, &val);
> + if (ret)
> + return ret;
> +
> + return sysfs_emit(buf, "0x%llx\n", val);
> +}
> +
> +static ssize_t mode_counts_codewords_show(struct device *ras_feat_dev,
> + struct device_attribute *attr,
> + char *buf)
> +{
> + struct edac_ecs_dev_attr *ecs_dev_attr = to_ecs_dev_attr(attr);
> + struct edac_ras_feat_ctx *ctx = dev_get_drvdata(ras_feat_dev);
> + const struct edac_ecs_ops *ops = ctx->ecs.ops;
> + u64 val;
> + int ret;
> +
> + ret = ops->get_mode_counts_codewords(ras_feat_dev->parent, ctx->ecs.private,
> + ecs_dev_attr->fru_id, &val);
> + if (ret)
> + return ret;
> +
> + return sysfs_emit(buf, "0x%llx\n", val);
> +}
> +
> +static ssize_t reset_store(struct device *ras_feat_dev,
> + struct device_attribute *attr,
> + const char *buf, size_t len)
> +{
> + struct edac_ecs_dev_attr *ecs_dev_attr = to_ecs_dev_attr(attr);
> + struct edac_ras_feat_ctx *ctx = dev_get_drvdata(ras_feat_dev);
> + const struct edac_ecs_ops *ops = ctx->ecs.ops;
> + long val;
> + int ret;
> +
> + ret = kstrtol(buf, 10, &val);
> + if (ret < 0)
> + return ret;
> +
> + ret = ops->reset(ras_feat_dev->parent, ctx->ecs.private,
> + ecs_dev_attr->fru_id, val);
> + if (ret)
> + return ret;
> +
> + return len;
> +}
> +
> +static ssize_t name_show(struct device *ras_feat_dev,
> + struct device_attribute *attr, char *buf)
> +{
> + struct edac_ecs_dev_attr *ecs_dev_attr = to_ecs_dev_attr(attr);
> + struct edac_ras_feat_ctx *ctx = dev_get_drvdata(ras_feat_dev);
> + const struct edac_ecs_ops *ops = ctx->ecs.ops;
> + int ret;
> +
> + ret = ops->get_name(ras_feat_dev->parent, ctx->ecs.private,
> + ecs_dev_attr->fru_id, buf);
> + if (ret)
> + return ret;
> +
> + return strlen(buf);
> +}
> +
> +static ssize_t threshold_show(struct device *ras_feat_dev,
> + struct device_attribute *attr, char *buf)
> +{
> + struct edac_ecs_dev_attr *ecs_dev_attr = to_ecs_dev_attr(attr);
> + struct edac_ras_feat_ctx *ctx = dev_get_drvdata(ras_feat_dev);
> + const struct edac_ecs_ops *ops = ctx->ecs.ops;
> + int ret;
> + u64 val;
> +
> + ret = ops->get_threshold(ras_feat_dev->parent, ctx->ecs.private,
> + ecs_dev_attr->fru_id, &val);
> + if (ret)
> + return ret;
> +
> + return sysfs_emit(buf, "0x%llx\n", val);
> +}
> +
> +static ssize_t threshold_store(struct device *ras_feat_dev,
> + struct device_attribute *attr,
> + const char *buf, size_t len)
> +{
> + struct edac_ecs_dev_attr *ecs_dev_attr = to_ecs_dev_attr(attr);
> + struct edac_ras_feat_ctx *ctx = dev_get_drvdata(ras_feat_dev);
> + const struct edac_ecs_ops *ops = ctx->ecs.ops;
> + long val;
> + int ret;
> +
> + ret = kstrtol(buf, 10, &val);
> + if (ret < 0)
> + return ret;
> +
> + ret = ops->set_threshold(ras_feat_dev->parent, ctx->ecs.private,
> + ecs_dev_attr->fru_id, val);
> + if (ret)
> + return ret;
> +
> + return len;
> +}
> +
> +static umode_t ecs_attr_visible(struct kobject *kobj,
> + struct attribute *a, int attr_id)
> +{
> + struct device *ras_feat_dev = kobj_to_dev(kobj);
> + struct edac_ras_feat_ctx *ctx = dev_get_drvdata(ras_feat_dev);
> + const struct edac_ecs_ops *ops = ctx->ecs.ops;
> +
> + switch (attr_id) {
> + case ecs_log_entry_type:
> + if (ops->get_log_entry_type && ops->set_log_entry_type)
> + return a->mode;
> + if (ops->get_log_entry_type)
> + return 0444;
> + return 0;
> + case ecs_log_entry_type_per_dram:
> + return ops->get_log_entry_type_per_dram ? a->mode : 0;
> + case ecs_log_entry_type_per_memory_media:
> + return ops->get_log_entry_type_per_memory_media ? a->mode : 0;
> + case ecs_mode:
> + if (ops->get_mode && ops->set_mode)
> + return a->mode;
> + if (ops->get_mode)
> + return 0444;
> + return 0;
> + case ecs_mode_counts_rows:
> + return ops->get_mode_counts_rows ? a->mode : 0;
> + case ecs_mode_counts_codewords:
> + return ops->get_mode_counts_codewords ? a->mode : 0;
> + case ecs_reset:
> + return ops->reset ? a->mode : 0;
> + case ecs_name:
> + return ops->get_name ? a->mode : 0;
> + case ecs_threshold:
> + if (ops->get_threshold && ops->set_threshold)
> + return a->mode;
> + if (ops->get_threshold)
> + return 0444;
> + return 0;
> + default:
> + return 0;
> + }
> +}
> +
> +#define EDAC_ECS_ATTR_RO(_name, _fru_id) \
> + ((struct edac_ecs_dev_attr) { .dev_attr = __ATTR_RO(_name), \
> + .fru_id = _fru_id })
> +
> +#define EDAC_ECS_ATTR_WO(_name, _fru_id) \
> + ((struct edac_ecs_dev_attr) { .dev_attr = __ATTR_WO(_name), \
> + .fru_id = _fru_id })
> +
> +#define EDAC_ECS_ATTR_RW(_name, _fru_id) \
> + ((struct edac_ecs_dev_attr) { .dev_attr = __ATTR_RW(_name), \
> + .fru_id = _fru_id })
> +
> +static int ecs_create_desc(struct device *ecs_dev,
> + const struct attribute_group **attr_groups,
> + u16 num_media_frus)
> +{
> + struct edac_ecs_context *ecs_ctx;
> + u32 fru;
> +
> + ecs_ctx = devm_kzalloc(ecs_dev, sizeof(*ecs_ctx), GFP_KERNEL);
> + if (!ecs_ctx)
> + return -ENOMEM;
> +
> + ecs_ctx->num_media_frus = num_media_frus;
> + ecs_ctx->fru_ctxs = devm_kcalloc(ecs_dev, num_media_frus,
> + sizeof(*ecs_ctx->fru_ctxs),
> + GFP_KERNEL);
> + if (!ecs_ctx->fru_ctxs)
> + return -ENOMEM;
> +
> + for (fru = 0; fru < num_media_frus; fru++) {
> + struct edac_ecs_fru_context *fru_ctx = &ecs_ctx->fru_ctxs[fru];
> + struct attribute_group *group = &fru_ctx->group;
> + int i;
> +
> + fru_ctx->ecs_dev_attr[0] = EDAC_ECS_ATTR_RW(log_entry_type, fru);
> + fru_ctx->ecs_dev_attr[1] = EDAC_ECS_ATTR_RO(log_entry_type_per_dram, fru);
> + fru_ctx->ecs_dev_attr[2] = EDAC_ECS_ATTR_RO(log_entry_type_per_memory_media, fru);
> + fru_ctx->ecs_dev_attr[3] = EDAC_ECS_ATTR_RW(mode, fru);
> + fru_ctx->ecs_dev_attr[4] = EDAC_ECS_ATTR_RO(mode_counts_rows, fru);
> + fru_ctx->ecs_dev_attr[5] = EDAC_ECS_ATTR_RO(mode_counts_codewords, fru);
> + fru_ctx->ecs_dev_attr[6] = EDAC_ECS_ATTR_WO(reset, fru);
> + fru_ctx->ecs_dev_attr[7] = EDAC_ECS_ATTR_RO(name, fru);
> + fru_ctx->ecs_dev_attr[8] = EDAC_ECS_ATTR_RW(threshold, fru);
> + for (i = 0; i < ecs_max_attrs; i++)
> + fru_ctx->ecs_attrs[i] = &fru_ctx->ecs_dev_attr[i].dev_attr.attr;
> +
> + sprintf(fru_ctx->name, "%s%d", EDAC_ECS_FRU_NAME, fru);
> + group->name = fru_ctx->name;
> + group->attrs = fru_ctx->ecs_attrs;
> + group->is_visible = ecs_attr_visible;
> +
> + attr_groups[fru] = group;
> + }
> +
> + return 0;
> +}
> +
> +/**
> + * edac_ecs_get_desc - get edac ecs descriptors
> + * @ecs_dev: client ecs device
> + * @attr_groups: pointer to attrribute group container
> + * @num_media_frus: number of media FRUs in the device
> + *
> + * Returns 0 on success, error otherwise.
> + */
> +int edac_ecs_get_desc(struct device *ecs_dev,
> + const struct attribute_group **attr_groups,
> + u16 num_media_frus)
> +{
> + if (!ecs_dev || !attr_groups || !num_media_frus)
> + return -EINVAL;
> +
> + return ecs_create_desc(ecs_dev, attr_groups, num_media_frus);
> +}
> diff --git a/drivers/edac/edac_ras_feature.c b/drivers/edac/edac_ras_feature.c
> index 48927f868372..a02ffbcc1c1e 100755
> --- a/drivers/edac/edac_ras_feature.c
> +++ b/drivers/edac/edac_ras_feature.c
> @@ -47,10 +47,15 @@ static int edac_ras_feat_ecs_init(struct device *parent,
> const struct attribute_group **attr_groups)
> {
> int num = efeat->ecs_info.num_media_frus;
> + int ret;
>
> edata->ops = efeat->ecs_ops;
> edata->private = efeat->ecs_ctx;
>
> + ret = edac_ecs_get_desc(parent, attr_groups, num);
> + if (ret)
> + return ret;
> +
> return num;
> }
>
> diff --git a/include/linux/edac_ras_feature.h b/include/linux/edac_ras_feature.h
> index 462f9ecbf9d4..153f8a3557f1 100755
> --- a/include/linux/edac_ras_feature.h
> +++ b/include/linux/edac_ras_feature.h
> @@ -47,10 +47,46 @@ struct edac_scrub_ops {
>
> const struct attribute_group *edac_scrub_get_desc(void);
>
> +/**
> + * struct ecs_ops - ECS device operations (all elements optional)
> + * @get_log_entry_type: read the log entry type value.
> + * @set_log_entry_type: set the log entry type value.
> + * @get_log_entry_type_per_dram: read the log entry type per dram value.
> + * @get_log_entry_type_memory_media: read the log entry type per memory media value.
> + * @get_mode: read the mode value.
> + * @set_mode: set the mode value.
> + * @get_mode_counts_rows: read the mode counts rows value.
> + * @get_mode_counts_codewords: read the mode counts codewords value.
> + * @reset: reset the ECS counter.
> + * @get_threshold: read the threshold value.
> + * @set_threshold: set the threshold value.
> + * @get_name: get the ECS's name.
> + */
> +struct edac_ecs_ops {
> + int (*get_log_entry_type)(struct device *dev, void *drv_data, int fru_id, u64 *val);
> + int (*set_log_entry_type)(struct device *dev, void *drv_data, int fru_id, u64 val);
> + int (*get_log_entry_type_per_dram)(struct device *dev, void *drv_data,
> + int fru_id, u64 *val);
> + int (*get_log_entry_type_per_memory_media)(struct device *dev, void *drv_data,
> + int fru_id, u64 *val);
> + int (*get_mode)(struct device *dev, void *drv_data, int fru_id, u64 *val);
> + int (*set_mode)(struct device *dev, void *drv_data, int fru_id, u64 val);
> + int (*get_mode_counts_rows)(struct device *dev, void *drv_data, int fru_id, u64 *val);
> + int (*get_mode_counts_codewords)(struct device *dev, void *drv_data, int fru_id, u64 *val);
> + int (*reset)(struct device *dev, void *drv_data, int fru_id, u64 val);
> + int (*get_threshold)(struct device *dev, void *drv_data, int fru_id, u64 *threshold);
> + int (*set_threshold)(struct device *dev, void *drv_data, int fru_id, u64 threshold);
> + int (*get_name)(struct device *dev, void *drv_data, int fru_id, char *buf);
> +};
> +
> struct edac_ecs_ex_info {
> u16 num_media_frus;
> };
>
> +int edac_ecs_get_desc(struct device *ecs_dev,
> + const struct attribute_group **attr_groups,
> + u16 num_media_frus);
> +
> /*
> * EDAC RAS feature information structure
> */
> --
> 2.34.1
>
Powered by blists - more mailing lists