[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <4c292a95-dec7-4407-98a2-194c73e19a2b@os.amperecomputing.com>
Date: Tue, 21 Jan 2025 15:01:28 -0800
From: Daniel Ferguson <danielf@...amperecomputing.com>
To: shiju.jose@...wei.com, linux-edac@...r.kernel.org,
linux-cxl@...r.kernel.org, linux-acpi@...r.kernel.org, linux-mm@...ck.org,
linux-kernel@...r.kernel.org
Cc: bp@...en8.de, tony.luck@...el.com, rafael@...nel.org, lenb@...nel.org,
mchehab@...nel.org, dan.j.williams@...el.com, dave@...olabs.net,
jonathan.cameron@...wei.com, dave.jiang@...el.com,
alison.schofield@...el.com, vishal.l.verma@...el.com, ira.weiny@...el.com,
david@...hat.com, Vilas.Sridharan@....com, leo.duran@....com,
Yazen.Ghannam@....com, rientjes@...gle.com, jiaqiyan@...gle.com,
Jon.Grimm@....com, dave.hansen@...ux.intel.com, naoya.horiguchi@....com,
james.morse@....com, jthoughton@...gle.com, somasundaram.a@....com,
erdemaktas@...gle.com, pgonda@...gle.com, duenwen@...gle.com,
gthelen@...gle.com, wschwartz@...erecomputing.com,
dferguson@...erecomputing.com, wbs@...amperecomputing.com,
nifan.cxl@...il.com, tanxiaofei@...wei.com, prime.zeng@...ilicon.com,
roberto.sassu@...wei.com, kangkang.shen@...urewei.com,
wanghuiqiang@...wei.com, linuxarm@...wei.com
Subject: Re: [PATCH v18 06/19] ras: mem: Add memory ACPI RAS2 driver
On 1/6/2025 4:10 AM, shiju.jose@...wei.com wrote:
> +#define pr_fmt(fmt) "MEMORY ACPI RAS2: " fmt
> +
> +#include <linux/bitfield.h>
> +#include <linux/edac.h>
> +#include <linux/platform_device.h>
> +#include <acpi/ras2_acpi.h>
> +
> +#define RAS2_DEV_NUM_RAS_FEATURES 1
> +
> +#define RAS2_SUPPORT_HW_PARTOL_SCRUB BIT(0)
> +#define RAS2_TYPE_PATROL_SCRUB 0x0000
> +
> +#define RAS2_GET_PATROL_PARAMETERS 0x01
> +#define RAS2_START_PATROL_SCRUBBER 0x02
> +#define RAS2_STOP_PATROL_SCRUBBER 0x03
> +
> +#define RAS2_PATROL_SCRUB_SCHRS_IN_MASK GENMASK(15, 8)
> +#define RAS2_PATROL_SCRUB_EN_BACKGROUND BIT(0)
> +#define RAS2_PATROL_SCRUB_SCHRS_OUT_MASK GENMASK(7, 0)
> +#define RAS2_PATROL_SCRUB_MIN_SCHRS_OUT_MASK GENMASK(15, 8)
> +#define RAS2_PATROL_SCRUB_MAX_SCHRS_OUT_MASK GENMASK(23, 16)
> +#define RAS2_PATROL_SCRUB_FLAG_SCRUBBER_RUNNING BIT(0)
> +
> +#define RAS2_SCRUB_NAME_LEN 128
> +#define RAS2_HOUR_IN_SECS 3600
> +
> +struct acpi_ras2_ps_shared_mem {
> + struct acpi_ras2_shared_memory common;
> + struct acpi_ras2_patrol_scrub_parameter params;
> +};
> +
If the ACPI change here [1] comes to fruition, then checking for errors
will/may have to be done by each individual feature. To show how that
may look, I've included a possible implementation to illustrate what I'm
trying to convey.
static int ras2_scrub_map_status_to_error(u32 cap_status)
{
switch (cap_status) {
case ACPI_RAS2_NOT_VALID:
case ACPI_RAS2_NOT_SUPPORTED:
return -EPERM;
case ACPI_RAS2_BUSY:
return -EBUSY;
case ACPI_RAS2_FAILED:
case ACPI_RAS2_ABORTED:
case ACPI_RAS2_INVALID_DATA:
return -EINVAL;
default: /* 0 or other, Success */
return 0;
}
}
[1] https://github.com/tianocore/edk2/issues/10540
> +static int ras2_is_patrol_scrub_support(struct ras2_mem_ctx *ras2_ctx)
> +{
> + struct acpi_ras2_shared_memory __iomem *common = (void *)
> + ras2_ctx->pcc_comm_addr;
> +
> + guard(mutex)(&ras2_ctx->lock);
> + common->set_capabilities[0] = 0;
> +
> + return common->features[0] & RAS2_SUPPORT_HW_PARTOL_SCRUB;
> +}
> +
> +static int ras2_update_patrol_scrub_params_cache(struct ras2_mem_ctx *ras2_ctx)
> +{
> + struct acpi_ras2_ps_shared_mem __iomem *ps_sm = (void *)
> + ras2_ctx->pcc_comm_addr;
> + int ret;
> +
> + ps_sm->common.set_capabilities[0] = RAS2_SUPPORT_HW_PARTOL_SCRUB;
> + ps_sm->params.patrol_scrub_command = RAS2_GET_PATROL_PARAMETERS;
> +
> + ret = ras2_send_pcc_cmd(ras2_ctx, RAS2_PCC_CMD_EXEC);
> + if (ret) {
> + dev_err(ras2_ctx->dev, "failed to read parameters\n");
> + return ret;
> + }
ret = ras2_scrub_map_status_to_error(ps_sm->scrub_params.status);
if (ret != 0)
return ret;
> +
> + ras2_ctx->min_scrub_cycle = FIELD_GET(RAS2_PATROL_SCRUB_MIN_SCHRS_OUT_MASK,
> + ps_sm->params.scrub_params_out);
> + ras2_ctx->max_scrub_cycle = FIELD_GET(RAS2_PATROL_SCRUB_MAX_SCHRS_OUT_MASK,
> + ps_sm->params.scrub_params_out);
> + if (!ras2_ctx->bg) {
> + ras2_ctx->base = ps_sm->params.actual_address_range[0];
> + ras2_ctx->size = ps_sm->params.actual_address_range[1];
> + }
> + ras2_ctx->scrub_cycle_hrs = FIELD_GET(RAS2_PATROL_SCRUB_SCHRS_OUT_MASK,
> + ps_sm->params.scrub_params_out);
> +
> + return 0;
> +}
> +
> +/* Context - lock must be held */
> +static int ras2_get_patrol_scrub_running(struct ras2_mem_ctx *ras2_ctx,
> + bool *running)
> +{
> + struct acpi_ras2_ps_shared_mem __iomem *ps_sm = (void *)
> + ras2_ctx->pcc_comm_addr;
> + int ret;
> +
> + ps_sm->common.set_capabilities[0] = RAS2_SUPPORT_HW_PARTOL_SCRUB;
> + ps_sm->params.patrol_scrub_command = RAS2_GET_PATROL_PARAMETERS;
> +
> + ret = ras2_send_pcc_cmd(ras2_ctx, RAS2_PCC_CMD_EXEC);
> + if (ret) {
> + dev_err(ras2_ctx->dev, "failed to read parameters\n");
> + return ret;
> + }
ret = ras2_scrub_map_status_to_error(ps_sm->scrub_params.status);
if (ret != 0)
return ret;
> +
> + *running = ps_sm->params.flags & RAS2_PATROL_SCRUB_FLAG_SCRUBBER_RUNNING;
> +
> + return 0;
> +}
> +
> +static int ras2_hw_scrub_read_min_scrub_cycle(struct device *dev, void *drv_data,
> + u32 *min)
> +{
> + struct ras2_mem_ctx *ras2_ctx = drv_data;
> +
> + *min = ras2_ctx->min_scrub_cycle * RAS2_HOUR_IN_SECS;
> +
> + return 0;
> +}
> +
> +static int ras2_hw_scrub_read_max_scrub_cycle(struct device *dev, void *drv_data,
> + u32 *max)
> +{
> + struct ras2_mem_ctx *ras2_ctx = drv_data;
> +
> + *max = ras2_ctx->max_scrub_cycle * RAS2_HOUR_IN_SECS;
> +
> + return 0;
> +}
> +
> +static int ras2_hw_scrub_cycle_read(struct device *dev, void *drv_data,
> + u32 *scrub_cycle_secs)
> +{
> + struct ras2_mem_ctx *ras2_ctx = drv_data;
> +
> + *scrub_cycle_secs = ras2_ctx->scrub_cycle_hrs * RAS2_HOUR_IN_SECS;
> +
> + return 0;
> +}
> +
> +static int ras2_hw_scrub_cycle_write(struct device *dev, void *drv_data,
> + u32 scrub_cycle_secs)
> +{
> + u8 scrub_cycle_hrs = scrub_cycle_secs / RAS2_HOUR_IN_SECS;
> + struct ras2_mem_ctx *ras2_ctx = drv_data;
> + bool running;
> + int ret;
> +
> + guard(mutex)(&ras2_ctx->lock);
> + ret = ras2_get_patrol_scrub_running(ras2_ctx, &running);
> + if (ret)
> + return ret;
> +
> + if (running)
> + return -EBUSY;
> +
> + if (scrub_cycle_hrs < ras2_ctx->min_scrub_cycle ||
> + scrub_cycle_hrs > ras2_ctx->max_scrub_cycle)
> + return -EINVAL;
> +
> + ras2_ctx->scrub_cycle_hrs = scrub_cycle_hrs;
> +
> + return 0;
> +}
> +
> +static int ras2_hw_scrub_read_addr(struct device *dev, void *drv_data, u64 *base)
> +{
> + struct ras2_mem_ctx *ras2_ctx = drv_data;
> + int ret;
> +
> + /*
> + * When BG scrubbing is enabled the actual address range is not valid.
> + * Return -EBUSY now unless find out a method to retrieve actual full PA range.
> + */
> + if (ras2_ctx->bg)
> + return -EBUSY;
> +
> + /*
> + * When demand scrubbing is finished firmware must reset actual
> + * address range to 0. Otherwise userspace assumes demand scrubbing
> + * is in progress.
> + */
> + ret = ras2_update_patrol_scrub_params_cache(ras2_ctx);
> + if (ret)
> + return ret;
> + *base = ras2_ctx->base;
> +
> + return 0;
> +}
> +
> +static int ras2_hw_scrub_read_size(struct device *dev, void *drv_data, u64 *size)
> +{
> + struct ras2_mem_ctx *ras2_ctx = drv_data;
> + int ret;
> +
> + if (ras2_ctx->bg)
> + return -EBUSY;
> +
> + ret = ras2_update_patrol_scrub_params_cache(ras2_ctx);
> + if (ret)
> + return ret;
> + *size = ras2_ctx->size;
> +
> + return 0;
> +}
> +
> +static int ras2_hw_scrub_write_addr(struct device *dev, void *drv_data, u64 base)
> +{
> + struct ras2_mem_ctx *ras2_ctx = drv_data;
> + struct acpi_ras2_ps_shared_mem __iomem *ps_sm = (void *)
> + ras2_ctx->pcc_comm_addr;
> + bool running;
> + int ret;
> +
> + guard(mutex)(&ras2_ctx->lock);
> + ps_sm->common.set_capabilities[0] = RAS2_SUPPORT_HW_PARTOL_SCRUB;
> + if (ras2_ctx->bg)
> + return -EBUSY;
> +
> + if (!base || !ras2_ctx->size) {
> + dev_warn(ras2_ctx->dev,
> + "%s: Invalid address range, base=0x%llx "
> + "size=0x%llx\n", __func__,
> + base, ras2_ctx->size);
> + return -ERANGE;
> + }
> +
> + ret = ras2_get_patrol_scrub_running(ras2_ctx, &running);
> + if (ret)
> + return ret;
> +
> + if (running)
> + return -EBUSY;
> +
> + ps_sm->params.scrub_params_in &= ~RAS2_PATROL_SCRUB_SCHRS_IN_MASK;
> + ps_sm->params.scrub_params_in |= FIELD_PREP(RAS2_PATROL_SCRUB_SCHRS_IN_MASK,
> + ras2_ctx->scrub_cycle_hrs);
> + ps_sm->params.requested_address_range[0] = base;
> + ps_sm->params.requested_address_range[1] = ras2_ctx->size;
> + ps_sm->params.scrub_params_in &= ~RAS2_PATROL_SCRUB_EN_BACKGROUND;
> + ps_sm->params.patrol_scrub_command = RAS2_START_PATROL_SCRUBBER;
> +
> + ret = ras2_send_pcc_cmd(ras2_ctx, RAS2_PCC_CMD_EXEC);
> + if (ret) {
> + dev_err(ras2_ctx->dev, "Failed to start demand scrubbing\n");
> + return ret;
> + }
ret = ras2_scrub_map_status_to_error(ps_sm->scrub_params.status);
if (ret != 0)
return ret;
> +
> + return ras2_update_patrol_scrub_params_cache(ras2_ctx);
> +}
> +
> +static int ras2_hw_scrub_write_size(struct device *dev, void *drv_data, u64 size)
> +{
> + struct ras2_mem_ctx *ras2_ctx = drv_data;
> + bool running;
> + int ret;
> +
> + guard(mutex)(&ras2_ctx->lock);
> + ret = ras2_get_patrol_scrub_running(ras2_ctx, &running);
> + if (ret)
> + return ret;
> +
> + if (running)
> + return -EBUSY;
> +
> + if (!size) {
> + dev_warn(dev, "%s: Invalid address range size=0x%llx\n",
> + __func__, size);
> + return -EINVAL;
> + }
> +
> + ras2_ctx->size = size;
> +
> + return 0;
> +}
> +
> +static int ras2_hw_scrub_set_enabled_bg(struct device *dev, void *drv_data, bool enable)
> +{
> + struct ras2_mem_ctx *ras2_ctx = drv_data;
> + struct acpi_ras2_ps_shared_mem __iomem *ps_sm = (void *)
> + ras2_ctx->pcc_comm_addr;
> + bool running;
> + int ret;
> +
> + guard(mutex)(&ras2_ctx->lock);
> + ps_sm->common.set_capabilities[0] = RAS2_SUPPORT_HW_PARTOL_SCRUB;
> + ret = ras2_get_patrol_scrub_running(ras2_ctx, &running);
> + if (ret)
> + return ret;
> + if (enable) {
> + if (ras2_ctx->bg || running)
> + return -EBUSY;
> + ps_sm->params.requested_address_range[0] = 0;
> + ps_sm->params.requested_address_range[1] = 0;
> + ps_sm->params.scrub_params_in &= ~RAS2_PATROL_SCRUB_SCHRS_IN_MASK;
> + ps_sm->params.scrub_params_in |= FIELD_PREP(RAS2_PATROL_SCRUB_SCHRS_IN_MASK,
> + ras2_ctx->scrub_cycle_hrs);
> + ps_sm->params.patrol_scrub_command = RAS2_START_PATROL_SCRUBBER;
> + } else {
> + if (!ras2_ctx->bg)
> + return -EPERM;
> + if (!ras2_ctx->bg && running)
> + return -EBUSY;
> + ps_sm->params.patrol_scrub_command = RAS2_STOP_PATROL_SCRUBBER;
> + }
> + ps_sm->params.scrub_params_in &= ~RAS2_PATROL_SCRUB_EN_BACKGROUND;
> + ps_sm->params.scrub_params_in |= FIELD_PREP(RAS2_PATROL_SCRUB_EN_BACKGROUND,
> + enable);
> + ret = ras2_send_pcc_cmd(ras2_ctx, RAS2_PCC_CMD_EXEC);
> + if (ret) {
> + dev_err(ras2_ctx->dev, "Failed to %s background scrubbing\n",
> + enable ? "enable" : "disable");
> + return ret;
> + }
ret = ras2_scrub_map_status_to_error(ps_sm->scrub_params.status);
if (ret != 0)
return ret;
> + if (enable) {
> + ras2_ctx->bg = true;
> + /* Update the cache to account for rounding of supplied parameters and similar */
> + ret = ras2_update_patrol_scrub_params_cache(ras2_ctx);
> + } else {
> + ret = ras2_update_patrol_scrub_params_cache(ras2_ctx);
> + ras2_ctx->bg = false;
> + }
> +
> + return ret;
> +}
Powered by blists - more mailing lists