[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <50f5bc27-98da-ee3e-59dd-7252c3ed7a0a@amazon.com>
Date: Sun, 8 Sep 2019 10:58:31 +0300
From: "Hawa, Hanna" <hhhawa@...zon.com>
To: Robert Richter <rrichter@...vell.com>
CC: "bp@...en8.de" <bp@...en8.de>,
"mchehab@...nel.org" <mchehab@...nel.org>,
"james.morse@....com" <james.morse@....com>,
"linux-edac@...r.kernel.org" <linux-edac@...r.kernel.org>,
"linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
"dwmw@...zon.co.uk" <dwmw@...zon.co.uk>,
"benh@...zon.com" <benh@...zon.com>,
"ronenk@...zon.com" <ronenk@...zon.com>,
"talel@...zon.com" <talel@...zon.com>,
"jonnyc@...zon.com" <jonnyc@...zon.com>,
"hanochu@...zon.com" <hanochu@...zon.com>
Subject: Re: [PATCH 1/1] edac: Add an API for edac device to report for
multiple errors
On 9/5/2019 12:56 PM, Robert Richter wrote:
> Hi Hanna,
>
> thanks for the update. See below.
>
> On 05.09.19 09:37:45, Hanna Hawa wrote:
>> Add an API for edac device to report multiple errors with same type.
>>
>> Signed-off-by: Hanna Hawa <hhhawa@...zon.com>
>> ---
>> drivers/edac/edac_device.c | 66 +++++++++++++++++++++++++++++---------
>> drivers/edac/edac_device.h | 31 ++++++++++++++++--
>> 2 files changed, 79 insertions(+), 18 deletions(-)
>>
>> diff --git a/drivers/edac/edac_device.c b/drivers/edac/edac_device.c
>> index 65cf2b9355c4..bf6a4fd9831b 100644
>> --- a/drivers/edac/edac_device.c
>> +++ b/drivers/edac/edac_device.c
>> @@ -555,12 +555,15 @@ static inline int edac_device_get_panic_on_ue(struct edac_device_ctl_info
>> return edac_dev->panic_on_ue;
>> }
>>
>> -void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,
>> - int inst_nr, int block_nr, const char *msg)
>> +static void __edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,
>> + u16 error_count, int inst_nr, int block_nr,
>
> Just curious, why u16, some register mask size? Maybe just use unsigned int?
Wanted to be aligned with edac MC.
I can change it to be u32.
>
> I think the variable can be shortened to 'count', the meaning should
> still be clear.
I think more clear to include 'error'.
maybe shorter name 'err_count'?
>
>> + const char *msg)
>> {
>> struct edac_device_instance *instance;
>> struct edac_device_block *block = NULL;
>>
>> + WARN_ON(!error_count);
>
> Should return in this case.
>
> Better use WARN_ON_ONCE() to avoid flooding.
In case of two drivers using this function with wrong error count, only
the first WARN_ON_ONCE will catch in this case, and other will miss
other wrong usage of other edac device drivers.
>
> The check should be moved to edac_device_handle_ce_count().
I'll move it to edac_device_handle_ce_count.
>
>> +
>> if ((inst_nr >= edac_dev->nr_instances) || (inst_nr < 0)) {
>> edac_device_printk(edac_dev, KERN_ERR,
>> "INTERNAL ERROR: 'instance' out of range "
>> @@ -582,27 +585,44 @@ void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,
>>
>> if (instance->nr_blocks > 0) {
>> block = instance->blocks + block_nr;
>> - block->counters.ce_count++;
>> + block->counters.ce_count += error_count;
>> }
>>
>> /* Propagate the count up the 'totals' tree */
>> - instance->counters.ce_count++;
>> - edac_dev->counters.ce_count++;
>> + instance->counters.ce_count += error_count;
>> + edac_dev->counters.ce_count += error_count;
>>
>> if (edac_device_get_log_ce(edac_dev))
>> edac_device_printk(edac_dev, KERN_WARNING,
>> - "CE: %s instance: %s block: %s '%s'\n",
>> + "CE: %s instance: %s block: %s count: %d '%s'\n",
>> edac_dev->ctl_name, instance->name,
>> - block ? block->name : "N/A", msg);
>> + block ? block->name : "N/A", error_count, msg);
>> +}
>> +
>> +void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,
>> + int inst_nr, int block_nr, const char *msg)
>> +{
>> + __edac_device_handle_ce(edac_dev, 1, inst_nr, block_nr, msg);
>> }
>> EXPORT_SYMBOL_GPL(edac_device_handle_ce);
>
> We could just export the __*() version of those functions and make
> everything else inline in the header file? Though, better do this with
> two patches to avoid an ABI breakage in case someone wants to backport
> it. Let's see what others say here.
Waiting for other reviewers.
>
>>
>> -void edac_device_handle_ue(struct edac_device_ctl_info *edac_dev,
>> - int inst_nr, int block_nr, const char *msg)
>> +void edac_device_handle_ce_count(struct edac_device_ctl_info *edac_dev,
>> + u16 error_count, int inst_nr, int block_nr,
>> + const char *msg)
>> +{
>> + __edac_device_handle_ce(edac_dev, error_count, inst_nr, block_nr, msg);
>> +}
>> +EXPORT_SYMBOL_GPL(edac_device_handle_ce_count);
>> +
>> +static void __edac_device_handle_ue(struct edac_device_ctl_info *edac_dev,
>> + u16 error_count, int inst_nr, int block_nr,
>> + const char *msg)
>
> All the above applies for this function too.
>
>> {
>> struct edac_device_instance *instance;
>> struct edac_device_block *block = NULL;
>>
>> + WARN_ON(!error_count);
>> +
>> if ((inst_nr >= edac_dev->nr_instances) || (inst_nr < 0)) {
>> edac_device_printk(edac_dev, KERN_ERR,
>> "INTERNAL ERROR: 'instance' out of range "
>> @@ -624,22 +644,36 @@ void edac_device_handle_ue(struct edac_device_ctl_info *edac_dev,
>>
>> if (instance->nr_blocks > 0) {
>> block = instance->blocks + block_nr;
>> - block->counters.ue_count++;
>> + block->counters.ue_count += error_count;
>> }
>>
>> /* Propagate the count up the 'totals' tree */
>> - instance->counters.ue_count++;
>> - edac_dev->counters.ue_count++;
>> + instance->counters.ue_count += error_count;
>> + edac_dev->counters.ue_count += error_count;
>>
>> if (edac_device_get_log_ue(edac_dev))
>> edac_device_printk(edac_dev, KERN_EMERG,
>> - "UE: %s instance: %s block: %s '%s'\n",
>> + "UE: %s instance: %s block: %s count: %d '%s'\n",
>> edac_dev->ctl_name, instance->name,
>> - block ? block->name : "N/A", msg);
>> + block ? block->name : "N/A", error_count, msg);
>>
>> if (edac_device_get_panic_on_ue(edac_dev))
>> - panic("EDAC %s: UE instance: %s block %s '%s'\n",
>> + panic("EDAC %s: UE instance: %s block %s count: %d '%s'\n",
>> edac_dev->ctl_name, instance->name,
>> - block ? block->name : "N/A", msg);
>> + block ? block->name : "N/A", error_count, msg);
>> +}
>> +
>> +void edac_device_handle_ue(struct edac_device_ctl_info *edac_dev,
>> + int inst_nr, int block_nr, const char *msg)
>> +{
>> + __edac_device_handle_ue(edac_dev, 1, inst_nr, block_nr, msg);
>> }
>> EXPORT_SYMBOL_GPL(edac_device_handle_ue);
>> +
>> +void edac_device_handle_ue_count(struct edac_device_ctl_info *edac_dev,
>> + u16 error_count, int inst_nr, int block_nr,
>> + const char *msg)
>> +{
>> + __edac_device_handle_ue(edac_dev, error_count, inst_nr, block_nr, msg);
>> +}
>> +EXPORT_SYMBOL_GPL(edac_device_handle_ue_count);
>> diff --git a/drivers/edac/edac_device.h b/drivers/edac/edac_device.h
>> index 1aaba74ae411..c8dc83eda64f 100644
>> --- a/drivers/edac/edac_device.h
>> +++ b/drivers/edac/edac_device.h
>> @@ -287,7 +287,7 @@ extern struct edac_device_ctl_info *edac_device_del_device(struct device *dev);
>>
>> /**
>> * edac_device_handle_ue():
>> - * perform a common output and handling of an 'edac_dev' UE event
>> + * perform a common output and handling of an 'edac_dev' single UE event
>> *
>> * @edac_dev: pointer to struct &edac_device_ctl_info
>> * @inst_nr: number of the instance where the UE error happened
>> @@ -298,7 +298,7 @@ extern void edac_device_handle_ue(struct edac_device_ctl_info *edac_dev,
>> int inst_nr, int block_nr, const char *msg);
>> /**
>> * edac_device_handle_ce():
>> - * perform a common output and handling of an 'edac_dev' CE event
>> + * perform a common output and handling of an 'edac_dev' single CE event
>> *
>> * @edac_dev: pointer to struct &edac_device_ctl_info
>> * @inst_nr: number of the instance where the CE error happened
>> @@ -308,6 +308,33 @@ extern void edac_device_handle_ue(struct edac_device_ctl_info *edac_dev,
>> extern void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,
>> int inst_nr, int block_nr, const char *msg);
>>
>> +/**
>> + * edac_device_handle_ue_count():
>> + * perform a common output and handling of an 'edac_dev'
>> + *
>> + * @edac_dev: pointer to struct &edac_device_ctl_info
>> + * @error_count: number of errors of the same type
>> + * @inst_nr: number of the instance where the UE error happened
>> + * @block_nr: number of the block where the UE error happened
>> + * @msg: message to be printed
>> + */
>> +extern void edac_device_handle_ue_count(struct edac_device_ctl_info *edac_dev,
>> + u16 error_count, int inst_nr,
>> + int block_nr, const char *msg);
>> +/**
>> + * edac_device_handle_ce_count():
>> + * perform a common output and handling of an 'edac_dev'
>> + *
>> + * @edac_dev: pointer to struct &edac_device_ctl_info
>> + * @error_count: number of errors of the same type
>> + * @inst_nr: number of the instance where the CE error happened
>> + * @block_nr: number of the block where the CE error happened
>> + * @msg: message to be printed
>> + */
>> +extern void edac_device_handle_ce_count(struct edac_device_ctl_info *edac_dev,
>> + u16 error_count, int inst_nr,
>> + int block_nr, const char *msg);
>> +
>
> Looks otherwise good to me.
Thanks!
Thanks,
Hanna
>
> Thanks,
>
> -Robert
>
>> /**
>> * edac_device_alloc_index: Allocate a unique device index number
>> *
>> --
>> 2.17.1
>>
Powered by blists - more mailing lists