linux-kernel - RE: [PATCH v5 1/5] x86/mce: Add wrapper for struct mce to export vendor specific info

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CY8PR11MB71346A3B771899EC335E444489442@CY8PR11MB7134.namprd11.prod.outlook.com>
Date: Mon, 14 Oct 2024 06:04:34 +0000
From: "Zhuo, Qiuxu" <qiuxu.zhuo@...el.com>
To: Avadhut Naik <avadhut.naik@....com>, "x86@...nel.org" <x86@...nel.org>,
	"linux-edac@...r.kernel.org" <linux-edac@...r.kernel.org>,
	"linux-trace-kernel@...r.kernel.org" <linux-trace-kernel@...r.kernel.org>,
	"linux-acpi@...r.kernel.org" <linux-acpi@...r.kernel.org>
CC: "linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
	"bp@...en8.de" <bp@...en8.de>, "Luck, Tony" <tony.luck@...el.com>,
	"rafael@...nel.org" <rafael@...nel.org>, "tglx@...utronix.de"
	<tglx@...utronix.de>, "mingo@...hat.com" <mingo@...hat.com>,
	"rostedt@...dmis.org" <rostedt@...dmis.org>, "lenb@...nel.org"
	<lenb@...nel.org>, "mchehab@...nel.org" <mchehab@...nel.org>,
	"james.morse@....com" <james.morse@....com>, "airlied@...il.com"
	<airlied@...il.com>, "yazen.ghannam@....com" <yazen.ghannam@....com>,
	"john.allen@....com" <john.allen@....com>, "avadnaik@....com"
	<avadnaik@....com>
Subject: RE: [PATCH v5 1/5] x86/mce: Add wrapper for struct mce to export
 vendor specific info

> From: Avadhut Naik <avadhut.naik@....com>
> [...]
> Subject: [PATCH v5 1/5] x86/mce: Add wrapper for struct mce to export vendor
> [...]
> --- a/arch/x86/include/asm/mce.h
> +++ b/arch/x86/include/asm/mce.h
> @@ -187,6 +187,14 @@ enum mce_notifier_prios {
>  	MCE_PRIO_HIGHEST = MCE_PRIO_CEC
>  };
> 
> +/**
> + * struct mce_hw_err - Hardware Error Record.
> + * @m:		Machine Check record.
> + */
> +struct mce_hw_err {
> +	struct mce m;
> +};
> +

Just some thoughts: 

I noticed that many places only care about the 'mce' structure instead of the new 'mce_hw_err', 
such as the EDAC drivers in the notifier chains. 

Could we still pass these users with the 'mce' structure instead of 'mce_hw_err', 
since they only care about 'mce'?  However, they still have the opportunity to
get 'mce_hw_err' from 'mce' on demand with the following macro. [1]

   #define  to_mce_hw_err(mce) container_of(mce, struct mce_hw_err, m) 

So, for those codes that need 'mce_hw_err', they can retrieve it from 'mce'
on demand by:

    struct mce_hw_err *err = to_mce_hw_err(mce); 

For those codes that don't need 'mce_hw_err', there are 
no changes to them. I.e., they do not need to perform the extra operations 
like below over and over:

   struct mce *mce = &err->m;

> [...]

>  static int mce_early_notifier(struct notifier_block *nb, unsigned long val,
>  			      void *data)
>  {
> -	struct mce *m = (struct mce *)data;
> +	struct mce_hw_err *err = (struct mce_hw_err *)data;

If using [1], then it can be:
  
    struct mce_hw_err *err = to_mce_hw_err(data);

> -	if (!m)
> +	if (!err)
>  		return NOTIFY_DONE;
> 
>  	/* Emit the trace record: */
> -	trace_mce_record(m);
> +	trace_mce_record(err);
> 
>  	set_bit(0, &mce_need_notify);
> 
> @@ -597,7 +606,8 @@ static struct notifier_block early_nb = {  static int
> uc_decode_notifier(struct notifier_block *nb, unsigned long val,
>  			      void *data)
>  {
> -	struct mce *mce = (struct mce *)data;
> +	struct mce_hw_err *err = (struct mce_hw_err *)data;
> +	struct mce *mce = &err->m;

If using [1], then there is no need for these changes.

> [...]
> diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c
> b/arch/x86/kernel/cpu/mce/dev-mcelog.c
> index af44fd5dbd7c..d991ef370efd 100644
> --- a/arch/x86/kernel/cpu/mce/dev-mcelog.c
> +++ b/arch/x86/kernel/cpu/mce/dev-mcelog.c
> @@ -36,7 +36,7 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
>  static int dev_mce_log(struct notifier_block *nb, unsigned long val,
>  				void *data)
>  {
> -	struct mce *mce = (struct mce *)data;
> +	struct mce *mce = &((struct mce_hw_err *)data)->m;
>  	unsigned int entry;
> 
>  	if (mce->kflags & MCE_HANDLED_CEC)

If using [1], then there is no need for these changes.

> [...]
> @@ -83,8 +83,8 @@ void mce_gen_pool_process(struct work_struct
> *__unused)
> 
>  	head = llist_reverse_order(head);
>  	llist_for_each_entry_safe(node, tmp, head, llnode) {
> -		mce = &node->mce;
> -		blocking_notifier_call_chain(&x86_mce_decoder_chain, 0,
> mce);
> +		err = &node->err;
> +		blocking_notifier_call_chain(&x86_mce_decoder_chain, 0,
> err);

If using [1], then continue to pass 'mce' as before:

     mce = &node->err.m;
     blocking_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);

> [...]
> diff --git a/drivers/acpi/acpi_extlog.c b/drivers/acpi/acpi_extlog.c index
> ca87a0939135..4864191918db 100644
> --- a/drivers/acpi/acpi_extlog.c
> +++ b/drivers/acpi/acpi_extlog.c
> @@ -134,7 +134,7 @@ static int print_extlog_rcd(const char *pfx,  static int
> extlog_print(struct notifier_block *nb, unsigned long val,
>  			void *data)
>  {
> -	struct mce *mce = (struct mce *)data;
> +	struct mce *mce = &((struct mce_hw_err *)data)->m;
>  	int	bank = mce->bank;
>  	int	cpu = mce->extcpu;
>  	struct acpi_hest_generic_status *estatus, *tmp; diff --git

If using [1], then there is no need for these changes.

> a/drivers/acpi/nfit/mce.c b/drivers/acpi/nfit/mce.c index
> d48a388b796e..b917988db794 100644
> --- a/drivers/acpi/nfit/mce.c
> +++ b/drivers/acpi/nfit/mce.c
> @@ -13,7 +13,7 @@
>  static int nfit_handle_mce(struct notifier_block *nb, unsigned long val,
>  			void *data)
>  {
> -	struct mce *mce = (struct mce *)data;
> +	struct mce *mce = &((struct mce_hw_err *)data)->m;
>  	struct acpi_nfit_desc *acpi_desc;
>  	struct nfit_spa *nfit_spa;
> 

If using [1], then there is no need for these changes.

> diff --git a/drivers/edac/i7core_edac.c b/drivers/edac/i7core_edac.c index
> 91e0a88ef904..d1e47cba0ff2 100644
> --- a/drivers/edac/i7core_edac.c
> +++ b/drivers/edac/i7core_edac.c
> @@ -1810,7 +1810,7 @@ static void i7core_check_error(struct mem_ctl_info
> *mci, struct mce *m)  static int i7core_mce_check_error(struct notifier_block
> *nb, unsigned long val,
>  				  void *data)
>  {
> -	struct mce *mce = (struct mce *)data;
> +	struct mce *mce = &((struct mce_hw_err *)data)->m;
>  	struct i7core_dev *i7_dev;
>  	struct mem_ctl_info *mci;
> 

If using [1], then there is no need for these changes.

> diff --git a/drivers/edac/igen6_edac.c b/drivers/edac/igen6_edac.c index
> 189a2fc29e74..091126a42884 100644
> --- a/drivers/edac/igen6_edac.c
> +++ b/drivers/edac/igen6_edac.c
> @@ -919,7 +919,7 @@ static int ecclog_nmi_handler(unsigned int cmd, struct
> pt_regs *regs)  static int ecclog_mce_handler(struct notifier_block *nb,
> unsigned long val,
>  			      void *data)
>  {
> -	struct mce *mce = (struct mce *)data;
> +	struct mce *mce = &((struct mce_hw_err *)data)->m;
>  	char *type;
> 
>  	if (mce->kflags & MCE_HANDLED_CEC)

If using [1], then there is no need for these changes.

> diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index
> 8130c3dc64da..c5fae99de781 100644
> --- a/drivers/edac/mce_amd.c
> +++ b/drivers/edac/mce_amd.c
> @@ -792,7 +792,7 @@ static const char *decode_error_status(struct mce *m)
> static int  amd_decode_mce(struct notifier_block *nb, unsigned long val, void
> *data)  {
> -	struct mce *m = (struct mce *)data;
> +	struct mce *m = &((struct mce_hw_err *)data)->m;
>  	unsigned int fam = x86_family(m->cpuid);
>  	int ecc;
> 

If using [1], then there is no need for these changes.

> diff --git a/drivers/edac/pnd2_edac.c b/drivers/edac/pnd2_edac.c index
> f93f2f2b1cf2..a3008f6eb2b1 100644
> --- a/drivers/edac/pnd2_edac.c
> +++ b/drivers/edac/pnd2_edac.c
> @@ -1366,7 +1366,7 @@ static void pnd2_unregister_mci(struct mem_ctl_info
> *mci)
>   */
>  static int pnd2_mce_check_error(struct notifier_block *nb, unsigned long val,
> void *data)  {
> -	struct mce *mce = (struct mce *)data;
> +	struct mce *mce = &((struct mce_hw_err *)data)->m;
>  	struct mem_ctl_info *mci;
>  	struct dram_addr daddr;
>  	char *type;

If using [1], then there is no need for these changes.

> diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index
> d5f12219598a..f771c36540cf 100644
> --- a/drivers/edac/sb_edac.c
> +++ b/drivers/edac/sb_edac.c
> @@ -3256,7 +3256,7 @@ static void sbridge_mce_output_error(struct
> mem_ctl_info *mci,  static int sbridge_mce_check_error(struct notifier_block
> *nb, unsigned long val,
>  				   void *data)
>  {
> -	struct mce *mce = (struct mce *)data;
> +	struct mce *mce = &((struct mce_hw_err *)data)->m;
>  	struct mem_ctl_info *mci;
>  	char *type;
> 

If using [1], then there is no need for these changes.

> diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c index
> 85713646957b..5fa3a9038a77 100644
> --- a/drivers/edac/skx_common.c
> +++ b/drivers/edac/skx_common.c
> @@ -644,7 +644,7 @@ static bool skx_error_in_mem(const struct mce *m)  int
> skx_mce_check_error(struct notifier_block *nb, unsigned long val,
>  			void *data)
>  {
> -	struct mce *mce = (struct mce *)data;
> +	struct mce *mce = &((struct mce_hw_err *)data)->m;
>  	struct decoded_addr res;
>  	struct mem_ctl_info *mci;
>  	char *type;

If using [1], then there is no need for these changes.

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 1a1395c5fff1..88648a89fbdf 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -4160,7 +4160,7 @@ static struct amdgpu_device *find_adev(uint32_t
> node_id)  static int amdgpu_bad_page_notifier(struct notifier_block *nb,
>  				    unsigned long val, void *data)
>  {
> -	struct mce *m = (struct mce *)data;
> +	struct mce *m = &((struct mce_hw_err *)data)->m;
>  	struct amdgpu_device *adev = NULL;
>  	uint32_t gpu_id = 0;
>  	uint32_t umc_inst = 0, ch_inst = 0;

If using [1], then there is no need for these changes.

> diff --git a/drivers/ras/amd/fmpm.c b/drivers/ras/amd/fmpm.c index
> 90de737fbc90..78dd4b192992 100644
> --- a/drivers/ras/amd/fmpm.c
> +++ b/drivers/ras/amd/fmpm.c
> @@ -400,7 +400,7 @@ static void retire_dram_row(u64 addr, u64 id, u32 cpu)
> 
>  static int fru_handle_mem_poison(struct notifier_block *nb, unsigned long
> val, void *data)  {
> -	struct mce *m = (struct mce *)data;
> +	struct mce *m = &((struct mce_hw_err *)data)->m;
>  	struct fru_rec *rec;
> 
>  	if (!mce_is_memory_error(m))

If using [1], then there is no need for these changes.

> diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c index
> e440b15fbabc..be785746f587 100644
> --- a/drivers/ras/cec.c
> +++ b/drivers/ras/cec.c
> @@ -534,7 +534,7 @@ static int __init create_debugfs_nodes(void)  static int
> cec_notifier(struct notifier_block *nb, unsigned long val,
>  			void *data)
>  {
> -	struct mce *m = (struct mce *)data;
> +	struct mce *m = &((struct mce_hw_err *)data)->m;
> 
>  	if (!m)
>  		return NOTIFY_DONE;

If using [1], then there is no need for these changes.

> [...]