lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <7f08c03f-a618-4ea4-ab57-f7078afe49c9@amd.com>
Date: Wed, 16 Jul 2025 17:07:25 -0500
From: "Kalra, Ashish" <ashish.kalra@....com>
To: Vasant Hegde <vasant.hegde@....com>, joro@...tes.org,
 suravee.suthikulpanit@....com, thomas.lendacky@....com,
 Sairaj.ArunKodilkar@....com, herbert@...dor.apana.org.au
Cc: seanjc@...gle.com, pbonzini@...hat.com, will@...nel.org,
 robin.murphy@....com, john.allen@....com, davem@...emloft.net, bp@...en8.de,
 michael.roth@....com, iommu@...ts.linux.dev, linux-kernel@...r.kernel.org,
 linux-crypto@...r.kernel.org, kvm@...r.kernel.org
Subject: Re: [PATCH v3 2/4] iommu/amd: Reuse device table for kdump

Hello Vasant,

On 7/16/2025 4:42 AM, Vasant Hegde wrote:
> 
> 
> On 7/16/2025 12:57 AM, Ashish Kalra wrote:
>> From: Ashish Kalra <ashish.kalra@....com>
>>
>> After a panic if SNP is enabled in the previous kernel then the kdump
>> kernel boots with IOMMU SNP enforcement still enabled.
>>
>> IOMMU device table register is locked and exclusive to the previous
>> kernel. Attempts to copy old device table from the previous kernel
>> fails in kdump kernel as hardware ignores writes to the locked device
>> table base address register as per AMD IOMMU spec Section 2.12.2.1.
>>
>> This results in repeated "Completion-Wait loop timed out" errors and a
>> second kernel panic: "Kernel panic - not syncing: timer doesn't work
>> through Interrupt-remapped IO-APIC".
>>
>> Reuse device table instead of copying device table in case of kdump
>> boot and remove all copying device table code.
>>
>> Signed-off-by: Ashish Kalra <ashish.kalra@....com>
>> ---
>>  drivers/iommu/amd/init.c | 97 ++++++++++++----------------------------
>>  1 file changed, 28 insertions(+), 69 deletions(-)
>>
>> diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
>> index 32295f26be1b..18bd869a82d9 100644
>> --- a/drivers/iommu/amd/init.c
>> +++ b/drivers/iommu/amd/init.c
>> @@ -406,6 +406,9 @@ static void iommu_set_device_table(struct amd_iommu *iommu)
>>  
>>  	BUG_ON(iommu->mmio_base == NULL);
>>  
>> +	if (is_kdump_kernel())
> 
> This is fine.. but its becoming too many places with kdump check! I don't know
> what is the better way here.
> Is it worth to keep it like this -OR- add say iommu ops that way during init we
> check is_kdump_kernel() and adjust the ops ?
> 
> @Joerg, any preference?
> 
> 
>> +		return;
>> +
>>  	entry = iommu_virt_to_phys(dev_table);
>>  	entry |= (dev_table_size >> 12) - 1;
>>  	memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET,
>> @@ -646,7 +649,10 @@ static inline int __init alloc_dev_table(struct amd_iommu_pci_seg *pci_seg)
>>  
>>  static inline void free_dev_table(struct amd_iommu_pci_seg *pci_seg)
>>  {
>> -	iommu_free_pages(pci_seg->dev_table);
>> +	if (is_kdump_kernel())
>> +		memunmap((void *)pci_seg->dev_table);
>> +	else
>> +		iommu_free_pages(pci_seg->dev_table);
>>  	pci_seg->dev_table = NULL;
>>  }
>>  
>> @@ -1128,15 +1134,12 @@ static void set_dte_bit(struct dev_table_entry *dte, u8 bit)
>>  	dte->data[i] |= (1UL << _bit);
>>  }
>>  
>> -static bool __copy_device_table(struct amd_iommu *iommu)
>> +static bool __reuse_device_table(struct amd_iommu *iommu)
>>  {
>> -	u64 int_ctl, int_tab_len, entry = 0;
>>  	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
>> -	struct dev_table_entry *old_devtb = NULL;
>> -	u32 lo, hi, devid, old_devtb_size;
>> +	u32 lo, hi, old_devtb_size;
>>  	phys_addr_t old_devtb_phys;
>> -	u16 dom_id, dte_v, irq_v;
>> -	u64 tmp;
>> +	u64 entry;
>>  
>>  	/* Each IOMMU use separate device table with the same size */
>>  	lo = readl(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET);
>> @@ -1161,66 +1164,22 @@ static bool __copy_device_table(struct amd_iommu *iommu)
>>  		pr_err("The address of old device table is above 4G, not trustworthy!\n");
>>  		return false;
>>  	}
>> -	old_devtb = (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT) && is_kdump_kernel())
>> -		    ? (__force void *)ioremap_encrypted(old_devtb_phys,
>> -							pci_seg->dev_table_size)
>> -		    : memremap(old_devtb_phys, pci_seg->dev_table_size, MEMREMAP_WB);
>> -
>> -	if (!old_devtb)
>> -		return false;
>>  
>> -	pci_seg->old_dev_tbl_cpy = iommu_alloc_pages_sz(
>> -		GFP_KERNEL | GFP_DMA32, pci_seg->dev_table_size);
>> +	/*
>> +	 * IOMMU Device Table Base Address MMIO register is locked
>> +	 * if SNP is enabled during kdump, reuse the previous kernel's
>> +	 * device table.
>> +	 */
>> +	pci_seg->old_dev_tbl_cpy = iommu_memremap(old_devtb_phys, pci_seg->dev_table_size);
>>  	if (pci_seg->old_dev_tbl_cpy == NULL) {
>> -		pr_err("Failed to allocate memory for copying old device table!\n");
>> -		memunmap(old_devtb);
>> +		pr_err("Failed to remap memory for reusing old device table!\n");
>>  		return false;
>>  	}
>>  
>> -	for (devid = 0; devid <= pci_seg->last_bdf; ++devid) {
>> -		pci_seg->old_dev_tbl_cpy[devid] = old_devtb[devid];
>> -		dom_id = old_devtb[devid].data[1] & DEV_DOMID_MASK;
>> -		dte_v = old_devtb[devid].data[0] & DTE_FLAG_V;
>> -
>> -		if (dte_v && dom_id) {
>> -			pci_seg->old_dev_tbl_cpy[devid].data[0] = old_devtb[devid].data[0];
>> -			pci_seg->old_dev_tbl_cpy[devid].data[1] = old_devtb[devid].data[1];
>> -			/* Reserve the Domain IDs used by previous kernel */
>> -			if (ida_alloc_range(&pdom_ids, dom_id, dom_id, GFP_ATOMIC) != dom_id) {
>> -				pr_err("Failed to reserve domain ID 0x%x\n", dom_id);
>> -				memunmap(old_devtb);
>> -				return false;
>> -			}
>> -			/* If gcr3 table existed, mask it out */
>> -			if (old_devtb[devid].data[0] & DTE_FLAG_GV) {
>> -				tmp = (DTE_GCR3_30_15 | DTE_GCR3_51_31);
>> -				pci_seg->old_dev_tbl_cpy[devid].data[1] &= ~tmp;
>> -				tmp = (DTE_GCR3_14_12 | DTE_FLAG_GV);
>> -				pci_seg->old_dev_tbl_cpy[devid].data[0] &= ~tmp;
>> -			}
>> -		}
>> -
>> -		irq_v = old_devtb[devid].data[2] & DTE_IRQ_REMAP_ENABLE;
>> -		int_ctl = old_devtb[devid].data[2] & DTE_IRQ_REMAP_INTCTL_MASK;
>> -		int_tab_len = old_devtb[devid].data[2] & DTE_INTTABLEN_MASK;
>> -		if (irq_v && (int_ctl || int_tab_len)) {
>> -			if ((int_ctl != DTE_IRQ_REMAP_INTCTL) ||
>> -			    (int_tab_len != DTE_INTTABLEN_512 &&
>> -			     int_tab_len != DTE_INTTABLEN_2K)) {
>> -				pr_err("Wrong old irq remapping flag: %#x\n", devid);
>> -				memunmap(old_devtb);
>> -				return false;
>> -			}
>> -
>> -			pci_seg->old_dev_tbl_cpy[devid].data[2] = old_devtb[devid].data[2];
>> -		}
>> -	}
>> -	memunmap(old_devtb);
>> -
>>  	return true;
>>  }
>>  
>> -static bool copy_device_table(void)
>> +static bool reuse_device_table(void)
>>  {
>>  	struct amd_iommu *iommu;
>>  	struct amd_iommu_pci_seg *pci_seg;
>> @@ -1228,17 +1187,17 @@ static bool copy_device_table(void)
>>  	if (!amd_iommu_pre_enabled)
>>  		return false;
>>  
>> -	pr_warn("Translation is already enabled - trying to copy translation structures\n");
>> +	pr_warn("Translation is already enabled - trying to reuse translation structures\n");
>>  
>>  	/*
>>  	 * All IOMMUs within PCI segment shares common device table.
>> -	 * Hence copy device table only once per PCI segment.
>> +	 * Hence reuse device table only once per PCI segment.
>>  	 */
>>  	for_each_pci_segment(pci_seg) {
>>  		for_each_iommu(iommu) {
>>  			if (pci_seg->id != iommu->pci_seg->id)
>>  				continue;
>> -			if (!__copy_device_table(iommu))
>> +			if (!__reuse_device_table(iommu))
>>  				return false;
>>  			break;
>>  		}
>> @@ -2917,8 +2876,8 @@ static void early_enable_iommu(struct amd_iommu *iommu)
>>   * This function finally enables all IOMMUs found in the system after
>>   * they have been initialized.
>>   *
>> - * Or if in kdump kernel and IOMMUs are all pre-enabled, try to copy
>> - * the old content of device table entries. Not this case or copy failed,
>> + * Or if in kdump kernel and IOMMUs are all pre-enabled, try to reuse
>> + * the old content of device table entries. Not this case or reuse failed,
>>   * just continue as normal kernel does.
>>   */
>>  static void early_enable_iommus(void)
>> @@ -2926,18 +2885,18 @@ static void early_enable_iommus(void)
>>  	struct amd_iommu *iommu;
>>  	struct amd_iommu_pci_seg *pci_seg;
>>  
>> -	if (!copy_device_table()) {
>> +	if (!reuse_device_table()) {
> 
> Hmmm. What happens if SNP enabled and reuse_device_table() couldn't setup
> previous DTE?
> In non-SNP case it works fine as we can rebuild new DTE. But in SNP case we
> should fail the kdump right?
> 

Which will happen automatically, if we can't setup previous DTE for SNP case
then IOMMU commands will time-out and subsequenly cause a panic as IRQ remapping
won't be setup.

So this is as good as failing the kdump, which will have the same result.

Thanks,
Ashish


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ