lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <8869fcdb-5a4a-45c7-a1ff-1ade5b85097d@amd.com>
Date: Thu, 17 Jul 2025 11:08:58 +0530
From: Sairaj Kodilkar <sarunkod@....com>
To: "Kalra, Ashish" <ashish.kalra@....com>, Vasant Hegde
	<vasant.hegde@....com>, <joro@...tes.org>, <suravee.suthikulpanit@....com>,
	<thomas.lendacky@....com>, <Sairaj.ArunKodilkar@....com>,
	<herbert@...dor.apana.org.au>
CC: <seanjc@...gle.com>, <pbonzini@...hat.com>, <will@...nel.org>,
	<robin.murphy@....com>, <john.allen@....com>, <davem@...emloft.net>,
	<bp@...en8.de>, <michael.roth@....com>, <iommu@...ts.linux.dev>,
	<linux-kernel@...r.kernel.org>, <linux-crypto@...r.kernel.org>,
	<kvm@...r.kernel.org>
Subject: Re: [PATCH v3 2/4] iommu/amd: Reuse device table for kdump



On 7/17/2025 3:37 AM, Kalra, Ashish wrote:
> Hello Vasant,
> 
> On 7/16/2025 4:42 AM, Vasant Hegde wrote:
>>
>>
>> On 7/16/2025 12:57 AM, Ashish Kalra wrote:
>>> From: Ashish Kalra <ashish.kalra@....com>
>>>
>>> After a panic if SNP is enabled in the previous kernel then the kdump
>>> kernel boots with IOMMU SNP enforcement still enabled.
>>>
>>> IOMMU device table register is locked and exclusive to the previous
>>> kernel. Attempts to copy old device table from the previous kernel
>>> fails in kdump kernel as hardware ignores writes to the locked device
>>> table base address register as per AMD IOMMU spec Section 2.12.2.1.
>>>
>>> This results in repeated "Completion-Wait loop timed out" errors and a
>>> second kernel panic: "Kernel panic - not syncing: timer doesn't work
>>> through Interrupt-remapped IO-APIC".
>>>
>>> Reuse device table instead of copying device table in case of kdump
>>> boot and remove all copying device table code.
>>>
>>> Signed-off-by: Ashish Kalra <ashish.kalra@....com>
>>> ---
>>>   drivers/iommu/amd/init.c | 97 ++++++++++++----------------------------
>>>   1 file changed, 28 insertions(+), 69 deletions(-)
>>>
>>> diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
>>> index 32295f26be1b..18bd869a82d9 100644
>>> --- a/drivers/iommu/amd/init.c
>>> +++ b/drivers/iommu/amd/init.c
>>> @@ -406,6 +406,9 @@ static void iommu_set_device_table(struct amd_iommu *iommu)
>>>   
>>>   	BUG_ON(iommu->mmio_base == NULL);
>>>   
>>> +	if (is_kdump_kernel())
>>
>> This is fine.. but its becoming too many places with kdump check! I don't know
>> what is the better way here.
>> Is it worth to keep it like this -OR- add say iommu ops that way during init we
>> check is_kdump_kernel() and adjust the ops ?
>>
>> @Joerg, any preference?
>>
>>
>>> +		return;
>>> +
>>>   	entry = iommu_virt_to_phys(dev_table);
>>>   	entry |= (dev_table_size >> 12) - 1;
>>>   	memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET,
>>> @@ -646,7 +649,10 @@ static inline int __init alloc_dev_table(struct amd_iommu_pci_seg *pci_seg)
>>>   
>>>   static inline void free_dev_table(struct amd_iommu_pci_seg *pci_seg)
>>>   {
>>> -	iommu_free_pages(pci_seg->dev_table);
>>> +	if (is_kdump_kernel())
>>> +		memunmap((void *)pci_seg->dev_table);
>>> +	else
>>> +		iommu_free_pages(pci_seg->dev_table);
>>>   	pci_seg->dev_table = NULL;
>>>   }
>>>   
>>> @@ -1128,15 +1134,12 @@ static void set_dte_bit(struct dev_table_entry *dte, u8 bit)
>>>   	dte->data[i] |= (1UL << _bit);
>>>   }
>>>   
>>> -static bool __copy_device_table(struct amd_iommu *iommu)
>>> +static bool __reuse_device_table(struct amd_iommu *iommu)
>>>   {
>>> -	u64 int_ctl, int_tab_len, entry = 0;
>>>   	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
>>> -	struct dev_table_entry *old_devtb = NULL;
>>> -	u32 lo, hi, devid, old_devtb_size;
>>> +	u32 lo, hi, old_devtb_size;
>>>   	phys_addr_t old_devtb_phys;
>>> -	u16 dom_id, dte_v, irq_v;
>>> -	u64 tmp;
>>> +	u64 entry;
>>>   
>>>   	/* Each IOMMU use separate device table with the same size */
>>>   	lo = readl(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET);
>>> @@ -1161,66 +1164,22 @@ static bool __copy_device_table(struct amd_iommu *iommu)
>>>   		pr_err("The address of old device table is above 4G, not trustworthy!\n");
>>>   		return false;
>>>   	}
>>> -	old_devtb = (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT) && is_kdump_kernel())
>>> -		    ? (__force void *)ioremap_encrypted(old_devtb_phys,
>>> -							pci_seg->dev_table_size)
>>> -		    : memremap(old_devtb_phys, pci_seg->dev_table_size, MEMREMAP_WB);
>>> -
>>> -	if (!old_devtb)
>>> -		return false;
>>>   
>>> -	pci_seg->old_dev_tbl_cpy = iommu_alloc_pages_sz(
>>> -		GFP_KERNEL | GFP_DMA32, pci_seg->dev_table_size);
>>> +	/*
>>> +	 * IOMMU Device Table Base Address MMIO register is locked
>>> +	 * if SNP is enabled during kdump, reuse the previous kernel's
>>> +	 * device table.
>>> +	 */
>>> +	pci_seg->old_dev_tbl_cpy = iommu_memremap(old_devtb_phys, pci_seg->dev_table_size);
>>>   	if (pci_seg->old_dev_tbl_cpy == NULL) {
>>> -		pr_err("Failed to allocate memory for copying old device table!\n");
>>> -		memunmap(old_devtb);
>>> +		pr_err("Failed to remap memory for reusing old device table!\n");
>>>   		return false;
>>>   	}
>>>   
>>> -	for (devid = 0; devid <= pci_seg->last_bdf; ++devid) {
>>> -		pci_seg->old_dev_tbl_cpy[devid] = old_devtb[devid];
>>> -		dom_id = old_devtb[devid].data[1] & DEV_DOMID_MASK;
>>> -		dte_v = old_devtb[devid].data[0] & DTE_FLAG_V;
>>> -
>>> -		if (dte_v && dom_id) {
>>> -			pci_seg->old_dev_tbl_cpy[devid].data[0] = old_devtb[devid].data[0];
>>> -			pci_seg->old_dev_tbl_cpy[devid].data[1] = old_devtb[devid].data[1];
>>> -			/* Reserve the Domain IDs used by previous kernel */
>>> -			if (ida_alloc_range(&pdom_ids, dom_id, dom_id, GFP_ATOMIC) != dom_id) {
>>> -				pr_err("Failed to reserve domain ID 0x%x\n", dom_id);
>>> -				memunmap(old_devtb);
>>> -				return false;
>>> -			}
>>> -			/* If gcr3 table existed, mask it out */
>>> -			if (old_devtb[devid].data[0] & DTE_FLAG_GV) {
>>> -				tmp = (DTE_GCR3_30_15 | DTE_GCR3_51_31);
>>> -				pci_seg->old_dev_tbl_cpy[devid].data[1] &= ~tmp;
>>> -				tmp = (DTE_GCR3_14_12 | DTE_FLAG_GV);
>>> -				pci_seg->old_dev_tbl_cpy[devid].data[0] &= ~tmp;
>>> -			}
>>> -		}
>>> -
>>> -		irq_v = old_devtb[devid].data[2] & DTE_IRQ_REMAP_ENABLE;
>>> -		int_ctl = old_devtb[devid].data[2] & DTE_IRQ_REMAP_INTCTL_MASK;
>>> -		int_tab_len = old_devtb[devid].data[2] & DTE_INTTABLEN_MASK;
>>> -		if (irq_v && (int_ctl || int_tab_len)) {
>>> -			if ((int_ctl != DTE_IRQ_REMAP_INTCTL) ||
>>> -			    (int_tab_len != DTE_INTTABLEN_512 &&
>>> -			     int_tab_len != DTE_INTTABLEN_2K)) {
>>> -				pr_err("Wrong old irq remapping flag: %#x\n", devid);
>>> -				memunmap(old_devtb);
>>> -				return false;
>>> -			}
>>> -
>>> -			pci_seg->old_dev_tbl_cpy[devid].data[2] = old_devtb[devid].data[2];
>>> -		}
>>> -	}
>>> -	memunmap(old_devtb);
>>> -
>>>   	return true;
>>>   }
>>>   
>>> -static bool copy_device_table(void)
>>> +static bool reuse_device_table(void)
>>>   {
>>>   	struct amd_iommu *iommu;
>>>   	struct amd_iommu_pci_seg *pci_seg;
>>> @@ -1228,17 +1187,17 @@ static bool copy_device_table(void)
>>>   	if (!amd_iommu_pre_enabled)
>>>   		return false;
>>>   
>>> -	pr_warn("Translation is already enabled - trying to copy translation structures\n");
>>> +	pr_warn("Translation is already enabled - trying to reuse translation structures\n");
>>>   
>>>   	/*
>>>   	 * All IOMMUs within PCI segment shares common device table.
>>> -	 * Hence copy device table only once per PCI segment.
>>> +	 * Hence reuse device table only once per PCI segment.
>>>   	 */
>>>   	for_each_pci_segment(pci_seg) {
>>>   		for_each_iommu(iommu) {
>>>   			if (pci_seg->id != iommu->pci_seg->id)
>>>   				continue;
>>> -			if (!__copy_device_table(iommu))
>>> +			if (!__reuse_device_table(iommu))
>>>   				return false;
>>>   			break;
>>>   		}
>>> @@ -2917,8 +2876,8 @@ static void early_enable_iommu(struct amd_iommu *iommu)
>>>    * This function finally enables all IOMMUs found in the system after
>>>    * they have been initialized.
>>>    *
>>> - * Or if in kdump kernel and IOMMUs are all pre-enabled, try to copy
>>> - * the old content of device table entries. Not this case or copy failed,
>>> + * Or if in kdump kernel and IOMMUs are all pre-enabled, try to reuse
>>> + * the old content of device table entries. Not this case or reuse failed,
>>>    * just continue as normal kernel does.
>>>    */
>>>   static void early_enable_iommus(void)
>>> @@ -2926,18 +2885,18 @@ static void early_enable_iommus(void)
>>>   	struct amd_iommu *iommu;
>>>   	struct amd_iommu_pci_seg *pci_seg;
>>>   
>>> -	if (!copy_device_table()) {
>>> +	if (!reuse_device_table()) {
>>
>> Hmmm. What happens if SNP enabled and reuse_device_table() couldn't setup
>> previous DTE?
>> In non-SNP case it works fine as we can rebuild new DTE. But in SNP case we
>> should fail the kdump right?
>>
> 
> Which will happen automatically, if we can't setup previous DTE for SNP case
> then IOMMU commands will time-out and subsequenly cause a panic as IRQ remapping
> won't be setup.
> 
> So this is as good as failing the kdump, which will have the same result.
> 

Maybe we can have a BUG_ON() when it fails to remap the DTE in kdump
kernel and SNP is turned on ?

Its hard to understand why kdump has failed just by looking at
completion wait timeout. Will be easier to debug with BUG_ON().

> Thanks,
> Ashish
> 


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ