[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <8869fcdb-5a4a-45c7-a1ff-1ade5b85097d@amd.com>
Date: Thu, 17 Jul 2025 11:08:58 +0530
From: Sairaj Kodilkar <sarunkod@....com>
To: "Kalra, Ashish" <ashish.kalra@....com>, Vasant Hegde
<vasant.hegde@....com>, <joro@...tes.org>, <suravee.suthikulpanit@....com>,
<thomas.lendacky@....com>, <Sairaj.ArunKodilkar@....com>,
<herbert@...dor.apana.org.au>
CC: <seanjc@...gle.com>, <pbonzini@...hat.com>, <will@...nel.org>,
<robin.murphy@....com>, <john.allen@....com>, <davem@...emloft.net>,
<bp@...en8.de>, <michael.roth@....com>, <iommu@...ts.linux.dev>,
<linux-kernel@...r.kernel.org>, <linux-crypto@...r.kernel.org>,
<kvm@...r.kernel.org>
Subject: Re: [PATCH v3 2/4] iommu/amd: Reuse device table for kdump
On 7/17/2025 3:37 AM, Kalra, Ashish wrote:
> Hello Vasant,
>
> On 7/16/2025 4:42 AM, Vasant Hegde wrote:
>>
>>
>> On 7/16/2025 12:57 AM, Ashish Kalra wrote:
>>> From: Ashish Kalra <ashish.kalra@....com>
>>>
>>> After a panic if SNP is enabled in the previous kernel then the kdump
>>> kernel boots with IOMMU SNP enforcement still enabled.
>>>
>>> IOMMU device table register is locked and exclusive to the previous
>>> kernel. Attempts to copy old device table from the previous kernel
>>> fails in kdump kernel as hardware ignores writes to the locked device
>>> table base address register as per AMD IOMMU spec Section 2.12.2.1.
>>>
>>> This results in repeated "Completion-Wait loop timed out" errors and a
>>> second kernel panic: "Kernel panic - not syncing: timer doesn't work
>>> through Interrupt-remapped IO-APIC".
>>>
>>> Reuse device table instead of copying device table in case of kdump
>>> boot and remove all copying device table code.
>>>
>>> Signed-off-by: Ashish Kalra <ashish.kalra@....com>
>>> ---
>>> drivers/iommu/amd/init.c | 97 ++++++++++++----------------------------
>>> 1 file changed, 28 insertions(+), 69 deletions(-)
>>>
>>> diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
>>> index 32295f26be1b..18bd869a82d9 100644
>>> --- a/drivers/iommu/amd/init.c
>>> +++ b/drivers/iommu/amd/init.c
>>> @@ -406,6 +406,9 @@ static void iommu_set_device_table(struct amd_iommu *iommu)
>>>
>>> BUG_ON(iommu->mmio_base == NULL);
>>>
>>> + if (is_kdump_kernel())
>>
>> This is fine.. but its becoming too many places with kdump check! I don't know
>> what is the better way here.
>> Is it worth to keep it like this -OR- add say iommu ops that way during init we
>> check is_kdump_kernel() and adjust the ops ?
>>
>> @Joerg, any preference?
>>
>>
>>> + return;
>>> +
>>> entry = iommu_virt_to_phys(dev_table);
>>> entry |= (dev_table_size >> 12) - 1;
>>> memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET,
>>> @@ -646,7 +649,10 @@ static inline int __init alloc_dev_table(struct amd_iommu_pci_seg *pci_seg)
>>>
>>> static inline void free_dev_table(struct amd_iommu_pci_seg *pci_seg)
>>> {
>>> - iommu_free_pages(pci_seg->dev_table);
>>> + if (is_kdump_kernel())
>>> + memunmap((void *)pci_seg->dev_table);
>>> + else
>>> + iommu_free_pages(pci_seg->dev_table);
>>> pci_seg->dev_table = NULL;
>>> }
>>>
>>> @@ -1128,15 +1134,12 @@ static void set_dte_bit(struct dev_table_entry *dte, u8 bit)
>>> dte->data[i] |= (1UL << _bit);
>>> }
>>>
>>> -static bool __copy_device_table(struct amd_iommu *iommu)
>>> +static bool __reuse_device_table(struct amd_iommu *iommu)
>>> {
>>> - u64 int_ctl, int_tab_len, entry = 0;
>>> struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
>>> - struct dev_table_entry *old_devtb = NULL;
>>> - u32 lo, hi, devid, old_devtb_size;
>>> + u32 lo, hi, old_devtb_size;
>>> phys_addr_t old_devtb_phys;
>>> - u16 dom_id, dte_v, irq_v;
>>> - u64 tmp;
>>> + u64 entry;
>>>
>>> /* Each IOMMU use separate device table with the same size */
>>> lo = readl(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET);
>>> @@ -1161,66 +1164,22 @@ static bool __copy_device_table(struct amd_iommu *iommu)
>>> pr_err("The address of old device table is above 4G, not trustworthy!\n");
>>> return false;
>>> }
>>> - old_devtb = (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT) && is_kdump_kernel())
>>> - ? (__force void *)ioremap_encrypted(old_devtb_phys,
>>> - pci_seg->dev_table_size)
>>> - : memremap(old_devtb_phys, pci_seg->dev_table_size, MEMREMAP_WB);
>>> -
>>> - if (!old_devtb)
>>> - return false;
>>>
>>> - pci_seg->old_dev_tbl_cpy = iommu_alloc_pages_sz(
>>> - GFP_KERNEL | GFP_DMA32, pci_seg->dev_table_size);
>>> + /*
>>> + * IOMMU Device Table Base Address MMIO register is locked
>>> + * if SNP is enabled during kdump, reuse the previous kernel's
>>> + * device table.
>>> + */
>>> + pci_seg->old_dev_tbl_cpy = iommu_memremap(old_devtb_phys, pci_seg->dev_table_size);
>>> if (pci_seg->old_dev_tbl_cpy == NULL) {
>>> - pr_err("Failed to allocate memory for copying old device table!\n");
>>> - memunmap(old_devtb);
>>> + pr_err("Failed to remap memory for reusing old device table!\n");
>>> return false;
>>> }
>>>
>>> - for (devid = 0; devid <= pci_seg->last_bdf; ++devid) {
>>> - pci_seg->old_dev_tbl_cpy[devid] = old_devtb[devid];
>>> - dom_id = old_devtb[devid].data[1] & DEV_DOMID_MASK;
>>> - dte_v = old_devtb[devid].data[0] & DTE_FLAG_V;
>>> -
>>> - if (dte_v && dom_id) {
>>> - pci_seg->old_dev_tbl_cpy[devid].data[0] = old_devtb[devid].data[0];
>>> - pci_seg->old_dev_tbl_cpy[devid].data[1] = old_devtb[devid].data[1];
>>> - /* Reserve the Domain IDs used by previous kernel */
>>> - if (ida_alloc_range(&pdom_ids, dom_id, dom_id, GFP_ATOMIC) != dom_id) {
>>> - pr_err("Failed to reserve domain ID 0x%x\n", dom_id);
>>> - memunmap(old_devtb);
>>> - return false;
>>> - }
>>> - /* If gcr3 table existed, mask it out */
>>> - if (old_devtb[devid].data[0] & DTE_FLAG_GV) {
>>> - tmp = (DTE_GCR3_30_15 | DTE_GCR3_51_31);
>>> - pci_seg->old_dev_tbl_cpy[devid].data[1] &= ~tmp;
>>> - tmp = (DTE_GCR3_14_12 | DTE_FLAG_GV);
>>> - pci_seg->old_dev_tbl_cpy[devid].data[0] &= ~tmp;
>>> - }
>>> - }
>>> -
>>> - irq_v = old_devtb[devid].data[2] & DTE_IRQ_REMAP_ENABLE;
>>> - int_ctl = old_devtb[devid].data[2] & DTE_IRQ_REMAP_INTCTL_MASK;
>>> - int_tab_len = old_devtb[devid].data[2] & DTE_INTTABLEN_MASK;
>>> - if (irq_v && (int_ctl || int_tab_len)) {
>>> - if ((int_ctl != DTE_IRQ_REMAP_INTCTL) ||
>>> - (int_tab_len != DTE_INTTABLEN_512 &&
>>> - int_tab_len != DTE_INTTABLEN_2K)) {
>>> - pr_err("Wrong old irq remapping flag: %#x\n", devid);
>>> - memunmap(old_devtb);
>>> - return false;
>>> - }
>>> -
>>> - pci_seg->old_dev_tbl_cpy[devid].data[2] = old_devtb[devid].data[2];
>>> - }
>>> - }
>>> - memunmap(old_devtb);
>>> -
>>> return true;
>>> }
>>>
>>> -static bool copy_device_table(void)
>>> +static bool reuse_device_table(void)
>>> {
>>> struct amd_iommu *iommu;
>>> struct amd_iommu_pci_seg *pci_seg;
>>> @@ -1228,17 +1187,17 @@ static bool copy_device_table(void)
>>> if (!amd_iommu_pre_enabled)
>>> return false;
>>>
>>> - pr_warn("Translation is already enabled - trying to copy translation structures\n");
>>> + pr_warn("Translation is already enabled - trying to reuse translation structures\n");
>>>
>>> /*
>>> * All IOMMUs within PCI segment shares common device table.
>>> - * Hence copy device table only once per PCI segment.
>>> + * Hence reuse device table only once per PCI segment.
>>> */
>>> for_each_pci_segment(pci_seg) {
>>> for_each_iommu(iommu) {
>>> if (pci_seg->id != iommu->pci_seg->id)
>>> continue;
>>> - if (!__copy_device_table(iommu))
>>> + if (!__reuse_device_table(iommu))
>>> return false;
>>> break;
>>> }
>>> @@ -2917,8 +2876,8 @@ static void early_enable_iommu(struct amd_iommu *iommu)
>>> * This function finally enables all IOMMUs found in the system after
>>> * they have been initialized.
>>> *
>>> - * Or if in kdump kernel and IOMMUs are all pre-enabled, try to copy
>>> - * the old content of device table entries. Not this case or copy failed,
>>> + * Or if in kdump kernel and IOMMUs are all pre-enabled, try to reuse
>>> + * the old content of device table entries. Not this case or reuse failed,
>>> * just continue as normal kernel does.
>>> */
>>> static void early_enable_iommus(void)
>>> @@ -2926,18 +2885,18 @@ static void early_enable_iommus(void)
>>> struct amd_iommu *iommu;
>>> struct amd_iommu_pci_seg *pci_seg;
>>>
>>> - if (!copy_device_table()) {
>>> + if (!reuse_device_table()) {
>>
>> Hmmm. What happens if SNP enabled and reuse_device_table() couldn't setup
>> previous DTE?
>> In non-SNP case it works fine as we can rebuild new DTE. But in SNP case we
>> should fail the kdump right?
>>
>
> Which will happen automatically, if we can't setup previous DTE for SNP case
> then IOMMU commands will time-out and subsequenly cause a panic as IRQ remapping
> won't be setup.
>
> So this is as good as failing the kdump, which will have the same result.
>
Maybe we can have a BUG_ON() when it fails to remap the DTE in kdump
kernel and SNP is turned on ?
Its hard to understand why kdump has failed just by looking at
completion wait timeout. Will be easier to debug with BUG_ON().
> Thanks,
> Ashish
>
Powered by blists - more mailing lists