[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <da25bef4-7e6e-8e4b-ad9e-96d1c8729095@amd.com>
Date: Thu, 7 Sep 2023 23:24:43 +0530
From: "Srivastava, Dheeraj Kumar" <dheerajkumar.srivastava@....com>
To: Robin Murphy <robin.murphy@....com>, joro@...tes.org
Cc: will@...nel.org, iommu@...ts.linux.dev,
linux-kernel@...r.kernel.org, zhangzekun11@...wei.com,
john.g.garry@...cle.com, jsnitsel@...hat.com,
"wyes.karny@....com vasant.hegde"@amd.com
Subject: Re: [PATCH v2 2/2] iommu/iova: Manage the depot list size
Hi Robin,
On 8/21/2023 11:52 PM, Robin Murphy wrote:
> Automatically scaling the depot up to suit the peak capacity of a
> workload is all well and good, but it would be nice to have a way to
> scale it back down again if the workload changes. To that end, add
> backround reclaim that will gradually free surplus magazines if the
> depot size remains above a reasonable threshold for long enough.
>
> Signed-off-by: Robin Murphy <robin.murphy@....com>
> ---
>
> v2: fix reschedule delay typo
>
> drivers/iommu/iova.c | 29 +++++++++++++++++++++++++++++
> 1 file changed, 29 insertions(+)
>
> diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
> index dd2309e9a6c5..436f42855c29 100644
> --- a/drivers/iommu/iova.c
> +++ b/drivers/iommu/iova.c
> @@ -11,6 +11,7 @@
> #include <linux/smp.h>
> #include <linux/bitops.h>
> #include <linux/cpu.h>
> +#include <linux/workqueue.h>
>
> /* The anchor node sits above the top of the usable address space */
> #define IOVA_ANCHOR ~0UL
> @@ -627,6 +628,8 @@ EXPORT_SYMBOL_GPL(reserve_iova);
> */
> #define IOVA_MAG_SIZE 127
>
> +#define IOVA_DEPOT_DELAY msecs_to_jiffies(100)
> +
> struct iova_magazine {
> union {
> unsigned long size;
> @@ -644,8 +647,11 @@ struct iova_cpu_rcache {
>
> struct iova_rcache {
> spinlock_t lock;
> + unsigned int depot_size;
> struct iova_magazine *depot;
> struct iova_cpu_rcache __percpu *cpu_rcaches;
> + struct iova_domain *iovad;
> + struct delayed_work work;
> };
>
> static struct iova_magazine *iova_magazine_alloc(gfp_t flags)
> @@ -726,6 +732,7 @@ static struct iova_magazine *iova_depot_pop(struct iova_rcache *rcache)
>
> rcache->depot = mag->next;
> mag->size = IOVA_MAG_SIZE;
> + rcache->depot_size--;
> return mag;
> }
>
> @@ -733,6 +740,24 @@ static void iova_depot_push(struct iova_rcache *rcache, struct iova_magazine *ma
> {
> mag->next = rcache->depot;
> rcache->depot = mag;
> + rcache->depot_size++;
> +}
> +
> +static void iova_depot_work_func(struct work_struct *work)
> +{
> + struct iova_rcache *rcache = container_of(work, typeof(*rcache), work.work);
> + struct iova_magazine *mag = NULL;
> +
> + spin_lock(&rcache->lock);
> + if (rcache->depot_size > num_online_cpus())
> + mag = iova_depot_pop(rcache);
> + spin_unlock(&rcache->lock);
> +
Running fio test on nvme disk is causing hard LOCKUP issue with the
below kernel logs
[ 7620.507689] watchdog: Watchdog detected hard LOCKUP on cpu 334
[ 7620.507694] Modules linked in: nvme_fabrics intel_rapl_msr
intel_rapl_common amd64_edac edac_mce_amd kvm_amd kvm rapl wmi_bmof
ipmi_ssif binfmt_misc nls_iso8859_1 input_leds joydev acpi_ipmi ipmi_si
ccp ipmi_devintf k10temp ipmi_msghandler mac_hid sch_fq_codel
dm_multipath scsi_dh_rdac scsi_dh_emc scsi_dh_alua msr ramoops
reed_solomon pstore_blk pstore_zone efi_pstore ip_tables x_tables
autofs4 btrfs blake2b_generic raid10 raid456 async_raid6_recov
async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c raid1
raid0 multipath linear hid_generic usbhid ast dax_hmem i2c_algo_bit hid
drm_shmem_helper drm_kms_helper cxl_acpi crct10dif_pclmul crc32_pclmul
ghash_clmulni_intel sha512_ssse3 aesni_intel crypto_simd cryptd cxl_core
nvme ahci drm tg3 nvme_core xhci_pci libahci i2c_piix4 xhci_pci_renesas wmi
[ 7620.507795] CPU: 334 PID: 3524 Comm: kworker/334:1 Not tainted
6.5.0-woiovapatchfix #1
[ 7620.507800] Hardware name: AMD Corporation
[ 7620.507803] Workqueue: events iova_depot_work_func
[ 7620.507813] RIP: 0010:native_queued_spin_lock_slowpath+0x8b/0x300
[ 7620.507823] Code: 24 0f b6 d2 c1 e2 08 30 e4 09 d0 a9 00 01 ff ff 0f
85 f6 01 00 00 85 c0 74 14 41 0f b6 04 24 84 c0 74 0b f3 90 41 0f b6 04
24 <84> c0 75 f5 b8 01 00 00 00 66 41 89 04 24 5b 41 5c 41 5d 41 5e 41
[ 7620.507826] RSP: 0018:ffa000001e450c60 EFLAGS: 00000002
[ 7620.507830] RAX: 0000000000000001 RBX: 000000000000007f RCX:
0000000000000001
[ 7620.507832] RDX: 0000000000000000 RSI: 0000000000000000 RDI:
ff11000167b7c000
[ 7620.507834] RBP: ffa000001e450c88 R08: 0000000000000086 R09:
ff110055f3bf9800
[ 7620.507836] R10: 0000000000001000 R11: ffa000001e450ff8 R12:
ff11000167b7c000
[ 7620.507838] R13: ff110060639dce08 R14: 000000000007fc54 R15:
ff110055f3bf9800
[ 7620.507840] FS: 0000000000000000(0000) GS:ff11005ecf380000(0000)
knlGS:0000000000000000
[ 7620.507843] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 7620.507846] CR2: 00007f3678bc4010 CR3: 000000016e00c003 CR4:
0000000000771ee0
[ 7620.507848] PKRU: 55555554
[ 7620.507850] Call Trace:
[ 7620.507852] <NMI>
[ 7620.507857] ? show_regs+0x6e/0x80
[ 7620.507865] ? watchdog_hardlockup_check+0x16e/0x330
[ 7620.507872] ? srso_alias_return_thunk+0x5/0x7f
[ 7620.507877] ? watchdog_overflow_callback+0x70/0x80
[ 7620.507883] ? __perf_event_overflow+0x13b/0x2b0
[ 7620.507888] ? x86_perf_event_set_period+0xe7/0x1d0
[ 7620.507895] ? perf_event_overflow+0x1d/0x30
[ 7620.507901] ? amd_pmu_v2_handle_irq+0x158/0x2f0
[ 7620.507911] ? srso_alias_return_thunk+0x5/0x7f
[ 7620.507914] ? flush_tlb_one_kernel+0x12/0x30
[ 7620.507920] ? srso_alias_return_thunk+0x5/0x7f
[ 7620.507923] ? set_pte_vaddr_p4d+0x5c/0x70
[ 7620.507931] ? srso_alias_return_thunk+0x5/0x7f
[ 7620.507933] ? set_pte_vaddr+0x80/0xb0
[ 7620.507938] ? srso_alias_return_thunk+0x5/0x7f
[ 7620.507942] ? srso_alias_return_thunk+0x5/0x7f
[ 7620.507945] ? native_set_fixmap+0x6d/0x90
[ 7620.507949] ? srso_alias_return_thunk+0x5/0x7f
[ 7620.507952] ? ghes_copy_tofrom_phys+0x79/0x120
[ 7620.507960] ? srso_alias_return_thunk+0x5/0x7f
[ 7620.507962] ? __ghes_peek_estatus.isra.0+0x4e/0xc0
[ 7620.507968] ? srso_alias_return_thunk+0x5/0x7f
[ 7620.507973] ? perf_event_nmi_handler+0x31/0x60
[ 7620.507978] ? nmi_handle+0x68/0x180
[ 7620.507985] ? default_do_nmi+0x45/0x120
[ 7620.507991] ? exc_nmi+0x142/0x1c0
[ 7620.507996] ? end_repeat_nmi+0x16/0x67
[ 7620.508006] ? native_queued_spin_lock_slowpath+0x8b/0x300
[ 7620.508012] ? native_queued_spin_lock_slowpath+0x8b/0x300
[ 7620.508019] ? native_queued_spin_lock_slowpath+0x8b/0x300
[ 7620.508024] </NMI>
[ 7620.508026] <IRQ>
[ 7620.508029] _raw_spin_lock+0x2d/0x40
[ 7620.508034] free_iova_fast+0x12e/0x1b0
[ 7620.508041] fq_ring_free+0x9b/0x150
[ 7620.508045] ? amd_iommu_unmap_pages+0x49/0x130
[ 7620.508051] iommu_dma_free_iova+0x10c/0x300
[ 7620.508058] __iommu_dma_unmap+0xca/0x140
[ 7620.508067] iommu_dma_unmap_page+0x4f/0xb0
[ 7620.508073] dma_unmap_page_attrs+0x183/0x1c0
[ 7620.508081] nvme_unmap_data+0x105/0x150 [nvme]
[ 7620.508095] nvme_pci_complete_batch+0xaf/0xd0 [nvme]
[ 7620.508106] nvme_irq+0x7b/0x90 [nvme]
[ 7620.508116] ? __pfx_nvme_pci_complete_batch+0x10/0x10 [nvme]
[ 7620.508125] __handle_irq_event_percpu+0x50/0x1b0
[ 7620.508133] handle_irq_event+0x3d/0x80
[ 7620.508138] handle_edge_irq+0x90/0x240
[ 7620.508143] __common_interrupt+0x52/0x100
[ 7620.508149] ? srso_alias_return_thunk+0x5/0x7f
[ 7620.508153] common_interrupt+0x8d/0xa0
[ 7620.508157] </IRQ>
[ 7620.508158] <TASK>
[ 7620.508161] asm_common_interrupt+0x2b/0x40
[ 7620.508165] RIP: 0010:iova_depot_work_func+0x28/0xb0
[ 7620.508170] Code: 90 90 f3 0f 1e fa 0f 1f 44 00 00 55 48 89 e5 41 56
4c 8d 77 e0 41 55 41 54 49 89 fc 4c 89 f7 e8 2e 9d 5e 00 8b 05 58 d6 bb
01 <41> 39 44 24 e4 77 14 4c 89 f7 e8 f9 9d 5e 00 41 5c 41 5d 41 5e 5d
[ 7620.508173] RSP: 0018:ffa0000025513e30 EFLAGS: 00000246
[ 7620.508176] RAX: 0000000000000200 RBX: ff110001988f4540 RCX:
ff11005ecf3b3ca8
[ 7620.508178] RDX: 0000000000000001 RSI: ff110001000420b0 RDI:
ff11000167b7c000
[ 7620.508180] RBP: ffa0000025513e48 R08: ff110001000420b0 R09:
ff110001988f45c0
[ 7620.508181] R10: 0000000000000007 R11: 0000000000000007 R12:
ff11000167b7c020
[ 7620.508183] R13: ff110001001f1600 R14: ff11000167b7c000 R15:
ff110001001f1605
[ 7620.508193] process_one_work+0x178/0x350
[ 7620.508202] ? __pfx_worker_thread+0x10/0x10
[ 7620.508207] worker_thread+0x2f7/0x420
[ 7620.508215] ? __pfx_worker_thread+0x10/0x10
[ 7620.508220] kthread+0xf8/0x130
[ 7620.508226] ? __pfx_kthread+0x10/0x10
[ 7620.508231] ret_from_fork+0x3d/0x60
[ 7620.508235] ? __pfx_kthread+0x10/0x10
[ 7620.508240] ret_from_fork_asm+0x1b/0x30
[ 7620.508252] </TASK>
Looking into the trace and your patch figured out that in
iova_depot_work_func workqueue function rcache->lock is taken via
spin_lock. But the same lock will be taken from IRQ context also.
So, to prevent IRQ when the rcache->lock is taken we should disable IRQ.
Therefore use spin_lock_irqsave in place of normal spin_lock.
Below changes fixes the issue
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 436f42855c29..d30e453d0fb4 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -747,11 +747,12 @@ static void iova_depot_work_func(struct
work_struct *work)
{
struct iova_rcache *rcache = container_of(work, typeof(*rcache),
work.work);
struct iova_magazine *mag = NULL;
+ unsigned long flags;
- spin_lock(&rcache->lock);
+ spin_lock_irqsave(&rcache->lock, flags);
if (rcache->depot_size > num_online_cpus())
mag = iova_depot_pop(rcache);
- spin_unlock(&rcache->lock);
+ spin_unlock_irqrestore(&rcache->lock, flags);
if (mag) {
iova_magazine_free_pfns(mag, rcache->iovad);
> + if (mag) {
> + iova_magazine_free_pfns(mag, rcache->iovad);
> + iova_magazine_free(mag);
> + schedule_delayed_work(&rcache->work, IOVA_DEPOT_DELAY);
> + }
> }
>
> int iova_domain_init_rcaches(struct iova_domain *iovad)
> @@ -752,6 +777,8 @@ int iova_domain_init_rcaches(struct iova_domain *iovad)
>
> rcache = &iovad->rcaches[i];
> spin_lock_init(&rcache->lock);
> + rcache->iovad = iovad;
> + INIT_DELAYED_WORK(&rcache->work, iova_depot_work_func);
> rcache->cpu_rcaches = __alloc_percpu(sizeof(*cpu_rcache),
> cache_line_size());
> if (!rcache->cpu_rcaches) {
> @@ -812,6 +839,7 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
> spin_lock(&rcache->lock);
> iova_depot_push(rcache, cpu_rcache->loaded);
> spin_unlock(&rcache->lock);
> + schedule_delayed_work(&rcache->work, IOVA_DEPOT_DELAY);
>
> cpu_rcache->loaded = new_mag;
> can_insert = true;
> @@ -912,6 +940,7 @@ static void free_iova_rcaches(struct iova_domain *iovad)
> iova_magazine_free(cpu_rcache->prev);
> }
> free_percpu(rcache->cpu_rcaches);
> + cancel_delayed_work_sync(&rcache->work);
> while (rcache->depot)
> iova_magazine_free(iova_depot_pop(rcache));
> }
Powered by blists - more mailing lists