linux-kernel - Re: [PATCH v2 2/2] iommu/iova: Manage the depot list size

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <da25bef4-7e6e-8e4b-ad9e-96d1c8729095@amd.com>
Date:   Thu, 7 Sep 2023 23:24:43 +0530
From:   "Srivastava, Dheeraj Kumar" <dheerajkumar.srivastava@....com>
To:     Robin Murphy <robin.murphy@....com>, joro@...tes.org
Cc:     will@...nel.org, iommu@...ts.linux.dev,
        linux-kernel@...r.kernel.org, zhangzekun11@...wei.com,
        john.g.garry@...cle.com, jsnitsel@...hat.com,
        "wyes.karny@....com vasant.hegde"@amd.com
Subject: Re: [PATCH v2 2/2] iommu/iova: Manage the depot list size

Hi Robin,

On 8/21/2023 11:52 PM, Robin Murphy wrote:
> Automatically scaling the depot up to suit the peak capacity of a
> workload is all well and good, but it would be nice to have a way to
> scale it back down again if the workload changes. To that end, add
> backround reclaim that will gradually free surplus magazines if the
> depot size remains above a reasonable threshold for long enough.
> 
> Signed-off-by: Robin Murphy <robin.murphy@....com>
> ---
> 
> v2: fix reschedule delay typo
> 
>   drivers/iommu/iova.c | 29 +++++++++++++++++++++++++++++
>   1 file changed, 29 insertions(+)
> 
> diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
> index dd2309e9a6c5..436f42855c29 100644
> --- a/drivers/iommu/iova.c
> +++ b/drivers/iommu/iova.c
> @@ -11,6 +11,7 @@
>   #include <linux/smp.h>
>   #include <linux/bitops.h>
>   #include <linux/cpu.h>
> +#include <linux/workqueue.h>
>   
>   /* The anchor node sits above the top of the usable address space */
>   #define IOVA_ANCHOR	~0UL
> @@ -627,6 +628,8 @@ EXPORT_SYMBOL_GPL(reserve_iova);
>    */
>   #define IOVA_MAG_SIZE 127
>   
> +#define IOVA_DEPOT_DELAY msecs_to_jiffies(100)
> +
>   struct iova_magazine {
>   	union {
>   		unsigned long size;
> @@ -644,8 +647,11 @@ struct iova_cpu_rcache {
>   
>   struct iova_rcache {
>   	spinlock_t lock;
> +	unsigned int depot_size;
>   	struct iova_magazine *depot;
>   	struct iova_cpu_rcache __percpu *cpu_rcaches;
> +	struct iova_domain *iovad;
> +	struct delayed_work work;
>   };
>   
>   static struct iova_magazine *iova_magazine_alloc(gfp_t flags)
> @@ -726,6 +732,7 @@ static struct iova_magazine *iova_depot_pop(struct iova_rcache *rcache)
>   
>   	rcache->depot = mag->next;
>   	mag->size = IOVA_MAG_SIZE;
> +	rcache->depot_size--;
>   	return mag;
>   }
>   
> @@ -733,6 +740,24 @@ static void iova_depot_push(struct iova_rcache *rcache, struct iova_magazine *ma
>   {
>   	mag->next = rcache->depot;
>   	rcache->depot = mag;
> +	rcache->depot_size++;
> +}
> +
> +static void iova_depot_work_func(struct work_struct *work)
> +{
> +	struct iova_rcache *rcache = container_of(work, typeof(*rcache), work.work);
> +	struct iova_magazine *mag = NULL;
> +
> +	spin_lock(&rcache->lock);
> +	if (rcache->depot_size > num_online_cpus())
> +		mag = iova_depot_pop(rcache);
> +	spin_unlock(&rcache->lock);
> +

Running fio test on nvme disk is causing hard LOCKUP issue with the 
below kernel logs

[ 7620.507689] watchdog: Watchdog detected hard LOCKUP on cpu 334
[ 7620.507694] Modules linked in: nvme_fabrics intel_rapl_msr 
intel_rapl_common amd64_edac edac_mce_amd kvm_amd kvm rapl wmi_bmof 
ipmi_ssif binfmt_misc nls_iso8859_1 input_leds joydev acpi_ipmi ipmi_si 
ccp ipmi_devintf k10temp ipmi_msghandler mac_hid sch_fq_codel 
dm_multipath scsi_dh_rdac scsi_dh_emc scsi_dh_alua msr ramoops 
reed_solomon pstore_blk pstore_zone efi_pstore ip_tables x_tables 
autofs4 btrfs blake2b_generic raid10 raid456 async_raid6_recov 
async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c raid1 
raid0 multipath linear hid_generic usbhid ast dax_hmem i2c_algo_bit hid 
drm_shmem_helper drm_kms_helper cxl_acpi crct10dif_pclmul crc32_pclmul 
ghash_clmulni_intel sha512_ssse3 aesni_intel crypto_simd cryptd cxl_core 
nvme ahci drm tg3 nvme_core xhci_pci libahci i2c_piix4 xhci_pci_renesas wmi
[ 7620.507795] CPU: 334 PID: 3524 Comm: kworker/334:1 Not tainted 
6.5.0-woiovapatchfix #1
[ 7620.507800] Hardware name: AMD Corporation
[ 7620.507803] Workqueue: events iova_depot_work_func
[ 7620.507813] RIP: 0010:native_queued_spin_lock_slowpath+0x8b/0x300
[ 7620.507823] Code: 24 0f b6 d2 c1 e2 08 30 e4 09 d0 a9 00 01 ff ff 0f 
85 f6 01 00 00 85 c0 74 14 41 0f b6 04 24 84 c0 74 0b f3 90 41 0f b6 04 
24 <84> c0 75 f5 b8 01 00 00 00 66 41 89 04 24 5b 41 5c 41 5d 41 5e 41
[ 7620.507826] RSP: 0018:ffa000001e450c60 EFLAGS: 00000002
[ 7620.507830] RAX: 0000000000000001 RBX: 000000000000007f RCX: 
0000000000000001
[ 7620.507832] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 
ff11000167b7c000
[ 7620.507834] RBP: ffa000001e450c88 R08: 0000000000000086 R09: 
ff110055f3bf9800
[ 7620.507836] R10: 0000000000001000 R11: ffa000001e450ff8 R12: 
ff11000167b7c000
[ 7620.507838] R13: ff110060639dce08 R14: 000000000007fc54 R15: 
ff110055f3bf9800
[ 7620.507840] FS:  0000000000000000(0000) GS:ff11005ecf380000(0000) 
knlGS:0000000000000000
[ 7620.507843] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 7620.507846] CR2: 00007f3678bc4010 CR3: 000000016e00c003 CR4: 
0000000000771ee0
[ 7620.507848] PKRU: 55555554
[ 7620.507850] Call Trace:
[ 7620.507852]  <NMI>
[ 7620.507857]  ? show_regs+0x6e/0x80
[ 7620.507865]  ? watchdog_hardlockup_check+0x16e/0x330
[ 7620.507872]  ? srso_alias_return_thunk+0x5/0x7f
[ 7620.507877]  ? watchdog_overflow_callback+0x70/0x80
[ 7620.507883]  ? __perf_event_overflow+0x13b/0x2b0
[ 7620.507888]  ? x86_perf_event_set_period+0xe7/0x1d0
[ 7620.507895]  ? perf_event_overflow+0x1d/0x30
[ 7620.507901]  ? amd_pmu_v2_handle_irq+0x158/0x2f0
[ 7620.507911]  ? srso_alias_return_thunk+0x5/0x7f
[ 7620.507914]  ? flush_tlb_one_kernel+0x12/0x30
[ 7620.507920]  ? srso_alias_return_thunk+0x5/0x7f
[ 7620.507923]  ? set_pte_vaddr_p4d+0x5c/0x70
[ 7620.507931]  ? srso_alias_return_thunk+0x5/0x7f
[ 7620.507933]  ? set_pte_vaddr+0x80/0xb0
[ 7620.507938]  ? srso_alias_return_thunk+0x5/0x7f
[ 7620.507942]  ? srso_alias_return_thunk+0x5/0x7f
[ 7620.507945]  ? native_set_fixmap+0x6d/0x90
[ 7620.507949]  ? srso_alias_return_thunk+0x5/0x7f
[ 7620.507952]  ? ghes_copy_tofrom_phys+0x79/0x120
[ 7620.507960]  ? srso_alias_return_thunk+0x5/0x7f
[ 7620.507962]  ? __ghes_peek_estatus.isra.0+0x4e/0xc0
[ 7620.507968]  ? srso_alias_return_thunk+0x5/0x7f
[ 7620.507973]  ? perf_event_nmi_handler+0x31/0x60
[ 7620.507978]  ? nmi_handle+0x68/0x180
[ 7620.507985]  ? default_do_nmi+0x45/0x120
[ 7620.507991]  ? exc_nmi+0x142/0x1c0
[ 7620.507996]  ? end_repeat_nmi+0x16/0x67
[ 7620.508006]  ? native_queued_spin_lock_slowpath+0x8b/0x300
[ 7620.508012]  ? native_queued_spin_lock_slowpath+0x8b/0x300
[ 7620.508019]  ? native_queued_spin_lock_slowpath+0x8b/0x300
[ 7620.508024]  </NMI>
[ 7620.508026]  <IRQ>
[ 7620.508029]  _raw_spin_lock+0x2d/0x40
[ 7620.508034]  free_iova_fast+0x12e/0x1b0
[ 7620.508041]  fq_ring_free+0x9b/0x150
[ 7620.508045]  ? amd_iommu_unmap_pages+0x49/0x130
[ 7620.508051]  iommu_dma_free_iova+0x10c/0x300
[ 7620.508058]  __iommu_dma_unmap+0xca/0x140
[ 7620.508067]  iommu_dma_unmap_page+0x4f/0xb0
[ 7620.508073]  dma_unmap_page_attrs+0x183/0x1c0
[ 7620.508081]  nvme_unmap_data+0x105/0x150 [nvme]
[ 7620.508095]  nvme_pci_complete_batch+0xaf/0xd0 [nvme]
[ 7620.508106]  nvme_irq+0x7b/0x90 [nvme]
[ 7620.508116]  ? __pfx_nvme_pci_complete_batch+0x10/0x10 [nvme]
[ 7620.508125]  __handle_irq_event_percpu+0x50/0x1b0
[ 7620.508133]  handle_irq_event+0x3d/0x80
[ 7620.508138]  handle_edge_irq+0x90/0x240
[ 7620.508143]  __common_interrupt+0x52/0x100
[ 7620.508149]  ? srso_alias_return_thunk+0x5/0x7f
[ 7620.508153]  common_interrupt+0x8d/0xa0
[ 7620.508157]  </IRQ>
[ 7620.508158]  <TASK>
[ 7620.508161]  asm_common_interrupt+0x2b/0x40
[ 7620.508165] RIP: 0010:iova_depot_work_func+0x28/0xb0
[ 7620.508170] Code: 90 90 f3 0f 1e fa 0f 1f 44 00 00 55 48 89 e5 41 56 
4c 8d 77 e0 41 55 41 54 49 89 fc 4c 89 f7 e8 2e 9d 5e 00 8b 05 58 d6 bb 
01 <41> 39 44 24 e4 77 14 4c 89 f7 e8 f9 9d 5e 00 41 5c 41 5d 41 5e 5d
[ 7620.508173] RSP: 0018:ffa0000025513e30 EFLAGS: 00000246
[ 7620.508176] RAX: 0000000000000200 RBX: ff110001988f4540 RCX: 
ff11005ecf3b3ca8
[ 7620.508178] RDX: 0000000000000001 RSI: ff110001000420b0 RDI: 
ff11000167b7c000
[ 7620.508180] RBP: ffa0000025513e48 R08: ff110001000420b0 R09: 
ff110001988f45c0
[ 7620.508181] R10: 0000000000000007 R11: 0000000000000007 R12: 
ff11000167b7c020
[ 7620.508183] R13: ff110001001f1600 R14: ff11000167b7c000 R15: 
ff110001001f1605
[ 7620.508193]  process_one_work+0x178/0x350
[ 7620.508202]  ? __pfx_worker_thread+0x10/0x10
[ 7620.508207]  worker_thread+0x2f7/0x420
[ 7620.508215]  ? __pfx_worker_thread+0x10/0x10
[ 7620.508220]  kthread+0xf8/0x130
[ 7620.508226]  ? __pfx_kthread+0x10/0x10
[ 7620.508231]  ret_from_fork+0x3d/0x60
[ 7620.508235]  ? __pfx_kthread+0x10/0x10
[ 7620.508240]  ret_from_fork_asm+0x1b/0x30
[ 7620.508252]  </TASK>

Looking into the trace and your patch figured out that in 
iova_depot_work_func workqueue function rcache->lock is taken via 
spin_lock. But the same lock will be taken from IRQ context also.
So, to prevent IRQ when the rcache->lock is taken we should disable IRQ.
Therefore use spin_lock_irqsave in place of normal spin_lock.

Below changes fixes the issue

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 436f42855c29..d30e453d0fb4 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -747,11 +747,12 @@ static void iova_depot_work_func(struct 
work_struct *work)
  {
	struct iova_rcache *rcache = container_of(work, typeof(*rcache), 
work.work);
	struct iova_magazine *mag = NULL;
+	unsigned long flags;

-	spin_lock(&rcache->lock);
+	spin_lock_irqsave(&rcache->lock, flags);
	if (rcache->depot_size > num_online_cpus())
		mag = iova_depot_pop(rcache);
-	spin_unlock(&rcache->lock);
+	spin_unlock_irqrestore(&rcache->lock, flags);

	if (mag) {
		iova_magazine_free_pfns(mag, rcache->iovad);


> +	if (mag) {
> +		iova_magazine_free_pfns(mag, rcache->iovad);
> +		iova_magazine_free(mag);
> +		schedule_delayed_work(&rcache->work, IOVA_DEPOT_DELAY);
> +	}
>   }
>   
>   int iova_domain_init_rcaches(struct iova_domain *iovad)
> @@ -752,6 +777,8 @@ int iova_domain_init_rcaches(struct iova_domain *iovad)
>   
>   		rcache = &iovad->rcaches[i];
>   		spin_lock_init(&rcache->lock);
> +		rcache->iovad = iovad;
> +		INIT_DELAYED_WORK(&rcache->work, iova_depot_work_func);
>   		rcache->cpu_rcaches = __alloc_percpu(sizeof(*cpu_rcache),
>   						     cache_line_size());
>   		if (!rcache->cpu_rcaches) {
> @@ -812,6 +839,7 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
>   			spin_lock(&rcache->lock);
>   			iova_depot_push(rcache, cpu_rcache->loaded);
>   			spin_unlock(&rcache->lock);
> +			schedule_delayed_work(&rcache->work, IOVA_DEPOT_DELAY);
>   
>   			cpu_rcache->loaded = new_mag;
>   			can_insert = true;
> @@ -912,6 +940,7 @@ static void free_iova_rcaches(struct iova_domain *iovad)
>   			iova_magazine_free(cpu_rcache->prev);
>   		}
>   		free_percpu(rcache->cpu_rcaches);
> +		cancel_delayed_work_sync(&rcache->work);
>   		while (rcache->depot)
>   			iova_magazine_free(iova_depot_pop(rcache));
>   	}