[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <3689f036-5d48-0e4d-56fc-9d96823b9547@amd.com>
Date: Mon, 21 Aug 2023 13:41:58 +0530
From: "Srivastava, Dheeraj Kumar" <dheerajkumar.srivastava@....com>
To: Robin Murphy <robin.murphy@....com>, joro@...tes.org
Cc: will@...nel.org, iommu@...ts.linux.dev,
linux-kernel@...r.kernel.org, john.g.garry@...cle.com,
zhangzekun11@...wei.com
Subject: Re: [PATCH 1/2] iommu/iova: Make the rcache depot scale better
Hello Robin,
On 8/14/2023 11:23 PM, Robin Murphy wrote:
> The algorithm in the original paper specifies the storage of full
> magazines in the depot as an unbounded list rather than a fixed-size
> array. It turns out to be pretty straightforward to do this in our
> implementation with no significant loss of efficiency. This allows
> the depot to scale up to the working set sizes of larger systems,
> while also potentially saving some memory on smaller ones too.
>
> Signed-off-by: Robin Murphy <robin.murphy@....com>
> ---
> drivers/iommu/iova.c | 65 ++++++++++++++++++++++++--------------------
> 1 file changed, 36 insertions(+), 29 deletions(-)
>
> diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
> index 10b964600948..d2de6fb0e9f4 100644
> --- a/drivers/iommu/iova.c
> +++ b/drivers/iommu/iova.c
> @@ -625,10 +625,16 @@ EXPORT_SYMBOL_GPL(reserve_iova);
> * will be wasted.
> */
> #define IOVA_MAG_SIZE 127
> -#define MAX_GLOBAL_MAGS 32 /* magazines per bin */
>
> struct iova_magazine {
> - unsigned long size;
> + /*
> + * Only full magazines are inserted into the depot, so we can avoid
> + * a separate list head and preserve maximum space-efficiency.
> + */
> + union {
> + unsigned long size;
> + struct iova_magazine *next;
> + };
> unsigned long pfns[IOVA_MAG_SIZE];
> };
>
> @@ -640,8 +646,7 @@ struct iova_cpu_rcache {
>
> struct iova_rcache {
> spinlock_t lock;
> - unsigned long depot_size;
> - struct iova_magazine *depot[MAX_GLOBAL_MAGS];
> + struct iova_magazine *depot;
> struct iova_cpu_rcache __percpu *cpu_rcaches;
> };
>
> @@ -717,6 +722,21 @@ static void iova_magazine_push(struct iova_magazine *mag, unsigned long pfn)
> mag->pfns[mag->size++] = pfn;
> }
>
> +static struct iova_magazine *iova_depot_pop(struct iova_rcache *rcache)
> +{
> + struct iova_magazine *mag = rcache->depot;
> +
> + rcache->depot = mag->next;
While doing routine domain change test for a device ("unbind device from
driver -> change domain of the device -> bind device back to the
driver"), i ran into the following NULL pointer dereferencing issue.
[ 599.020261] BUG: kernel NULL pointer dereference, address:
0000000000000000
[ 599.020986] #PF: supervisor read access in kernel mode
[ 599.021588] #PF: error_code(0x0000) - not-present page
[ 599.022180] PGD 0 P4D 0
[ 599.022770] Oops: 0000 [#1] PREEMPT SMP NOPTI
[ 599.023365] CPU: 68 PID: 3122 Comm: avocado Not tainted
6.5.0-rc6-ChngDomainIssue #16
[ 599.023970] Hardware name: Dell Inc. PowerEdge R6515/07PXPY, BIOS
2.3.6 07/06/2021
[ 599.024571] RIP: 0010:free_iova_rcaches+0x9c/0x110
[ 599.025170] Code: d1 ff 39 05 36 d2 bc 01 48 89 c3 77 b4 49 8b 7f 10
e8 b8 69 93 ff 49 8d 7f 20 e8 6f e4 6b ff eb 05 e8 48 ba 93 ff 49 8b 7f
08 <48> 8b 07 49 89 47 08 48 c7 07 7f 00 00 00 41 83 6f 04 01 48 85 ff
[ 599.026436] RSP: 0018:ffffb78b4c9f7c68 EFLAGS: 00010296
[ 599.027075] RAX: ffffffff9fa8c100 RBX: 0000000000000080 RCX:
0000000000000005
[ 599.027719] RDX: 0000000000000000 RSI: 000000007fffffff RDI:
0000000000000000
[ 599.028359] RBP: ffffb78b4c9f7c98 R08: 0000000000000000 R09:
0000000000000006
[ 599.028995] R10: 0000000000000000 R11: 0000000000000000 R12:
0000000000000000
[ 599.029636] R13: ffff93910d9c6008 R14: ffffd78b3bde1000 R15:
ffff9391144ebc00
[ 599.030283] FS: 00007fa5c9e5c000(0000) GS:ffff93cf72700000(0000)
knlGS:0000000000000000
[ 599.030941] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 599.031588] CR2: 0000000000000000 CR3: 000000013f526006 CR4:
0000000000770ee0
[ 599.032237] PKRU: 55555554
[ 599.032878] Call Trace:
[ 599.033512] <TASK>
[ 599.034140] ? show_regs+0x6e/0x80
[ 599.034769] ? __die+0x29/0x70
[ 599.035393] ? page_fault_oops+0x154/0x4a0
[ 599.036021] ? __x86_return_thunk+0x9/0x10
[ 599.036647] ? do_user_addr_fault+0x318/0x6b0
[ 599.037258] ? __x86_return_thunk+0x9/0x10
[ 599.037866] ? __slab_free+0xc7/0x320
[ 599.038472] ? exc_page_fault+0x7d/0x190
[ 599.039074] ? asm_exc_page_fault+0x2b/0x30
[ 599.039683] ? free_iova_rcaches+0x9c/0x110
[ 599.040286] ? free_iova_rcaches+0x91/0x110
[ 599.040875] ? __x86_return_thunk+0x9/0x10
[ 599.041460] put_iova_domain+0x32/0xa0
[ 599.042041] iommu_put_dma_cookie+0x177/0x1b0
[ 599.042620] iommu_domain_free+0x1f/0x50
[ 599.043194] iommu_setup_default_domain+0x2fb/0x420
[ 599.043774] iommu_group_store_type+0xb6/0x210
[ 599.044362] iommu_group_attr_store+0x21/0x40
[ 599.044938] sysfs_kf_write+0x42/0x50
[ 599.045511] kernfs_fop_write_iter+0x143/0x1d0
[ 599.046084] vfs_write+0x2c2/0x3f0
[ 599.046653] ksys_write+0x6b/0xf0
[ 599.047219] __x64_sys_write+0x1d/0x30
[ 599.047782] do_syscall_64+0x60/0x90
[ 599.048346] ? syscall_exit_to_user_mode+0x2a/0x50
[ 599.048911] ? __x64_sys_lseek+0x1c/0x30
[ 599.049465] ? __x86_return_thunk+0x9/0x10
[ 599.050013] ? do_syscall_64+0x6d/0x90
[ 599.050562] ? do_syscall_64+0x6d/0x90
[ 599.051098] ? do_syscall_64+0x6d/0x90
[ 599.051625] ? __x86_return_thunk+0x9/0x10
[ 599.052149] ? exc_page_fault+0x8e/0x190
[ 599.052665] entry_SYSCALL_64_after_hwframe+0x73/0xdd
[ 599.053180] RIP: 0033:0x7fa5c9d14a6f
[ 599.053670] Code: 89 54 24 18 48 89 74 24 10 89 7c 24 08 e8 19 c0 f7
ff 48 8b 54 24 18 48 8b 74 24 10 41 89 c0 8b 7c 24 08 b8 01 00 00 00 0f
05 <48> 3d 00 f0 ff ff 77 31 44 89 c7 48 89 44 24 08 e8 5c c0 f7 ff 48
[ 599.054672] RSP: 002b:00007ffdeb05f540 EFLAGS: 00000293 ORIG_RAX:
0000000000000001
[ 599.055182] RAX: ffffffffffffffda RBX: 0000557de3275a80 RCX:
00007fa5c9d14a6f
[ 599.055692] RDX: 0000000000000003 RSI: 0000557de418ff60 RDI:
0000000000000011
[ 599.056203] RBP: 0000557de39a25e0 R08: 0000000000000000 R09:
0000000000000000
[ 599.056718] R10: 0000000000000000 R11: 0000000000000293 R12:
0000000000000003
[ 599.057227] R13: 00007fa5c9e5bf80 R14: 0000000000000011 R15:
0000557de418ff60
[ 599.057738] </TASK>
[ 599.058227] Modules linked in: xt_CHECKSUM xt_MASQUERADE xt_conntrack
ipt_REJECT nf_reject_ipv4 xt_tcpudp nft_compat nft_chain_nat nf_nat
nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 nf_tables nfnetlink bridge
stp llc ipmi_ssif binfmt_misc nls_iso8859_1 intel_rapl_msr
intel_rapl_common amd64_edac edac_mce_amd kvm_amd kvm dell_smbios dcdbas
rapl dell_wmi_descriptor wmi_bmof joydev input_leds ccp ptdma k10temp
acpi_ipmi ipmi_si acpi_power_meter mac_hid sch_fq_codel dm_multipath
scsi_dh_rdac ipmi_devintf scsi_dh_emc ipmi_msghandler scsi_dh_alua msr
ramoops reed_solomon pstore_blk pstore_zone efi_pstore ip_tables
x_tables autofs4 hid_generic usbhid hid btrfs blake2b_generic raid10
raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor
raid6_pq libcrc32c raid1 raid0 multipath linear mgag200 i2c_algo_bit
drm_shmem_helper drm_kms_helper crct10dif_pclmul crc32_pclmul
ghash_clmulni_intel sha512_ssse3 aesni_intel crypto_simd cryptd nvme drm
mpt3sas tg3 ahci nvme_core libahci xhci_pci raid_class i2c_piix4
[ 599.058423] xhci_pci_renesas scsi_transport_sas wmi
[ 599.064210] CR2: 0000000000000000
[ 599.064841] ---[ end trace 0000000000000000 ]---
--
Looking at the RIP: free_iova_rcaches+0x9c/0x110 pointed me to the above
line leading me to believe we are popping element from an empty stack.
Following hunk fixed the issue for me:
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 76a7d694708e..899f1c2ba62a 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -732,6 +732,9 @@ static struct iova_magazine *iova_depot_pop(struct
iova_rcache *rcache)
{
struct iova_magazine *mag = rcache->depot;
+ if (!mag)
+ return NULL;
+
rcache->depot = mag->next;
mag->size = IOVA_MAG_SIZE;
rcache->depot_size--;
--
> + mag->size = IOVA_MAG_SIZE;
> + return mag;
> +}
> +
--
Thanks and Regards
Dheeraj Kumar Srivastava
Powered by blists - more mailing lists