[<prev] [next>] [day] [month] [year] [list]
Message-Id: <20250814022810.2917-1-wangkanglong@loongson.cn>
Date: Thu, 14 Aug 2025 10:28:10 +0800
From: Kanglong Wang <wangkanglong@...ngson.cn>
To: Huacai Chen <chenhuacai@...nel.org>
Cc: loongarch@...ts.linux.dev,
linux-kernel@...r.kernel.org
Subject: [PATCH] LoongArch: module: Optimize module load time by optimizing PLT/GOT counting
When enabling CONFIG_KASAN, CONFIG_PREEMPT_VOLUNTARY_BUILD and
CONFIG_PREEMPT_VOLUNTARY at the same time, there will be soft deadlock,
the relevant logs are as follows:
rcu: INFO: rcu_sched self-detected stall on CPU
...
Call Trace:
[<900000000024f9e4>] show_stack+0x5c/0x180
[<90000000002482f4>] dump_stack_lvl+0x94/0xbc
[<9000000000224544>] rcu_dump_cpu_stacks+0x1fc/0x280
[<900000000037ac80>] rcu_sched_clock_irq+0x720/0xf88
[<9000000000396c34>] update_process_times+0xb4/0x150
[<90000000003b2474>] tick_nohz_handler+0xf4/0x250
[<9000000000397e28>] __hrtimer_run_queues+0x1d0/0x428
[<9000000000399b2c>] hrtimer_interrupt+0x214/0x538
[<9000000000253634>] constant_timer_interrupt+0x64/0x80
[<9000000000349938>] __handle_irq_event_percpu+0x78/0x1a0
[<9000000000349a78>] handle_irq_event_percpu+0x18/0x88
[<9000000000354c00>] handle_percpu_irq+0x90/0xf0
[<9000000000348c74>] handle_irq_desc+0x94/0xb8
[<9000000001012b28>] handle_cpu_irq+0x68/0xa0
[<9000000001def8c0>] handle_loongarch_irq+0x30/0x48
[<9000000001def958>] do_vint+0x80/0xd0
[<9000000000268a0c>] kasan_mem_to_shadow.part.0+0x2c/0x2a0
[<90000000006344f4>] __asan_load8+0x4c/0x120
[<900000000025c0d0>] module_frob_arch_sections+0x5c8/0x6b8
[<90000000003895f0>] load_module+0x9e0/0x2958
[<900000000038b770>] __do_sys_init_module+0x208/0x2d0
[<9000000001df0c34>] do_syscall+0x94/0x190
[<900000000024d6fc>] handle_syscall+0xbc/0x158
After analysis, because the slow speed of loading the amdgpu module
leads to the long time occupation of the cpu and the soft deadlock.
When loading a module, module_frob_arch_sections() tries to figure out
the number of PLTs/GOTs that'll be needed to handle all the RELAs. It
will call the count_max_entries() to find in an out-of-order date which
counting algorithm has O(n^2) complexity.
To make faster, it sorts the relocation list by info and addend. That
way, to check for a duplicate relocation, it just needs to compare with
the previous entry. This reduces the complexity of the algorithm to O(n
log n), as done in commit d4e0340919fb ("arm64/module: Optimize module
load time by optimizing PLT counting"). This gives sinificant reduction
in module load time for modules with large number of relocations.
After applying this patch, the soft deadlock problem has been solved,
and the kernel starts normally without "Call Trace".
Using the default configuration to test some modules, the results are as
follows:
Module Size
ip_tables 36K
fat 143K
radeon 2.5MB
amdgpu 16MB
Without this patch:
Module Module load time (ms) Count(PLTs/GOTs)
ip_tables 18 59/6
fat 0 162/14
radeon 54 1221/84
amdgpu 1411 4525/1098
With this patch:
Module Module load time (ms) Count(PLTs/GOTs)
ip_tables 18 59/6
fat 0 162/14
radeon 22 1221/84
amdgpu 45 4525/1098
Fixes: fcdfe9d22bed ("LoongArch: Add ELF and module support")
Signed-off-by: Kanglong Wang <wangkanglong@...ngson.cn>
---
arch/loongarch/kernel/module-sections.c | 34 ++++++++++++-------------
1 file changed, 16 insertions(+), 18 deletions(-)
diff --git a/arch/loongarch/kernel/module-sections.c b/arch/loongarch/kernel/module-sections.c
index e2f30ff9afde..3b22b3c1af28 100644
--- a/arch/loongarch/kernel/module-sections.c
+++ b/arch/loongarch/kernel/module-sections.c
@@ -8,6 +8,7 @@
#include <linux/module.h>
#include <linux/moduleloader.h>
#include <linux/ftrace.h>
+#include <linux/sort.h>
Elf_Addr module_emit_got_entry(struct module *mod, Elf_Shdr *sechdrs, Elf_Addr val)
{
@@ -61,39 +62,36 @@ Elf_Addr module_emit_plt_entry(struct module *mod, Elf_Shdr *sechdrs, Elf_Addr v
return (Elf_Addr)&plt[nr];
}
-static int is_rela_equal(const Elf_Rela *x, const Elf_Rela *y)
-{
- return x->r_info == y->r_info && x->r_addend == y->r_addend;
-}
+#define cmp_3way(a, b) ((a) < (b) ? -1 : (a) > (b))
-static bool duplicate_rela(const Elf_Rela *rela, int idx)
+static int compare_rela(const void *x, const void *y)
{
- int i;
+ int ret;
+ const Elf_Rela *rela_x = x, *rela_y = y;
- for (i = 0; i < idx; i++) {
- if (is_rela_equal(&rela[i], &rela[idx]))
- return true;
- }
+ ret = cmp_3way(rela_x->r_info, rela_y->r_info);
+ if (ret == 0)
+ ret = cmp_3way(rela_x->r_addend, rela_y->r_addend);
- return false;
+ return ret;
}
static void count_max_entries(Elf_Rela *relas, int num,
unsigned int *plts, unsigned int *gots)
{
- unsigned int i, type;
+ unsigned int i;
+ sort(relas, num, sizeof(Elf_Rela), compare_rela, NULL);
for (i = 0; i < num; i++) {
- type = ELF_R_TYPE(relas[i].r_info);
- switch (type) {
+ if (i > 0 && compare_rela(&relas[i-1], &relas[i]) == 0)
+ continue;
+ switch (ELF_R_TYPE(relas[i].r_info)) {
case R_LARCH_SOP_PUSH_PLT_PCREL:
case R_LARCH_B26:
- if (!duplicate_rela(relas, i))
- (*plts)++;
+ (*plts)++;
break;
case R_LARCH_GOT_PC_HI20:
- if (!duplicate_rela(relas, i))
- (*gots)++;
+ (*gots)++;
break;
default:
break; /* Do nothing. */
--
2.20.1
Powered by blists - more mailing lists