linux-kernel - Re: [PATCH] LoongArch: module: Optimize module load time by optimizing PLT/GOT counting

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <CAAhV-H6fMgs+OYc5Sosz7ii0o24NoqTGp37yfehuxVpo2ctAQQ@mail.gmail.com>
Date: Sun, 17 Aug 2025 11:42:45 +0800
From: Huacai Chen <chenhuacai@...nel.org>
To: Kanglong Wang <wangkanglong@...ngson.cn>
Cc: loongarch@...ts.linux.dev, linux-kernel@...r.kernel.org
Subject: Re: [PATCH] LoongArch: module: Optimize module load time by
 optimizing PLT/GOT counting

Applied, thanks.

Huacai

On Thu, Aug 14, 2025 at 10:28 AM Kanglong Wang <wangkanglong@...ngson.cn> wrote:
>
> When enabling CONFIG_KASAN, CONFIG_PREEMPT_VOLUNTARY_BUILD and
> CONFIG_PREEMPT_VOLUNTARY at the same time, there will be soft deadlock,
> the relevant logs are as follows:
>
> rcu: INFO: rcu_sched self-detected stall on CPU
> ...
> Call Trace:
> [<900000000024f9e4>] show_stack+0x5c/0x180
> [<90000000002482f4>] dump_stack_lvl+0x94/0xbc
> [<9000000000224544>] rcu_dump_cpu_stacks+0x1fc/0x280
> [<900000000037ac80>] rcu_sched_clock_irq+0x720/0xf88
> [<9000000000396c34>] update_process_times+0xb4/0x150
> [<90000000003b2474>] tick_nohz_handler+0xf4/0x250
> [<9000000000397e28>] __hrtimer_run_queues+0x1d0/0x428
> [<9000000000399b2c>] hrtimer_interrupt+0x214/0x538
> [<9000000000253634>] constant_timer_interrupt+0x64/0x80
> [<9000000000349938>] __handle_irq_event_percpu+0x78/0x1a0
> [<9000000000349a78>] handle_irq_event_percpu+0x18/0x88
> [<9000000000354c00>] handle_percpu_irq+0x90/0xf0
> [<9000000000348c74>] handle_irq_desc+0x94/0xb8
> [<9000000001012b28>] handle_cpu_irq+0x68/0xa0
> [<9000000001def8c0>] handle_loongarch_irq+0x30/0x48
> [<9000000001def958>] do_vint+0x80/0xd0
> [<9000000000268a0c>] kasan_mem_to_shadow.part.0+0x2c/0x2a0
> [<90000000006344f4>] __asan_load8+0x4c/0x120
> [<900000000025c0d0>] module_frob_arch_sections+0x5c8/0x6b8
> [<90000000003895f0>] load_module+0x9e0/0x2958
> [<900000000038b770>] __do_sys_init_module+0x208/0x2d0
> [<9000000001df0c34>] do_syscall+0x94/0x190
> [<900000000024d6fc>] handle_syscall+0xbc/0x158
>
> After analysis, because the slow speed of loading the amdgpu module
> leads to the long time occupation of the cpu and the soft deadlock.
>
> When loading a module, module_frob_arch_sections() tries to figure out
> the number of PLTs/GOTs that'll be needed to handle all the RELAs. It
> will call the count_max_entries() to find in an out-of-order date which
> counting algorithm has O(n^2) complexity.
>
> To make faster, it sorts the relocation list by info and addend. That
> way, to check for a duplicate relocation, it just needs to compare with
> the previous entry. This reduces the complexity of the algorithm to O(n
>  log n), as done in commit d4e0340919fb ("arm64/module: Optimize module
> load time by optimizing PLT counting"). This gives sinificant reduction
> in module load time for modules with large number of relocations.
>
> After applying this patch, the soft deadlock problem has been solved,
> and the kernel starts normally without "Call Trace".
>
> Using the default configuration to test some modules, the results are as
> follows:
>
> Module              Size
> ip_tables           36K
> fat                 143K
> radeon              2.5MB
> amdgpu              16MB
>
> Without this patch:
> Module              Module load time (ms)       Count(PLTs/GOTs)
> ip_tables           18                          59/6
> fat                 0                           162/14
> radeon              54                          1221/84
> amdgpu              1411                        4525/1098
>
> With this patch:
> Module              Module load time (ms)       Count(PLTs/GOTs)
> ip_tables           18                          59/6
> fat                 0                           162/14
> radeon              22                          1221/84
> amdgpu              45                          4525/1098
>
> Fixes: fcdfe9d22bed ("LoongArch: Add ELF and module support")
> Signed-off-by: Kanglong Wang <wangkanglong@...ngson.cn>
> ---
>  arch/loongarch/kernel/module-sections.c | 34 ++++++++++++-------------
>  1 file changed, 16 insertions(+), 18 deletions(-)
>
> diff --git a/arch/loongarch/kernel/module-sections.c b/arch/loongarch/kernel/module-sections.c
> index e2f30ff9afde..3b22b3c1af28 100644
> --- a/arch/loongarch/kernel/module-sections.c
> +++ b/arch/loongarch/kernel/module-sections.c
> @@ -8,6 +8,7 @@
>  #include <linux/module.h>
>  #include <linux/moduleloader.h>
>  #include <linux/ftrace.h>
> +#include <linux/sort.h>
>
>  Elf_Addr module_emit_got_entry(struct module *mod, Elf_Shdr *sechdrs, Elf_Addr val)
>  {
> @@ -61,39 +62,36 @@ Elf_Addr module_emit_plt_entry(struct module *mod, Elf_Shdr *sechdrs, Elf_Addr v
>         return (Elf_Addr)&plt[nr];
>  }
>
> -static int is_rela_equal(const Elf_Rela *x, const Elf_Rela *y)
> -{
> -       return x->r_info == y->r_info && x->r_addend == y->r_addend;
> -}
> +#define cmp_3way(a, b)  ((a) < (b) ? -1 : (a) > (b))
>
> -static bool duplicate_rela(const Elf_Rela *rela, int idx)
> +static int compare_rela(const void *x, const void *y)
>  {
> -       int i;
> +       int ret;
> +       const Elf_Rela *rela_x = x, *rela_y = y;
>
> -       for (i = 0; i < idx; i++) {
> -               if (is_rela_equal(&rela[i], &rela[idx]))
> -                       return true;
> -       }
> +       ret = cmp_3way(rela_x->r_info, rela_y->r_info);
> +       if (ret == 0)
> +               ret = cmp_3way(rela_x->r_addend, rela_y->r_addend);
>
> -       return false;
> +       return ret;
>  }
>
>  static void count_max_entries(Elf_Rela *relas, int num,
>                               unsigned int *plts, unsigned int *gots)
>  {
> -       unsigned int i, type;
> +       unsigned int i;
>
> +       sort(relas, num, sizeof(Elf_Rela), compare_rela, NULL);
>         for (i = 0; i < num; i++) {
> -               type = ELF_R_TYPE(relas[i].r_info);
> -               switch (type) {
> +               if (i > 0 && compare_rela(&relas[i-1], &relas[i]) == 0)
> +                       continue;
> +               switch (ELF_R_TYPE(relas[i].r_info)) {
>                 case R_LARCH_SOP_PUSH_PLT_PCREL:
>                 case R_LARCH_B26:
> -                       if (!duplicate_rela(relas, i))
> -                               (*plts)++;
> +                       (*plts)++;
>                         break;
>                 case R_LARCH_GOT_PC_HI20:
> -                       if (!duplicate_rela(relas, i))
> -                               (*gots)++;
> +                       (*gots)++;
>                         break;
>                 default:
>                         break; /* Do nothing. */
> --
> 2.20.1
>