lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAB=BE-S=cxewXu7nqJY1DC5w9Bapar_C0cTjpZOQ-Qd5GGwYyw@mail.gmail.com>
Date:   Tue, 30 May 2023 14:18:40 -0700
From:   Sandeep Dhavale <dhavale@...gle.com>
To:     Tejun Heo <tj@...nel.org>
Cc:     jiangshanlai@...il.com, torvalds@...ux-foundation.org,
        peterz@...radead.org, linux-kernel@...r.kernel.org,
        kernel-team@...a.com, joshdon@...gle.com, brho@...gle.com,
        briannorris@...omium.org, nhuck@...gle.com, agk@...hat.com,
        snitzer@...nel.org, void@...ifault.com, kernel-team@...roid.com
Subject: Re: [PATCH 14/24] workqueue: Generalize unbound CPU pods

Hi Tejun,

> @@ -6234,6 +6256,7 @@ static inline void wq_watchdog_init(void) { }
>   */
>  void __init workqueue_init_early(void)
>  {
> +       struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_SYSTEM];
>         int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
>         int i, cpu;
>
> @@ -6248,6 +6271,22 @@ void __init workqueue_init_early(void)
>         wq_update_pod_attrs_buf = alloc_workqueue_attrs();
>         BUG_ON(!wq_update_pod_attrs_buf);
>
> +       /* initialize WQ_AFFN_SYSTEM pods */
> +       pt->pod_cpus = kcalloc(1, sizeof(pt->pod_cpus[0]), GFP_KERNEL);
> +       pt->pod_node = kcalloc(1, sizeof(pt->pod_node[0]), GFP_KERNEL);
> +       pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[0]), GFP_KERNEL);
> +       BUG_ON(!pt->pod_cpus || !pt->pod_node || !pt->cpu_pod);
> +
> +       BUG_ON(!zalloc_cpumask_var_node(&pt->pod_cpus[0], GFP_KERNEL, NUMA_NO_NODE));
> +
> +       wq_update_pod_attrs_buf = alloc_workqueue_attrs();
> +       BUG_ON(!wq_update_pod_attrs_buf);
> +

Looks like allocation for wq_update_pod_attrs_buf is already being
done in the preceding code block.

I am trying to evaluate this series to see if it helps with the
scheduling delays we have seen in EROFS.
In addition to the panic and fix reported by Prateek [0], I am having
stability issues only with the series applied.
I am testing with Pixel 6 and android-mainline kernel [1]

The panic seems to be in the context of kworker for events_unbound wq.
The only significant change directly to events_unbound wq was in patch [2]

@@ -6399,7 +6335,7 @@ void __init workqueue_init_early(void)
  system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
  system_long_wq = alloc_workqueue("events_long", 0, 0);
  system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
-    WQ_UNBOUND_MAX_ACTIVE);
+    WQ_MAX_ACTIVE);
  system_freezable_wq = alloc_workqueue("events_freezable",
       WQ_FREEZABLE, 0);
  system_power_efficient_wq = alloc_workqueue("events_power_efficient",

Panic log:
[  316.386684][  T115] Unable to handle kernel paging request at
virtual address ffffffd2745a0160
[  316.386936][  T115] Mem abort info:
[  316.387027][  T115]   ESR = 0x0000000096000007
[  316.387137][  T115]   EC = 0x25: DABT (current EL), IL = 32 bits
[  316.387284][  T115]   SET = 0, FnV = 0
[  316.387378][  T115]   EA = 0, S1PTW = 0
[  316.387475][  T115]   FSC = 0x07: level 3 translation fault
[  316.387606][  T115] Data abort info:
[  316.387694][  T115]   ISV = 0, ISS = 0x00000007
[  316.387804][  T115]   CM = 0, WnR = 0
[  316.387897][  T115] swapper pgtable: 4k pages, 39-bit VAs,
pgdp=0000000081dec000
[  316.388071][  T115] [ffffffd2745a0160] pgd=10000009d83ff003,
p4d=10000009d83ff003, pud=10000009d83ff003, pmd=10000009d83fb003,
pte=0000000000000000
[  316.388491][  T115] Internal error: Oops: 0000000096000007 [#1] PREEMPT SMP
[  316.388765][  T115] debug-snapshot dss: core register saved(CPU:2)
[  316.388993][  T115] debug-snapshot dss: ECC error check erridr_el1.num = 0x2
[  316.389260][  T115] debug-snapshot dss: ERRSELR_EL1.SEL = 0, NOT
Error, ERXSTATUS_EL1 = 0x0
[  316.389578][  T115] debug-snapshot dss: ERRSELR_EL1.SEL = 1, NOT
Error, ERXSTATUS_EL1 = 0x0
[  316.389898][  T115] debug-snapshot dss: context saved(CPU:2)
[  316.390112][  T115] item - log_kevents is disabled
[  316.390300][  T115] Modules linked in: sec_touch(OE) ftm5(OE)
bcmdhd4389(OE) goog_touch_interface(OE) snd_soc_cs40l2x(OE)
haptics_cs40l2x(OE) google_dock(OE) lwis(OE) panel_boe_nt37290(OE)
panel_samsung_s6e3hc4(OE) panel_samsung_s6e3hc3_c10(OE)
panel_samsung_s6e3fc3_p10(OE) stmvl53l1(OE) slg51000_core(OE)
slg51000_regulator(OE) pinctrl_slg51000(OE) nfc mac802154
ieee802154_socket ieee802154_6lowpan ieee802154 nhc_udp nhc_routing
nhc_mobility nhc_ipv6 nhc_hop nhc_fragment nhc_dest 6lowpan diag tipc
mac80211 l2tp_ppp l2tp_core hidp rfcomm can_gw can_bcm can_raw can
cfg80211 8021q btsdio hci_uart btqca btbcm bluetooth ftdi_sio
usbserial cdc_acm r8153_ecm aqc111 cdc_ncm cdc_eem cdc_ether
ax88179_178a asix usbnet r8152 rtl8150 pptp pppox ppp_mppe ppp_deflate
bsd_comp ppp_generic slhc slcan vcan can_dev mii libarc4 bigocean(OE)
st33spi(OE) st54spi(OE) st21nfc(OE) nitrous(OE) rfkill
exynos_reboot(OE) heatmap(OE) touch_bus_negotiator(OE)
touch_offload(OE) aoc_alsa_dev(OE) aoc_alsa_dev_util(OE)
aoc_uwb_platform_drv(OE)
[  316.390708][  T115]  aoc_uwb_service_dev(OE) aoc_channel_dev(OE)
aoc_control_dev(OE) aoc_char_dev(OE) aoc_core(OE) mailbox_wc(OE)
audiometrics(OE) snd_soc_cs35l41_i2c(OE) snd_soc_cs35l41_spi(OE)
snd_soc_cs35l41(OE) snd_soc_wm_adsp(OE) max20339(OE) pca9468(OE)
p9221(OE) max77759_charger(OE) max77729_charger(OE) max77729_uic(OE)
max77729_pmic(OE) max1720x_battery(OE) overheat_mitigation(OE)
google_cpm(OE) google_dual_batt_gauge(OE) google_charger(OE)
google_battery(OE) google_bms(OE) abrolhos(OE) mali_kbase(OE)
mali_pixel(OE) panel_samsung_s6e3hc3(OE) panel_samsung_sofef01(OE)
panel_samsung_s6e3fc3(OE) panel_samsung_s6e3hc2(OE)
panel_samsung_emul(OE) panel_samsung_drv(OE) exynos_drm(OE)
arm_memlat_mon(OE) governor_memlat(OE) memlat_devfreq(OE)
exynos_acme(OE) s3c2410_wdt(OE) trusty_virtio(OE) trusty_test(OE)
trusty_log(OE) trusty_irq(OE) gs101_spmic_thermal(OE) gpu_cooling(OE)
debug_reboot(OE) smfc(OE) exynos_mfc(OE) i2c_exynos5(OE)
rtc_s2mpg10(OE) keycombo(OE) goodixfp(OE) usbc_cooling_dev(OE)
tcpci_max77759(OE)
[  316.393987][  T115]  max77759_contaminant(OE) bc_max77759(OE)
max77759_helper(OE) tcpci_fusb307(OE) slg46826(OE) usb_psy(OE)
usb_f_dm1(OE) usb_f_dm(OE) xhci_exynos(OE) ufs_exynos_gs(OE)
s2mpg1x_gpio(OE) bcm47765(OE) sscoredump(OE) sbb_mux(OE) gsc_spi(OE)
g2d(OE) samsung_iommu(OE) samsung_iommu_group(OE) exyswd_rng(OE)
exynos_tty(OE) max77826_gs_regulator(OE) boot_control_sysfs(OE)
exynos_seclog(OE) dbgcore_dump(OE) pixel_stat_mm(OE)
pixel_stat_sysfs(OE) sysrq_hook(OE) hardlockup_debug(OE) eh(OE)
cp_thermal_zone(OE) cpif(OE) bts(OE) exynos_dit(OE) cpif_page(OE)
boot_device_spi(OE) bcm_dbg(OE) exynos_bcm_dbg_dump(OE) gsa_gsc(OE)
slc_acpm(OE) slc_pmon(OE) slc_dummy(OE) acpm_mbox_test(OE)
exynos_devfreq(OE) exynos_dm(OE) slc_pt(OE) power_stats(OE)
exynos_pd_dbg(OE) pixel_em(OE) gs_thermal(OE) google_bcl(OE)
i2c_acpm(OE) s2mpg11_regulator(OE) s2mpg10_regulator(OE) odpm(OE)
s2mpg10_powermeter(OE) s2mpg10_mfd(OE) s2mpg11_powermeter(OE)
pmic_class(OE) s2mpg11_mfd(OE) exynos_cpuhp(OE) pixel_boot_metrics(OE)
exynos_adv_tracer_s2d(OE)
[  316.397483][  T115]  keydebug(OE) exynos_coresight_etm(OE)
exynos_ecc_handler(OE) exynos_coresight(OE) exynos_debug_test(OE)
pixel_debug_test(OE) ehld(OE) sjtag_driver(OE) exynos_adv_tracer(OE)
gsa(OE) trusty_ipc(OE) samsung_dma_heap(OE) trusty_core(OE)
samsung_secure_iova(OE) deferred_free_helper(OE) page_pool(OE)
hardlockup_watchdog(OE) debug_snapshot_debug_kinfo(OE)
debug_snapshot_qd(OE) debug_snapshot_sfrdump(OE) exynos_pd(OE)
dwc3_exynos_usb(OE) gvotable(OE) clk_exynos_gs(OE) pcie_exynos_gs(OE)
exynos_pm(OE) acpm_flexpmu_dbg(OE) pcie_exynos_gs101_rc_cal(OE)
shm_ipc(OE) spi_s3c64xx(OE) samsung_dma(OE) pl330(OE) s2mpu(OE)
logbuffer(OE) itmon(OE) exynos_cpupm(OE) exynos_mct(OE) cmupmucal(OE)
exynos_pm_qos(OE) gs_acpm(OE) kernel_top(OE) dss(OE)
pixel_suspend_diag(OE) systrace(OE) ect_parser(OE) gs_chipid(OE)
pinctrl_exynos_gs(OE) phy_exynos_mipi(OE) phy_exynos_mipi_dsim(OE)
exynos_pmu_if(OE) phy_exynos_usbdrd_super(OE) exynos_pd_el3(OE)
arm_dsu_pmu(E) softdog(E) pps_gpio(E) i2c_dev(E) spidev(E) sg(E)
at24(E) zram zsmalloc
[  316.404101][  T115] CPU: 2 PID: 115 Comm: kworker/u24:2 Tainted: G
      W  OE      6.3.0-mainline-maybe-dirty #1
[  316.404491][  T115] Hardware name: Oriole DVT (DT)
[  316.404678][  T115] Workqueue: events_unbound idle_cull_fn
[  316.404882][  T115] pstate: 804000c5 (Nzcv daIF +PAN -UAO -TCO -DIT
-SSBS BTYPE=--)
[  316.405176][  T115] pc : available_idle_cpu+0x20/0x60
[  316.405368][  T115] lr : select_task_rq_fair+0x1d0/0x17d8
[  316.405574][  T115] sp : ffffffc008dfbb40
[  316.405728][  T115] x29: ffffffc008dfbc10 x28: 0000000000000000
x27: 0000000000000008
[  316.406028][  T115] x26: 0000000000000000 x25: 0000000000000001
x24: 0000000000000008
[  316.406323][  T115] x23: 0000000000000000 x22: 0000000000000400
x21: 0000000000000000
[  316.406623][  T115] x20: 0000000000000008 x19: ffffff8800812380
x18: ffffffc008cdf040
[  316.406925][  T115] x17: 00000000aa3494c0 x16: 00000000aa3494c0
x15: 0000000000019ed5
[  316.407221][  T115] x14: 0000000000000001 x13: 000000000001a2d5
x12: 0000000000000010
[  316.407521][  T115] x11: 0000000000000400 x10: de8448a6b7c5d500 x9
: ffffffd27459f6c0
[  316.407822][  T115] x8 : ffffffd27459f6c0 x7 : 0000000000008080 x6
: 0000000000000000
[  316.408118][  T115] x5 : ffffff894f35c590 x4 : 0000646e756f626e x3
: 0000000000000008
[  316.408418][  T115] x2 : 0000000000000001 x1 : ffffff8800812380 x0
: 0000000000000008
[  316.408724][  T115] Call trace:
[  316.408842][  T115]  available_idle_cpu+0x20/0x60
[  316.409020][  T115]  try_to_wake_up+0x4ec/0x85c
[  316.409190][  T115]  wake_up_process+0x18/0x28
[  316.409359][  T115]  wake_dying_workers+0x5c/0xe8
[  316.409539][  T115]  idle_cull_fn+0xdc/0x11c
[  316.409705][  T115]  process_scheduled_works+0x208/0x45c
[  316.409905][  T115]  worker_thread+0x22c/0x31c
[  316.410074][  T115]  kthread+0x114/0x1c0
[  316.410229][  T115]  ret_from_fork+0x10/0x20
[  316.410399][  T115] Code: b00105c9 911b0129 f8605908 8b090108 (f9455109)
[  316.410651][  T115] ---[ end trace 0000000000000000 ]---
[  316.410853][  T115] Kernel panic - not syncing: Oops: Fatal exception
[  316.411097][  T115] SMP: stopping secondary CPUs

Do you think the change in patch [2] could be related?

Thanks,
Sandeep.

[0] https://lore.kernel.org/all/30625cdd-4d61-594b-8db9-6816b017dde3@amd.com/
[1] https://android.googlesource.com/kernel/common/+/refs/heads/android-mainline
[2] https://lore.kernel.org/all/20230519001709.2563-10-tj@kernel.org/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ