[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <BY3PR18MB4612EBA035589E65F4E249BFAB969@BY3PR18MB4612.namprd18.prod.outlook.com>
Date: Thu, 28 Jul 2022 18:02:48 +0000
From: Manish Chopra <manishc@...vell.com>
To: David Christensen <drc@...ux.vnet.ibm.com>,
"davem@...emloft.net" <davem@...emloft.net>,
Ariel Elior <aelior@...vell.com>,
Sudarsana Reddy Kalluru <skalluru@...vell.com>,
"GR-everest-linux-l2@...vell.com" <GR-everest-linux-l2@...vell.com>
CC: Netdev <netdev@...r.kernel.org>
Subject: RE: EEH Error With NAPI_STATE_SCHED Set in BNX2X Results in RTNL Held
While Sleeping
> -----Original Message-----
> From: David Christensen <drc@...ux.vnet.ibm.com>
> Sent: Tuesday, July 26, 2022 3:43 AM
> To: davem@...emloft.net; Ariel Elior <aelior@...vell.com>; Sudarsana
> Reddy Kalluru <skalluru@...vell.com>; GR-everest-linux-l2@...vell.com
> Cc: Netdev <netdev@...r.kernel.org>
> Subject: EEH Error With NAPI_STATE_SCHED Set in BNX2X Results in RTNL
> Held While Sleeping
>
> While performing EEH error recovery tests, internal test team encountered an
> issue where the bnx2x driver is holding the RTNL lock while sleeping in
> napi_disable() on a SuSE 15.3-SP3 system (kernel
> 5.3.18-150300.59.68) while running network stress on multiple adapters, even
> when a previous fix (bnx2x: fix napi API usage sequence) has been applied to
> the bnx2x driver.
>
> crash> foreach UN bt
> PID: 213 TASK: c000000004ba9f80 CPU: 9 COMMAND: "eehd"
> #0 [c000000004c277e0] __schedule at c000000000c756a8
> #1 [c000000004c278b0] schedule at c000000000c75d80
> #2 [c000000004c278e0] schedule_timeout at c000000000c7bc8c
> #3 [c000000004c279c0] msleep at c000000000212bec
> #4 [c000000004c279f0] napi_disable at c000000000a0b2e8
> #5 [c000000004c27a30] bnx2x_netif_stop at c0080000015cbbf4 [bnx2x]
> #6 [c000000004c27a60] bnx2x_io_slot_reset at c00800000159555c [bnx2x]
> #7 [c000000004c27b20] eeh_report_reset at c00000000004ca3c
> #8 [c000000004c27b90] eeh_pe_report at c00000000004d228
> #9 [c000000004c27c40] eeh_handle_normal_event at c00000000004dae4
> #10 [c000000004c27d00] eeh_event_handler at c00000000004e3b0
> #11 [c000000004c27db0] kthread at c000000000175f9c
> #12 [c000000004c27e20] ret_from_kernel_thread at c00000000000cda8
>
> What's interesting to me in this case is that the failure only occurs when stress
> testing multiple adapters simultaneously and injecting the EEH error to an
> upstream PCIe switch. If errors are injected to the switch while running stress
> tests on a single NIC, no problem is observed.
>
> Reviewing the bnx2x driver code did not reveal any obvious problems, but
> when reviewing the NAPI state machine it did appear to my eye that there is
> no practical method to abort a scheduled poll in the driver (i.e.
> when NAPI_STATE_SCHED is set), so there's no obvious mechanism for the
> driver to recover in this situation when the NIC registers are inaccessible due
> to the EEH.
>
> My attempt to introduce such a mechanism, shown below, hasn't resolved the
> problem. The NIC receives a hardware interrupt while initializing the driver
> which results in a NULL pointer reference, suggesting I've missed some
> required step.
>
> void napi_err_cancel(struct napi_struct *n) {
> unsigned long flags, new, val;
>
> if (n->gro_bitmask) {
> napi_gro_flush(n, true);
> }
>
> gro_normal_list(n);
>
> if (unlikely(!list_empty(&n->poll_list))) {
> /* If n->poll_list is not empty, we need to mask irqs */
> local_irq_save(flags);
> list_del_init(&n->poll_list);
> local_irq_restore(flags);
> }
>
> do {
> val = READ_ONCE(n->state);
> WARN_ON(test_bit(NAPI_STATE_DISABLE, &val));
>
> new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
> } while (cmpxchg(&n->state, val, new) != val);
>
> return (val & NAPIF_STATE_SCHED); }
> EXPORT_SYMBOL(napi_err_cancel);
>
> My question: Is a new function required to abort a schedule poll, and if so,
> what else is required? I reviewed several other drivers that support EEH and
> did not see any that appear to handle this situation.
>
> Dave
>
> P.S. A few additional details on the failure.
>
> crash> net
> NET_DEVICE NAME IP ADDRESS(ES)
> c000000ef3bd3000 lo 127.0.0.1
> c000000eef1f3000 eth0 9.3.233.22
> c000000002490000 eth8 106.1.233.22 (tg3)
> c000000002498000 eth1 107.1.233.22 (bnx2x)
> c0000000024a4000 eth9 106.2.233.22 (tg3)
> c0000000024b4000 eth10 109.1.233.22 (tg3)
> c000000002494000 eth5 110.1.233.22 (e1000e)
> c0000000024a8000 eth2 107.2.233.22 (bnx2x)
> c0000000024b8000 eth6 109.2.233.22 (tg3)
> c0000000024bc000 eth7 110.2.233.22 (e10001)
> c000000eb3000000 eth16 111.1.233.22 (mlx5)
> c000000eab100000 eth15 111.2.233.22 (mlx5)
>
> # Get address of bnx2x structure for eth1
> crash> struct -ox net_device
> struct net_device {
> ...
> }
> SIZE: 0x980
>
> crash> struct -x bnx2x c000000002498980
> struct bnx2x {
> fp = 0xc000000002520000,
> ...
> }
>
> # Observe NAPI struct in per-queue fastpath[0]
> crash> struct -x bnx2x_fastpath c000000002520000
> struct bnx2x_fastpath {
> bp = 0xc000000002498980,
> napi = {
> poll_list = {
> next = 0xc000000002520008,
> prev = 0xc000000002520008
> },
> state = 13,
> weight = 64,
> defer_hard_irqs_count = 0,
> gro_bitmask = 0,
> poll = 0xc0080000015c8a88 <bnx2x_poll>,
> poll_owner = -1,
> dev = 0xc000000002498000,
> gro_hash = {{
> list = {
> next = 0xc000000002520048,
> prev = 0xc000000002520048
> },
> count = 0
> }, {
> list = {
> next = 0xc000000002520060,
> prev = 0xc000000002520060
> },
> count = 0
> }, {
> list = {
> next = 0xc000000002520078,
> prev = 0xc000000002520078
> },
> count = 0
> }, {
> list = {
> next = 0xc000000002520090,
> prev = 0xc000000002520090
> },
> count = 0
> }, {
> list = {
> next = 0xc0000000025200a8,
> prev = 0xc0000000025200a8
> },
> count = 0
> }, {
> list = {
> next = 0xc0000000025200c0,
> prev = 0xc0000000025200c0
> },
> count = 0
> }, {
> list = {
> next = 0xc0000000025200d8,
> prev = 0xc0000000025200d8
> },
> count = 0
> }, {
> list = {
> next = 0xc0000000025200f0,
> prev = 0xc0000000025200f0
> },
> count = 0
> }},
> skb = 0x0,
> rx_list = {
> next = 0xc000000002520110,
> prev = 0xc000000002520110
> },
> rx_count = 0,
> timer = {
> node = {
> node = {
> __rb_parent_color = 13835058055321092392,
> rb_right = 0x0,
> rb_left = 0x0
> },
> expires = 0
> },
> _softexpires = 0,
> function = 0xc000000000a0d2d0 <napi_watchdog>,
> base = 0xc000000efce32380,
> state = 0 '\000',
> is_rel = 0 '\000',
> is_soft = 0 '\000'
> },
> dev_list = {
> next = 0xc000000002520168,
> prev = 0xc000000002520168
> },
> napi_hash_node = {
> next = 0x0,
> pprev = 0x5deadbeef0000122
> },
> napi_id = 2284
> },
> ...
> }
>
> crash> struct -ox bnx2x_fastpath
> struct bnx2x_fastpath {
> ...
> }
> SIZE: 0x2f0
>
> # ... and NAPI struct in fastpath[1]
> crash> struct -x bnx2x_fastpath c0000000025202f0
> struct bnx2x_fastpath {
> bp = 0xc000000002498980,
> napi = {
> poll_list = {
> next = 0xc0000000025202f8,
> prev = 0xc0000000025202f8
> },
> state = 0x9,
> weight = 0x40,
> defer_hard_irqs_count = 0x0,
> gro_bitmask = 0x0,
> poll = 0xc0080000015c8a88 <bnx2x_poll>,
> poll_owner = 0xffffffff,
> dev = 0xc000000002498000,
> gro_hash = {{
> list = {
> next = 0xc000000002520338,
> prev = 0xc000000002520338
> },
> count = 0x0
> }, {
> list = {
> next = 0xc000000002520350,
> prev = 0xc000000002520350
> },
> count = 0x0
> }, {
> list = {
> next = 0xc000000002520368,
> prev = 0xc000000002520368
> },
> count = 0x0
> }, {
> list = {
> next = 0xc000000002520380,
> prev = 0xc000000002520380
> },
> count = 0x0
> }, {
> list = {
> next = 0xc000000002520398,
> prev = 0xc000000002520398
> },
> count = 0x0
> }, {
> list = {
> next = 0xc0000000025203b0,
> prev = 0xc0000000025203b0
> },
> count = 0x0
> }, {
> list = {
> next = 0xc0000000025203c8,
> prev = 0xc0000000025203c8
> },
> count = 0x0
> }, {
> list = {
> next = 0xc0000000025203e0,
> prev = 0xc0000000025203e0
> },
> count = 0x0
> }},
> skb = 0x0,
> rx_list = {
> next = 0xc000000002520400,
> prev = 0xc000000002520400
> },
> rx_count = 0x0,
> timer = {
> node = {
> node = {
> __rb_parent_color = 0xc000000002520418,
> rb_right = 0x0,
> rb_left = 0x0
> },
> expires = 0x0
> },
> _softexpires = 0x0,
> function = 0xc000000000a0d2d0 <napi_watchdog>,
> base = 0xc000000efce32380,
> state = 0x0,
> is_rel = 0x0,
> is_soft = 0x0
> },
> dev_list = {
> next = 0xc000000002520458,
> prev = 0xc000000002520458
> },
> napi_hash_node = {
> next = 0x0,
> pprev = 0x5deadbeef0000122
> },
> napi_id = 0x8ed
> },
> ...
> }
>
> # ... fastpath[2..X] structs also show NAPI state as 9
Hello David,
It will be interesting to know if the issue also occurs with latest stable linux kernel ?
Thanks,
Manish
Powered by blists - more mailing lists