[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <02ac44b5-e48c-2abc-9e1c-d49d8d3465c9@acm.org>
Date: Fri, 17 Nov 2017 08:47:35 -0600
From: Corey Minyard <minyard@....org>
To: Masamitsu Yamazaki <m-yamazaki@...jp.nec.com>,
"openipmi-developer@...ts.sourceforge.net"
<openipmi-developer@...ts.sourceforge.net>
Cc: Junichi Nomura <j-nomura@...jp.nec.com>,
Kiyoshi Ueda <k-ueda@...jp.nec.com>,
"linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>
Subject: Re: [Openipmi-developer] [PATCH] ipmi: Stop timers before cleaning up
the module
On 11/15/2017 01:33 AM, Masamitsu Yamazaki wrote:
> System may crash after unloading ipmi_si.ko module
> because a timer may remain and fire after the module cleaned up resources.
This is a big improvement, thanks. I'll work on getting this into 4.15.
Thanks,
-corey
> cleanup_one_si() contains the following processing.
>
> /*
> * Make sure that interrupts, the timer and the thread are
> * stopped and will not run again.
> */
> if (to_clean->irq_cleanup)
> to_clean->irq_cleanup(to_clean);
> wait_for_timer_and_thread(to_clean);
>
> /*
> * Timeouts are stopped, now make sure the interrupts are off
> * in the BMC. Note that timers and CPU interrupts are off,
> * so no need for locks.
> */
> while (to_clean->curr_msg || (to_clean->si_state != SI_NORMAL)) {
> poll(to_clean);
> schedule_timeout_uninterruptible(1);
> }
>
> si_state changes as following in the while loop calling poll(to_clean).
>
> SI_GETTING_MESSAGES
> => SI_CHECKING_ENABLES
> => SI_SETTING_ENABLES
> => SI_GETTING_EVENTS
> => SI_NORMAL
>
> As written in the code comments above,
> timers are expected to stop before the polling loop and not to run again.
> But the timer is set again in the following process
> when si_state becomes SI_SETTING_ENABLES.
>
> => poll
> => smi_event_handler
> => handle_transaction_done
> // smi_info->si_state == SI_SETTING_ENABLES
> => start_getting_events
> => start_new_msg
> => smi_mod_timer
> => mod_timer
>
> As a result, before the timer set in start_new_msg() expires,
> the polling loop may see si_state becoming SI_NORMAL
> and the module clean-up finishes.
>
> For example, hard LOCKUP and panic occurred as following.
> smi_timeout was called after smi_event_handler,
> kcs_event and hangs at port_inb()
> trying to access I/O port after release.
>
> #11 [ffff88069fdc5ef0] end_repeat_nmi at ffffffff816ac8d3
> [exception RIP: port_inb+19]
> RIP: ffffffffc0473053 RSP: ffff88069fdc3d80 RFLAGS: 00000006
> RAX: ffff8806800f8e00 RBX: ffff880682bd9400 RCX: 0000000000000000
> RDX: 0000000000000ca3 RSI: 0000000000000ca3 RDI: ffff8806800f8e40
> RBP: ffff88069fdc3d80 R8: ffffffff81d86dfc R9: ffffffff81e36426
> R10: 00000000000509f0 R11: 0000000000100000 R12: 0000000000]:000000
> R13: 0000000000000000 R14: 0000000000000246 R15: ffff8806800f8e00
> ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0000
> --- <NMI exception stack> ---
> #12 [ffff88069fdc3d80] port_inb at ffffffffc0473053 [ipmi_si]
> #13 [ffff88069fdc3d88] kcs_event at ffffffffc0477952 [ipmi_si]
> #14 [ffff88069fdc3db0] smi_event_handler at ffffffffc047465d [ipmi_si]
> #15 [ffff88069fdc3df0] smi_timeout at ffffffffc0474f9e [ipmi_si]
>
> To fix the problem I defined a flag, timer_can_start,
> as member of struct smi_info.
> The flag is enabled immediately after initializing the timer
> and disabled immediately before waiting for timer deletion.
>
> Fixes: 0cfec916e86d ("ipmi: Start the timer and thread on internal msgs")
> Signed-off-by: Yamazaki Masamitsu <m-yamazaki@...jp.nec.com>
>
> ----
>
> diff -Nurp a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c
> --- a/drivers/char/ipmi/ipmi_si_intf.c 2017-11-09 15:00:31.436926440 +0900
> +++ b/drivers/char/ipmi/ipmi_si_intf.c 2017-11-13 14:14:02.399051610 +0900
> @@ -242,6 +242,9 @@ struct smi_info {
> /* The timer for this si. */
> struct timer_list si_timer;
>
> + /* This flag is set, if the timer can be set */
> + bool timer_can_start;
> +
> /* This flag is set, if the timer is running (timer_pending() isn't enough) */
> bool timer_running;
>
> @@ -417,6 +420,8 @@ out:
>
> static void smi_mod_timer(struct smi_info *smi_info, unsigned long new_val)
> {
> + if (!smi_info->timer_can_start)
> + return;
> smi_info->last_timeout_jiffies = jiffies;
> mod_timer(&smi_info->si_timer, new_val);
> smi_info->timer_running = true;
> @@ -436,21 +441,18 @@ static void start_new_msg(struct smi_inf
> smi_info->handlers->start_transaction(smi_info->si_sm, msg, size);
> }
>
> -static void start_check_enables(struct smi_info *smi_info, bool start_timer)
> +static void start_check_enables(struct smi_info *smi_info)
> {
> unsigned char msg[2];
>
> msg[0] = (IPMI_NETFN_APP_REQUEST << 2);
> msg[1] = IPMI_GET_BMC_GLOBAL_ENABLES_CMD;
>
> - if (start_timer)
> - start_new_msg(smi_info, msg, 2);
> - else
> - smi_info->handlers->start_transaction(smi_info->si_sm, msg, 2);
> + start_new_msg(smi_info, msg, 2);
> smi_info->si_state = SI_CHECKING_ENABLES;
> }
>
> -static void start_clear_flags(struct smi_info *smi_info, bool start_timer)
> +static void start_clear_flags(struct smi_info *smi_info)
> {
> unsigned char msg[3];
>
> @@ -459,10 +461,7 @@ static void start_clear_flags(struct smi
> msg[1] = IPMI_CLEAR_MSG_FLAGS_CMD;
> msg[2] = WDT_PRE_TIMEOUT_INT;
>
> - if (start_timer)
> - start_new_msg(smi_info, msg, 3);
> - else
> - smi_info->handlers->start_transaction(smi_info->si_sm, msg, 3);
> + start_new_msg(smi_info, msg, 3);
> smi_info->si_state = SI_CLEARING_FLAGS;
> }
>
> @@ -497,11 +496,11 @@ static void start_getting_events(struct
> * Note that we cannot just use disable_irq(), since the interrupt may
> * be shared.
> */
> -static inline bool disable_si_irq(struct smi_info *smi_info, bool start_timer)
> +static inline bool disable_si_irq(struct smi_info *smi_info)
> {
> if ((smi_info->irq) && (!smi_info->interrupt_disabled)) {
> smi_info->interrupt_disabled = true;
> - start_check_enables(smi_info, start_timer);
> + start_check_enables(smi_info);
> return true;
> }
> return false;
> @@ -511,7 +510,7 @@ static inline bool enable_si_irq(struct
> {
> if ((smi_info->irq) && (smi_info->interrupt_disabled)) {
> smi_info->interrupt_disabled = false;
> - start_check_enables(smi_info, true);
> + start_check_enables(smi_info);
> return true;
> }
> return false;
> @@ -529,7 +528,7 @@ static struct ipmi_smi_msg *alloc_msg_ha
>
> msg = ipmi_alloc_smi_msg();
> if (!msg) {
> - if (!disable_si_irq(smi_info, true))
> + if (!disable_si_irq(smi_info))
> smi_info->si_state = SI_NORMAL;
> } else if (enable_si_irq(smi_info)) {
> ipmi_free_smi_msg(msg);
> @@ -545,7 +544,7 @@ retry:
> /* Watchdog pre-timeout */
> smi_inc_stat(smi_info, watchdog_pretimeouts);
>
> - start_clear_flags(smi_info, true);
> + start_clear_flags(smi_info);
> smi_info->msg_flags &= ~WDT_PRE_TIMEOUT_INT;
> if (smi_info->intf)
> ipmi_smi_watchdog_pretimeout(smi_info->intf);
> @@ -928,7 +927,7 @@ restart:
> * disable and messages disabled.
> */
> if (smi_info->supports_event_msg_buff || smi_info->irq) {
> - start_check_enables(smi_info, true);
> + start_check_enables(smi_info);
> } else {
> smi_info->curr_msg = alloc_msg_handle_irq(smi_info);
> if (!smi_info->curr_msg)
> @@ -1235,6 +1234,7 @@ static int smi_start_processing(void
>
> /* Set up the timer that drives the interface. */
> setup_timer(&new_smi->si_timer, smi_timeout, (long)new_smi);
> + new_smi->timer_can_start = true;
> smi_mod_timer(new_smi, jiffies + SI_TIMEOUT_JIFFIES);
>
> /* Try to claim any interrupts. */
> @@ -3416,10 +3416,12 @@ static void check_for_broken_irqs(struct
> check_set_rcv_irq(smi_info);
> }
>
> -static inline void wait_for_timer_and_thread(struct smi_info *smi_info)
> +static inline void stop_timer_and_thread(struct smi_info *smi_info)
> {
> if (smi_info->thread != NULL)
> kthread_stop(smi_info->thread);
> +
> + smi_info->timer_can_start = false;
> if (smi_info->timer_running)
> del_timer_sync(&smi_info->si_timer);
> }
> @@ -3593,7 +3595,7 @@ static int try_smi_init(struct smi_info
> * Start clearing the flags before we enable interrupts or the
> * timer to avoid racing with the timer.
> */
> - start_clear_flags(new_smi, false);
> + start_clear_flags(new_smi);
>
> /*
> * IRQ is defined to be set when non-zero. req_events will
> @@ -3662,7 +3664,7 @@ static int try_smi_init(struct smi_info
> return 0;
>
> out_err_stop_timer:
> - wait_for_timer_and_thread(new_smi);
> + stop_timer_and_thread(new_smi);
>
> out_err:
> new_smi->interrupt_disabled = true;
> @@ -3854,7 +3856,7 @@ static void cleanup_one_si(struct smi_in
> */
> if (to_clean->irq_cleanup)
> to_clean->irq_cleanup(to_clean);
> - wait_for_timer_and_thread(to_clean);
> + stop_timer_and_thread(to_clean);
>
> /*
> * Timeouts are stopped, now make sure the interrupts are off
> @@ -3865,7 +3867,7 @@ static void cleanup_one_si(struct smi_in
> poll(to_clean);
> schedule_timeout_uninterruptible(1);
> }
> - disable_si_irq(to_clean, false);
> + disable_si_irq(to_clean);
> while (to_clean->curr_msg || (to_clean->si_state != SI_NORMAL)) {
> poll(to_clean);
> schedule_timeout_uninterruptible(1);
>
> ------------------------------------------------------------------------------
> Check out the vibrant tech community on one of the world's most
> engaging tech sites, Slashdot.org! http://sdm.link/slashdot
> _______________________________________________
> Openipmi-developer mailing list
> Openipmi-developer@...ts.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/openipmi-developer
Powered by blists - more mailing lists