In order to combine the preemption and need_resched test we need to fold the need_resched information into the preempt_count value. Since the NEED_RESCHED flag is set across CPUs this needs to be an atomic operation, however we very much want to avoid making preempt_count atomic, therefore we keep the existing TIF_NEED_RESCHED infrastructure in place but at 3 sites test it and fold its value into preempt_count; namely: - resched_task() when setting TIF_NEED_RESCHED on the current task - scheduler_ipi() when resched_task() sets TIF_NEED_RESCHED on a remote task it follows it up with a reschedule IPI and we can modify the cpu local preempt_count from there. - cpu_idle_loop() for when resched_task() found tsk_is_polling(). We use an inverted bitmask to indicate need_resched so that a 0 means both need_resched and !atomic. Also remove the barrier() in preempt_enable() between preempt_enable_no_resched() and preempt_check_resched() to avoid having to reload the preemption value and allow the compiler to use the flags of the previuos decrement. I couldn't come up with any sane reason for this barrier() to be there as preempt_enable_no_resched() already has a barrier() before doing the decrement. Suggested-by: Ingo Molnar Signed-off-by: Peter Zijlstra --- include/linux/preempt.h | 42 +++++++++++++++++++++++++++++++++++++----- include/linux/sched.h | 2 +- kernel/context_tracking.c | 2 +- kernel/cpu/idle.c | 7 +++++++ kernel/sched/core.c | 18 ++++++++++++++---- 5 files changed, 60 insertions(+), 11 deletions(-) --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -10,9 +10,19 @@ #include #include +/* + * We use the MSB mostly because its available; see for + * the other bits. + */ +#define PREEMPT_NEED_RESCHED 0x80000000 + +/* + * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users + * that think a non-zero value indicates we cannot preempt. + */ static __always_inline int preempt_count(void) { - return current_thread_info()->preempt_count; + return current_thread_info()->preempt_count & ~PREEMPT_NEED_RESCHED; } static __always_inline int *preempt_count_ptr(void) @@ -20,6 +30,30 @@ static __always_inline int *preempt_coun return ¤t_thread_info()->preempt_count; } +/* + * We fold the NEED_RESCHED bit into the preempt count such that + * preempt_enable() can decrement and test for needing to reschedule with a + * single instruction. + * + * We invert the actual bit, so that when the decrement hits 0 we know we both + * need to resched (the bit is cleared) and can resched (no preempt count). + */ + +static __always_inline void set_preempt_need_resched(void) +{ + *preempt_count_ptr() &= ~PREEMPT_NEED_RESCHED; +} + +static __always_inline void clear_preempt_need_resched(void) +{ + *preempt_count_ptr() |= PREEMPT_NEED_RESCHED; +} + +static __always_inline bool test_preempt_need_resched(void) +{ + return !(*preempt_count_ptr() & PREEMPT_NEED_RESCHED); +} + #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) extern void add_preempt_count(int val); extern void sub_preempt_count(int val); @@ -37,7 +71,7 @@ asmlinkage void preempt_schedule(void); #define preempt_check_resched() \ do { \ - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \ + if (unlikely(!*preempt_count_ptr())) \ preempt_schedule(); \ } while (0) @@ -47,7 +81,7 @@ void preempt_schedule_context(void); #define preempt_check_resched_context() \ do { \ - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \ + if (unlikely(!*preempt_count_ptr())) \ preempt_schedule_context(); \ } while (0) #else @@ -83,7 +117,6 @@ do { \ #define preempt_enable() \ do { \ preempt_enable_no_resched(); \ - barrier(); \ preempt_check_resched(); \ } while (0) @@ -111,7 +144,6 @@ do { \ #define preempt_enable_notrace() \ do { \ preempt_enable_no_resched_notrace(); \ - barrier(); \ preempt_check_resched_context(); \ } while (0) --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2408,7 +2408,7 @@ static inline int signal_pending_state(l static inline int need_resched(void) { - return unlikely(test_thread_flag(TIF_NEED_RESCHED)); + return unlikely(test_preempt_need_resched()); } /* --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -115,7 +115,7 @@ void __sched notrace preempt_schedule_co { enum ctx_state prev_ctx; - if (likely(!preemptible())) + if (likely(preempt_count() || irqs_disabled())) return; /* --- a/kernel/cpu/idle.c +++ b/kernel/cpu/idle.c @@ -105,6 +105,13 @@ static void cpu_idle_loop(void) __current_set_polling(); } arch_cpu_idle_exit(); + /* + * We need to test and propagate the TIF_NEED_RESCHED + * bit here because we might not have send the + * reschedule IPI to idle tasks. + */ + if (tif_need_resched()) + set_preempt_need_resched(); } tick_nohz_idle_exit(); schedule_preempt_disabled(); --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -525,8 +525,10 @@ void resched_task(struct task_struct *p) set_tsk_need_resched(p); cpu = task_cpu(p); - if (cpu == smp_processor_id()) + if (cpu == smp_processor_id()) { + set_preempt_need_resched(); return; + } /* NEED_RESCHED must be visible before we test polling */ smp_mb(); @@ -1397,6 +1399,14 @@ static void sched_ttwu_pending(void) void scheduler_ipi(void) { + /* + * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting + * TIF_NEED_RESCHED remotely (for the first time) will also send + * this IPI. + */ + if (tif_need_resched()) + set_preempt_need_resched(); + if (llist_empty(&this_rq()->wake_list) && !tick_nohz_full_cpu(smp_processor_id()) && !got_nohz_idle_kick()) @@ -2431,6 +2441,7 @@ static void __sched __schedule(void) put_prev_task(rq, prev); next = pick_next_task(rq); clear_tsk_need_resched(prev); + clear_preempt_need_resched(); rq->skip_clock_update = 0; if (likely(prev != next)) { @@ -2517,7 +2528,7 @@ asmlinkage void __sched notrace preempt_ * If there is a non-zero preempt_count or interrupts are disabled, * we do not want to preempt the current task. Just return.. */ - if (likely(!preemptible())) + if (likely(preempt_count() || irqs_disabled())) return; do { @@ -2542,11 +2553,10 @@ EXPORT_SYMBOL(preempt_schedule); */ asmlinkage void __sched preempt_schedule_irq(void) { - struct thread_info *ti = current_thread_info(); enum ctx_state prev_state; /* Catch callers which need to be fixed */ - BUG_ON(ti->preempt_count || !irqs_disabled()); + BUG_ON(preempt_count() || !irqs_disabled()); prev_state = exception_enter(); -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/