linux-kernel - x86_64 Question: Are concurrent IPI requests safe?

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <201605092354.AHF82313.FtQFOMVOFJLOSH@I-love.SAKURA.ne.jp>
Date:	Mon, 9 May 2016 23:54:30 +0900
From:	Tetsuo Handa <penguin-kernel@...ove.SAKURA.ne.jp>
To:	peterz@...radead.org, tglx@...utronix.de, mingo@...nel.org,
	akpm@...ux-foundation.org, mgorman@...hsingularity.net,
	mhocko@...nel.org
Cc:	linux-kernel@...r.kernel.org, linux-mm@...ck.org
Subject: x86_64 Question: Are concurrent IPI requests safe?

Tetsuo Handa wrote:
> I'm hitting this bug while doing OOM-killer torture test, but I can't tell
> whether this is a mm bug.

I came to think that this is an IPI handling bug (including the possibility
of hardware dependent bug).

If I understand correctly, all "call a function with an argument" requests
in a per CPU call_single_queue are processed by one CALL_FUNCTION_VECTOR IPI
request.

----------------------------------------
void smp_call_function_many(const struct cpumask *mask, smp_call_func_t func, void *info, bool wait) { /* kernel/smp.c */
  for_each_cpu(cpu, cfd->cpumask) {
    llist_add(&csd->llist, &per_cpu(call_single_queue, cpu));
  }
  arch_send_call_function_ipi_mask(cfd->cpumask) { /* arch/x86/include/asm/smp.h */
    smp_ops.send_call_func_ipi(mask) {
      void native_send_call_func_ipi(const struct cpumask *mask) { /* arch/x86/kernel/smp.c */
        apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR) {
          static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector) { /* arch/x86/kernel/apic/apic_flat_64.c */
            _flat_send_IPI_mask(mask, vector) {
              __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest) { /* arch/x86/kernel/apic/ipi.c */
                __xapic_wait_icr_idle();
                native_apic_mem_write(APIC_ICR2, cfg);
                native_apic_mem_write(APIC_ICR, cfg);
              }
            }
          }
        }
      }
    }
  }
  for_each_cpu(cpu, cfd->cpumask) {
    csd_lock_wait(csd);
  }
}
----------------------------------------

----------------------------------------
__visible void smp_call_function_interrupt(struct pt_regs *regs) { /* arch/x86/kernel/smp.c */
  ipi_entering_ack_irq();
  __smp_call_function_interrupt() {
    generic_smp_call_function_interrupt() {
      void generic_smp_call_function_single_interrupt(void) { /* kernel/smp.c */
        flush_smp_call_function_queue(true) {
          head = this_cpu_ptr(&call_single_queue);
          entry = llist_del_all(head);
          llist_for_each_entry_safe(csd, csd_next, entry, llist) {
             func(info);
             csd_unlock(csd);
          }
        }
      }
    }
  }
  exiting_irq();
}
----------------------------------------

Therefore, concurrent on_each_cpu_mask() calls with wait == true does not matter
as long as llist_del_all() is called from smp_call_function_interrupt()
after llist_add() is called from smp_call_function_many().

Since the SysRq-l shows that multiple CPUs are spinning at csd_lock_wait(),
I checked whether llist_del_all() is called after llist_add() is called
using debug printk() patch shown below.

----------------------------------------
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index e513940..6eb9f79 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -237,6 +237,8 @@ static DECLARE_WORK(sysrq_showallcpus, sysrq_showregs_othercpus);
 
 static void sysrq_handle_showallcpus(int key)
 {
+	extern void show_ipi_sequence(void);
+	show_ipi_sequence();
 	/*
 	 * Fall back to the workqueue based printing if the
 	 * backtrace printing did not succeed or the
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index afdcb7b..3744946 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -56,6 +56,7 @@
 #include <linux/random.h>
 #include <linux/trace_events.h>
 #include <linux/suspend.h>
+extern void show_ipi_sequence(void);
 
 #include "tree.h"
 #include "rcu.h"
@@ -1346,6 +1347,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
 	 * See Documentation/RCU/stallwarn.txt for info on how to debug
 	 * RCU CPU stall warnings.
 	 */
+	show_ipi_sequence();
 	pr_err("INFO: %s detected stalls on CPUs/tasks:",
 	       rsp->name);
 	print_cpu_stall_info_begin();
@@ -1412,6 +1414,7 @@ static void print_cpu_stall(struct rcu_state *rsp)
 	 * See Documentation/RCU/stallwarn.txt for info on how to debug
 	 * RCU CPU stall warnings.
 	 */
+	show_ipi_sequence();
 	pr_err("INFO: %s self-detected stall on CPU", rsp->name);
 	print_cpu_stall_info_begin();
 	print_cpu_stall_info(rsp, smp_processor_id());
diff --git a/kernel/smp.c b/kernel/smp.c
index 7416544..0630eda 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -17,6 +17,21 @@
 
 #include "smpboot.h"
 
+static DEFINE_PER_CPU(unsigned int, ipi_call_func_sequence);
+static DEFINE_PER_CPU(unsigned int, ipi_call_func_last_sequence[4]);
+
+void show_ipi_sequence(void)
+{
+	int cpu;
+	for_each_online_cpu(cpu) {
+		printk(KERN_ERR "IPI(%d): last_requested=%u,%u last_responded=%u,%u\n",
+		       cpu, per_cpu_ptr(ipi_call_func_last_sequence, cpu)[0],
+		       per_cpu_ptr(ipi_call_func_last_sequence, cpu)[2],
+		       per_cpu_ptr(ipi_call_func_last_sequence, cpu)[1],
+		       per_cpu_ptr(ipi_call_func_last_sequence, cpu)[3]);
+	}
+}
+
 enum {
 	CSD_FLAG_LOCK		= 0x01,
 	CSD_FLAG_SYNCHRONOUS	= 0x02,
@@ -218,7 +233,9 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
 	WARN_ON(!irqs_disabled());
 
 	head = this_cpu_ptr(&call_single_queue);
+	this_cpu_write(ipi_call_func_last_sequence[1], this_cpu_inc_return(ipi_call_func_sequence));
 	entry = llist_del_all(head);
+	this_cpu_write(ipi_call_func_last_sequence[3], this_cpu_inc_return(ipi_call_func_sequence));
 	entry = llist_reverse_order(entry);
 
 	/* There shouldn't be any pending callbacks on an offline CPU. */
@@ -452,7 +469,9 @@ void smp_call_function_many(const struct cpumask *mask,
 			csd->flags |= CSD_FLAG_SYNCHRONOUS;
 		csd->func = func;
 		csd->info = info;
+		this_cpu_write(ipi_call_func_last_sequence[0], this_cpu_inc_return(ipi_call_func_sequence));
 		llist_add(&csd->llist, &per_cpu(call_single_queue, cpu));
+		this_cpu_write(ipi_call_func_last_sequence[2], this_cpu_inc_return(ipi_call_func_sequence));
 	}
 
 	/* Send a message to all CPUs in the map */
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 2932f3e9..deb3243 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4404,6 +4404,8 @@ void show_workqueue_state(void)
 	struct worker_pool *pool;
 	unsigned long flags;
 	int pi;
+	extern void show_ipi_sequence(void);
+	show_ipi_sequence();
 
 	rcu_read_lock_sched();
 
----------------------------------------

This debug printk() patch shows sequence numbers using

  IPI($cpu): last_requested=$v0,$v2 last_responded=$v1,$v3

format and it is expected that $v0 == $v2 - 1 && $v1 == $v3 - 1 && $v2 < $v1
for most of the time.

However, I sometimes get $v2 > $v1 when I hit this bug.
(Complete log is at http://I-love.SAKURA.ne.jp/tmp/serial-20160509.txt.xz .)
----------------------------------------
[  120.469451] IPI(0): last_requested=21491,21492 last_responded=21493,21494
[  120.471484] IPI(1): last_requested=10485,10486 last_responded=10489,10490
[  120.473622] IPI(2): last_requested=11117,11118 last_responded=11111,11112
[  120.476205] IPI(3): last_requested=9663,9664 last_responded=9667,9668
[  133.889461] IPI(0): last_requested=21491,21492 last_responded=21493,21494
[  133.891353] IPI(1): last_requested=10485,10486 last_responded=10489,10490
[  133.893211] IPI(2): last_requested=11117,11118 last_responded=11111,11112
[  133.895008] IPI(3): last_requested=9663,9664 last_responded=9667,9668
[  150.550605] IPI(0): last_requested=21491,21492 last_responded=21493,21494
[  150.552350] IPI(1): last_requested=10485,10486 last_responded=10489,10490
[  150.554114] IPI(2): last_requested=11117,11118 last_responded=11111,11112
[  150.555865] IPI(3): last_requested=9663,9664 last_responded=9667,9668
----------------------------------------
This means that llist_del_all() is NOT called after llist_add() is called on
some CPUs. This suggests that IPI requests are dropped (or cannot be processed)
for some reason. 

Now, I got a question about __default_send_IPI_dest_field().

----------------------------------------
void __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest)
{
        unsigned long cfg;

        /*
         * Wait for idle.
         */
        if (unlikely(vector == NMI_VECTOR))
                safe_apic_wait_icr_idle();
        else
                __xapic_wait_icr_idle();

        /*
         * prepare target chip field
         */
        cfg = __prepare_ICR2(mask);
        native_apic_mem_write(APIC_ICR2, cfg);

        /*
         * program the ICR
         */
        cfg = __prepare_ICR(0, vector, dest);

        /*
         * Send the IPI. The write to APIC_ICR fires this off.
         */
        native_apic_mem_write(APIC_ICR, cfg);
}

static inline void __xapic_wait_icr_idle(void)
{
        while (native_apic_mem_read(APIC_ICR) & APIC_ICR_BUSY)
                cpu_relax();
}

static inline u32 native_apic_mem_read(u32 reg)
{
        return *((volatile u32 *)(APIC_BASE + reg));
}

static inline void native_apic_mem_write(u32 reg, u32 v)
{
        volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg);

        alternative_io("movl %0, %P1", "xchgl %0, %P1", X86_BUG_11AP,
                       ASM_OUTPUT2("=r" (v), "=m" (*addr)),
                       ASM_OUTPUT2("0" (v), "m" (*addr)));
}
----------------------------------------

It seems to me that APIC_BASE APIC_ICR APIC_ICR_BUSY are all constant
regardless of calling cpu. Thus, native_apic_mem_read() and
native_apic_mem_write() are using globally shared constant memory
address and __xapic_wait_icr_idle() is making decision based on
globally shared constant memory address. Am I right?

Then, what happens if 2 CPUs called native_apic_mem_write(APIC_ICR),
one with vector == CALL_FUNCTION_VECTOR and the other with
vector != CALL_FUNCTION_VECTOR ? Since I can't find exclusion control
between CPUs here, native_apic_mem_write() for CALL_FUNCTION_VECTOR
can be ignored by concurrent !CALL_FUNCTION_VECTOR request?