[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20200211172952.GY14914@hirez.programming.kicks-ass.net>
Date:   Tue, 11 Feb 2020 18:29:52 +0100
From:   Peter Zijlstra <peterz@...radead.org>
To:     Steven Rostedt <rostedt@...dmis.org>
Cc:     LKML <linux-kernel@...r.kernel.org>,
        Ingo Molnar <mingo@...nel.org>,
        "Joel Fernandes (Google)" <joel@...lfernandes.org>,
        Greg Kroah-Hartman <gregkh@...uxfoundation.org>,
        "Gustavo A. R. Silva" <gustavo@...eddedor.com>,
        Thomas Gleixner <tglx@...utronix.de>,
        "Paul E. McKenney" <paulmck@...nel.org>,
        Josh Triplett <josh@...htriplett.org>,
        Mathieu Desnoyers <mathieu.desnoyers@...icios.com>,
        Lai Jiangshan <jiangshanlai@...il.com>
Subject: Re: [PATCH v2] tracing/perf: Move rcu_irq_enter/exit_irqson() to
 perf trace point hook
On Tue, Feb 11, 2020 at 11:18:28AM -0500, Steven Rostedt wrote:
> On Tue, 11 Feb 2020 16:34:52 +0100
> Peter Zijlstra <peterz@...radead.org> wrote:
> 
> > > +		if (unlikely(in_nmi()))
> > > +			goto out;  
> > 
> > unless I'm mistaken, we can simply do rcu_nmi_enter() in this case, and
> > rcu_nmi_exit() on the other end.
> > 
> > > +		rcu_irq_enter_irqson();
> 
> The thing is, I don't think this can ever happen. 
Per your own argument it should be true in the trace_preempt_off()
tracepoint from nmi_enter():
  <idle>
    // rcu_is_watching() := false
    <NMI>
      nmi_enter()
        preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET)
	  __preempt_count_add()
	  // in_nmi() := true
	  preempt_latency_start()
	    // .oO -- we'll never hit this tracepoint because idle has
	    // preempt_count() == 1, so we'll have:
	    //   preempt_count() != val
	    // .oO-2 we should be able to this this when the NMI
	    // hits userspace and we have NOHZ_FULL on.
	rcu_nmi_enter() // function tracer
But the point is, if we ever do hit it, rcu_nmi_enter() is the right
thing to call when '!rcu_is_watching() && in_nmi()', because, as per the
rcu_nmi_enter() comments, that function fully supports nested NMIs.
How about something like so? And then you get to use the same in
__trace_stack() or something, and anywhere else you're doing this dance.
---
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index da0af631ded5..e987236abf95 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -89,4 +89,52 @@ extern void irq_exit(void);
 		arch_nmi_exit();				\
 	} while (0)
 
+/*
+ * Tracing vs rcu_is_watching()
+ * ----------------------------
+ *
+ * tracepoints and function-tracing can happen when RCU isn't watching (idle,
+ * or early IRQ/NMI entry).
+ *
+ * When it happens during idle or early during IRQ entry, tracing will have
+ * to inform RCU that it ought to pay attention, this is done by calling
+ * rcu_irq_enter_irqon().
+ *
+ * On NMI entry, we must be very careful that tracing only happens after we've
+ * incremented preempt_count(), otherwise we cannot tell we're in NMI and take
+ * the special path.
+ */
+
+#define __TR_IRQ	1
+#define __TR_NMI	2
+
+#define trace_rcu_enter()					\
+({								\
+	unsigned long state = 0;				\
+	if (!rcu_is_watching())	{				\
+		if (in_nmi()) {					\
+			state = __TR_NMI;			\
+			rcu_nmi_enter();			\
+		} else {					\
+			state = __TR_IRQ;			\
+			rcu_irq_enter_irqson();			\
+		}						\
+	}							\
+	state;							\
+})
+
+#define trace_rcu_exit(state)					\
+do {								\
+	switch (state) {					\
+	case __TR_IRQ:						\
+		rcu_irq_exit_irqson();				\
+		break;						\
+	case __IRQ_NMI:						\
+		rcu_nmi_exit();					\
+		break;						\
+	default:						\
+		break;						\
+	}							\
+} while (0)
+
 #endif /* LINUX_HARDIRQ_H */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e453589da97c..8f34592a1355 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8950,6 +8950,7 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
 {
 	struct perf_sample_data data;
 	struct perf_event *event;
+	unsigned long rcu_flags;
 
 	struct perf_raw_record raw = {
 		.frag = {
@@ -8961,6 +8962,8 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
 	perf_sample_data_init(&data, 0, 0);
 	data.raw = &raw;
 
+	rcu_flags = trace_rcu_enter();
+
 	perf_trace_buf_update(record, event_type);
 
 	hlist_for_each_entry_rcu(event, head, hlist_entry) {
@@ -8996,6 +8999,8 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
 	}
 
 	perf_swevent_put_recursion_context(rctx);
+
+	trace_rcu_exit(rcu_flags);
 }
 EXPORT_SYMBOL_GPL(perf_tp_event);
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 45f79bcc3146..62f87807bbe6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3781,7 +3781,7 @@ static inline void preempt_latency_start(int val)
 	}
 }
 
-void preempt_count_add(int val)
+void notrace preempt_count_add(int val)
 {
 #ifdef CONFIG_DEBUG_PREEMPT
 	/*
@@ -3813,7 +3813,7 @@ static inline void preempt_latency_stop(int val)
 		trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
 }
 
-void preempt_count_sub(int val)
+void notrace preempt_count_sub(int val)
 {
 #ifdef CONFIG_DEBUG_PREEMPT
 	/*
Powered by blists - more mailing lists
 
