[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <Y3hDYiXwRnJr8RYG@xpf.sh.intel.com>
Date: Sat, 19 Nov 2022 10:45:54 +0800
From: Pengfei Xu <pengfei.xu@...el.com>
To: Peter Zijlstra <peterz@...radead.org>
CC: <peter.zijlstra@...el.com>, <linux-kernel@...r.kernel.org>,
<heng.su@...el.com>, Marco Elver <elver@...gle.com>,
Mark Rutland <mark.rutland@....com>
Subject: Re: [Syzkaller & bisect] There is "__perf_event_overflow" WARNING in
v6.1-rc5 kernel in guest
Hi Peter,
On 2022-11-17 at 19:11:37 +0100, Peter Zijlstra wrote:
> On Wed, Nov 16, 2022 at 11:39:53AM +0800, Pengfei Xu wrote:
> > int main(void)
> > {
> > syscall(__NR_mmap, 0x1ffff000ul, 0x1000ul, 0ul, 0x32ul, -1, 0ul);
> > syscall(__NR_mmap, 0x20000000ul, 0x1000000ul, 7ul, 0x32ul, -1, 0ul);
> > syscall(__NR_mmap, 0x21000000ul, 0x1000ul, 0ul, 0x32ul, -1, 0ul);
> > setup_sysctl();
> >
> > *(uint32_t*)0x20000200 = 0;
> > *(uint32_t*)0x20000204 = 0x80;
> > *(uint8_t*)0x20000208 = 0;
> > *(uint8_t*)0x20000209 = 0;
> > *(uint8_t*)0x2000020a = 0;
> > *(uint8_t*)0x2000020b = 0;
> > *(uint32_t*)0x2000020c = 0;
> > *(uint64_t*)0x20000210 = 8;
> > *(uint64_t*)0x20000218 = 0;
> > *(uint64_t*)0x20000220 = 0;
> > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 0, 1);
> > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 1, 1);
> > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 2, 1);
> > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 3, 1);
> > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 4, 1);
> > STORE_BY_BITMASK(uint64_t, , 0x20000228, 1, 5, 1);
> > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 6, 1);
> > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 7, 1);
> > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 8, 1);
> > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 9, 1);
> > STORE_BY_BITMASK(uint64_t, , 0x20000228, 1, 10, 1);
> > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 11, 1);
> > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 12, 1);
> > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 13, 1);
> > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 14, 1);
> > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 15, 2);
> > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 17, 1);
> > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 18, 1);
> > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 19, 1);
> > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 20, 1);
> > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 21, 1);
> > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 22, 1);
> > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 23, 1);
> > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 24, 1);
> > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 25, 1);
> > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 26, 1);
> > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 27, 1);
> > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 28, 1);
> > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 29, 1);
> > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 30, 1);
> > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 31, 1);
> > STORE_BY_BITMASK(uint64_t, , 0x20000228, 1, 32, 1);
> > STORE_BY_BITMASK(uint64_t, , 0x20000228, 1, 33, 1);
> > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 34, 1);
> > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 35, 1);
> > STORE_BY_BITMASK(uint64_t, , 0x20000228, 1, 36, 1);
> > STORE_BY_BITMASK(uint64_t, , 0x20000228, 1, 37, 1);
> > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 38, 26);
> > *(uint32_t*)0x20000230 = 0;
> > *(uint32_t*)0x20000234 = 0;
> > *(uint64_t*)0x20000238 = 0;
> > *(uint64_t*)0x20000240 = 0;
> > *(uint64_t*)0x20000248 = 0;
> > *(uint64_t*)0x20000250 = 0;
> > *(uint32_t*)0x20000258 = 0;
> > *(uint32_t*)0x2000025c = 0;
> > *(uint64_t*)0x20000260 = 0;
> > *(uint32_t*)0x20000268 = 0;
> > *(uint16_t*)0x2000026c = 0;
> > *(uint16_t*)0x2000026e = 0;
> > *(uint32_t*)0x20000270 = 0;
> > *(uint32_t*)0x20000274 = 0;
> > *(uint64_t*)0x20000278 = 0xd62;
> > syscall(__NR_perf_event_open, 0x20000200ul, 0, 0ul, -1, 3ul);
> > return 0;
> > }
>
> Putting that next to 'pahole --hex -C perf_event_attr' seems to suggest
> this is HW_CPU_CYCLES with a sampling on and exclude_kernel and sigtrap
> set.
>
> Now, for hardware events like this, you'd expect the PMU OS filter to
> respect exclude_kernel (specifically not set ARCH_PERFMON_EVENTSEL_OS in
> the relevant MSR), except there's a bunch of erratas and skid related
> 'funnies' that mean a user event can trigger across the kernel boundary.
>
> Does the below patch help -- it should filter out those sort of things.
>
> (still also including that previous patch)
>
> ---
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 4ec3717003d5..cc86bf25f0c3 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -9273,6 +9273,21 @@ int perf_event_account_interrupt(struct perf_event *event)
> return __perf_event_account_interrupt(event, 1);
> }
>
> +static inline bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs)
> +{
> + /*
> + * Due to interrupt latency (AKA "skid"), we may enter the
> + * kernel before taking an overflow, even if the PMU is only
> + * counting user events.
> + * To avoid leaking information to userspace, we must always
> + * reject kernel samples when exclude_kernel is set.
> + */
> + if (event->attr.exclude_kernel && !user_mode(regs))
> + return false;
> +
> + return true;
> +}
> +
> /*
> * Generic event overflow handling, sampling.
> */
> @@ -9305,15 +9320,28 @@ static int __perf_event_overflow(struct perf_event *event,
> perf_event_disable_inatomic(event);
> }
>
> - if (event->attr.sigtrap) {
> - /*
> - * Should not be able to return to user space without processing
> - * pending_sigtrap (kernel events can overflow multiple times).
> - */
> - WARN_ON_ONCE(event->pending_sigtrap && event->attr.exclude_kernel);
> + if (event->attr.sigtrap && sample_is_allowed(event, regs)) {
> + unsigned int pending_id = 1;
> +
> + if (regs)
> + pending_id = hash32_ptr((void *)instruction_pointer(regs)) ?: 1;
> if (!event->pending_sigtrap) {
> - event->pending_sigtrap = 1;
> + event->pending_sigtrap = pending_id;
> local_inc(&event->ctx->nr_pending);
> + } else if (event->attr.exclude_kernel) {
> + /*
> + * Should not be able to return to user space without
> + * consuming pending_sigtrap; with exceptions:
> + *
> + * 1. Where !exclude_kernel, events can overflow again
> + * in the kernel without returning to user space.
> + *
> + * 2. Events that can overflow again before the IRQ-
> + * work without user space progress (e.g. hrtimer).
> + * To approximate progress (with false negatives),
> + * check 32-bit hash of the current IP.
> + */
> + WARN_ON_ONCE(event->pending_sigtrap != pending_id);
> }
> event->pending_addr = data->addr;
> irq_work_queue(&event->pending_irq);
Thanks a lot for your patch!
You added below part based on previous patch in below link:
https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?h=perf/urgent&id=bb88f9695460bec25aa30ba9072595025cf6c8af
"
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 884871427a94..cc86bf25f0c3 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -9273,6 +9273,21 @@ int perf_event_account_interrupt(struct perf_event *event)
return __perf_event_account_interrupt(event, 1);
}
+static inline bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs)
+{
+ /*
+ * Due to interrupt latency (AKA "skid"), we may enter the
+ * kernel before taking an overflow, even if the PMU is only
+ * counting user events.
+ * To avoid leaking information to userspace, we must always
+ * reject kernel samples when exclude_kernel is set.
+ */
+ if (event->attr.exclude_kernel && !user_mode(regs))
+ return false;
+
+ return true;
+}
+
/*
* Generic event overflow handling, sampling.
*/
@@ -9305,7 +9320,7 @@ static int __perf_event_overflow(struct perf_event *event,
perf_event_disable_inatomic(event);
}
- if (event->attr.sigtrap) {
+ if (event->attr.sigtrap && sample_is_allowed(event, regs)) {
unsigned int pending_id = 1;
if (regs)
"
After your patch, I could not reproduce this issue in "15465 times ./repro"
Attached is the dmesg of guest, and this issue could not be reproduced.
The result shows that your additional patch fixed this issue!
If possible, could you add Reported-and-tested-by tag from me.
Thanks!
BR.
View attachment "61rc5_perf2_fixed_dmesg.log" of type "text/plain" (34623 bytes)
Powered by blists - more mailing lists