[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <35394cb7-a490-5aeb-b3a8-0f46e3c8ca28@amd.com>
Date: Tue, 2 Aug 2022 11:40:34 +0530
From: Ravi Bangoria <ravi.bangoria@....com>
To: Peter Zijlstra <peterz@...radead.org>
Cc: acme@...nel.org, alexander.shishkin@...ux.intel.com,
jolsa@...hat.com, namhyung@...nel.org, songliubraving@...com,
eranian@...gle.com, alexey.budankov@...ux.intel.com,
ak@...ux.intel.com, mark.rutland@....com, megha.dey@...el.com,
frederic@...nel.org, maddy@...ux.ibm.com, irogers@...gle.com,
kim.phillips@....com, linux-kernel@...r.kernel.org,
santosh.shukla@....com, ravi.bangoria@....com
Subject: Re: [RFC v2] perf: Rewrite core context handling
On 13-Jun-22 8:25 PM, Peter Zijlstra wrote:
> On Mon, Jun 13, 2022 at 04:35:11PM +0200, Peter Zijlstra wrote:
>> @@ -12125,6 +12232,8 @@ SYSCALL_DEFINE5(perf_event_open,
>> goto err_task;
>> }
>>
>> + // XXX premature; what if this is allowed, but we get moved to a PMU
>> + // that doesn't have this.
>> if (is_sampling_event(event)) {
>> if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
>> err = -EOPNOTSUPP;
>
> No; this really should be against the event's native PMU. If the event
> can't natively sample, it can't sample when placed in another group
> either.
Right. But IIUC, the question was, would there be any issue if we allow
grouping of perf_sw_context sampling event as group leader and
perf_{hw|invalid}_context counting event as group member. I think no. It
should just work fine. And, there could be real usecases of it as you
described in one old thread[1].
TL;DR
Although I can't find any such pmu combination on AMD(not considering real
sw pmus), I just tried opposite scenario:
Group leader: msr/tsc/ as counting event (perf_sw_context)
Group member: ibs_op/cnt_ctl=1/ as sampling event (perf_invalid_context)
And a simple test program seems to work fine:
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <string.h>
#include <linux/perf_event.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/ioctl.h>
#define PAGE_SIZE sysconf(_SC_PAGESIZE)
#define PERF_MMAP_DATA_PAGES 256
#define PERF_MMAP_DATA_SIZE (PERF_MMAP_DATA_PAGES * PAGE_SIZE)
#define PERF_MMAP_DATA_MASK (PERF_MMAP_DATA_SIZE - 1)
#define PERF_MMAP_TOTAL_PAGES (PERF_MMAP_DATA_PAGES + 1)
#define PERF_MMAP_TOTAL_SIZE (PERF_MMAP_TOTAL_PAGES * PAGE_SIZE)
#define rmb() asm volatile("lfence":::"memory")
struct perf_event {
int fd;
void *p;
};
static int perf_event_open(struct perf_event_attr *attr, pid_t pid,
int cpu, int group_fd, unsigned long flags)
{
int fd = syscall(__NR_perf_event_open, attr, pid, cpu,
group_fd, flags);
if (fd < 0)
perror("perf_event_open() failed.");
return fd;
}
static void *perf_event_mmap(int fd)
{
void *p = mmap(NULL, PERF_MMAP_TOTAL_SIZE,
PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if (p == MAP_FAILED)
perror("mmap() failed.");
return p;
}
static void
copy_event_data(void *src, unsigned long offset, void *dest, size_t size)
{
size_t chunk1_size, chunk2_size;
if ((offset + size) < PERF_MMAP_DATA_SIZE) {
memcpy(dest, src + offset, size);
} else {
chunk1_size = PERF_MMAP_DATA_SIZE - offset;
chunk2_size = size - chunk1_size;
memcpy(dest, src + offset, chunk1_size);
memcpy(dest + chunk1_size, src, chunk2_size);
}
}
static int mmap_read(struct perf_event_mmap_page *p, void *dest, size_t size)
{
void *base;
unsigned long data_tail, data_head;
/* Casting to (void *) is needed. */
base = (void *)p + PAGE_SIZE;
data_head = p->data_head;
rmb();
data_tail = p->data_tail;
if ((data_head - data_tail) < size)
return -1;
data_tail &= PERF_MMAP_DATA_MASK;
copy_event_data(base, data_tail, dest, size);
p->data_tail += size;
return 0;
}
static void mmap_skip(struct perf_event_mmap_page *p, size_t size)
{
int data_head = p->data_head;
rmb();
if ((p->data_tail + size) > data_head)
p->data_tail = data_head;
else
p->data_tail += size;
}
static void perf_read_event_details(struct perf_event_mmap_page *p)
{
struct perf_event_header hdr;
unsigned int pid, tid;
/*
* PERF_RECORD_SAMPLE:
* struct {
* struct perf_event_header hdr;
* u32 pid; // PERF_SAMPLE_TID
* u32 tid; // PERF_SAMPLE_TID
* };
*/
while(1) {
if (mmap_read(p, &hdr, sizeof(hdr)))
return;
if (hdr.type == PERF_RECORD_SAMPLE) {
if (mmap_read(p, &pid, sizeof(pid)))
perror("Error reading pid.");
if (mmap_read(p, &tid, sizeof(tid)))
perror("Error reading tid.");
printf("pid: %d, tid: %d\n", pid, tid);
} else {
mmap_skip(p, hdr.size - sizeof(hdr));
}
}
}
int main(int argc, char *argv[])
{
struct perf_event_attr attr;
struct perf_event events[2];
int i;
long long count1, count2;
memset(&attr, 0, sizeof(struct perf_event_attr));
attr.size = sizeof(struct perf_event_attr);
attr.type = 16; /* /sys/bus/event_source/devices/msr/type */
attr.config = 0x0; /* /sys/bus/event_source/devices/msr/events/tsc */
attr.disabled = 1;
events[0].fd = perf_event_open(&attr, -1, 0, -1, 0);
attr.type = 9; /* /sys/bus/event_source/devices/ibs_op/type */
attr.config = (0x1 << 19); /* /sys/bus/event_source/devices/ibs_op/format/cnt_ctl */
attr.disabled = 1;
/* perf_read_event_details() can parse PERF_SAMPLE_TID only */
attr.sample_type = PERF_SAMPLE_TID;
attr.sample_period = 10000000;
events[1].fd = perf_event_open(&attr, -1, 0, events[0].fd, 0);
events[1].p = perf_event_mmap(events[1].fd);
ioctl(events[0].fd, PERF_EVENT_IOC_RESET, 0);
ioctl(events[1].fd, PERF_EVENT_IOC_RESET, 0);
ioctl(events[0].fd, PERF_EVENT_IOC_ENABLE, 0);
ioctl(events[1].fd, PERF_EVENT_IOC_ENABLE, 0);
i = 5;
while(i--) {
sleep(1);
read(events[0].fd, &count1, sizeof(long long));
read(events[1].fd, &count2, sizeof(long long));
perf_read_event_details(events[1].p);
ioctl(events[0].fd, PERF_EVENT_IOC_RESET, 0);
ioctl(events[1].fd, PERF_EVENT_IOC_RESET, 0);
printf("%lld, %lld\n", count1, count2);
}
close(events[1].fd);
close(events[0].fd);
}
Example run:
[term1~]$ taskset -c 0 top
[term2~]$ pgrep top
85747
[term2~]$ sudo ./perf-group-sample-count
1996319080, 0
pid: 85747, tid: 85747
pid: 85747, tid: 85747
pid: 85747, tid: 85747
pid: 85747, tid: 85747
pid: 85747, tid: 85747
pid: 85747, tid: 85747
pid: 85747, tid: 85747
pid: 85747, tid: 85747
pid: 85747, tid: 85747
pid: 85747, tid: 85747
pid: 85747, tid: 85747
pid: 85747, tid: 85747
pid: 85747, tid: 85747
pid: 85747, tid: 85747
pid: 0, tid: 0
1996510960, 150000000
1996325400, 0
1996348600, 0
pid: 85747, tid: 85747
pid: 85747, tid: 85747
pid: 85747, tid: 85747
pid: 85747, tid: 85747
pid: 85747, tid: 85747
pid: 85747, tid: 85747
pid: 85747, tid: 85747
pid: 85747, tid: 85747
pid: 85747, tid: 85747
pid: 85747, tid: 85747
pid: 85747, tid: 85747
pid: 85747, tid: 85747
pid: 0, tid: 0
1996341420, 130000000
Thanks,
Ravi
[1] https://lore.kernel.org/all/20150204125954.GL21418@twins.programming.kicks-ass.net
Powered by blists - more mailing lists