[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20260128175212.18d7eaaa@gandalf.local.home>
Date: Wed, 28 Jan 2026 17:52:12 -0500
From: Steven Rostedt <rostedt@...dmis.org>
To: Vincent Donnefort <vdonnefort@...gle.com>
Cc: mhiramat@...nel.org, mathieu.desnoyers@...icios.com,
linux-trace-kernel@...r.kernel.org, maz@...nel.org, oliver.upton@...ux.dev,
joey.gouly@....com, suzuki.poulose@....com, yuzenghui@...wei.com,
kvmarm@...ts.linux.dev, linux-arm-kernel@...ts.infradead.org,
jstultz@...gle.com, qperret@...gle.com, will@...nel.org,
aneesh.kumar@...nel.org, kernel-team@...roid.com,
linux-kernel@...r.kernel.org
Subject: Re: [PATCH v10 07/30] tracing: Add non-consuming read to trace
remotes
On Mon, 26 Jan 2026 10:43:56 +0000
Vincent Donnefort <vdonnefort@...gle.com> wrote:
> Allow reading the trace file for trace remotes. This performs a
> non-consuming read of the trace buffer.
>
> Signed-off-by: Vincent Donnefort <vdonnefort@...gle.com>
>
> diff --git a/kernel/trace/trace_remote.c b/kernel/trace/trace_remote.c
> index 49c4ae127533..a744bbf48e88 100644
> --- a/kernel/trace/trace_remote.c
> +++ b/kernel/trace/trace_remote.c
> @@ -18,14 +18,25 @@
> #define TRACEFS_MODE_WRITE 0640
> #define TRACEFS_MODE_READ 0440
>
> +enum tri_type {
> + TRI_CONSUMING,
> + TRI_NONCONSUMING,
> +};
> +
> struct trace_remote_iterator {
> struct trace_remote *remote;
> struct trace_seq seq;
> struct delayed_work poll_work;
> unsigned long lost_events;
> u64 ts;
> + union {
> + struct ring_buffer_iter **rb_iters;
> + struct ring_buffer_iter *rb_iter;
I don't care for the union, it can be error prone and for what? 8 bytes?
It's not a fast path and the memory is temporary.
Just make two entries, where one is NULL. That way if there's a mistake and
the wrong one is used it will be pretty obvious that it gets a NULL pointer
dereference than some random error.
> + };
> int cpu;
> int evt_cpu;
> + loff_t pos;
> + enum tri_type type;
> };
>
> struct trace_remote {
> @@ -36,6 +47,8 @@ struct trace_remote {
> unsigned long trace_buffer_size;
> struct ring_buffer_remote rb_remote;
> struct mutex lock;
> + struct rw_semaphore reader_lock;
> + struct rw_semaphore *pcpu_reader_locks;
> unsigned int nr_readers;
> unsigned int poll_ms;
> bool tracing_on;
> @@ -225,6 +238,20 @@ static int trace_remote_get(struct trace_remote *remote, int cpu)
> if (ret)
> return ret;
>
> + if (cpu != RING_BUFFER_ALL_CPUS && !remote->pcpu_reader_locks) {
> + int lock_cpu;
> +
> + remote->pcpu_reader_locks = kcalloc(nr_cpu_ids, sizeof(*remote->pcpu_reader_locks),
> + GFP_KERNEL);
> + if (!remote->pcpu_reader_locks) {
> + trace_remote_try_unload(remote);
> + return -ENOMEM;
> + }
> +
> + for_each_possible_cpu(lock_cpu)
> + init_rwsem(&remote->pcpu_reader_locks[lock_cpu]);
> + }
> +
> remote->nr_readers++;
>
> return 0;
> @@ -239,6 +266,9 @@ static void trace_remote_put(struct trace_remote *remote)
> if (remote->nr_readers)
> return;
>
> + kfree(remote->pcpu_reader_locks);
> + remote->pcpu_reader_locks = NULL;
> +
> trace_remote_try_unload(remote);
> }
>
> @@ -253,6 +283,48 @@ static void __poll_remote(struct work_struct *work)
> msecs_to_jiffies(iter->remote->poll_ms));
> }
>
> +static int __alloc_ring_buffer_iter(struct trace_remote_iterator *iter, int cpu)
> +{
> + bool once = false;
> +
> + if (cpu != RING_BUFFER_ALL_CPUS) {
> + iter->rb_iter = ring_buffer_read_start(iter->remote->trace_buffer, cpu, GFP_KERNEL);
> +
> + return iter->rb_iter ? 0 : -ENOMEM;
> + }
> +
> + iter->rb_iters = kcalloc(nr_cpu_ids, sizeof(*iter->rb_iters), GFP_KERNEL);
> + if (!iter->rb_iters)
> + return -ENOMEM;
> +
> + for_each_possible_cpu(cpu) {
> + iter->rb_iters[cpu] = ring_buffer_read_start(iter->remote->trace_buffer, cpu,
> + GFP_KERNEL);
> + if (iter->rb_iters[cpu])
> + once = true;
Do we really want to succeed if only one cpu passes?
> + }
> +
> + return once ? 0 : -ENOMEM;
> +}
> +
> +static void __free_ring_buffer_iter(struct trace_remote_iterator *iter, int cpu)
> +{
> + if (!iter->rb_iter)
> + return;
> +
> + if (cpu != RING_BUFFER_ALL_CPUS) {
> + ring_buffer_read_finish(iter->rb_iter);
> + return;
> + }
> +
> + for_each_possible_cpu(cpu) {
> + if (iter->rb_iters[cpu])
> + ring_buffer_read_finish(iter->rb_iters[cpu]);
> + }
> +
> + kfree(iter->rb_iters);
> +}
> +
> static struct trace_remote_iterator
> *trace_remote_iter(struct trace_remote *remote, int cpu, enum tri_type type)
> {
> @@ -261,6 +333,8 @@ static struct trace_remote_iterator
>
> lockdep_assert_held(&remote->lock);
>
> + if (type == TRI_NONCONSUMING && !trace_remote_loaded(remote))
> + return NULL;
>
> ret = trace_remote_get(remote, cpu);
> if (ret)
> @@ -275,9 +349,21 @@ static struct trace_remote_iterator
> if (iter) {
> iter->remote = remote;
> iter->cpu = cpu;
> + iter->type = type;
> trace_seq_init(&iter->seq);
> - INIT_DELAYED_WORK(&iter->poll_work, __poll_remote);
> - schedule_delayed_work(&iter->poll_work, msecs_to_jiffies(remote->poll_ms));
> +
> + switch (type) {
> + case TRI_CONSUMING:
> + INIT_DELAYED_WORK(&iter->poll_work, __poll_remote);
> + schedule_delayed_work(&iter->poll_work, msecs_to_jiffies(remote->poll_ms));
> + break;
> + case TRI_NONCONSUMING:
> + ret = __alloc_ring_buffer_iter(iter, cpu);
> + break;
> + }
> +
> + if (ret)
> + goto err;
>
> return iter;
> }
> @@ -301,10 +387,100 @@ static void trace_remote_iter_free(struct trace_remote_iterator *iter)
>
> lockdep_assert_held(&remote->lock);
>
> + switch (iter->type) {
> + case TRI_CONSUMING:
> + cancel_delayed_work_sync(&iter->poll_work);
> + break;
> + case TRI_NONCONSUMING:
> + __free_ring_buffer_iter(iter, iter->cpu);
> + break;
> + }
> +
> kfree(iter);
> trace_remote_put(remote);
> }
>
> +static void trace_remote_iter_read_start(struct trace_remote_iterator *iter)
> +{
> + struct trace_remote *remote = iter->remote;
> + int cpu = iter->cpu;
> +
> + /* Acquire global reader lock */
> + if (cpu == RING_BUFFER_ALL_CPUS && iter->type == TRI_CONSUMING)
> + down_write(&remote->reader_lock);
> + else
> + down_read(&remote->reader_lock);
> +
> + if (cpu == RING_BUFFER_ALL_CPUS)
> + return;
> +
> + /*
> + * No need for the remote lock here, iter holds a reference on
> + * remote->nr_readers
> + */
> +
> + /* Get the per-CPU one */
> + if (WARN_ON_ONCE(!remote->pcpu_reader_locks))
> + return;
> +
> + if (iter->type == TRI_CONSUMING)
> + down_write(&remote->pcpu_reader_locks[cpu]);
> + else
> + down_read(&remote->pcpu_reader_locks[cpu]);
> +}
> +
> +static void trace_remote_iter_read_finished(struct trace_remote_iterator *iter)
> +{
> + struct trace_remote *remote = iter->remote;
> + int cpu = iter->cpu;
> +
> + /* Release per-CPU reader lock */
> + if (cpu != RING_BUFFER_ALL_CPUS) {
> + /*
> + * No need for the remote lock here, iter holds a reference on
> + * remote->nr_readers
> + */
> + if (iter->type == TRI_CONSUMING)
> + up_write(&remote->pcpu_reader_locks[cpu]);
> + else
> + up_read(&remote->pcpu_reader_locks[cpu]);
> + }
> +
> + /* Release global reader lock */
> + if (cpu == RING_BUFFER_ALL_CPUS && iter->type == TRI_CONSUMING)
> + up_write(&remote->reader_lock);
> + else
> + up_read(&remote->reader_lock);
> +}
> +
> +static struct ring_buffer_iter *__get_rb_iter(struct trace_remote_iterator *iter, int cpu)
> +{
> + return iter->cpu != RING_BUFFER_ALL_CPUS ? iter->rb_iter : iter->rb_iters[cpu];
> +}
> +
> +static struct ring_buffer_event *
> +__peek_event(struct trace_remote_iterator *iter, int cpu, u64 *ts, unsigned long *lost_events)
> +{
> + struct ring_buffer_event *rb_evt;
> + struct ring_buffer_iter *rb_iter;
> +
> + switch (iter->type) {
> + case TRI_CONSUMING:
> + return ring_buffer_peek(iter->remote->trace_buffer, cpu, ts, lost_events);
> + case TRI_NONCONSUMING:
> + rb_iter = __get_rb_iter(iter, cpu);
> + rb_evt = ring_buffer_iter_peek(rb_iter, ts);
> + if (!rb_evt)
> + return NULL;
> +
> + *lost_events = ring_buffer_iter_dropped(rb_iter);
> +
> + return rb_evt;
> + }
> +
> + return NULL;
> +}
> +
> static bool trace_remote_iter_read_event(struct trace_remote_iterator *iter)
> {
> struct trace_buffer *trace_buffer = iter->remote->trace_buffer;
> @@ -314,7 +490,7 @@ static bool trace_remote_iter_read_event(struct trace_remote_iterator *iter)
> if (ring_buffer_empty_cpu(trace_buffer, cpu))
> return false;
>
> - if (!ring_buffer_peek(trace_buffer, cpu, &iter->ts, &iter->lost_events))
> + if (!__peek_event(iter, cpu, &iter->ts, &iter->lost_events))
> return false;
>
> iter->evt_cpu = cpu;
> @@ -329,7 +505,7 @@ static bool trace_remote_iter_read_event(struct trace_remote_iterator *iter)
> if (ring_buffer_empty_cpu(trace_buffer, cpu))
> continue;
>
> - if (!ring_buffer_peek(trace_buffer, cpu, &ts, &lost_events))
> + if (!__peek_event(iter, cpu, &ts, &lost_events))
> continue;
>
> if (ts >= iter->ts)
> @@ -343,7 +519,21 @@ static bool trace_remote_iter_read_event(struct trace_remote_iterator *iter)
> return iter->ts != U64_MAX;
> }
>
> -static int trace_remote_iter_print(struct trace_remote_iterator *iter)
> +static void trace_remote_iter_move(struct trace_remote_iterator *iter)
> +{
> + struct trace_buffer *trace_buffer = iter->remote->trace_buffer;
> +
> + switch (iter->type) {
> + case TRI_CONSUMING:
> + ring_buffer_consume(trace_buffer, iter->evt_cpu, NULL, NULL);
> + break;
> + case TRI_NONCONSUMING:
> + ring_buffer_iter_advance(__get_rb_iter(iter, iter->evt_cpu));
> + break;
> + }
> +}
> +
> +static int trace_remote_iter_print_event(struct trace_remote_iterator *iter)
> {
> unsigned long usecs_rem;
> u64 ts = iter->ts;
> @@ -371,7 +561,11 @@ static int trace_pipe_open(struct inode *inode, struct file *filp)
> cpu = (long)inode->i_cdev - 1;
>
> guard(mutex)(&remote->lock);
> - iter = trace_remote_iter(remote, cpu);
> +
> + iter = trace_remote_iter(remote, cpu, TRI_CONSUMING);
> + if (IS_ERR(iter))
> + return PTR_ERR(iter);
> +
> filp->private_data = iter;
>
> return IS_ERR(iter) ? PTR_ERR(iter) : 0;
> @@ -406,6 +600,8 @@ static ssize_t trace_pipe_read(struct file *filp, char __user *ubuf, size_t cnt,
> if (ret < 0)
> return ret;
>
> + trace_remote_iter_read_start(iter);
> +
> while (trace_remote_iter_read_event(iter)) {
> int prev_len = iter->seq.seq.len;
>
> @@ -414,9 +610,11 @@ static ssize_t trace_pipe_read(struct file *filp, char __user *ubuf, size_t cnt,
> break;
> }
>
> - ring_buffer_consume(trace_buffer, iter->evt_cpu, NULL, NULL);
> + trace_remote_iter_move(iter);
> }
>
> + trace_remote_iter_read_finished(iter);
> +
> goto copy_to_user;
> }
>
> @@ -426,6 +624,119 @@ static const struct file_operations trace_pipe_fops = {
> .release = trace_pipe_release,
> };
>
> +static void *trace_seq_start(struct seq_file *m, loff_t *pos)
Don't call these "trace_seq_*", as it's confusing as functions that start
with "trace_seq_*" are to be used as API for struct trace_seq instances.
They're static functions, call them s_start() or whatever ;-)
> +{
> + struct trace_remote_iterator *iter = m->private;
> + loff_t i = *pos;
> +
> + if (!iter)
> + return NULL;
> +
> + if (iter->pos <= *pos) {
> + do {
> + if (!trace_remote_iter_read_event(iter))
> + return NULL;
> +
> + trace_remote_iter_move(iter);
> + iter->pos++;
> + } while (i--);
> + }
> +
> + return iter;
> +}
> +
> +static void *trace_seq_next(struct seq_file *m, void *v, loff_t *pos)
> +{
> + struct trace_remote_iterator *iter = m->private;
> +
> + ++*pos;
> +
> + if (!iter || !trace_remote_iter_read_event(iter))
> + return NULL;
> +
> + trace_remote_iter_move(iter);
> + iter->pos++;
> +
> + return iter;
> +}
BTW, I usually use the next function to increment the start function so
there's not duplicate code.
static void *tri_start(struct seq_file *m, loff_t *pos)
{
struct trace_remote_iterator *iter = m->private;
loff_t i = *pos;
if (!iter)
return NULL;
if (iter->pos <= *pos) {
do {
iter = tri_next(m, v, pos);
if (!iter)
return NULL;
} while (i--);
}
return iter;
}
> +
> +static int trace_seq_show(struct seq_file *m, void *v)
> +{
> + struct trace_remote_iterator *iter = v;
> +
> + trace_seq_init(&iter->seq);
> +
> + if (trace_remote_iter_print_event(iter)) {
> + seq_printf(m, "[EVENT %d PRINT TOO BIG]\n", iter->evt->id);
> + return 0;
> + }
> +
> + return trace_print_seq(m, &iter->seq);
> +}
> +
> +static void trace_seq_stop(struct seq_file *s, void *v) { }
> +
> +static const struct seq_operations trace_seq_ops = {
> + .start = trace_seq_start,
> + .next = trace_seq_next,
> + .show = trace_seq_show,
> + .stop = trace_seq_stop,
> +};
> +
> +static int trace_open(struct inode *inode, struct file *filp)
> +{
> + struct trace_remote *remote = inode->i_private;
> + struct trace_remote_iterator *iter = NULL;
> + int cpu = RING_BUFFER_ALL_CPUS;
> + int ret;
> +
> + if (!(filp->f_mode & FMODE_READ))
> + return 0;
> +
> + if (inode->i_cdev)
> + cpu = (long)inode->i_cdev - 1;
Hmm, we probably should use the helper function here. That is make
tracing_get_cpu() non-static and use that. When inode->i_cdev is zero it
returns RING_BUFFER_ALL_CPUS so you don't need to initialize cpu.
It should be used in the other locations too.
-- Steve
> +
> + guard(mutex)(&remote->lock);
> +
> + iter = trace_remote_iter(remote, cpu, TRI_NONCONSUMING);
> + if (IS_ERR(iter))
> + return PTR_ERR(iter);
> +
> + ret = seq_open(filp, &trace_seq_ops);
> + if (ret) {
> + trace_remote_iter_free(iter);
> + return ret;
> + }
> +
> + if (iter)
> + trace_remote_iter_read_start(iter);
> +
> + ((struct seq_file *)filp->private_data)->private = (void *)iter;
> +
> + return 0;
> +}
> +
> +static int trace_release(struct inode *inode, struct file *filp)
> +{
> + struct trace_remote_iterator *iter;
> +
> + if (!(filp->f_mode & FMODE_READ))
> + return 0;
> +
> + iter = ((struct seq_file *)filp->private_data)->private;
> + seq_release(inode, filp);
> +
> + if (!iter)
> + return 0;
> +
> + guard(mutex)(&iter->remote->lock);
> +
> + trace_remote_iter_read_finished(iter);
> + trace_remote_iter_free(iter);
> +
> + return 0;
> +}
> +
> static ssize_t trace_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos)
> {
> struct inode *inode = file_inode(filp);
> @@ -443,7 +754,11 @@ static ssize_t trace_write(struct file *filp, const char __user *ubuf, size_t cn
> }
>
> static const struct file_operations trace_fops = {
> + .open = trace_open,
> .write = trace_write,
> + .read = seq_read,
> + .read_iter = seq_read_iter,
> + .release = trace_release,
> };
>
> static int trace_remote_init_tracefs(const char *name, struct trace_remote *remote)
> @@ -532,6 +847,7 @@ int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs,
> remote->trace_buffer_size = 7 << 10;
> remote->poll_ms = 100;
> mutex_init(&remote->lock);
> + init_rwsem(&remote->reader_lock);
>
> if (trace_remote_init_tracefs(name, remote)) {
> kfree(remote);
Powered by blists - more mailing lists