[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250128182932.GC24845@redhat.com>
Date: Tue, 28 Jan 2025 19:29:33 +0100
From: Oleg Nesterov <oleg@...hat.com>
To: Mateusz Guzik <mjguzik@...il.com>
Cc: brauner@...nel.org, akpm@...ux-foundation.org, linux-mm@...ck.org,
linux-kernel@...r.kernel.org,
"Eric W. Biederman" <ebiederm@...ssion.com>
Subject: Re: [PATCH v2] exit: perform randomness and pid work without
tasklist_lock
(Add Eric).
On 01/28, Mateusz Guzik wrote:
>
> Both add_device_randomness() and attach_pid()/detach_pid()
So afaics this patch does 2 different things, and I do think this needs
2 separate patches. Can you split this change please?
As for add_device_randomness(). I must have missed something, but I still can't
understand why we can't simply shift add_device_randomness(p->sum_exec_runtime)
to release_release_task() and avoid release_task_post->randomness.
You said:
I wanted to keep the load where it was
but why??? Again, I must have missed something, but to me this simply adds the
unnecessary complications. Either way, ->sum_exec_runtime is not stable even if
task-to-release != current, so what is the point?
Oleg.
> have their
> own synchronisation mechanisms.
>
> The clone side calls them *without* the tasklist_lock held, meaning
> parallel calls can contend on their locks.
>
> The exit side calls them *with* the tasklist_lock lock, which means the
> hold time is avoidably extended by waiting on either of the 2 locks, in
> turn exacerbating contention on tasklist_lock itself.
>
> Postponing the work until after the lock is dropped bumps thread
> creation/destruction rate by 15% on a 24 core vm.
>
> Bench (plop into will-it-scale):
> $ cat tests/threadspawn1.c
>
> char *testcase_description = "Thread creation and teardown";
>
> static void *worker(void *arg)
> {
> return (NULL);
> }
>
> void testcase(unsigned long long *iterations, unsigned long nr)
> {
> pthread_t thread;
> int error;
>
> while (1) {
> error = pthread_create(&thread, NULL, worker, NULL);
> assert(error == 0);
> error = pthread_join(thread, NULL);
> assert(error == 0);
> (*iterations)++;
> }
> }
>
> Run:
> $ ./threadspawn1_processes -t 24
>
> Signed-off-by: Mateusz Guzik <mjguzik@...il.com>
> ---
>
> v2:
> - introduce a struct to collect work
> - move out pids as well
>
> there is more which can be pulled out
>
> this may look suspicious:
> + proc_flush_pid(p->thread_pid);
>
> AFAICS this is constant for the duration of the lifetime, so i don't
> there is a problem
>
> include/linux/pid.h | 1 +
> kernel/exit.c | 53 +++++++++++++++++++++++++++++++++------------
> kernel/pid.c | 23 +++++++++++++++-----
> 3 files changed, 58 insertions(+), 19 deletions(-)
>
> diff --git a/include/linux/pid.h b/include/linux/pid.h
> index 98837a1ff0f3..6e9fcacd02cd 100644
> --- a/include/linux/pid.h
> +++ b/include/linux/pid.h
> @@ -101,6 +101,7 @@ extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type);
> * these helpers must be called with the tasklist_lock write-held.
> */
> extern void attach_pid(struct task_struct *task, enum pid_type);
> +extern struct pid *detach_pid_return(struct task_struct *task, enum pid_type);
> extern void detach_pid(struct task_struct *task, enum pid_type);
> extern void change_pid(struct task_struct *task, enum pid_type,
> struct pid *pid);
> diff --git a/kernel/exit.c b/kernel/exit.c
> index 1dcddfe537ee..4e452d3e3a89 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -122,14 +122,23 @@ static __init int kernel_exit_sysfs_init(void)
> late_initcall(kernel_exit_sysfs_init);
> #endif
>
> -static void __unhash_process(struct task_struct *p, bool group_dead)
> +/*
> + * For things release_task would like to do *after* tasklist_lock is released.
> + */
> +struct release_task_post {
> + unsigned long long randomness;
> + struct pid *pids[PIDTYPE_MAX];
> +};
> +
> +static void __unhash_process(struct release_task_post *post, struct task_struct *p,
> + bool group_dead)
> {
> nr_threads--;
> - detach_pid(p, PIDTYPE_PID);
> + post->pids[PIDTYPE_PID] = detach_pid_return(p, PIDTYPE_PID);
> if (group_dead) {
> - detach_pid(p, PIDTYPE_TGID);
> - detach_pid(p, PIDTYPE_PGID);
> - detach_pid(p, PIDTYPE_SID);
> + post->pids[PIDTYPE_TGID] = detach_pid_return(p, PIDTYPE_TGID);
> + post->pids[PIDTYPE_PGID] = detach_pid_return(p, PIDTYPE_PGID);
> + post->pids[PIDTYPE_SID] = detach_pid_return(p, PIDTYPE_SID);
>
> list_del_rcu(&p->tasks);
> list_del_init(&p->sibling);
> @@ -141,7 +150,8 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
> /*
> * This function expects the tasklist_lock write-locked.
> */
> -static void __exit_signal(struct task_struct *tsk)
> +static void __exit_signal(struct release_task_post *post,
> + struct task_struct *tsk)
> {
> struct signal_struct *sig = tsk->signal;
> bool group_dead = thread_group_leader(tsk);
> @@ -174,8 +184,7 @@ static void __exit_signal(struct task_struct *tsk)
> sig->curr_target = next_thread(tsk);
> }
>
> - add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
> - sizeof(unsigned long long));
> + post->randomness = tsk->se.sum_exec_runtime;
>
> /*
> * Accumulate here the counters for all threads as they die. We could
> @@ -197,7 +206,7 @@ static void __exit_signal(struct task_struct *tsk)
> task_io_accounting_add(&sig->ioac, &tsk->ioac);
> sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
> sig->nr_threads--;
> - __unhash_process(tsk, group_dead);
> + __unhash_process(post, tsk, group_dead);
> write_sequnlock(&sig->stats_lock);
>
> /*
> @@ -240,9 +249,13 @@ void __weak release_thread(struct task_struct *dead_task)
> void release_task(struct task_struct *p)
> {
> struct task_struct *leader;
> - struct pid *thread_pid;
> int zap_leader;
> + struct release_task_post post;
> repeat:
> + memset(&post, 0, sizeof(post));
> +
> + proc_flush_pid(p->thread_pid);
> +
> /* don't need to get the RCU readlock here - the process is dead and
> * can't be modifying its own credentials. But shut RCU-lockdep up */
> rcu_read_lock();
> @@ -253,8 +266,7 @@ void release_task(struct task_struct *p)
>
> write_lock_irq(&tasklist_lock);
> ptrace_release_task(p);
> - thread_pid = get_pid(p->thread_pid);
> - __exit_signal(p);
> + __exit_signal(&post, p);
>
> /*
> * If we are the last non-leader member of the thread
> @@ -276,8 +288,21 @@ void release_task(struct task_struct *p)
> }
>
> write_unlock_irq(&tasklist_lock);
> - proc_flush_pid(thread_pid);
> - put_pid(thread_pid);
> +
> + /*
> + * Process clean up deferred to after we drop the tasklist lock.
> + */
> + add_device_randomness((const void*) &post.randomness,
> + sizeof(unsigned long long));
> + if (post.pids[PIDTYPE_PID])
> + free_pid(post.pids[PIDTYPE_PID]);
> + if (post.pids[PIDTYPE_TGID])
> + free_pid(post.pids[PIDTYPE_TGID]);
> + if (post.pids[PIDTYPE_PGID])
> + free_pid(post.pids[PIDTYPE_PGID]);
> + if (post.pids[PIDTYPE_SID])
> + free_pid(post.pids[PIDTYPE_SID]);
> +
> release_thread(p);
> put_task_struct_rcu_user(p);
>
> diff --git a/kernel/pid.c b/kernel/pid.c
> index 3a10a7b6fcf8..047cdbcef5cf 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -343,7 +343,7 @@ void attach_pid(struct task_struct *task, enum pid_type type)
> hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]);
> }
>
> -static void __change_pid(struct task_struct *task, enum pid_type type,
> +static struct pid *__change_pid(struct task_struct *task, enum pid_type type,
> struct pid *new)
> {
> struct pid **pid_ptr = task_pid_ptr(task, type);
> @@ -362,20 +362,33 @@ static void __change_pid(struct task_struct *task, enum pid_type type,
>
> for (tmp = PIDTYPE_MAX; --tmp >= 0; )
> if (pid_has_task(pid, tmp))
> - return;
> + return NULL;
>
> - free_pid(pid);
> + return pid;
> +}
> +
> +struct pid *detach_pid_return(struct task_struct *task, enum pid_type type)
> +{
> + return __change_pid(task, type, NULL);
> }
>
> void detach_pid(struct task_struct *task, enum pid_type type)
> {
> - __change_pid(task, type, NULL);
> + struct pid *pid;
> +
> + pid = detach_pid_return(task, type);
> + if (pid)
> + free_pid(pid);
> }
>
> void change_pid(struct task_struct *task, enum pid_type type,
> struct pid *pid)
> {
> - __change_pid(task, type, pid);
> + struct pid *opid;
> +
> + opid = __change_pid(task, type, pid);
> + if (opid)
> + free_pid(opid);
> attach_pid(task, type);
> }
>
> --
> 2.43.0
>
Powered by blists - more mailing lists