include/linux/psi.h | 1 + include/linux/psi_types.h | 3 ++ kernel/cgroup/cgroup.c | 2 +- kernel/sched/psi.c | 71 ++++++++++++++++++++++++++++++++--------------- 4 files changed, 54 insertions(+), 23 deletions(-) diff --git a/include/linux/psi.h b/include/linux/psi.h index 65eb1476ac70..9ec9468d6068 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -33,6 +33,7 @@ void cgroup_move_task(struct task_struct *p, struct css_set *to); struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, size_t nbytes, enum psi_res res); void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *t); +void psi_trigger_release(void **trigger_ptr); __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, poll_table *wait); diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 0a23300d49af..eab79e68bf56 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -132,6 +132,9 @@ struct psi_trigger { /* Refcounting to prevent premature destruction */ struct kref refcount; + + /* Previous trigger that this one replaced */ + struct psi_trigger *stale_trigger; }; struct psi_group { diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 919194de39c8..801d0aec0443 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3684,7 +3684,7 @@ static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, static void cgroup_pressure_release(struct kernfs_open_file *of) { - psi_trigger_replace(&of->priv, NULL); + psi_trigger_release(&of->priv); } bool cgroup_psi_enabled(void) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 1652f2bb54b7..10430f75f21a 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1152,6 +1152,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, t->last_event_time = 0; init_waitqueue_head(&t->event_wait); kref_init(&t->refcount); + t->stale_trigger = NULL; mutex_lock(&group->trigger_lock); @@ -1223,12 +1224,6 @@ static void psi_trigger_destroy(struct kref *ref) mutex_unlock(&group->trigger_lock); - /* - * Wait for both *trigger_ptr from psi_trigger_replace and - * poll_task RCUs to complete their read-side critical sections - * before destroying the trigger and optionally the poll_task - */ - synchronize_rcu(); /* * Stop kthread 'psimon' after releasing trigger_lock to prevent a * deadlock while waiting for psi_poll_work to acquire trigger_lock @@ -1243,16 +1238,48 @@ static void psi_trigger_destroy(struct kref *ref) kfree(t); } +/* + * Replacing a trigger must not throw away the old one, since it + * can still have pending waiters. + * + * Possible optimization: after successfully installing a new + * trigger, we could release the old one from the stale list + * early. Not done here yet - needs locking with psi_trigger_poll. + */ void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new) { - struct psi_trigger *old = *trigger_ptr; + if (static_branch_likely(&psi_disabled)) + return; + + for (;;) { + struct psi_trigger *old = *trigger_ptr; + + new->stale_trigger = old; + if (try_cmpxchg(trigger_ptr, old, new)) + break; + } + + /* + * Now that the new one has been installed, we could + * check if the stale one has an empty wait-queue + * and release it early... But we'd need some locking + * with enw poll users to be sure. + */ +} + +/* No locking needed for final release */ +void psi_trigger_release(void **trigger_ptr) +{ + struct psi_trigger *trigger; if (static_branch_likely(&psi_disabled)) return; - rcu_assign_pointer(*trigger_ptr, new); - if (old) - kref_put(&old->refcount, psi_trigger_destroy); + while (trigger) { + struct psi_trigger *next = trigger->stale_trigger; + kref_put(&trigger->refcount, psi_trigger_destroy); + trigger = next; + } } __poll_t psi_trigger_poll(void **trigger_ptr, @@ -1264,24 +1291,24 @@ __poll_t psi_trigger_poll(void **trigger_ptr, if (static_branch_likely(&psi_disabled)) return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; - rcu_read_lock(); - - t = rcu_dereference(*(void __rcu __force **)trigger_ptr); - if (!t) { - rcu_read_unlock(); + /* + * See psi_trigger_replace(): finding a trigger means + * that it is guaranteed to have an elevated refcount + * for the lifetime of this file descriptor. + * + * If we had locking, we could release it early. As it + * is, we'll only release it with psi_trigger_release() + * at the very end. + */ + t = READ_ONCE(*trigger_ptr); + if (!t) return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; - } - kref_get(&t->refcount); - - rcu_read_unlock(); poll_wait(file, &t->event_wait, wait); if (cmpxchg(&t->event, 1, 0) == 1) ret |= EPOLLPRI; - kref_put(&t->refcount, psi_trigger_destroy); - return ret; } @@ -1347,7 +1374,7 @@ static int psi_fop_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; - psi_trigger_replace(&seq->private, NULL); + psi_trigger_release(&seq->private); return single_release(inode, file); }