[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <803cf1bf-1934-abf7-727a-91a0458e3b3a@colorfullife.com>
Date: Sat, 6 Nov 2021 08:50:39 +0100
From: Manfred Spraul <manfred@...orfullife.com>
To: "Eric W. Biederman" <ebiederm@...ssion.com>
Cc: Alexander Mikhalitsyn <alexander.mikhalitsyn@...tuozzo.com>,
linux-kernel@...r.kernel.org,
Andrew Morton <akpm@...ux-foundation.org>,
Davidlohr Bueso <dave@...olabs.net>,
Greg KH <gregkh@...uxfoundation.org>,
Andrei Vagin <avagin@...il.com>,
Pavel Tikhomirov <ptikhomirov@...tuozzo.com>,
Vasily Averin <vvs@...tuozzo.com>,
Alexander Mikhalitsyn <alexander@...alicyn.com>,
stable@...r.kernel.org
Subject: Re: [RFC] shm: extend forced shm destroy to support objects from
several IPC nses (simplified)
Hi Eric,
On 11/5/21 22:34, Eric W. Biederman wrote:
> I have to dash so this is short.
As last time, I'll review the change and check for new/good ideas.
As first question: Is the change tested?
[...]
>
> /* The task created the shm object. NULL if the task is dead. */
> - struct task_struct *shm_creator;
> + struct task_struct __rcu *shm_creator;
> struct list_head shm_clist; /* list by creator */
> + struct ipc_namespace *shm_ns; /* valid when shm_nattch != 0 */
> } __randomize_layout;
>
There is no reason to modify shm_creator:
We need _one_ indicator that the creator has died, not two.
We have both list_empty() and shm_creator. Thus we should/must define
what is the relevant indicator, and every function must use the same one.
exit_sem() must walk shm_clist. list_empty() must return the correct answer.
Thus I think it is simpler that list_empty() is the indicator.
In addition, as you have correctly noticed: If we make shm_creator==NULL
the indicator, then we must use at __rcu or at least READ_ONCE() accessors.
But: This would only solve a self created problem. Just leave
shm_creator unmodified - and the need for READ_ONCE() goes away.
> /* shm_mode upper byte flags */
> @@ -106,29 +107,17 @@ void shm_init_ns(struct ipc_namespace *ns)
> ipc_init_ids(&shm_ids(ns));
> }
>
> -/*
> - * Called with shm_ids.rwsem (writer) and the shp structure locked.
> - * Only shm_ids.rwsem remains locked on exit.
> - */
> -static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
> +static void do_shm_destroy(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
> {
> - struct shmid_kernel *shp;
> -
> - shp = container_of(ipcp, struct shmid_kernel, shm_perm);
> -
> - if (shp->shm_nattch) {
> - shp->shm_perm.mode |= SHM_DEST;
> - /* Do not find it any more */
> - ipc_set_key_private(&shm_ids(ns), &shp->shm_perm);
> - shm_unlock(shp);
> - } else
> - shm_destroy(ns, shp);
> + struct shmid_kernel *shp =
> + container_of(ipcp, struct shmid_kernel, shm_perm);
> + shm_destroy(ns, shp);
> }
>
> #ifdef CONFIG_IPC_NS
> void shm_exit_ns(struct ipc_namespace *ns)
> {
> - free_ipcs(ns, &shm_ids(ns), do_shm_rmid);
> + free_ipcs(ns, &shm_ids(ns), do_shm_destroy);
> idr_destroy(&ns->ids[IPC_SHM_IDS].ipcs_idr);
> rhashtable_destroy(&ns->ids[IPC_SHM_IDS].key_ht);
> }
> @@ -225,9 +214,22 @@ static void shm_rcu_free(struct rcu_head *head)
> kfree(shp);
> }
>
> +static inline void shm_clist_del(struct shmid_kernel *shp)
> +{
> + struct task_struct *creator;
> +
> + rcu_read_lock();
> + creator = rcu_dereference(shp->shm_creator);
> + if (creator) {
> + task_lock(creator);
> + list_del(&shp->shm_clist);
Does this work? You are using list_del() instead of list_del_init().
I fear that this might break exit_sem()
> + task_unlock(creator);
> + }
> + rcu_read_unlock();
> +}
> +
> static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s)
> {
> - list_del(&s->shm_clist);
> ipc_rmid(&shm_ids(ns), &s->shm_perm);
> }
>
> @@ -283,7 +285,9 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
> shm_file = shp->shm_file;
> shp->shm_file = NULL;
> ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT;
> + shm_clist_del(shp);
> shm_rmid(ns, shp);
> + shp->shm_ns = NULL;
> shm_unlock(shp);
> if (!is_file_hugepages(shm_file))
> shmem_lock(shm_file, 0, shp->mlock_ucounts);
> @@ -361,7 +365,7 @@ static int shm_try_destroy_orphaned(int id, void *p, void *data)
> *
> * As shp->* are changed under rwsem, it's safe to skip shp locking.
> */
> - if (shp->shm_creator != NULL)
> + if (rcu_access_pointer(shp->shm_creator) != NULL)
> return 0;
>
> if (shm_may_destroy(ns, shp)) {
> @@ -382,48 +386,62 @@ void shm_destroy_orphaned(struct ipc_namespace *ns)
> /* Locking assumes this will only be called with task == current */
> void exit_shm(struct task_struct *task)
> {
> - struct ipc_namespace *ns = task->nsproxy->ipc_ns;
> - struct shmid_kernel *shp, *n;
> -
> - if (list_empty(&task->sysvshm.shm_clist))
> - return;
> -
> - /*
> - * If kernel.shm_rmid_forced is not set then only keep track of
> - * which shmids are orphaned, so that a later set of the sysctl
> - * can clean them up.
> - */
> - if (!ns->shm_rmid_forced) {
> - down_read(&shm_ids(ns).rwsem);
> - list_for_each_entry(shp, &task->sysvshm.shm_clist, shm_clist)
> - shp->shm_creator = NULL;
> - /*
> - * Only under read lock but we are only called on current
> - * so no entry on the list will be shared.
> - */
> - list_del(&task->sysvshm.shm_clist);
> - up_read(&shm_ids(ns).rwsem);
> - return;
> - }
> + struct list_head *head = &task->sysvshm.shm_clist;
>
> /*
> * Destroy all already created segments, that were not yet mapped,
> * and mark any mapped as orphan to cover the sysctl toggling.
> * Destroy is skipped if shm_may_destroy() returns false.
> */
> - down_write(&shm_ids(ns).rwsem);
> - list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist) {
> - shp->shm_creator = NULL;
> + for (;;) {
> + struct ipc_namespace *ns;
> + struct shmid_kernel *shp;
>
> - if (shm_may_destroy(ns, shp)) {
> + task_lock(task);
> + if (list_empty(head)) {
> + task_unlock(task);
> + break;
> + }
> +
> + shp = list_first_entry(head, struct shmid_kernel, shm_clist);
> +
> + list_del(&shp->shm_clist);
> + rcu_assign_pointer(shp->shm_creator, NULL);
> +
> + /*
> + * Guarantee that ns lives after task_list is dropped.
> + *
> + * This shm segment may not be attached and it's ipc
> + * namespace may be exiting. If so ignore the shm
> + * segment as it will be destroyed by shm_exit_ns.
> + */
> + ns = get_ipc_ns_not_zero(shp->shm_ns);
> + if (!ns) {
> + task_unlock(task);
> + continue;
> + }
> +
> + /* Guarantee shp lives after task_lock is dropped */
> + ipc_getref(&shp->shm_perm);
> +
> + /* Drop task_lock so that shm_destroy may take it */
> + task_unlock(task);
> +
> + /* Can the shm segment be destroyed? */
> + down_write(&shm_ids(ns).rwsem);
> + shm_lock_by_ptr(shp);
> + if (ipc_valid_object(&shp->shm_perm) &&
> + shm_may_destroy(ns, shp)) {
> shm_lock_by_ptr(shp);
> shm_destroy(ns, shp);
> + } else {
> + shm_unlock(shp);
> }
> - }
>
> - /* Remove the list head from any segments still attached. */
> - list_del(&task->sysvshm.shm_clist);
> - up_write(&shm_ids(ns).rwsem);
> + ipc_rcu_putref(&shp->shm_perm, shm_rcu_free);
> + up_write(&shm_ids(ns).rwsem);
> + put_ipc_ns(ns);
> + }
> }
>
> static vm_fault_t shm_fault(struct vm_fault *vmf)
> @@ -673,14 +691,17 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
> shp->shm_segsz = size;
> shp->shm_nattch = 0;
> shp->shm_file = file;
> - shp->shm_creator = current;
> + RCU_INIT_POINTER(shp->shm_creator, current);
> + shp->shm_ns = ns;
>
> /* ipc_addid() locks shp upon success. */
> error = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni);
> if (error < 0)
> goto no_id;
>
> + task_lock(current);
> list_add(&shp->shm_clist, ¤t->sysvshm.shm_clist);
> + task_unlock(current);
>
> /*
> * shmid gets reported as "inode#" in /proc/pid/maps.
> @@ -913,8 +934,14 @@ static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd,
> switch (cmd) {
> case IPC_RMID:
> ipc_lock_object(&shp->shm_perm);
> - /* do_shm_rmid unlocks the ipc object and rcu */
> - do_shm_rmid(ns, ipcp);
> + if (shp->shm_nattch) {
> + shp->shm_perm.mode |= SHM_DEST;
> + /* Do not find it any more */
> + ipc_set_key_private(&shm_ids(ns), &shp->shm_perm);
> + shm_unlock(shp);
> + } else
> + shm_destroy(ns, shp);
> + /* shm_unlock unlocked the ipc object and rcu */
> goto out_up;
> case IPC_SET:
> ipc_lock_object(&shp->shm_perm);
> diff --git a/ipc/util.c b/ipc/util.c
> index fa2d86ef3fb8..58228f342397 100644
> --- a/ipc/util.c
> +++ b/ipc/util.c
> @@ -525,6 +525,11 @@ void ipc_set_key_private(struct ipc_ids *ids, struct kern_ipc_perm *ipcp)
> ipcp->key = IPC_PRIVATE;
> }
>
> +void ipc_getref(struct kern_ipc_perm *ptr)
> +{
> + return refcount_inc(&ptr->refcount);
> +}
> +
> bool ipc_rcu_getref(struct kern_ipc_perm *ptr)
> {
> return refcount_inc_not_zero(&ptr->refcount);
> diff --git a/ipc/util.h b/ipc/util.h
> index 2dd7ce0416d8..e13b46ff675f 100644
> --- a/ipc/util.h
> +++ b/ipc/util.h
> @@ -170,6 +170,7 @@ static inline int ipc_get_maxidx(struct ipc_ids *ids)
> * refcount is initialized by ipc_addid(), before that point call_rcu()
> * must be used.
> */
> +void ipc_getref(struct kern_ipc_perm *ptr);
> bool ipc_rcu_getref(struct kern_ipc_perm *ptr);
> void ipc_rcu_putref(struct kern_ipc_perm *ptr,
> void (*func)(struct rcu_head *head));
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 38681ad44c76..3e881f78bcf2 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -3095,7 +3095,6 @@ int ksys_unshare(unsigned long unshare_flags)
> if (unshare_flags & CLONE_NEWIPC) {
> /* Orphan segments in old ns (see sem above). */
> exit_shm(current);
> - shm_init_task(current);
> }
>
> if (new_nsproxy)
Powered by blists - more mailing lists