[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <bd0a1f71-4624-d88a-98a8-6550926349b3@colorfullife.com>
Date: Thu, 22 Jul 2021 20:46:14 +0200
From: Manfred Spraul <manfred@...orfullife.com>
To: Alexander Mikhalitsyn <alexander.mikhalitsyn@...tuozzo.com>,
linux-kernel@...r.kernel.org
Cc: "Eric W . Biederman" <ebiederm@...ssion.com>,
Andrew Morton <akpm@...ux-foundation.org>,
Davidlohr Bueso <dave@...olabs.net>,
Johannes Weiner <hannes@...xchg.org>,
Michal Hocko <mhocko@...e.com>,
Vladimir Davydov <vdavydov.dev@...il.com>,
Andrei Vagin <avagin@...il.com>,
Christian Brauner <christian@...uner.io>,
Pavel Tikhomirov <ptikhomirov@...tuozzo.com>,
Alexander Mikhalitsyn <alexander@...alicyn.com>
Subject: Re: [RFC PATCH] shm: extend forced shm destroy to support objects
from several IPC nses
Hi Alexander,
A few more remarks.
On 7/14/21 7:30 PM, Alexander Mikhalitsyn wrote:
> This is total rework of fix.
> Thanks to Eric Biederman for suggestions (but may be I've misunderstood some of them :))
>
> I've tested it with reproducer of the original problem. But of course it needs
> detailed testing. I hope that I get some general comments about design and implementation.
>
> ToDo: remove unneeded "ns" argument from shm_destroy, shm_rmid and other functions.
What ensures the that shp->ns is not destroyed prematurely?
I did some tests, and it seems that shmat() acquires a namespace
refcount, and shm_release() puts it again, and the shm_release is late
enough to ensure that the ns cannot go out of scope.
But I haven't checked all combinations (with/without shm_rmid_forced,
delete via exit(), shmctl(), shmdt(), mmap()).
And: This should be documented somewhere.
> --- a/ipc/shm.c
> +++ b/ipc/shm.c
> @@ -65,6 +65,7 @@ struct shmid_kernel /* private to the kernel */
> /* The task created the shm object. NULL if the task is dead. */
> struct task_struct *shm_creator;
> struct list_head shm_clist; /* list by creator */
I think the comments are wrong/outdated.
Some parts of the new code checks with list_empty(shm_clist), not by
looking at shm_creator.
> + struct ipc_namespace *ns;
> } __randomize_layout;
>
> /* shm_mode upper byte flags */
> @@ -115,6 +116,7 @@ static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
> struct shmid_kernel *shp;
>
> shp = container_of(ipcp, struct shmid_kernel, shm_perm);
> + BUG_ON(shp->ns && ns != shp->ns);
Is shp->ns == NULL allowed/possible? From what I see, it is impossible.
I think we should not have NULL check in a few codepaths, but not in
other codepaths. Either everywhere, or nowhere.
>
> if (shp->shm_nattch) {
> shp->shm_perm.mode |= SHM_DEST;
> @@ -225,9 +227,32 @@ static void shm_rcu_free(struct rcu_head *head)
> kfree(shp);
> }
>
> +static inline void task_shm_clist_lock(struct task_struct *task)
> +{
> + spin_lock(&task->sysvshm.shm_clist_lock);
> +}
> +
> +static inline void task_shm_clist_unlock(struct task_struct *task)
> +{
> + spin_unlock(&task->sysvshm.shm_clist_lock);
> +}
> +
> +void shm_clist_rm(struct shmid_kernel *shp)
> +{
> + if (!shp->shm_creator)
> + return;
> +
> + task_shm_clist_lock(shp->shm_creator);
> + list_del_init(&shp->shm_clist);
> + task_shm_clist_unlock(shp->shm_creator);
> + shp->shm_creator = NULL;
> +}
> +
> static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s)
> {
> - list_del(&s->shm_clist);
> + WARN_ON(s->ns && ns != s->ns);
> + //list_del_init(&s->shm_clist);
> + shm_clist_rm(s);
> ipc_rmid(&shm_ids(ns), &s->shm_perm);
> }
>
> @@ -306,10 +331,10 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
> *
> * 2) sysctl kernel.shm_rmid_forced is set to 1.
> */
> -static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
> +static bool shm_may_destroy(struct shmid_kernel *shp)
> {
> return (shp->shm_nattch == 0) &&
> - (ns->shm_rmid_forced ||
> + (shp->ns->shm_rmid_forced ||
> (shp->shm_perm.mode & SHM_DEST));
> }
>
As written before: what ensures that shp->ns->shm_rmid_forced was not
released already?
> @@ -340,7 +365,7 @@ static void shm_close(struct vm_area_struct *vma)
> ipc_update_pid(&shp->shm_lprid, task_tgid(current));
> shp->shm_dtim = ktime_get_real_seconds();
> shp->shm_nattch--;
> - if (shm_may_destroy(ns, shp))
> + if (shm_may_destroy(shp))
> shm_destroy(ns, shp);
> else
> shm_unlock(shp);
> @@ -361,10 +386,10 @@ static int shm_try_destroy_orphaned(int id, void *p, void *data)
> *
> * As shp->* are changed under rwsem, it's safe to skip shp locking.
> */
> - if (shp->shm_creator != NULL)
> + if (!list_empty(&shp->shm_clist))
> return 0;
>
This collides with the comment above: here, list_empty() is used.
> - if (shm_may_destroy(ns, shp)) {
> + if (shm_may_destroy(shp)) {
> shm_lock_by_ptr(shp);
> shm_destroy(ns, shp);
> }
> @@ -379,51 +404,77 @@ void shm_destroy_orphaned(struct ipc_namespace *ns)
> up_write(&shm_ids(ns).rwsem);
> }
>
> +void shm_init_task(struct task_struct *task)
> +{
> + INIT_LIST_HEAD(&(task)->sysvshm.shm_clist);
> + spin_lock_init(&task->sysvshm.shm_clist_lock);
> +}
> +
> /* Locking assumes this will only be called with task == current */
> void exit_shm(struct task_struct *task)
> {
> - struct ipc_namespace *ns = task->nsproxy->ipc_ns;
> + LIST_HEAD(tmp);
> struct shmid_kernel *shp, *n;
>
> if (list_empty(&task->sysvshm.shm_clist))
> return;
>
> - /*
> - * If kernel.shm_rmid_forced is not set then only keep track of
> - * which shmids are orphaned, so that a later set of the sysctl
> - * can clean them up.
> - */
> - if (!ns->shm_rmid_forced) {
> - down_read(&shm_ids(ns).rwsem);
> - list_for_each_entry(shp, &task->sysvshm.shm_clist, shm_clist)
> - shp->shm_creator = NULL;
> - /*
> - * Only under read lock but we are only called on current
> - * so no entry on the list will be shared.
> - */
> - list_del(&task->sysvshm.shm_clist);
> - up_read(&shm_ids(ns).rwsem);
> - return;
> - }
> + rcu_read_lock(); /* for refcount_inc_not_zero */
> + task_shm_clist_lock(task);
>
> - /*
> - * Destroy all already created segments, that were not yet mapped,
> - * and mark any mapped as orphan to cover the sysctl toggling.
> - * Destroy is skipped if shm_may_destroy() returns false.
> - */
> - down_write(&shm_ids(ns).rwsem);
> list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist) {
> + struct ipc_namespace *ns = shp->ns;
> +
> + /*
> + * Remove shm from task list and nullify shm_creator which
> + * marks object as orphaned.
> + *
> + * If kernel.shm_rmid_forced is not set then only keep track of
> + * which shmids are orphaned, so that a later set of the sysctl
> + * can clean them up.
> + */
> + list_del_init(&shp->shm_clist);
> shp->shm_creator = NULL;
>
> - if (shm_may_destroy(ns, shp)) {
> - shm_lock_by_ptr(shp);
> - shm_destroy(ns, shp);
> + printk("exit_shm() %px refcnt=%u, id=%d,key=%x\n", shp,
> + refcount_read(&shp->shm_perm.refcount),
> + shp->shm_perm.id, shp->shm_perm.key
> + );
> +
> + /*
> + * Will destroy all already created segments, that were not yet mapped,
> + * and mark any mapped as orphan to cover the sysctl toggling.
> + * Destroy is skipped if shm_may_destroy() returns false.
> + */
> + if (shp->ns->shm_rmid_forced && shm_may_destroy(shp)) {
> + /*
> + * We may race with shm_exit_ns() if refcounter
> + * already zero. Let's skip shm_destroy() of such
> + * shm object as it will be destroyed during shm_exit_ns()
> + */
> + if (!refcount_inc_not_zero(&ns->ns.count)) /* get_ipc_ns */
> + continue;
> +
> + list_add(&shp->shm_clist, &tmp);
> }
> }
>
> - /* Remove the list head from any segments still attached. */
> list_del(&task->sysvshm.shm_clist);
> - up_write(&shm_ids(ns).rwsem);
> + task_shm_clist_unlock(task);
> + rcu_read_unlock();
> +
> + list_for_each_entry_safe(shp, n, &tmp, shm_clist) {
> + struct ipc_namespace *ns = shp->ns;
> +
> + list_del_init(&shp->shm_clist);
> +
> + down_write(&shm_ids(ns).rwsem);
> + shm_lock_by_ptr(shp);
> + /* will do put_ipc_ns(shp->ns) */
> + shm_destroy(ns, shp);
> + up_write(&shm_ids(ns).rwsem);
> + put_ipc_ns(ns); /* see refcount_inc_not_zero */
> + }
> }
>
I do not see the advantage of first collecting everything in a local
list, and then destroying the elements.
Attached is my current test case. Feel free to merge whatever you
consider as useful into your change.
--
Manfred
View attachment "0003-ipc-shm-locking-questions.patch" of type "text/x-patch" (10630 bytes)
Powered by blists - more mailing lists