lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Fri, 5 Nov 2021 20:03:21 +0100
From:   Manfred Spraul <manfred@...orfullife.com>
To:     "Eric W. Biederman" <ebiederm@...ssion.com>,
        Alexander Mikhalitsyn <alexander.mikhalitsyn@...tuozzo.com>
Cc:     linux-kernel@...r.kernel.org,
        Andrew Morton <akpm@...ux-foundation.org>,
        Davidlohr Bueso <dave@...olabs.net>,
        Greg KH <gregkh@...uxfoundation.org>,
        Andrei Vagin <avagin@...il.com>,
        Pavel Tikhomirov <ptikhomirov@...tuozzo.com>,
        Vasily Averin <vvs@...tuozzo.com>,
        Alexander Mikhalitsyn <alexander@...alicyn.com>,
        stable@...r.kernel.org
Subject: Re: [PATCH 2/2] shm: extend forced shm destroy to support objects
 from several IPC nses

Hi Eric,

On 11/5/21 18:46, Eric W. Biederman wrote:
>
>> -static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s)
>> +/*
>> + * It has to be called with shp locked.
>> + * It must be called before ipc_rmid()
>> + */
>> +static inline void shm_clist_rm(struct shmid_kernel *shp)
>>   {
>> -	list_del(&s->shm_clist);
>> -	ipc_rmid(&shm_ids(ns), &s->shm_perm);
>> +	struct task_struct *creator;
>> +
>> +	/*
>> +	 * A concurrent exit_shm may do a list_del_init() as well.
>> +	 * Just do nothing if exit_shm already did the work
>> +	 */
>> +	if (list_empty(&shp->shm_clist))
>> +		return;
> This looks like a problem.  With no lock is held the list_empty here is
> fundamentally an optimization.  So the rest of the function should run
> properly if this list_empty is removed.
>
> It does not look to me like the rest of the function will run properly
> if list_empty is removed.
>
> The code needs an rcu_lock or something like that to ensure that
> shm_creator does not go away between the time it is read and when the
> lock is taken.

>> +
>> +	/*
>> +	 * shp->shm_creator is guaranteed to be valid *only*
>> +	 * if shp->shm_clist is not empty.
>> +	 */
>> +	creator = shp->shm_creator;
>> +
>> +	task_lock(creator);
>> +	list_del_init(&shp->shm_clist);
>> +	task_unlock(creator);
>> +}
>> +

You are right!
I had checked the function several times, but I have overlooked the 
simple case. exit_shm() contains:

> task_lock()
> list_del_init()
> task_unlock()
>
> down_write(&shm_ids(ns).rwsem);
> shm_lock_by_ptr(shp);
>
<<< since the shm_clist_rm() is called when holding the shp lock, 
exit_shm() cannot proceed. Thus if !list_empty()) is guarantees that 
->creator will not disappear.

But: for !shm_rmid_forced, there is no lock of shp :-(


>> +static inline void shm_rmid(struct shmid_kernel *s)
>> +{
>> +	shm_clist_rm(s);
>> +	ipc_rmid(&shm_ids(s->ns), &s->shm_perm);
>>   }
>>   
>>   
>> @@ -283,7 +319,7 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
>>   	shm_file = shp->shm_file;
>>   	shp->shm_file = NULL;
>>   	ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT;
>> -	shm_rmid(ns, shp);
>> +	shm_rmid(shp);
>>   	shm_unlock(shp);
>>   	if (!is_file_hugepages(shm_file))
>>   		shmem_lock(shm_file, 0, shp->mlock_ucounts);
>> @@ -306,10 +342,10 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
>>    *
>>    * 2) sysctl kernel.shm_rmid_forced is set to 1.
>>    */
>> -static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
>> +static bool shm_may_destroy(struct shmid_kernel *shp)
>>   {
>>   	return (shp->shm_nattch == 0) &&
>> -	       (ns->shm_rmid_forced ||
>> +	       (shp->ns->shm_rmid_forced ||
>>   		(shp->shm_perm.mode & SHM_DEST));
>>   }
>>   
>> @@ -340,7 +376,7 @@ static void shm_close(struct vm_area_struct *vma)
>>   	ipc_update_pid(&shp->shm_lprid, task_tgid(current));
>>   	shp->shm_dtim = ktime_get_real_seconds();
>>   	shp->shm_nattch--;
>> -	if (shm_may_destroy(ns, shp))
>> +	if (shm_may_destroy(shp))
>>   		shm_destroy(ns, shp);
>>   	else
>>   		shm_unlock(shp);
>> @@ -361,10 +397,10 @@ static int shm_try_destroy_orphaned(int id, void *p, void *data)
>>   	 *
>>   	 * As shp->* are changed under rwsem, it's safe to skip shp locking.
>>   	 */
> We should add a comment why testing list_empty here is safe/reliable.
>
> Now that the list deletion is only protected by task_lock it feels like
> this introduces a race.
>
> I don't think the race is meaningful as either the list is non-empty
> or it is empty.  Plus none of the following tests are racy.  So there
> is no danger of an attached segment being destroyed.
It shp can be destroyed, in the sense that ->deleted is set. But this is 
handled.
>> -	if (shp->shm_creator != NULL)
>> +	if (!list_empty(&shp->shm_clist))
>>   		return 0;
>>   
>> -	if (shm_may_destroy(ns, shp)) {
>> +	if (shm_may_destroy(shp)) {
>>   		shm_lock_by_ptr(shp);
>>   		shm_destroy(ns, shp);
>>   	}
>> @@ -382,48 +418,87 @@ void shm_destroy_orphaned(struct ipc_namespace *ns)
>>   /* Locking assumes this will only be called with task == current */
>>   void exit_shm(struct task_struct *task)
>>   {
>> -	struct ipc_namespace *ns = task->nsproxy->ipc_ns;
>> -	struct shmid_kernel *shp, *n;
>> +	for (;;) {
>> +		struct shmid_kernel *shp;
>> +		struct ipc_namespace *ns;
>>   
>> -	if (list_empty(&task->sysvshm.shm_clist))
>> -		return;
>> +		task_lock(task);
>> +
>> +		if (list_empty(&task->sysvshm.shm_clist)) {
>> +			task_unlock(task);
>> +			break;
>> +		}
>> +
>> +		shp = list_first_entry(&task->sysvshm.shm_clist, struct shmid_kernel,
>> +				shm_clist);
>> +
>> +		/* 1) unlink */
>> +		list_del_init(&shp->shm_clist);
>                  ^^^^^^^
>                  The code should also clear shm_creator here as well.
>                  So that a stale reference becomes a NULL pointer
>                  dereference instead of use-after-free.  Something like:
list_del_init() already contains a write_once, and that pairs with a 
READ_ONCE() in list_empty.

Using both shp->shm_creator ==NULL and list_empty() as protection 
doesn't help, it can only introduce new races.



> 		/*
>                   * The old shm_creator value will remain valid for
>                   * at least an rcu grace period after this, see
> 		 * put_task_struct_rcu_user.
>                   */
>                   
> 		rcu_assign_pointer(shp->shm_creator, NULL);
>
> This allows shm_clist_rm to look like:
> static inline void shm_clist_rm(struct shmid_kernel *shp)
> {
> 	struct task_struct *creator;
>
> 	rcu_read_lock();
>          creator = rcu_dereference(shp->shm_clist);

We must protect against a parallel: 
exit_sem();<...>;kmem_cache_free(,creator), correct?

No other races are relevant, as shp->shm_creator is written once and 
then never updated.

Thus, my current understanding: We need the rcu_read_lock().

And rcu_read_lock() is sufficient, as release_task ends with 
put_task_struct_rcu_user().

>          if (creator) {
>          	task_lock(creator);
>                  list_del_init(&shp->shm_clist);
>                  task_unlock(creator);
>          }
>          rcu_read_unlock();
> }
> 	
>>   
>> -	/*
>> -	 * If kernel.shm_rmid_forced is not set then only keep track of
>> -	 * which shmids are orphaned, so that a later set of the sysctl
>> -	 * can clean them up.
>> -	 */
>> -	if (!ns->shm_rmid_forced) {
>> -		down_read(&shm_ids(ns).rwsem);
>> -		list_for_each_entry(shp, &task->sysvshm.shm_clist, shm_clist)
>> -			shp->shm_creator = NULL;
>>   		/*
>> -		 * Only under read lock but we are only called on current
>> -		 * so no entry on the list will be shared.
>> +		 * 2) Get pointer to the ipc namespace. It is worth to say
>> +		 * that this pointer is guaranteed to be valid because
>> +		 * shp lifetime is always shorter than namespace lifetime
>> +		 * in which shp lives.
>> +		 * We taken task_lock it means that shp won't be freed.
>>   		 */
>> -		list_del(&task->sysvshm.shm_clist);
>> -		up_read(&shm_ids(ns).rwsem);
>> -		return;
>> -	}
>> +		ns = shp->ns;
>>   
>> -	/*
>> -	 * Destroy all already created segments, that were not yet mapped,
>> -	 * and mark any mapped as orphan to cover the sysctl toggling.
>> -	 * Destroy is skipped if shm_may_destroy() returns false.
>> -	 */
>> -	down_write(&shm_ids(ns).rwsem);
>> -	list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist) {
>> -		shp->shm_creator = NULL;
>> +		/*
>> +		 * 3) If kernel.shm_rmid_forced is not set then only keep track of
>> +		 * which shmids are orphaned, so that a later set of the sysctl
>> +		 * can clean them up.
>> +		 */
>> +		if (!ns->shm_rmid_forced) {
>> +			task_unlock(task);
>> +			continue;
>> +		}
>>   
>> -		if (shm_may_destroy(ns, shp)) {
>> +		/*
>> +		 * 4) get a reference to the namespace.
>> +		 *    The refcount could be already 0. If it is 0, then
>> +		 *    the shm objects will be free by free_ipc_work().
>> +		 */
>> +		ns = get_ipc_ns_not_zero(ns);
>> +		if (ns) {
>                  ^^^^^^^^^
>
>                  This test is probably easier to follow if it was simply:
>                  if (!ns) {
>                  	task_unlock(task);
>                          continue;
>                  }
>
>                  Then the basic logic can all stay at the same
>                  indentation level, and ns does not need to be
>                  tested a second time.
>
>> +			/*
>> +			 * 5) get a reference to the shp itself.
>> +			 *   This cannot fail: shm_clist_rm() is called before
>> +			 *   ipc_rmid(), thus the refcount cannot be 0.
>> +			 */
>> +			WARN_ON(!ipc_rcu_getref(&shp->shm_perm));
>                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>                          This calls for an ipc_getref that simply calls
>                          refcount_inc.  Then the refcount code can
>                          perform all of the sanity checks for you,
>                          and the WARN_ON becomes unnecessary.
>
>                          Plus the code then documents the fact you know
>                          the refcount must be non-zero here.
>> +		}
>> +
>> +		task_unlock(task);
>> +
>> +		if (ns) {
>> +			down_write(&shm_ids(ns).rwsem);
>>   			shm_lock_by_ptr(shp);
>> -			shm_destroy(ns, shp);
>> +			/*
>> +			 * rcu_read_lock was implicitly taken in
>> +			 * shm_lock_by_ptr, it's safe to call
>> +			 * ipc_rcu_putref here
>                            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>                            This comment should say something like:
>
>                            rcu_read_lock was taken in shm_lock_by_ptr.
>                            With rcu protecting our accesses of shp
>                            holding a reference to shp is unnecessary.
>                            
>> +			 */
>> +			ipc_rcu_putref(&shp->shm_perm, shm_rcu_free);
>                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>                          It probably makes most sense just to move
>                          this decrement of the extra reference down to
>                          just before put_ipc_ns.  Removing the need
>                          for the comment and understanding the subtleties
>                          there, and keeping all of the taking and putting
>                          in a consistent order.
>                          
>
>> +
>> +			if (ipc_valid_object(&shp->shm_perm)) {
>> +				if (shm_may_destroy(shp))
>> +					shm_destroy(ns, shp);
>> +				else
>> +					shm_unlock(shp);
>> +			} else {
>> +				/*
>> +				 * Someone else deleted the shp from namespace
>> +				 * idr/kht while we have waited.
>> +				 * Just unlock and continue.
>> +				 */
>> +				shm_unlock(shp);
>> +			}
>> +
>> +			up_write(&shm_ids(ns).rwsem);
>> +			put_ipc_ns(ns); /* paired with get_ipc_ns_not_zero */
>>   		}
>>   	}
>> -
>> -	/* Remove the list head from any segments still attached. */
>> -	list_del(&task->sysvshm.shm_clist);
>> -	up_write(&shm_ids(ns).rwsem);
>>   }
>>   
>>   static vm_fault_t shm_fault(struct vm_fault *vmf)
>> @@ -680,7 +755,11 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
>>   	if (error < 0)
>>   		goto no_id;
>>   
>> +	shp->ns = ns;
>> +
>> +	task_lock(current);
>>   	list_add(&shp->shm_clist, &current->sysvshm.shm_clist);
>> +	task_unlock(current);
>>   
>>   	/*
>>   	 * shmid gets reported as "inode#" in /proc/pid/maps.
>> @@ -1573,7 +1652,8 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg,
>>   	down_write(&shm_ids(ns).rwsem);
>>   	shp = shm_lock(ns, shmid);
>>   	shp->shm_nattch--;
>> -	if (shm_may_destroy(ns, shp))
>> +
>> +	if (shm_may_destroy(shp))
>>   		shm_destroy(ns, shp);
>>   	else
>>   		shm_unlock(shp);
> Eric


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ