linux-kernel - Re: Bug: fio traps into kernel without exiting because futex has a deadloop

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <4A309D16.4040505@us.ibm.com>
Date:	Wed, 10 Jun 2009 22:58:46 -0700
From:	Darren Hart <dvhltc@...ibm.com>
To:	"Zhang, Yanmin" <yanmin_zhang@...ux.intel.com>
CC:	Peter Zijlstra <peterz@...radead.org>,
	Rusty Russell <rusty@...tcorp.com.au>,
	LKML <linux-kernel@...r.kernel.org>,
	Thomas Gleixner <tglx@...utronix.de>
Subject: Re: Bug: fio traps into kernel without exiting because futex has
 a deadloop

Zhang, Yanmin wrote:

Hi Zhang,

> I investigate a fio hang issue. When I run fio multi-process
> testing on many disks, fio traps into kernel and doesn't exit
> (mostly hit once after runing sub test cases for hundreds of times).
> 
> Oprofile data shows kernel consumes time with some futex functions.
> Command kill couldn't kill the process and machine reboot also hangs.
> 
> Eventually, I locate the root cause as a bug of futex. Kernel enters
> a deadloop between 'retry' and 'goto retry' in function futex_wake_op.
> By unknown reason (might be an issue of fio or glibc), parameter uaddr2
> points to an area which is READONLY. So futex_atomic_op_inuser returns
> -EFAULT when trying to changing the data at uaddr2, but later get_user
> still succeeds becasue the area is READONLY. Then go back to retry.
> 
> I create a simple test case to trigger it, which just shmat an READONLY
> area for address uaddr2.
> 
> It could be used as a DOS attack.

Nice work on the diagnosis.  I recall discussing something like this a 
couple weeks back.  I thought this was fixed with a patch to ensure the 
pages were writable.  Cc'ing Thomas G. to confirm.  I didn't see a 
kernel version in your report, what are you running?

--
Darren

> 
> 'git log kernel/futex.c' shows below commit creates the issue obviously.
> 
> commit e4dc5b7a36a49eff97050894cf1b3a9a02523717
> Author: Darren Hart <dvhltc@...ibm.com>
> Date:   Thu Mar 12 00:56:13 2009 -0700
> 
>     futex: clean up fault logic
>     
>     Impact: cleanup
>     
>     Older versions of the futex code held the mmap_sem which had to
>     be dropped in order to call get_user(), so a two-pronged fault
>     handling mechanism was employed to handle faults of the atomic
>     operations.  The mmap_sem is no longer held, so get_user()
>     should be adequate.  This patch greatly simplifies the logic and
>     improves legibility.
> 
> 
> Reverting it fixes the issue. In addition, the patch deletes some comments
> around calling futex_handle_fault, but forgots in futex_unlock_pi.
> 
> There is a confliction to revert the commit. I worked out a patch.
> 
> The futex codes seem complicated. We could work out cleanup patches later after
> applying the reverting patch.
> 
> ---
> 
> --- linux-2.6.30-rc8/kernel/futex.c	2009-06-10 06:32:19.000000000 +0800
> +++ linux-2.6.30-rc8_futex/kernel/futex.c	2009-06-10 00:07:08.000000000 +0800
> @@ -300,6 +300,41 @@ static int get_futex_value_locked(u32 *d
>  	return ret ? -EFAULT : 0;
>  }
> 
> +/*
> + * Fault handling.
> + */
> +static int futex_handle_fault(unsigned long address, int attempt)
> +{
> +	struct vm_area_struct * vma;
> +	struct mm_struct *mm = current->mm;
> +	int ret = -EFAULT;
> +
> +	if (attempt > 2)
> +		return ret;
> +
> +	down_read(&mm->mmap_sem);
> +	vma = find_vma(mm, address);
> +	if (vma && address >= vma->vm_start &&
> +	    (vma->vm_flags & VM_WRITE)) {
> +		int fault;
> +		fault = handle_mm_fault(mm, vma, address, 1);
> +		if (unlikely((fault & VM_FAULT_ERROR))) {
> +#if 0
> +			/* XXX: let's do this when we verify it is OK */
> +			if (ret & VM_FAULT_OOM)
> +				ret = -ENOMEM;
> +#endif
> +		} else {
> +			ret = 0;
> +			if (fault & VM_FAULT_MAJOR)
> +				current->maj_flt++;
> +			else
> +				current->min_flt++;
> +		}
> +	}
> +	up_read(&mm->mmap_sem);
> +	return ret;
> +}
> 
>  /*
>   * PI code:
> @@ -722,9 +757,9 @@ futex_wake_op(u32 __user *uaddr1, int fs
>  	struct futex_hash_bucket *hb1, *hb2;
>  	struct plist_head *head;
>  	struct futex_q *this, *next;
> -	int ret, op_ret;
> +	int ret, op_ret, attempt = 0;
> 
> -retry:
> +retryfull:
>  	ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
>  	if (unlikely(ret != 0))
>  		goto out;
> @@ -735,8 +770,9 @@ retry:
>  	hb1 = hash_futex(&key1);
>  	hb2 = hash_futex(&key2);
> 
> +retry:
>  	double_lock_hb(hb1, hb2);
> -retry_private:
> +
>  	op_ret = futex_atomic_op_inuser(op, uaddr2);
>  	if (unlikely(op_ret < 0)) {
>  		u32 dummy;
> @@ -757,16 +793,28 @@ retry_private:
>  			goto out_put_keys;
>  		}
> 
> +		/*
> +		 * futex_atomic_op_inuser needs to both read and write
> +		 * *(int __user *)uaddr2, but we can't modify it
> +		 * non-atomically.  Therefore, if get_user below is not
> +		 * enough, we need to handle the fault ourselves, while
> +		 * still holding the mmap_sem.
> +		 */
> +		if (attempt++) {
> +			ret = futex_handle_fault((unsigned long)uaddr2,
> +						 attempt);
> +			if (ret)
> +				goto out_put_keys;
> +			goto retry;
> +		}
> +
>  		ret = get_user(dummy, uaddr2);
>  		if (ret)
>  			goto out_put_keys;
> 
> -		if (!fshared)
> -			goto retry_private;
> -
>  		put_futex_key(fshared, &key2);
>  		put_futex_key(fshared, &key1);
> -		goto retry;
> +		goto retryfull;
>  	}
> 
>  	head = &hb1->chain;
> @@ -826,7 +874,6 @@ retry:
>  	hb1 = hash_futex(&key1);
>  	hb2 = hash_futex(&key2);
> 
> -retry_private:
>  	double_lock_hb(hb1, hb2);
> 
>  	if (likely(cmpval != NULL)) {
> @@ -837,16 +884,15 @@ retry_private:
>  		if (unlikely(ret)) {
>  			double_unlock_hb(hb1, hb2);
> 
> +			put_futex_key(fshared, &key2);
> +			put_futex_key(fshared, &key1);
> +
>  			ret = get_user(curval, uaddr1);
> -			if (ret)
> -				goto out_put_keys;
> 
> -			if (!fshared)
> -				goto retry_private;
> +			if (!ret)
> +				goto retry;
> 
> -			put_futex_key(fshared, &key2);
> -			put_futex_key(fshared, &key1);
> -			goto retry;
> +			goto out_put_keys;
>  		}
>  		if (curval != *cmpval) {
>  			ret = -EAGAIN;
> @@ -1026,7 +1072,7 @@ static int fixup_pi_state_owner(u32 __us
>  	struct futex_pi_state *pi_state = q->pi_state;
>  	struct task_struct *oldowner = pi_state->owner;
>  	u32 uval, curval, newval;
> -	int ret;
> +	int ret, attempt = 0;
> 
>  	/* Owner died? */
>  	if (!pi_state->owner)
> @@ -1097,7 +1143,7 @@ retry:
>  handle_fault:
>  	spin_unlock(q->lock_ptr);
> 
> -	ret = get_user(uval, uaddr);
> +	ret = futex_handle_fault((unsigned long)uaddr, attempt++);
> 
>  	spin_lock(q->lock_ptr);
> 
> @@ -1146,7 +1192,6 @@ retry:
>  	if (unlikely(ret != 0))
>  		goto out;
> 
> -retry_private:
>  	hb = queue_lock(&q);
> 
>  	/*
> @@ -1173,16 +1218,13 @@ retry_private:
> 
>  	if (unlikely(ret)) {
>  		queue_unlock(&q, hb);
> +		put_futex_key(fshared, &q.key);
> 
>  		ret = get_user(uval, uaddr);
> -		if (ret)
> -			goto out_put_key;
> 
> -		if (!fshared)
> -			goto retry_private;
> -
> -		put_futex_key(fshared, &q.key);
> -		goto retry;
> +		if (!ret)
> +			goto retry;
> +		goto out;
>  	}
>  	ret = -EWOULDBLOCK;
>  	if (unlikely(uval != val)) {
> @@ -1316,7 +1358,7 @@ static int futex_lock_pi(u32 __user *uad
>  	struct futex_hash_bucket *hb;
>  	u32 uval, newval, curval;
>  	struct futex_q q;
> -	int ret, lock_taken, ownerdied = 0;
> +	int ret, lock_taken, ownerdied = 0, attempt = 0;
> 
>  	if (refill_pi_state_cache())
>  		return -ENOMEM;
> @@ -1336,7 +1378,7 @@ retry:
>  	if (unlikely(ret != 0))
>  		goto out;
> 
> -retry_private:
> +retry_unlocked:
>  	hb = queue_lock(&q);
> 
>  retry_locked:
> @@ -1561,15 +1603,18 @@ uaddr_faulted:
>  	 */
>  	queue_unlock(&q, hb);
> 
> -	ret = get_user(uval, uaddr);
> -	if (ret)
> -		goto out_put_key;
> +	if (attempt++) {
> +		ret = futex_handle_fault((unsigned long)uaddr, attempt);
> +		if (ret)
> +			goto out_put_key;
> +		goto retry_unlocked;
> +	}
> 
> -	if (!fshared)
> -		goto retry_private;
> +	ret = get_user(uval, uaddr);
> +	if (!ret)
> +		goto retry_unlocked;
> 
> -	put_futex_key(fshared, &q.key);
> -	goto retry;
> +	goto out_put_key;
>  }
> 
> 
> @@ -1585,7 +1630,7 @@ static int futex_unlock_pi(u32 __user *u
>  	u32 uval;
>  	struct plist_head *head;
>  	union futex_key key = FUTEX_KEY_INIT;
> -	int ret;
> +	int ret, attempt = 0;
> 
>  retry:
>  	if (get_user(uval, uaddr))
> @@ -1601,6 +1646,7 @@ retry:
>  		goto out;
> 
>  	hb = hash_futex(&key);
> +retry_unlocked:
>  	spin_lock(&hb->lock);
> 
>  	/*
> @@ -1665,9 +1711,17 @@ pi_faulted:
>  	 * we have to drop the mmap_sem in order to call get_user().
>  	 */
>  	spin_unlock(&hb->lock);
> -	put_futex_key(fshared, &key);
> +
> +	if (attempt++) {
> +		ret = futex_handle_fault((unsigned long)uaddr, attempt);
> +		if (ret)
> +			goto out;
> +		uval = 0;
> +		goto retry_unlocked;
> +	}
> 
>  	ret = get_user(uval, uaddr);
> +	put_futex_key(fshared, &key);
>  	if (!ret)
>  		goto retry;
> 
> 
> 


-- 
Darren Hart
IBM Linux Technology Center
Real-Time Linux Team
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/