This patch modifies futex_wait() to use an hrtimer + schedule() in place of schedule_timeout(). schedule_timeout() is tick based, therefore the timeout granularity is the tick (1 ms, 4 ms or 10 ms depending on HZ). By using a high resolution timer for timeout wakeup, we can attain a much finer timeout granularity (in the microsecond range). This parallels what is already done for futex_lock_pi(). The timeout passed to the syscall is no longer converted to jiffies and is therefore passed to do_futex() and futex_wait() as a timespec therefore keeping nanosecond resolution. Also this removes the need to pass the nanoseconds timeout part to futex_lock_pi() in val2. In futex_wait(), if there is no timeout then a regular schedule() is performed. Otherwise, an hrtimer is fired before schedule() is called. Signed-off-by: Sébastien Dugué Signed-off-by: Pierre Peiffer --- include/linux/futex.h | 2 - kernel/futex.c | 59 +++++++++++++++++++++++++++++++++----------------- kernel/futex_compat.c | 12 ++-------- 3 files changed, 43 insertions(+), 30 deletions(-) Index: b/kernel/futex.c =================================================================== --- a/kernel/futex.c +++ b/kernel/futex.c @@ -998,7 +998,7 @@ static void unqueue_me_pi(struct futex_q drop_futex_key_refs(&q->key); } -static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) +static int futex_wait(u32 __user *uaddr, u32 val, struct timespec *time) { struct task_struct *curr = current; DECLARE_WAITQUEUE(wait, curr); @@ -1006,6 +1006,8 @@ static int futex_wait(u32 __user *uaddr, struct futex_q q; u32 uval; int ret; + struct hrtimer_sleeper t; + int rem = 0; q.pi_state = NULL; retry: @@ -1083,8 +1085,31 @@ static int futex_wait(u32 __user *uaddr, * !plist_node_empty() is safe here without any lock. * q.lock_ptr != 0 is not safe, because of ordering against wakeup. */ - if (likely(!plist_node_empty(&q.list))) - time = schedule_timeout(time); + if (likely(!plist_node_empty(&q.list))) { + if (!time) + schedule(); + else { + hrtimer_init(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init_sleeper(&t, current); + t.timer.expires = timespec_to_ktime(*time); + + hrtimer_start(&t.timer, t.timer.expires, HRTIMER_MODE_REL); + + /* + * the timer could have already expired, in which + * case current would be flagged for rescheduling. + * Don't bother calling schedule. + */ + if (likely(t.task)) + schedule(); + + hrtimer_cancel(&t.timer); + + /* Flag if a timeout occured */ + rem = (t.task == NULL); + } + } + __set_current_state(TASK_RUNNING); /* @@ -1095,7 +1120,7 @@ static int futex_wait(u32 __user *uaddr, /* If we were woken (and unqueued), we succeeded, whatever. */ if (!unqueue_me(&q)) return 0; - if (time == 0) + if (rem) return -ETIMEDOUT; /* * We expect signal_pending(current), but another thread may @@ -1117,8 +1142,8 @@ static int futex_wait(u32 __user *uaddr, * if there are waiters then it will block, it does PI, etc. (Due to * races the kernel might see a 0 value of the futex too.) */ -static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, - long nsec, int trylock) +static int futex_lock_pi(u32 __user *uaddr, int detect, struct timespec *time, + int trylock) { struct hrtimer_sleeper timeout, *to = NULL; struct task_struct *curr = current; @@ -1130,11 +1155,11 @@ static int futex_lock_pi(u32 __user *uad if (refill_pi_state_cache()) return -ENOMEM; - if (sec != MAX_SCHEDULE_TIMEOUT) { + if (time) { to = &timeout; hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); hrtimer_init_sleeper(to, current); - to->timer.expires = ktime_set(sec, nsec); + to->timer.expires = timespec_to_ktime(*time); } q.pi_state = NULL; @@ -1770,7 +1795,7 @@ void exit_robust_list(struct task_struct } } -long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout, +long do_futex(u32 __user *uaddr, int op, u32 val, struct timespec *timeout, u32 __user *uaddr2, u32 val2, u32 val3) { int ret; @@ -1796,13 +1821,13 @@ long do_futex(u32 __user *uaddr, int op, ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); break; case FUTEX_LOCK_PI: - ret = futex_lock_pi(uaddr, val, timeout, val2, 0); + ret = futex_lock_pi(uaddr, val, timeout, 0); break; case FUTEX_UNLOCK_PI: ret = futex_unlock_pi(uaddr); break; case FUTEX_TRYLOCK_PI: - ret = futex_lock_pi(uaddr, 0, timeout, val2, 1); + ret = futex_lock_pi(uaddr, 0, timeout, 1); break; default: ret = -ENOSYS; @@ -1815,8 +1840,7 @@ asmlinkage long sys_futex(u32 __user *ua struct timespec __user *utime, u32 __user *uaddr2, u32 val3) { - struct timespec t; - unsigned long timeout = MAX_SCHEDULE_TIMEOUT; + struct timespec t, *tp = NULL; u32 val2 = 0; if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { @@ -1824,12 +1848,7 @@ asmlinkage long sys_futex(u32 __user *ua return -EFAULT; if (!timespec_valid(&t)) return -EINVAL; - if (op == FUTEX_WAIT) - timeout = timespec_to_jiffies(&t) + 1; - else { - timeout = t.tv_sec; - val2 = t.tv_nsec; - } + tp = &t; } /* * requeue parameter in 'utime' if op == FUTEX_REQUEUE. @@ -1837,7 +1856,7 @@ asmlinkage long sys_futex(u32 __user *ua if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE) val2 = (u32) (unsigned long) utime; - return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); + return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); } static int futexfs_get_sb(struct file_system_type *fs_type, Index: b/include/linux/futex.h =================================================================== --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -94,7 +94,7 @@ struct robust_list_head { #define ROBUST_LIST_LIMIT 2048 #ifdef __KERNEL__ -long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout, +long do_futex(u32 __user *uaddr, int op, u32 val, struct timespec *timeout, u32 __user *uaddr2, u32 val2, u32 val3); extern int Index: b/kernel/futex_compat.c =================================================================== --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -141,8 +141,7 @@ asmlinkage long compat_sys_futex(u32 __u struct compat_timespec __user *utime, u32 __user *uaddr2, u32 val3) { - struct timespec t; - unsigned long timeout = MAX_SCHEDULE_TIMEOUT; + struct timespec t, *tp = NULL; int val2 = 0; if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { @@ -150,15 +149,10 @@ asmlinkage long compat_sys_futex(u32 __u return -EFAULT; if (!timespec_valid(&t)) return -EINVAL; - if (op == FUTEX_WAIT) - timeout = timespec_to_jiffies(&t) + 1; - else { - timeout = t.tv_sec; - val2 = t.tv_nsec; - } + tp = &t; } if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE) val2 = (int) (unsigned long) utime; - return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); + return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); } -- -- Pierre Peiffer - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/