/* -*- mode:C; tab-width:8; c-basic-offset:8 -*- * Anticipatory & deadline i/o scheduler. * * Copyright (C) 2002 Jens Axboe * Nick Piggin * Copyright (C) 2009 Corrado Zoccolo * */ #include #include #include #include #include #include #include #include #include #include #include /* * See Documentation/block/as-iosched.txt */ #define debug_printk(X...) ((void)0) /* * max time we may wait to anticipate (default around 6ms) */ static const unsigned long default_antic_expire = (HZ / 150) ? HZ / 150 : 1; /* * Base of all time allocation computations. * Should roughly be 1/3 of the maximum acceptable latency. * Default 100ms */ static const unsigned long default_time_quantum = (HZ / 10) ? HZ / 10 : 1; /* * Number of times that other workloads can be scheduled before async */ static const unsigned long default_max_async_starved = 2; /* * Keep track of up to 20ms thinktimes. We can go as big as we like here, * however huge values tend to interfere and not decay fast enough. A program * might be in a non-io phase of operation. Waiting on user input for example, * or doing a lengthy computation. A small penalty can be justified there, and * will still catch out those processes that constantly have large thinktimes. */ #define MAX_THINKTIME (HZ/50UL) /* Bits in as_io_context.state */ enum as_io_states { AS_TASK_RUNNING=0, /* Process has not exited */ AS_TASK_IOSTARTED, /* Process has started some IO */ AS_TASK_IORUNNING, /* Process has completed some IO */ }; enum anticipation_status { ANTIC_OFF=0, /* Not anticipating (normal operation) */ ANTIC_WAIT_REQ, /* The last read has not yet completed */ ANTIC_WAIT_NEXT, /* Currently anticipating a request vs last read (which has completed) */ ANTIC_FINISHED, /* Anticipating but have found a candidate * or timed out */ }; struct as_data { /* * run time data */ struct request_queue *q; /* the "owner" queue */ /* * per_workload state */ struct async_wl { struct rb_root sort_list; struct list_head fifo_list; unsigned count; } async_workload; struct random_sync_wl { struct rb_root phases[2]; struct list_head fifo_list; unsigned current_phase; unsigned count; } random_sync_workload[2]; struct seq_sync_wl { struct list_head fifo_list; struct list_head fast_path; unsigned count; unsigned long expire_current; /* jiffies + f(time_quantum) */ } seq_sync_workload; struct request *next_rq; /* for async & random sync wl */ sector_t active_sector; /* for close_req computation */ /* phase data */ unsigned long workload_expires; unsigned int current_workload : 2; unsigned int rand_sync_ddir : 1; unsigned int async_starved : 4; /*as state*/ unsigned int ioc_finished : 1; /* IO associated with io_context is finished */ unsigned int reads_delayed : 8; /* same as Anticipatory I/O scheduler */ enum anticipation_status antic_status; unsigned long antic_start; /* jiffies: when it started */ struct timer_list antic_timer; /* anticipatory scheduling timer */ struct work_struct antic_work; /* Deferred unplugging */ struct io_context *io_context; /* Identify the expected process */ int nr_dispatched; /* * settings that change how the i/o scheduler behaves */ unsigned long antic_expire; unsigned long time_quantum; unsigned long max_async_starved; /* * statistics collection */ unsigned stats[2][5]; }; /* * per-request data. */ enum arq_state { AS_RQ_NEW=0, /* New - not referenced and not on any lists */ AS_RQ_QUEUED, /* In the request queue. It belongs to the scheduler */ AS_RQ_DISPATCHED, /* On the dispatch list. It belongs to the driver now */ AS_RQ_PRESCHED, /* Debug poisoning for requests being used */ AS_RQ_REMOVED, AS_RQ_MERGED, AS_RQ_POSTSCHED, /* when they shouldn't be */ }; enum arq_workload { AS_RQ_ASYNC=0, AS_RQ_RANDSYNC, AS_RQ_SEQSYNC }; static inline struct io_context * RQ_IOC(struct request *rq) { return (struct io_context *)rq->elevator_private; } static inline enum arq_state RQ_STATE(struct request *rq) { return ((unsigned)rq->elevator_private2)&7; } static inline enum arq_state RQ_WORKLOAD(struct request *rq) { return ((unsigned)rq->elevator_private2)>>3; } static inline void RQ_SET_STATE_WL(struct request *rq, enum arq_state state, unsigned workload) { rq->elevator_private2 = (void *)((state&7) | (workload<<3)); } #define RQ_SET_STATE(rq, state) RQ_SET_STATE_WL(rq, state, RQ_WORKLOAD(rq)) #define RQ_SET_WL(rq, workload) RQ_SET_STATE_WL(rq, RQ_STATE(rq), workload) static DEFINE_PER_CPU(unsigned long, ioc_count); static struct completion *ioc_gone; static DEFINE_SPINLOCK(ioc_gone_lock); static void as_move_to_dispatch(struct as_data *ad, struct request *rq); static void as_antic_stop(struct as_data *ad); /* * IO Context helper functions * We share the same ioc pointer slot that is used by AS, * so we have to fill the information consistently */ /* Called to deallocate the as_io_context */ static void free_as_io_context(struct as_io_context *aic) { kfree(aic); elv_ioc_count_dec(ioc_count); if (ioc_gone) { /* * AS scheduler is exiting, grab exit lock and check * the pending io context count. If it hits zero, * complete ioc_gone and set it back to NULL. */ spin_lock(&ioc_gone_lock); if (ioc_gone && !elv_ioc_count_read(ioc_count)) { complete(ioc_gone); ioc_gone = NULL; } spin_unlock(&ioc_gone_lock); } } static void as_trim(struct io_context *ioc) { spin_lock_irq(&ioc->lock); if (ioc->aic) free_as_io_context(ioc->aic); ioc->aic = NULL; spin_unlock_irq(&ioc->lock); } /* Called when the task exits */ static void exit_as_io_context(struct as_io_context *aic) { WARN_ON(!test_bit(AS_TASK_RUNNING, &aic->state)); clear_bit(AS_TASK_RUNNING, &aic->state); } static struct as_io_context *alloc_as_io_context(void) { struct as_io_context *ret; ret = kmalloc(sizeof(*ret), GFP_ATOMIC); if (ret) { ret->dtor = free_as_io_context; ret->exit = exit_as_io_context; ret->state = 1 << AS_TASK_RUNNING; atomic_set(&ret->nr_queued, 0); atomic_set(&ret->nr_dispatched, 0); spin_lock_init(&ret->lock); ret->ttime_total = 0; ret->ttime_samples = 0; ret->ttime_mean = 0; ret->seek_total = 0; ret->seek_samples = 0; ret->seek_mean = 0; elv_ioc_count_inc(ioc_count); } return ret; } /* * If the current task has no AS IO context then create one and initialise it. * Then take a ref on the task's io context and return it. */ static struct io_context *as_get_io_context(int node) { struct io_context *ioc = get_io_context(GFP_ATOMIC, node); if (ioc && !ioc->aic) { ioc->aic = alloc_as_io_context(); if (!ioc->aic) { put_io_context(ioc); ioc = NULL; } } return ioc; } static void as_put_io_context(struct request *rq) { struct as_io_context *aic; if (unlikely(!RQ_IOC(rq))) return; aic = RQ_IOC(rq)->aic; if (rq_is_sync(rq) && aic) { unsigned long flags; spin_lock_irqsave(&aic->lock, flags); set_bit(AS_TASK_IORUNNING, &aic->state); aic->last_end_request = jiffies; spin_unlock_irqrestore(&aic->lock, flags); } put_io_context(RQ_IOC(rq)); } /* * Request to workload data structure mapping */ static inline unsigned * as_count(struct as_data *ad, struct request *rq) { const int ddir = rq_data_dir(rq); const int rq_wl = RQ_WORKLOAD(rq); unsigned * counts[] = { &ad->async_workload.count, &ad->random_sync_workload[ddir].count, &ad->random_sync_workload[ddir].count, &ad->seq_sync_workload.count, &ad->seq_sync_workload.count }; return counts[rq_wl]; } static inline struct list_head * as_fifo(struct as_data *ad, struct request *rq) { const int ddir = rq_data_dir(rq); const int rq_wl = RQ_WORKLOAD(rq); struct list_head *fifos[] = { &ad->async_workload.fifo_list, &ad->random_sync_workload[ddir].fifo_list, &ad->random_sync_workload[ddir].fifo_list, &ad->seq_sync_workload.fifo_list, &ad->seq_sync_workload.fast_path }; return fifos[rq_wl]; } static inline struct rb_root * as_rb_root(struct as_data *ad, struct request *rq) { const int ddir = rq_data_dir(rq); const int rq_wl = RQ_WORKLOAD(rq); struct rb_root * roots[] = { &ad->async_workload.sort_list, &ad->random_sync_workload[ddir].phases[0], &ad->random_sync_workload[ddir].phases[1], NULL, NULL }; return roots[rq_wl]; } /* * rb tree support functions */ static inline struct request * as_next_request(struct request *rq) { struct rb_node *node = rb_next(&rq->rb_node); if (node) return rb_entry_rq(node); return NULL; } static void as_add_rq_rb(struct as_data *ad, struct request *rq) { struct request *__alias; struct rb_root *root = as_rb_root(ad, rq); if (root) while (unlikely(__alias = elv_rb_add(root, rq))) { as_move_to_dispatch(ad, __alias); as_antic_stop(ad); } ++*as_count(ad, rq); } static void as_del_rq_rb(struct as_data *ad, struct request *rq) { struct rb_root *root = as_rb_root(ad, rq); if (ad->next_rq == rq) ad->next_rq = root ? as_next_request(rq) : NULL; if (root) elv_rb_del(root, rq); --*as_count(ad, rq); } /* * anticipatory scheduling functions follow */ /* * as_antic_expired tells us when we have anticipated too long. */ static inline int as_antic_expired(struct as_data *ad) { return time_after(jiffies, ad->antic_start + ad->antic_expire); } /* * as_antic_waitnext starts anticipating that a nice request will soon be * submitted. See also as_antic_waitreq */ static void as_antic_waitnext(struct as_data *ad) { unsigned long timeout; BUG_ON(ad->antic_status != ANTIC_OFF && ad->antic_status != ANTIC_WAIT_REQ); timeout = ad->antic_start + ad->antic_expire; mod_timer(&ad->antic_timer, timeout); ad->antic_status = ANTIC_WAIT_NEXT; } /* * as_antic_waitreq starts anticipating. We don't start timing the anticipation * until the request that we're anticipating on has finished. This means we * are timing from when the candidate process wakes up hopefully. */ static void as_antic_waitreq(struct as_data *ad) { BUG_ON(ad->antic_status == ANTIC_FINISHED); if (ad->antic_status == ANTIC_OFF) { if (!ad->io_context || ad->ioc_finished) as_antic_waitnext(ad); else ad->antic_status = ANTIC_WAIT_REQ; } } /* * This is called directly by the functions in this file to stop anticipation. * We kill the timer and schedule a call to the request_fn asap. */ static void as_antic_stop(struct as_data *ad) { int status = ad->antic_status; if (status == ANTIC_WAIT_REQ || status == ANTIC_WAIT_NEXT) { if (status == ANTIC_WAIT_NEXT) del_timer(&ad->antic_timer); ad->antic_status = ANTIC_FINISHED; /* see as_work_handler */ kblockd_schedule_work(ad->q, &ad->antic_work); } } /* * as_antic_timeout is the timer function set by as_antic_waitnext. */ static void as_antic_timeout(unsigned long data) { struct request_queue *q = (struct request_queue *)data; struct as_data *ad = q->elevator->elevator_data; unsigned long flags; spin_lock_irqsave(q->queue_lock, flags); if (ad->antic_status == ANTIC_WAIT_REQ || ad->antic_status == ANTIC_WAIT_NEXT) { struct as_io_context *aic; spin_lock(&ad->io_context->lock); aic = ad->io_context->aic; ad->antic_status = ANTIC_FINISHED; kblockd_schedule_work(q, &ad->antic_work); spin_unlock(&ad->io_context->lock); } spin_unlock_irqrestore(q->queue_lock, flags); } static void as_update_thinktime(struct as_data *ad, struct as_io_context *aic, unsigned long ttime) { /* fixed point: 1.0 == 1<<8 */ aic->ttime_samples = (7*aic->ttime_samples + 256) / 8; aic->ttime_total = (7*aic->ttime_total + 256*ttime) / 8; aic->ttime_mean = (aic->ttime_total + 128) / aic->ttime_samples; } static void as_update_seekdist(struct as_data *ad, struct as_io_context *aic, sector_t sdist) { u64 total; /* * Don't allow the seek distance to get too large from the * odd fragment, pagein, etc */ if (aic->seek_samples <= 60) /* second&third seek */ sdist = min(sdist, (aic->seek_mean * 4) + 2*1024*1024); else sdist = min(sdist, (aic->seek_mean * 4) + 2*1024*64); aic->seek_samples = (7*aic->seek_samples + 256) / 8; aic->seek_total = (7*aic->seek_total + (u64)256*sdist) / 8; total = aic->seek_total + (aic->seek_samples/2); do_div(total, aic->seek_samples); aic->seek_mean = (sector_t)total; } /* * as_update_iohist keeps a decaying histogram of IO thinktimes, and * updates @aic->ttime_mean based on that. It is called when a new * request is queued. */ static void as_update_iohist(struct as_data *ad, struct as_io_context *aic, struct request *rq) { unsigned long thinktime = 0; sector_t seek_dist; if (aic == NULL) return; if (rq_is_sync(rq) == BLK_RW_SYNC) { unsigned long in_flight = atomic_read(&aic->nr_queued) + atomic_read(&aic->nr_dispatched); spin_lock(&aic->lock); if (test_bit(AS_TASK_IORUNNING, &aic->state) || test_bit(AS_TASK_IOSTARTED, &aic->state)) { /* Calculate read -> read thinktime */ if (test_bit(AS_TASK_IORUNNING, &aic->state) && in_flight == 0) { thinktime = jiffies - aic->last_end_request; thinktime = min(thinktime, MAX_THINKTIME-1); } as_update_thinktime(ad, aic, thinktime); /* Calculate read -> read seek distance */ if (aic->last_request_pos < rq->sector) seek_dist = rq->sector - aic->last_request_pos; else seek_dist = aic->last_request_pos - rq->sector; as_update_seekdist(ad, aic, seek_dist); } aic->last_request_pos = rq->sector + rq->nr_sectors; set_bit(AS_TASK_IOSTARTED, &aic->state); spin_unlock(&aic->lock); } } /* * as_close_req decides if one request is considered "close" to the * previous one issued. */ static int as_close_req(struct as_data *ad, struct as_io_context *aic, struct request *rq) { unsigned long delay; /* jiffies */ sector_t last = ad->active_sector; sector_t next = rq->sector; sector_t delta; /* acceptable close offset (in sectors) */ sector_t s; if (ad->antic_status == ANTIC_OFF || !ad->ioc_finished) delay = 0; else delay = jiffies - ad->antic_start; if (delay == 0) delta = 8192; else if (delay <= (20 * HZ / 1000) && delay <= ad->antic_expire) delta = 8192 << delay; else return 1; if ((last <= next + (delta>>1)) && (next <= last + delta)) return 1; if (last < next) s = next - last; else s = last - next; if (aic->seek_samples == 0) { return 1; } else { if (aic->seek_mean > s) { /* this request is better than what we're expecting */ return 1; } } return 0; } /* * as_can_break_anticipation returns true if we have been anticipating this * request, or anticipation expired (including switching to a different workload). * * It also returns true if the process against which we are anticipating * submits a write - that's presumably an fsync, O_SYNC write, etc. We want to * dispatch it ASAP, because we know that application will not be submitting * any new reads. * * If the task which has submitted the request has exited, break anticipation. * * If this task has queued some other IO, do not enter enticipation. */ static int as_can_break_anticipation(struct as_data *ad, struct request *rq) { struct io_context *ioc; struct as_io_context *aic; ioc = ad->io_context; BUG_ON(!ioc); spin_lock(&ioc->lock); if (rq && ioc == RQ_IOC(rq)) { /* request from same process */ spin_unlock(&ioc->lock); return 1; } if (rq && ad->current_workload == AS_RQ_RANDSYNC && (RQ_WORKLOAD(rq) == 1 || RQ_WORKLOAD(rq) == 2)) { /* random sync request, matching current workload, we break anticipation */ spin_unlock(&ioc->lock); return 1; } if (ad->ioc_finished && as_antic_expired(ad)) { /* * In this situation status should really be FINISHED, * however the timer hasn't had the chance to run yet. */ spin_unlock(&ioc->lock); return 1; } if (rq && ioc != RQ_IOC(rq) && ad->current_workload == AS_RQ_SEQSYNC && time_after(jiffies, ad->seq_sync_workload.expire_current)) { /* request from different process, but former process exhausted its time quantum */ spin_unlock(&ioc->lock); return 1; } aic = ioc->aic; if (!aic) { spin_unlock(&ioc->lock); return 0; } if (atomic_read(&aic->nr_queued) > 0) { /* process has more requests queued */ spin_unlock(&ioc->lock); return 1; } if (atomic_read(&aic->nr_dispatched) > 0) { /* process has more requests dispatched */ spin_unlock(&ioc->lock); return 1; } if (rq && rq_is_sync(rq) && as_close_req(ad, aic, rq)) { /* * Found a close request that is not one of ours. * * This makes close requests from another process update * our IO history. Is generally useful when there are * two or more cooperating processes working in the same * area. */ as_update_iohist(ad, aic, rq); spin_unlock(&ioc->lock); return 1; } if (!test_bit(AS_TASK_RUNNING, &aic->state)) { /* process anticipated on has exited */ spin_unlock(&ioc->lock); return 1; } if (aic->ttime_samples == 0) { spin_unlock(&ioc->lock); return 1; } else if (aic->ttime_mean > ad->antic_expire) { /* the process thinks too much between requests */ spin_unlock(&ioc->lock); return 1; } spin_unlock(&ioc->lock); return 0; } /* * as_can_anticipate indicates whether we should either run rq * or keep anticipating a better request. */ static int as_can_anticipate(struct as_data *ad, struct request *rq) { if (!ad->io_context) /* * Last request submitted was a write */ return 0; if (ad->antic_status == ANTIC_FINISHED) /* * Don't restart if we have just finished. Run the next request */ return 0; if (as_can_break_anticipation(ad, rq)) /* * This request is a good candidate. Don't keep anticipating, * run it. */ return 0; /* * OK from here, we haven't finished, and don't have a decent request! * Status is either ANTIC_OFF so start waiting, * ANTIC_WAIT_REQ so continue waiting for request to finish * or ANTIC_WAIT_NEXT so continue waiting for an acceptable request. */ return 1; } /* * as_update_rq must be called whenever a request (rq) is added to * the sort_list. This function keeps caches up to date, and checks if the * request might be one we are "anticipating" */ static void as_update_rq(struct as_data *ad, struct request *rq) { /* * have we been anticipating this request? * or does it come from the same process as the one we are anticipating * for? */ if (ad->antic_status == ANTIC_WAIT_REQ || ad->antic_status == ANTIC_WAIT_NEXT) { if (as_can_break_anticipation(ad, rq)) as_antic_stop(ad); } } /* * as_completed_request is to be called when a request has completed and * returned something to the requesting process, be it an error or data. */ static void as_completed_request(struct request_queue *q, struct request *rq) { struct as_data *ad = q->elevator->elevator_data; WARN_ON(!list_empty(&rq->queuelist)); if (RQ_STATE(rq) != AS_RQ_REMOVED) { WARN(1, "rq->state %d\n", RQ_STATE(rq)); goto out; } WARN_ON(ad->nr_dispatched == 0); ad->nr_dispatched--; /* * Start counting the batch from when a request of that direction is * actually serviced. This should help devices with big TCQ windows * and writeback caches */ if (ad->io_context == RQ_IOC(rq) && ad->io_context) { ad->antic_start = jiffies; ad->ioc_finished = 1; if (ad->antic_status == ANTIC_WAIT_REQ) { /* * We were waiting on this request, now anticipate * the next one */ as_antic_waitnext(ad); } } as_put_io_context(rq); out: RQ_SET_STATE(rq, AS_RQ_POSTSCHED); } /* * as_remove_queued_request removes a request from the pre dispatch queue * without updating refcounts. It is expected the caller will drop the * reference unless it replaces the request at somepart of the elevator * (ie. the dispatch queue) */ static void as_remove_queued_request(struct request_queue *q, struct request *rq) { struct as_data *ad = q->elevator->elevator_data; struct io_context *ioc; WARN_ON(RQ_STATE(rq) != AS_RQ_QUEUED); ioc = RQ_IOC(rq); if (ioc && ioc->aic) { BUG_ON(!atomic_read(&ioc->aic->nr_queued)); atomic_dec(&ioc->aic->nr_queued); } rq_fifo_clear(rq); as_del_rq_rb(ad, rq); } /* * move an entry to dispatch queue */ static void as_move_to_dispatch(struct as_data *ad, struct request *rq) { as_antic_stop(ad); ad->antic_status = ANTIC_OFF; ad->active_sector = rq->sector + rq->nr_sectors; if (RQ_WORKLOAD(rq) > 0) { struct io_context *ioc = RQ_IOC(rq); /* logic to determine if we are delaying too much read requests: * in case, we penalize async writes. */ if (!time_before(jiffies, rq_fifo_time(rq) + 2*ad->time_quantum) && rq_data_dir(rq)==READ) { ad->reads_delayed = max( (int)((jiffies - rq_fifo_time(rq)) / (2*ad->time_quantum)), (int)ad->reads_delayed); debug_printk(KERN_INFO "late request (delay = %d, queues: [%d,%d,%d]): rq=%p ioc=%p sync=%d wl=%d\n", (int)(jiffies - (rq_fifo_time(rq)+2*ad->time_quantum)), ad->async_workload.count, ad->random_sync_workload[0].count + ad->random_sync_workload[1].count, ad->seq_sync_workload.count, rq, ioc, rq_is_sync(rq), RQ_WORKLOAD(rq)); } /* In case we have to anticipate after this */ if (ioc != ad->io_context) { copy_io_context(&ad->io_context, &ioc); if (RQ_WORKLOAD(rq) >= 3) { /* * This process' time slice among sequential processes */ ad->seq_sync_workload.expire_current = jiffies + max(4*ad->antic_expire, 4 * ad->time_quantum / (1 + ad->seq_sync_workload.count/2)); } } } else { put_io_context(ad->io_context); ad->io_context = NULL; } ad->ioc_finished = 0; ad->next_rq = (!RB_EMPTY_NODE(&rq->rb_node)) ? as_next_request(rq) : NULL; /* * take it off the sort and fifo list, add to dispatch queue */ as_remove_queued_request(ad->q, rq); WARN_ON(RQ_STATE(rq) != AS_RQ_QUEUED); elv_dispatch_add_tail(ad->q, rq); RQ_SET_STATE(rq, AS_RQ_DISPATCHED); if (RQ_IOC(rq) && RQ_IOC(rq)->aic) atomic_inc(&RQ_IOC(rq)->aic->nr_dispatched); ad->nr_dispatched++; } typedef int dispatcher_fun(struct request_queue *, int); typedef dispatcher_fun * dispatcher_fun_ptr; static dispatcher_fun as_dispatch_async, as_dispatch_random_sync, as_dispatch_seq_sync; /* * as_dispatch_request selects the best request according to * read/write expire, batch expire, etc, and moves it to the dispatch * queue. Returns 1 if a request was found, 0 otherwise. */ static int as_dispatch_request(struct request_queue *q, int force) { struct as_data *ad = q->elevator->elevator_data; static dispatcher_fun_ptr const dispatch_workloads[] = { as_dispatch_async, as_dispatch_random_sync, as_dispatch_seq_sync }; /* Current workload not expired, with more data already scheduled */ if (ad->next_rq && time_before(jiffies, ad->workload_expires)) return dispatch_workloads[ad->current_workload](q,force); /* honor anticipation */ if (!force && (ad->antic_status == ANTIC_WAIT_REQ || ad->antic_status == ANTIC_WAIT_NEXT)) return 0; { int pending[] = { ad->async_workload.count, ad->random_sync_workload[0].count + ad->random_sync_workload[1].count, ad->seq_sync_workload.count }; int sync_pending = pending[AS_RQ_RANDSYNC] + pending[AS_RQ_SEQSYNC]; int workloads = !!pending[AS_RQ_RANDSYNC] + !!pending[AS_RQ_SEQSYNC] + !!pending[AS_RQ_ASYNC]; unsigned long adjusted_time_quantum = ad->time_quantum; if (!workloads) { ad->reads_delayed = 0; return 0; } /* just finished workload still counts for time share computations */ workloads += !pending[ad->current_workload]; /* Current workload not expired, with more data */ if (time_before(jiffies, ad->workload_expires) && pending[ad->current_workload]) return dispatch_workloads[ad->current_workload](q,force); /* Current workload is (seq or random) sync reads ... */ if (ad->current_workload != AS_RQ_ASYNC) { /* ... and current sweep/fast_path is not yet finished, we give some more time for it */ if (ad->next_rq && rq_data_dir(ad->next_rq) == READ && time_before(jiffies, ad->workload_expires + ad->time_quantum/workloads)) return dispatch_workloads[ad->current_workload](q,force); /* ... not expired, no more data, current process has nothing pending, can we anticipate? */ if (time_before(jiffies, ad->workload_expires) && !force && (ad->io_context && ad->io_context->aic && !atomic_read(&ad->io_context->aic->nr_queued)) && as_can_anticipate(ad, NULL)) { debug_printk(KERN_INFO "as_dispatch_request is anticipating: [%d,%d,%d] cur=%d next=%p starv=%d\n", pending[0], pending[1], pending[2], ad->current_workload, ad->next_rq, ad->async_starved); ad->reads_delayed = 0; /* * Anticipate other requests for current workload */ as_antic_waitreq(ad); return 0; } } debug_printk(KERN_INFO "as_dispatch_request [%d,%d,%d] cur=%d next=%p starv=%d\n", pending[0], pending[1], pending[2], ad->current_workload, ad->next_rq, ad->async_starved); /* default expiration of workload, can be overridden below */ adjusted_time_quantum = 3 * ad->time_quantum / workloads; if (ad->current_workload != AS_RQ_ASYNC && pending[AS_RQ_ASYNC] && (!sync_pending || ad->async_starved++ >= ad->max_async_starved * (1+ad->reads_delayed))) { ad->next_rq = NULL; ad->current_workload = AS_RQ_ASYNC; ad->async_starved = 0; adjusted_time_quantum /= 3; adjusted_time_quantum >>= ad->reads_delayed; } else if (ad->current_workload != AS_RQ_RANDSYNC && pending[AS_RQ_RANDSYNC]) { ad->next_rq = NULL; ad->current_workload = AS_RQ_RANDSYNC; } else if (ad->current_workload != AS_RQ_SEQSYNC && pending[AS_RQ_SEQSYNC]) { ad->next_rq = NULL; ad->current_workload = AS_RQ_SEQSYNC; adjusted_time_quantum /= (sync_pending/pending[AS_RQ_SEQSYNC]); } ad->workload_expires = jiffies + max(4*ad->antic_expire, adjusted_time_quantum); } return dispatch_workloads[ad->current_workload](q,force); } static int as_dispatch_async(struct request_queue *q, int force) { struct as_data *ad = q->elevator->elevator_data; struct request *rq = ad->next_rq; if (!rq && !list_empty(&ad->async_workload.fifo_list)) { rq = rq_entry_fifo(ad->async_workload.fifo_list.next); } if (rq) { as_move_to_dispatch(ad, rq); return 1; } return 0; } static int as_dispatch_random_sync(struct request_queue *q, int force) { struct as_data *ad = q->elevator->elevator_data; struct request *rq = ad->next_rq; if (!rq) { int cases = (ad->random_sync_workload[0].count > 0) * 4 + (ad->random_sync_workload[1].count > 0) * 2 + !ad->rand_sync_ddir; static const int case_to_ddir[]={-1, -1, 1, 1, 0, 0, 0, 1}; int ddir = case_to_ddir[cases]; unsigned * cur_phase = &ad->random_sync_workload[ddir].current_phase; if (ddir >= 0) { struct rb_node * fn = rb_first(&ad->random_sync_workload[ddir].phases[*cur_phase]); if (!fn) { *cur_phase = !*cur_phase; fn = rb_first(&ad->random_sync_workload[ddir].phases[*cur_phase]); } BUG_ON(!fn); rq = rb_entry_rq(fn); ad->rand_sync_ddir = ddir; } debug_printk(KERN_INFO "as_dispatch_random_sync cases=%d ddir=%d rq=%p\n", cases, ddir, rq); } if (rq) { as_move_to_dispatch(ad, rq); return 1; } return 0; } static int as_dispatch_seq_sync(struct request_queue *q, int force) { struct as_data *ad = q->elevator->elevator_data; struct request *rq = ad->next_rq; if (!rq && !list_empty(&ad->seq_sync_workload.fast_path)) { rq = rq_entry_fifo(ad->seq_sync_workload.fast_path.next); } if (!rq && !list_empty(&ad->seq_sync_workload.fifo_list)) { rq = rq_entry_fifo(ad->seq_sync_workload.fifo_list.next); debug_printk(KERN_INFO "as_dispatch_seq_sync(cases=%d ddir=%d rq=%p)\n", cases, ddir, rq); /* * Anticipate other sequential requests */ if (rq && !force && ad->io_context != RQ_IOC(rq) && as_can_anticipate(ad, rq)) { debug_printk(KERN_INFO "as_dispatch_seq_sync anticipate, waiting4=%p\n", ad->io_context); as_antic_waitreq(ad); return 0; } if (rq && RQ_IOC(rq)) { struct io_context * ioc = RQ_IOC(rq); struct list_head *entry, *next; unsigned cnt = 0; /* * Move all requests from next process in the fifo to the fast path queue */ list_for_each_safe(entry, next, &ad->seq_sync_workload.fifo_list) { struct request *pos = list_entry_rq(entry); if (RQ_IOC(pos) == ioc) { if (pos == rq) continue; list_del(entry); list_add_tail(&pos->queuelist, &ad->seq_sync_workload.fast_path); ++cnt; } } debug_printk(KERN_INFO "as_dispatch_seq_sync switched to %p, moved=%u\n", ioc, cnt); } } if (rq) { as_move_to_dispatch(ad, rq); if (!list_empty(&ad->seq_sync_workload.fast_path)) ad->next_rq = rq_entry_fifo(ad->seq_sync_workload.fast_path.next); return 1; } return 0; } /* * add rq to rbtree and fifo * determines workload for each request */ static void as_add_request(struct request_queue *q, struct request *rq) { struct as_data *ad = q->elevator->elevator_data; const int ddir = rq_data_dir(rq); const bool sync = rq_is_sync(rq); bool is_seq = false; bool fast_path = false; struct io_context * rq_ioc = as_get_io_context(q->node); RQ_SET_STATE(rq, AS_RQ_NEW); rq->elevator_private = rq_ioc; if (rq_ioc) { struct as_io_context *aic = rq_ioc->aic; if (sync) { /* does this belong to current process doing sequential I/O? */ is_seq = fast_path = ad->current_workload == AS_RQ_SEQSYNC && rq_ioc == ad->io_context && time_before(jiffies, ad->workload_expires); if (!fast_path) { sector_t delta = min(aic->last_request_pos-rq->sector, (rq->sector-aic->last_request_pos)/2); /* is this a good candidate to be followed by other sequential requests? */ is_seq = fast_path || ((delta < 1024) && (aic->ttime_mean <= ad->antic_expire) && (aic->seek_mean <= 1024)); } as_update_iohist(ad, aic, rq); } atomic_inc(&aic->nr_queued); } RQ_SET_WL(rq, is_seq ? 3 + fast_path : sync ? 1 + !ad->random_sync_workload[ddir].current_phase : 0); ad->stats[ddir][RQ_WORKLOAD(rq)]++; debug_printk(KERN_INFO "as_add_request(rq=%p ioc=%p) sync=%d ddir=%d seq=%d,%d wl=%d\n", rq, rq_ioc, sync, ddir, is_seq, fast_path, RQ_WORKLOAD(rq)); as_add_rq_rb(ad, rq); /* * set insertion time and add to fifo list */ rq_set_fifo_time(rq, jiffies); list_add_tail(&rq->queuelist, as_fifo(ad, rq)); if (fast_path && !ad->next_rq && !list_empty(&ad->seq_sync_workload.fast_path)) ad->next_rq = rq_entry_fifo(ad->seq_sync_workload.fast_path.next); as_update_rq(ad, rq); /* keep state machine up to date */ RQ_SET_STATE(rq, AS_RQ_QUEUED); } static void as_activate_request(struct request_queue *q, struct request *rq) { WARN_ON(RQ_STATE(rq) != AS_RQ_DISPATCHED); RQ_SET_STATE(rq, AS_RQ_REMOVED); if (RQ_IOC(rq) && RQ_IOC(rq)->aic) atomic_dec(&RQ_IOC(rq)->aic->nr_dispatched); } static void as_deactivate_request(struct request_queue *q, struct request *rq) { WARN_ON(RQ_STATE(rq) != AS_RQ_REMOVED); RQ_SET_STATE(rq, AS_RQ_DISPATCHED); if (RQ_IOC(rq) && RQ_IOC(rq)->aic) atomic_inc(&RQ_IOC(rq)->aic->nr_dispatched); } /* * as_queue_empty tells us if there are requests left in the device. It may * not be the case that a driver can get the next request even if the queue * is not empty - it is used in the block layer to check for plugging and * merging opportunities */ static int as_queue_empty(struct request_queue *q) { struct as_data *ad = q->elevator->elevator_data; unsigned i; if (ad->async_workload.count || ad->seq_sync_workload.count) return 0; for (i=0;i<2; ++i) { if (ad->random_sync_workload[i].count) return 0; } return 1; } static void as_merged_requests(struct request_queue *q, struct request *req, struct request *next) { /* * if next expires before rq, assign its expire time to arq * and move into next position (next will be deleted) in fifo */ if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) { if (time_before(rq_fifo_time(next), rq_fifo_time(req))) { list_move(&req->queuelist, &next->queuelist); rq_set_fifo_time(req, rq_fifo_time(next)); } } /* * kill knowledge of next, this one is a goner */ as_remove_queued_request(q, next); as_put_io_context(next); RQ_SET_STATE(next, AS_RQ_MERGED); } /* * This is executed in a "deferred" process context, by kblockd. It calls the * driver's request_fn so the driver can submit that request. * * IMPORTANT! This guy will reenter the elevator, so set up all queue global * state before calling, and don't rely on any state over calls. * * FIXME! dispatch queue is not a queue at all! */ static void as_work_handler(struct work_struct *work) { struct as_data *ad = container_of(work, struct as_data, antic_work); struct request_queue *q = ad->q; unsigned long flags; spin_lock_irqsave(q->queue_lock, flags); blk_start_queueing(q); spin_unlock_irqrestore(q->queue_lock, flags); } static int as_may_queue(struct request_queue *q, int rw) { int ret = ELV_MQUEUE_MAY; struct as_data *ad = q->elevator->elevator_data; struct io_context *ioc; if (ad->antic_status == ANTIC_WAIT_REQ || ad->antic_status == ANTIC_WAIT_NEXT) { ioc = as_get_io_context(q->node); if (ad->io_context == ioc) ret = ELV_MQUEUE_MUST; put_io_context(ioc); } return ret; } static void as_exit_queue(struct elevator_queue *e) { struct as_data *ad = e->elevator_data; del_timer_sync(&ad->antic_timer); cancel_work_sync(&ad->antic_work); BUG_ON(!list_empty(&ad->async_workload.fifo_list)); BUG_ON(!list_empty(&ad->random_sync_workload[READ].fifo_list)); BUG_ON(!list_empty(&ad->random_sync_workload[WRITE].fifo_list)); BUG_ON(!list_empty(&ad->seq_sync_workload.fifo_list)); BUG_ON(!list_empty(&ad->seq_sync_workload.fast_path)); put_io_context(ad->io_context); kfree(ad); } /* * initialize elevator private data (as_data). */ static void *as_init_queue(struct request_queue *q) { struct as_data *ad; unsigned i; ad = kmalloc_node(sizeof(*ad), GFP_KERNEL | __GFP_ZERO, q->node); if (!ad) return NULL; ad->q = q; /* Identify what queue the data belongs to */ /* anticipatory scheduling helpers */ ad->antic_timer.function = as_antic_timeout; ad->antic_timer.data = (unsigned long)q; init_timer(&ad->antic_timer); INIT_WORK(&ad->antic_work, as_work_handler); ad->async_workload.sort_list = RB_ROOT; for (i=0;i<2; ++i) { ad->random_sync_workload[i].phases[0] = RB_ROOT; ad->random_sync_workload[i].phases[1] = RB_ROOT; INIT_LIST_HEAD(&ad->random_sync_workload[i].fifo_list); } INIT_LIST_HEAD(&ad->async_workload.fifo_list); INIT_LIST_HEAD(&ad->seq_sync_workload.fifo_list); INIT_LIST_HEAD(&ad->seq_sync_workload.fast_path); ad->antic_expire = default_antic_expire; ad->time_quantum = default_time_quantum; ad->max_async_starved = default_max_async_starved; return ad; } /* * sysfs parts below */ static ssize_t as_var_show(unsigned int var, char *page) { return sprintf(page, "%d\n", var); } static ssize_t as_var_store(unsigned long *var, const char *page, size_t count) { char *p = (char *) page; *var = simple_strtoul(p, &p, 10); return count; } static ssize_t stats_show(struct elevator_queue *e, char *page) { struct as_data *ad = e->elevator_data; int pos = 0; unsigned i,j; for (i=0;i<2;++i) { for (j=0;j<5;++j) pos += sprintf(page+pos, "[%5u]", ad->stats[i][j]); page[pos++]='\n'; } page[pos]='\0'; return pos; } #define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ static ssize_t __FUNC(struct elevator_queue *e, char *page) \ { \ struct as_data *ad = e->elevator_data; \ return as_var_show((__CONV)?jiffies_to_msecs((__VAR)):(__VAR), (page)); \ } SHOW_FUNCTION(as_antic_expire_show, ad->antic_expire, 1); SHOW_FUNCTION(as_time_quantum_show, ad->time_quantum, 1); SHOW_FUNCTION(as_max_async_starved_show, ad->max_async_starved, 0); #undef SHOW_FUNCTION #define STORE_FUNCTION(__FUNC, __PTR, __CONV, MIN, MAX) \ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ { \ struct as_data *ad = e->elevator_data; \ int ret = as_var_store(__PTR, (page), count); \ if (__CONV) *(__PTR) = msecs_to_jiffies(*(__PTR)); \ if (*(__PTR) < (MIN)) \ *(__PTR) = (MIN); \ else if (*(__PTR) > (MAX)) \ *(__PTR) = (MAX); \ return ret; \ } STORE_FUNCTION(as_antic_expire_store, &ad->antic_expire, 1, 0, max(1UL,ad->time_quantum/4)); STORE_FUNCTION(as_time_quantum_store, &ad->time_quantum, 1, 4*ad->antic_expire, INT_MAX); STORE_FUNCTION(as_max_async_starved_store, &ad->max_async_starved, 0, 0, 16); #undef STORE_FUNCTION #define AS_ATTR(name) \ __ATTR(name, S_IRUGO|S_IWUSR, as_##name##_show, as_##name##_store) static struct elv_fs_entry as_attrs[] = { __ATTR_RO(stats), AS_ATTR(antic_expire), AS_ATTR(time_quantum), AS_ATTR(max_async_starved), __ATTR_NULL }; static struct elevator_type iosched_as = { .ops = { .elevator_merge_req_fn = as_merged_requests, .elevator_dispatch_fn = as_dispatch_request, .elevator_add_req_fn = as_add_request, .elevator_activate_req_fn = as_activate_request, .elevator_deactivate_req_fn = as_deactivate_request, .elevator_queue_empty_fn = as_queue_empty, .elevator_completed_req_fn = as_completed_request, .elevator_former_req_fn = elv_rb_former_request, .elevator_latter_req_fn = elv_rb_latter_request, .elevator_may_queue_fn = as_may_queue, .elevator_init_fn = as_init_queue, .elevator_exit_fn = as_exit_queue, .trim = as_trim, }, .elevator_attrs = as_attrs, .elevator_name = "fas", .elevator_owner = THIS_MODULE, }; static int __init as_init(void) { elv_register(&iosched_as); return 0; } static void __exit as_exit(void) { DECLARE_COMPLETION_ONSTACK(all_gone); elv_unregister(&iosched_as); ioc_gone = &all_gone; /* ioc_gone's update must be visible before reading ioc_count */ smp_wmb(); if (elv_ioc_count_read(ioc_count)) wait_for_completion(&all_gone); synchronize_rcu(); } module_init(as_init); module_exit(as_exit); MODULE_AUTHOR("Corrado Zoccolo"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Fair Anticipatory IO scheduler");