block: initial patch for on-stack per-task plugging
This patch adds support for creating a queuing context outside
of the queue itself. This enables us to batch up pieces of IO
before grabbing the block device queue lock and submitting them to
the IO scheduler.
The context is created on the stack of the process and assigned in
the task structure, so that we can auto-unplug it if we hit a schedule
event.
The current queue plugging happens implicitly if IO is submitted to
an empty device, yet callers have to remember to unplug that IO when
they are going to wait for it. This is an ugly API and has caused bugs
in the past. Additionally, it requires hacks in the vm (->sync_page()
callback) to handle that logic. By switching to an explicit plugging
scheme we make the API a lot nicer and can get rid of the ->sync_page()
hack in the vm.
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
diff --git a/block/blk-core.c b/block/blk-core.c
index e958c7a..6efb55c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -27,6 +27,7 @@
#include <linux/writeback.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/fault-inject.h>
+#include <linux/list_sort.h>
#define CREATE_TRACE_POINTS
#include <trace/events/block.h>
@@ -203,7 +204,7 @@
q = container_of(work, struct request_queue, delay_work.work);
spin_lock_irq(q->queue_lock);
- q->request_fn(q);
+ __blk_run_queue(q);
spin_unlock_irq(q->queue_lock);
}
@@ -686,6 +687,8 @@
static inline void blk_free_request(struct request_queue *q, struct request *rq)
{
+ BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
+
if (rq->cmd_flags & REQ_ELVPRIV)
elv_put_request(q, rq);
mempool_free(rq, q->rq.rq_pool);
@@ -1051,6 +1054,13 @@
}
EXPORT_SYMBOL(blk_requeue_request);
+static void add_acct_request(struct request_queue *q, struct request *rq,
+ int where)
+{
+ drive_stat_acct(rq, 1);
+ __elv_add_request(q, rq, where, 0);
+}
+
/**
* blk_insert_request - insert a special request into a request queue
* @q: request queue where request should be inserted
@@ -1093,8 +1103,7 @@
if (blk_rq_tagged(rq))
blk_queue_end_tag(q, rq);
- drive_stat_acct(rq, 1);
- __elv_add_request(q, rq, where, 0);
+ add_acct_request(q, rq, where);
__blk_run_queue(q);
spin_unlock_irqrestore(q->queue_lock, flags);
}
@@ -1215,6 +1224,113 @@
}
EXPORT_SYMBOL_GPL(blk_add_request_payload);
+static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
+ struct bio *bio)
+{
+ const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
+
+ /*
+ * Debug stuff, kill later
+ */
+ if (!rq_mergeable(req)) {
+ blk_dump_rq_flags(req, "back");
+ return false;
+ }
+
+ if (!ll_back_merge_fn(q, req, bio))
+ return false;
+
+ trace_block_bio_backmerge(q, bio);
+
+ if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
+ blk_rq_set_mixed_merge(req);
+
+ req->biotail->bi_next = bio;
+ req->biotail = bio;
+ req->__data_len += bio->bi_size;
+ req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
+
+ drive_stat_acct(req, 0);
+ return true;
+}
+
+static bool bio_attempt_front_merge(struct request_queue *q,
+ struct request *req, struct bio *bio)
+{
+ const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
+ sector_t sector;
+
+ /*
+ * Debug stuff, kill later
+ */
+ if (!rq_mergeable(req)) {
+ blk_dump_rq_flags(req, "front");
+ return false;
+ }
+
+ if (!ll_front_merge_fn(q, req, bio))
+ return false;
+
+ trace_block_bio_frontmerge(q, bio);
+
+ if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
+ blk_rq_set_mixed_merge(req);
+
+ sector = bio->bi_sector;
+
+ bio->bi_next = req->bio;
+ req->bio = bio;
+
+ /*
+ * may not be valid. if the low level driver said
+ * it didn't need a bounce buffer then it better
+ * not touch req->buffer either...
+ */
+ req->buffer = bio_data(bio);
+ req->__sector = bio->bi_sector;
+ req->__data_len += bio->bi_size;
+ req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
+
+ drive_stat_acct(req, 0);
+ return true;
+}
+
+/*
+ * Attempts to merge with the plugged list in the current process. Returns
+ * true if merge was succesful, otherwise false.
+ */
+static bool attempt_plug_merge(struct task_struct *tsk, struct request_queue *q,
+ struct bio *bio)
+{
+ struct blk_plug *plug;
+ struct request *rq;
+ bool ret = false;
+
+ plug = tsk->plug;
+ if (!plug)
+ goto out;
+
+ list_for_each_entry_reverse(rq, &plug->list, queuelist) {
+ int el_ret;
+
+ if (rq->q != q)
+ continue;
+
+ el_ret = elv_try_merge(rq, bio);
+ if (el_ret == ELEVATOR_BACK_MERGE) {
+ ret = bio_attempt_back_merge(q, rq, bio);
+ if (ret)
+ break;
+ } else if (el_ret == ELEVATOR_FRONT_MERGE) {
+ ret = bio_attempt_front_merge(q, rq, bio);
+ if (ret)
+ break;
+ }
+ }
+out:
+ return ret;
+}
+
void init_request_from_bio(struct request *req, struct bio *bio)
{
req->cpu = bio->bi_comp_cpu;
@@ -1230,26 +1346,12 @@
blk_rq_bio_prep(req->q, req, bio);
}
-/*
- * Only disabling plugging for non-rotational devices if it does tagging
- * as well, otherwise we do need the proper merging
- */
-static inline bool queue_should_plug(struct request_queue *q)
-{
- return !(blk_queue_nonrot(q) && blk_queue_tagged(q));
-}
-
static int __make_request(struct request_queue *q, struct bio *bio)
{
- struct request *req;
- int el_ret;
- unsigned int bytes = bio->bi_size;
- const unsigned short prio = bio_prio(bio);
const bool sync = !!(bio->bi_rw & REQ_SYNC);
- const bool unplug = !!(bio->bi_rw & REQ_UNPLUG);
- const unsigned long ff = bio->bi_rw & REQ_FAILFAST_MASK;
- int where = ELEVATOR_INSERT_SORT;
- int rw_flags;
+ struct blk_plug *plug;
+ int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
+ struct request *req;
/*
* low level driver can indicate that it wants pages above a
@@ -1258,78 +1360,36 @@
*/
blk_queue_bounce(q, &bio);
- spin_lock_irq(q->queue_lock);
-
if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
+ spin_lock_irq(q->queue_lock);
where = ELEVATOR_INSERT_FLUSH;
goto get_rq;
}
- if (elv_queue_empty(q))
- goto get_rq;
+ /*
+ * Check if we can merge with the plugged list before grabbing
+ * any locks.
+ */
+ if (attempt_plug_merge(current, q, bio))
+ goto out;
+
+ spin_lock_irq(q->queue_lock);
el_ret = elv_merge(q, &req, bio);
- switch (el_ret) {
- case ELEVATOR_BACK_MERGE:
- BUG_ON(!rq_mergeable(req));
-
- if (!ll_back_merge_fn(q, req, bio))
- break;
-
- trace_block_bio_backmerge(q, bio);
-
- if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
- blk_rq_set_mixed_merge(req);
-
- req->biotail->bi_next = bio;
- req->biotail = bio;
- req->__data_len += bytes;
- req->ioprio = ioprio_best(req->ioprio, prio);
- if (!blk_rq_cpu_valid(req))
- req->cpu = bio->bi_comp_cpu;
- drive_stat_acct(req, 0);
- elv_bio_merged(q, req, bio);
- if (!attempt_back_merge(q, req))
- elv_merged_request(q, req, el_ret);
- goto out;
-
- case ELEVATOR_FRONT_MERGE:
- BUG_ON(!rq_mergeable(req));
-
- if (!ll_front_merge_fn(q, req, bio))
- break;
-
- trace_block_bio_frontmerge(q, bio);
-
- if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) {
- blk_rq_set_mixed_merge(req);
- req->cmd_flags &= ~REQ_FAILFAST_MASK;
- req->cmd_flags |= ff;
+ if (el_ret == ELEVATOR_BACK_MERGE) {
+ BUG_ON(req->cmd_flags & REQ_ON_PLUG);
+ if (bio_attempt_back_merge(q, req, bio)) {
+ if (!attempt_back_merge(q, req))
+ elv_merged_request(q, req, el_ret);
+ goto out_unlock;
}
-
- bio->bi_next = req->bio;
- req->bio = bio;
-
- /*
- * may not be valid. if the low level driver said
- * it didn't need a bounce buffer then it better
- * not touch req->buffer either...
- */
- req->buffer = bio_data(bio);
- req->__sector = bio->bi_sector;
- req->__data_len += bytes;
- req->ioprio = ioprio_best(req->ioprio, prio);
- if (!blk_rq_cpu_valid(req))
- req->cpu = bio->bi_comp_cpu;
- drive_stat_acct(req, 0);
- elv_bio_merged(q, req, bio);
- if (!attempt_front_merge(q, req))
- elv_merged_request(q, req, el_ret);
- goto out;
-
- /* ELV_NO_MERGE: elevator says don't/can't merge. */
- default:
- ;
+ } else if (el_ret == ELEVATOR_FRONT_MERGE) {
+ BUG_ON(req->cmd_flags & REQ_ON_PLUG);
+ if (bio_attempt_front_merge(q, req, bio)) {
+ if (!attempt_front_merge(q, req))
+ elv_merged_request(q, req, el_ret);
+ goto out_unlock;
+ }
}
get_rq:
@@ -1356,20 +1416,35 @@
*/
init_request_from_bio(req, bio);
- spin_lock_irq(q->queue_lock);
if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
- bio_flagged(bio, BIO_CPU_AFFINE))
- req->cpu = blk_cpu_to_group(smp_processor_id());
- if (queue_should_plug(q) && elv_queue_empty(q))
- blk_plug_device(q);
+ bio_flagged(bio, BIO_CPU_AFFINE)) {
+ req->cpu = blk_cpu_to_group(get_cpu());
+ put_cpu();
+ }
- /* insert the request into the elevator */
- drive_stat_acct(req, 1);
- __elv_add_request(q, req, where, 0);
+ plug = current->plug;
+ if (plug && !sync) {
+ if (!plug->should_sort && !list_empty(&plug->list)) {
+ struct request *__rq;
+
+ __rq = list_entry_rq(plug->list.prev);
+ if (__rq->q != q)
+ plug->should_sort = 1;
+ }
+ /*
+ * Debug flag, kill later
+ */
+ req->cmd_flags |= REQ_ON_PLUG;
+ list_add_tail(&req->queuelist, &plug->list);
+ drive_stat_acct(req, 1);
+ } else {
+ spin_lock_irq(q->queue_lock);
+ add_acct_request(q, req, where);
+ __blk_run_queue(q);
+out_unlock:
+ spin_unlock_irq(q->queue_lock);
+ }
out:
- if (unplug || !queue_should_plug(q))
- __generic_unplug_device(q);
- spin_unlock_irq(q->queue_lock);
return 0;
}
@@ -1772,9 +1847,7 @@
*/
BUG_ON(blk_queued_rq(rq));
- drive_stat_acct(rq, 1);
- __elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0);
-
+ add_acct_request(q, rq, ELEVATOR_INSERT_BACK);
spin_unlock_irqrestore(q->queue_lock, flags);
return 0;
@@ -2659,6 +2732,106 @@
}
EXPORT_SYMBOL(kblockd_schedule_delayed_work);
+#define PLUG_MAGIC 0x91827364
+
+void blk_start_plug(struct blk_plug *plug)
+{
+ struct task_struct *tsk = current;
+
+ plug->magic = PLUG_MAGIC;
+ INIT_LIST_HEAD(&plug->list);
+ plug->should_sort = 0;
+
+ /*
+ * If this is a nested plug, don't actually assign it. It will be
+ * flushed on its own.
+ */
+ if (!tsk->plug) {
+ /*
+ * Store ordering should not be needed here, since a potential
+ * preempt will imply a full memory barrier
+ */
+ tsk->plug = plug;
+ }
+}
+EXPORT_SYMBOL(blk_start_plug);
+
+static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+ struct request *rqa = container_of(a, struct request, queuelist);
+ struct request *rqb = container_of(b, struct request, queuelist);
+
+ return !(rqa->q == rqb->q);
+}
+
+static void flush_plug_list(struct blk_plug *plug)
+{
+ struct request_queue *q;
+ unsigned long flags;
+ struct request *rq;
+
+ BUG_ON(plug->magic != PLUG_MAGIC);
+
+ if (list_empty(&plug->list))
+ return;
+
+ if (plug->should_sort)
+ list_sort(NULL, &plug->list, plug_rq_cmp);
+
+ q = NULL;
+ local_irq_save(flags);
+ while (!list_empty(&plug->list)) {
+ rq = list_entry_rq(plug->list.next);
+ list_del_init(&rq->queuelist);
+ BUG_ON(!(rq->cmd_flags & REQ_ON_PLUG));
+ BUG_ON(!rq->q);
+ if (rq->q != q) {
+ if (q) {
+ __blk_run_queue(q);
+ spin_unlock(q->queue_lock);
+ }
+ q = rq->q;
+ spin_lock(q->queue_lock);
+ }
+ rq->cmd_flags &= ~REQ_ON_PLUG;
+
+ /*
+ * rq is already accounted, so use raw insert
+ */
+ __elv_add_request(q, rq, ELEVATOR_INSERT_SORT, 0);
+ }
+
+ if (q) {
+ __blk_run_queue(q);
+ spin_unlock(q->queue_lock);
+ }
+
+ BUG_ON(!list_empty(&plug->list));
+ local_irq_restore(flags);
+}
+
+static void __blk_finish_plug(struct task_struct *tsk, struct blk_plug *plug)
+{
+ flush_plug_list(plug);
+
+ if (plug == tsk->plug)
+ tsk->plug = NULL;
+}
+
+void blk_finish_plug(struct blk_plug *plug)
+{
+ if (plug)
+ __blk_finish_plug(current, plug);
+}
+EXPORT_SYMBOL(blk_finish_plug);
+
+void __blk_flush_plug(struct task_struct *tsk, struct blk_plug *plug)
+{
+ __blk_finish_plug(tsk, plug);
+ tsk->plug = plug;
+}
+EXPORT_SYMBOL(__blk_flush_plug);
+
int __init blk_dev_init(void)
{
BUILD_BUG_ON(__REQ_NR_BITS > 8 *
diff --git a/block/blk-flush.c b/block/blk-flush.c
index a867e3f..1e2aa8a 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -264,10 +264,9 @@
static void flush_data_end_io(struct request *rq, int error)
{
struct request_queue *q = rq->q;
- bool was_empty = elv_queue_empty(q);
/* after populating an empty queue, kick it to avoid stall */
- if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error) && was_empty)
+ if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error))
__blk_run_queue(q);
}
diff --git a/block/elevator.c b/block/elevator.c
index f98e92e..2571392 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -113,7 +113,7 @@
}
EXPORT_SYMBOL(elv_rq_merge_ok);
-static inline int elv_try_merge(struct request *__rq, struct bio *bio)
+int elv_try_merge(struct request *__rq, struct bio *bio)
{
int ret = ELEVATOR_NO_MERGE;
@@ -421,6 +421,8 @@
struct list_head *entry;
int stop_flags;
+ BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
+
if (q->last_merge == rq)
q->last_merge = NULL;
@@ -696,6 +698,8 @@
void __elv_add_request(struct request_queue *q, struct request *rq, int where,
int plug)
{
+ BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
+
if (rq->cmd_flags & REQ_SOFTBARRIER) {
/* barriers are scheduling boundary, update end_sector */
if (rq->cmd_type == REQ_TYPE_FS ||
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index dddedfc..16b2864 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -152,6 +152,7 @@
__REQ_IO_STAT, /* account I/O stat */
__REQ_MIXED_MERGE, /* merge of different types, fail separately */
__REQ_SECURE, /* secure discard (used with __REQ_DISCARD) */
+ __REQ_ON_PLUG, /* on plug list */
__REQ_NR_BITS, /* stops here */
};
@@ -193,5 +194,6 @@
#define REQ_IO_STAT (1 << __REQ_IO_STAT)
#define REQ_MIXED_MERGE (1 << __REQ_MIXED_MERGE)
#define REQ_SECURE (1 << __REQ_SECURE)
+#define REQ_ON_PLUG (1 << __REQ_ON_PLUG)
#endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f55b2a8..5873037 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -871,6 +871,31 @@
struct request_queue *blk_alloc_queue_node(gfp_t, int);
extern void blk_put_queue(struct request_queue *);
+struct blk_plug {
+ unsigned long magic;
+ struct list_head list;
+ unsigned int should_sort;
+};
+
+extern void blk_start_plug(struct blk_plug *);
+extern void blk_finish_plug(struct blk_plug *);
+extern void __blk_flush_plug(struct task_struct *, struct blk_plug *);
+
+static inline void blk_flush_plug(struct task_struct *tsk)
+{
+ struct blk_plug *plug = tsk->plug;
+
+ if (unlikely(plug))
+ __blk_flush_plug(tsk, plug);
+}
+
+static inline bool blk_needs_flush_plug(struct task_struct *tsk)
+{
+ struct blk_plug *plug = tsk->plug;
+
+ return plug && !list_empty(&plug->list);
+}
+
/*
* tag stuff
*/
@@ -1294,6 +1319,23 @@
return 0;
}
+static inline void blk_start_plug(struct list_head *list)
+{
+}
+
+static inline void blk_finish_plug(struct list_head *list)
+{
+}
+
+static inline void blk_flush_plug(struct task_struct *tsk)
+{
+}
+
+static inline bool blk_needs_flush_plug(struct task_struct *tsk)
+{
+ return false;
+}
+
#endif /* CONFIG_BLOCK */
#endif
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 39b68ed..8857cf9 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -105,6 +105,7 @@
extern void __elv_add_request(struct request_queue *, struct request *, int, int);
extern void elv_insert(struct request_queue *, struct request *, int);
extern int elv_merge(struct request_queue *, struct request **, struct bio *);
+extern int elv_try_merge(struct request *, struct bio *);
extern void elv_merge_requests(struct request_queue *, struct request *,
struct request *);
extern void elv_merged_request(struct request_queue *, struct request *, int);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 777d8a5..96ac226 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -99,6 +99,7 @@
struct bio_list;
struct fs_struct;
struct perf_event_context;
+struct blk_plug;
/*
* List of flags we want to share for kernel threads,
@@ -1429,6 +1430,11 @@
/* stacked block device info */
struct bio_list *bio_list;
+#ifdef CONFIG_BLOCK
+/* stack plugging */
+ struct blk_plug *plug;
+#endif
+
/* VM state */
struct reclaim_state *reclaim_state;
diff --git a/kernel/exit.c b/kernel/exit.c
index f9a45eb..6a488ad 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -908,6 +908,7 @@
profile_task_exit(tsk);
WARN_ON(atomic_read(&tsk->fs_excl));
+ WARN_ON(blk_needs_flush_plug(tsk));
if (unlikely(in_interrupt()))
panic("Aiee, killing interrupt handler!");
diff --git a/kernel/fork.c b/kernel/fork.c
index 25e4291..027c80e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1204,6 +1204,9 @@
* Clear TID on mm_release()?
*/
p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
+#ifdef CONFIG_BLOCK
+ p->plug = NULL;
+#endif
#ifdef CONFIG_FUTEX
p->robust_list = NULL;
#ifdef CONFIG_COMPAT
diff --git a/kernel/sched.c b/kernel/sched.c
index 18d38e4..ca098bf 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3978,6 +3978,16 @@
switch_count = &prev->nvcsw;
}
+ /*
+ * If we are going to sleep and we have plugged IO queued, make
+ * sure to submit it to avoid deadlocks.
+ */
+ if (prev->state != TASK_RUNNING && blk_needs_flush_plug(prev)) {
+ raw_spin_unlock(&rq->lock);
+ blk_flush_plug(prev);
+ raw_spin_lock(&rq->lock);
+ }
+
pre_schedule(rq, prev);
if (unlikely(!rq->nr_running))
@@ -5333,6 +5343,7 @@
delayacct_blkio_start();
atomic_inc(&rq->nr_iowait);
+ blk_flush_plug(current);
current->in_iowait = 1;
schedule();
current->in_iowait = 0;
@@ -5348,6 +5359,7 @@
delayacct_blkio_start();
atomic_inc(&rq->nr_iowait);
+ blk_flush_plug(current);
current->in_iowait = 1;
ret = schedule_timeout(timeout);
current->in_iowait = 0;