[patch] deadline-ioscheduler rb-tree sort

From: Jens Axboe (axboe@suse.de)
Date: Thu Oct 31 2002 - 08:43:15 EST


Hi,

This is something I whipped together the other day for testing the over
head of the O(N) insert scan that the deadline io scheduler does. So
instead of using a plain linked list, I adapted it to use an rb tree
since that was already available with the generic implementation in the
kernel.

Results are quite promising, overhead is down a lot. I'm not pushing
this for inclusion, just sending it out so others can test etc. Patch is
against 2.5.45.

===== drivers/block/ll_rw_blk.c 1.135 vs edited =====
--- 1.135/drivers/block/ll_rw_blk.c Mon Oct 28 20:57:59 2002
+++ edited/drivers/block/ll_rw_blk.c Thu Oct 31 14:31:09 2002
@@ -2175,8 +2183,8 @@
         queue_nr_requests = (total_ram >> 9) & ~7;
         if (queue_nr_requests < 16)
                 queue_nr_requests = 16;
- if (queue_nr_requests > 128)
- queue_nr_requests = 128;
+ if (queue_nr_requests > 512)
+ queue_nr_requests = 512;
 
         batch_requests = queue_nr_requests / 8;
         if (batch_requests > 8)
===== drivers/block/deadline-iosched.c 1.9 vs edited =====
--- 1.9/drivers/block/deadline-iosched.c Tue Oct 29 12:19:59 2002
+++ edited/drivers/block/deadline-iosched.c Thu Oct 31 10:48:09 2002
@@ -17,6 +17,7 @@
 #include <linux/init.h>
 #include <linux/compiler.h>
 #include <linux/hash.h>
+#include <linux/rbtree.h>
 
 /*
  * feel free to try other values :-). read_expire value is the timeout for
@@ -33,7 +34,7 @@
  */
 static int writes_starved = 2;
 
-static const int deadline_hash_shift = 8;
+static const int deadline_hash_shift = 10;
 #define DL_HASH_BLOCK(sec) ((sec) >> 3)
 #define DL_HASH_FN(sec) (hash_long(DL_HASH_BLOCK((sec)), deadline_hash_shift))
 #define DL_HASH_ENTRIES (1 << deadline_hash_shift)
@@ -48,19 +49,20 @@
         /*
          * run time data
          */
- struct list_head sort_list[2]; /* sorted listed */
+ struct rb_root rb_list[2];
         struct list_head read_fifo; /* fifo list */
         struct list_head *dispatch; /* driver dispatch queue */
         struct list_head *hash; /* request hash */
         sector_t last_sector; /* last sector sent to drive */
         unsigned long hash_valid_count; /* barrier hash count */
         unsigned int starved; /* writes starved */
+ unsigned int rb_entries;
 
         /*
          * settings that change how the i/o scheduler behaves
          */
         unsigned int fifo_batch;
- unsigned long read_expire;
+ unsigned int read_expire;
         unsigned int seek_cost;
         unsigned int writes_starved;
 };
@@ -69,10 +71,24 @@
  * pre-request data.
  */
 struct deadline_rq {
- struct list_head fifo;
+ /*
+ * rbtree index, key is the starting offset
+ */
+ struct rb_node rb_node;
+ sector_t rb_key;
+
+ struct request *request;
+
+ /*
+ * request hash, key is the ending offset (for back merge lookup)
+ */
         struct list_head hash;
         unsigned long hash_valid_count;
- struct request *request;
+
+ /*
+ * expire fifo
+ */
+ struct list_head fifo;
         unsigned long expires;
 };
 
@@ -81,23 +97,23 @@
 #define RQ_DATA(rq) ((struct deadline_rq *) (rq)->elevator_private)
 
 /*
- * rq hash
+ * the back merge hash support functions
  */
-static inline void __deadline_del_rq_hash(struct deadline_rq *drq)
+static inline void __deadline_hash_del(struct deadline_rq *drq)
 {
         drq->hash_valid_count = 0;
         list_del_init(&drq->hash);
 }
 
 #define ON_HASH(drq) (drq)->hash_valid_count
-static inline void deadline_del_rq_hash(struct deadline_rq *drq)
+static inline void deadline_hash_del(struct deadline_rq *drq)
 {
         if (ON_HASH(drq))
- __deadline_del_rq_hash(drq);
+ __deadline_hash_del(drq);
 }
 
 static inline void
-deadline_add_rq_hash(struct deadline_data *dd, struct deadline_rq *drq)
+deadline_hash_add(struct deadline_data *dd, struct deadline_rq *drq)
 {
         struct request *rq = drq->request;
 
@@ -109,33 +125,30 @@
 
 #define list_entry_hash(ptr) list_entry((ptr), struct deadline_rq, hash)
 static struct request *
-deadline_find_hash(struct deadline_data *dd, sector_t offset)
+deadline_hash_find(struct deadline_data *dd, sector_t offset)
 {
         struct list_head *hash_list = &dd->hash[DL_HASH_FN(offset)];
         struct list_head *entry, *next = hash_list->next;
- struct deadline_rq *drq;
- struct request *rq = NULL;
 
         while ((entry = next) != hash_list) {
+ struct deadline_rq *drq = list_entry_hash(entry);
+ struct request *__rq = drq->request;
+
                 next = entry->next;
                 
- drq = list_entry_hash(entry);
-
                 BUG_ON(!drq->hash_valid_count);
 
- if (!rq_mergeable(drq->request)
+ if (!rq_mergeable(__rq)
                     || drq->hash_valid_count != dd->hash_valid_count) {
- __deadline_del_rq_hash(drq);
+ __deadline_hash_del(drq);
                         continue;
                 }
 
- if (drq->request->sector + drq->request->nr_sectors == offset) {
- rq = drq->request;
- break;
- }
+ if (__rq->sector + __rq->nr_sectors == offset)
+ return __rq;
         }
 
- return rq;
+ return NULL;
 }
 
 static sector_t deadline_get_last_sector(struct deadline_data *dd)
@@ -154,86 +167,135 @@
         return last_sec;
 }
 
+/*
+ * rb tree support functions
+ */
+#define RB_NONE (2)
+#define RB_EMPTY(root) ((root)->rb_node == NULL)
+#define ON_RB(node) ((node)->rb_color != RB_NONE)
+#define RB_CLEAR(node) ((node)->rb_color = RB_NONE)
+#define deadline_rb_entry(node) rb_entry((node), struct deadline_rq, rb_node)
+#define DRQ_RB_ROOT(dd, drq) (&(dd)->rb_list[rq_data_dir((drq)->request)])
+
+static inline int
+__deadline_rb_add(struct deadline_data *dd, struct deadline_rq *drq)
+{
+ struct rb_node **p = &DRQ_RB_ROOT(dd, drq)->rb_node;
+ struct rb_node *parent = NULL;
+ struct deadline_rq *__drq;
+
+ while (*p) {
+ parent = *p;
+ __drq = deadline_rb_entry(parent);
+
+ if (drq->rb_key < __drq->rb_key)
+ p = &(*p)->rb_left;
+ else if (drq->rb_key > __drq->rb_key)
+ p = &(*p)->rb_right;
+ else
+ return 1;
+ }
+
+ rb_link_node(&drq->rb_node, parent, p);
+ return 0;
+}
+
+static void deadline_rb_add(struct deadline_data *dd, struct deadline_rq *drq)
+{
+ drq->rb_key = drq->request->sector;
+
+ if (!__deadline_rb_add(dd, drq)) {
+ rb_insert_color(&drq->rb_node, DRQ_RB_ROOT(dd, drq));
+ dd->rb_entries++;
+ return;
+ }
+
+ /*
+ * this cannot happen
+ */
+ blk_dump_rq_flags(drq->request, "deadline_rb_add alias");
+ list_add_tail(&drq->request->queuelist, dd->dispatch);
+}
+
+static inline void
+deadline_rb_del(struct deadline_data *dd, struct deadline_rq *drq)
+{
+ if (ON_RB(&drq->rb_node)) {
+ rb_erase(&drq->rb_node, DRQ_RB_ROOT(dd, drq));
+ RB_CLEAR(&drq->rb_node);
+ dd->rb_entries--;
+ BUG_ON(dd->rb_entries < 0);
+ }
+}
+
+static struct request *
+deadline_rb_find(struct deadline_data *dd, sector_t sector, int data_dir)
+{
+ struct rb_node *n = dd->rb_list[data_dir].rb_node;
+ struct deadline_rq *drq;
+
+ while (n) {
+ drq = deadline_rb_entry(n);
+
+ if (sector < drq->rb_key)
+ n = n->rb_left;
+ else if (sector > drq->rb_key)
+ n = n->rb_right;
+ else
+ return drq->request;
+ }
+
+ return NULL;
+}
+
 static int
 deadline_merge(request_queue_t *q, struct list_head **insert, struct bio *bio)
 {
         struct deadline_data *dd = q->elevator.elevator_data;
- const int data_dir = bio_data_dir(bio);
- struct list_head *entry, *sort_list;
- struct request *__rq;
         int ret = ELEVATOR_NO_MERGE;
+ struct request *__rq;
 
         /*
          * try last_merge to avoid going to hash
          */
         ret = elv_try_last_merge(q, bio);
         if (ret != ELEVATOR_NO_MERGE) {
- *insert = q->last_merge;
+ __rq = list_entry_rq(q->last_merge);
                 goto out;
         }
 
         /*
          * see if the merge hash can satisfy a back merge
          */
- if ((__rq = deadline_find_hash(dd, bio->bi_sector))) {
+ if ((__rq = deadline_hash_find(dd, bio->bi_sector))) {
                 BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector);
 
                 if (elv_rq_merge_ok(__rq, bio)) {
- *insert = &__rq->queuelist;
                         ret = ELEVATOR_BACK_MERGE;
                         goto out;
                 }
         }
 
         /*
- * scan list from back to find insertion point.
+ * probably not worth the extra overhead
          */
- entry = sort_list = &dd->sort_list[data_dir];
- while ((entry = entry->prev) != sort_list) {
- __rq = list_entry_rq(entry);
-
- BUG_ON(__rq->flags & REQ_STARTED);
-
- if (!(__rq->flags & REQ_CMD))
- continue;
+ __rq = deadline_rb_find(dd, bio->bi_sector + bio_sectors(bio), bio_data_dir(bio));
+ if (__rq) {
+ BUG_ON(bio->bi_sector + bio_sectors(bio) != __rq->sector);
 
- /*
- * it's not necessary to break here, and in fact it could make
- * us loose a front merge. emperical evidence shows this to
- * be a big waste of cycles though, so quit scanning
- */
- if (!*insert && bio_rq_in_between(bio, __rq, sort_list)) {
- *insert = &__rq->queuelist;
- break;
- }
-
- if (__rq->flags & REQ_BARRIER)
- break;
-
- /*
- * checking for a front merge, hash will miss those
- */
- if (__rq->sector - bio_sectors(bio) == bio->bi_sector) {
- ret = elv_try_merge(__rq, bio);
- if (ret != ELEVATOR_NO_MERGE) {
- *insert = &__rq->queuelist;
- break;
- }
+ if (elv_rq_merge_ok(__rq, bio)) {
+ ret = ELEVATOR_FRONT_MERGE;
+ goto out;
                 }
         }
 
- /*
- * no insertion point found, check the very front
- */
- if (!*insert && !list_empty(sort_list)) {
- __rq = list_entry_rq(sort_list->next);
-
- if (bio->bi_sector + bio_sectors(bio) < __rq->sector &&
- bio->bi_sector > deadline_get_last_sector(dd))
- *insert = sort_list;
+ *insert = NULL;
+out:
+ if (ret != ELEVATOR_NO_MERGE) {
+ *insert = &__rq->queuelist;
+ q->last_merge = &__rq->queuelist;
         }
 
-out:
         return ret;
 }
 
@@ -242,8 +304,16 @@
         struct deadline_data *dd = q->elevator.elevator_data;
         struct deadline_rq *drq = RQ_DATA(req);
 
- deadline_del_rq_hash(drq);
- deadline_add_rq_hash(dd, drq);
+ deadline_hash_del(drq);
+ deadline_hash_add(dd, drq);
+
+ /*
+ * the merge was a front merge, we need to reposition request
+ */
+ if (req->sector != drq->rb_key) {
+ deadline_rb_del(dd, drq);
+ deadline_rb_add(dd, drq);
+ }
 
         q->last_merge = &req->queuelist;
 }
@@ -258,8 +328,13 @@
         BUG_ON(!drq);
         BUG_ON(!dnext);
 
- deadline_del_rq_hash(drq);
- deadline_add_rq_hash(dd, drq);
+ deadline_hash_del(drq);
+ deadline_hash_add(dd, drq);
+
+ if (req->sector != drq->rb_key) {
+ deadline_rb_del(dd, drq);
+ deadline_rb_add(dd, drq);
+ }
 
         /*
          * if dnext expires before drq, assign it's expire time to drq
@@ -274,16 +349,16 @@
 }
 
 /*
- * move request from sort list to dispatch queue. maybe remove from rq hash
- * here too?
+ * move request from sort list to dispatch queue.
  */
 static inline void
 deadline_move_to_dispatch(struct deadline_data *dd, struct request *rq)
 {
         struct deadline_rq *drq = RQ_DATA(rq);
 
- list_move_tail(&rq->queuelist, dd->dispatch);
+ deadline_rb_del(dd, drq);
         list_del_init(&drq->fifo);
+ list_add_tail(&rq->queuelist, dd->dispatch);
 }
 
 /*
@@ -291,12 +366,12 @@
  */
 static void deadline_move_requests(struct deadline_data *dd, struct request *rq)
 {
- struct list_head *sort_head = &dd->sort_list[rq_data_dir(rq)];
         sector_t last_sec = deadline_get_last_sector(dd);
         int batch_count = dd->fifo_batch;
+ struct deadline_rq *drq = RQ_DATA(rq);
 
         do {
- struct list_head *nxt = rq->queuelist.next;
+ struct rb_node *rbnext = rb_next(&drq->rb_node);
                 int this_rq_cost;
 
                 /*
@@ -308,7 +383,7 @@
                 /*
                  * if this is the last entry, don't bother doing accounting
                  */
- if (nxt == sort_head)
+ if (rbnext == NULL)
                         break;
 
                 this_rq_cost = dd->seek_cost;
@@ -320,7 +395,8 @@
                         break;
 
                 last_sec = rq->sector + rq->nr_sectors;
- rq = list_entry_rq(nxt);
+ drq = deadline_rb_entry(rbnext);
+ rq = drq->request;
         } while (1);
 }
 
@@ -343,25 +419,11 @@
         return 0;
 }
 
-static struct request *deadline_next_request(request_queue_t *q)
+static int deadline_dispatch_requests(struct deadline_data *dd)
 {
- struct deadline_data *dd = q->elevator.elevator_data;
+ const int writes = !RB_EMPTY(&dd->rb_list[WRITE]);
         struct deadline_rq *drq;
- struct list_head *nxt;
- struct request *rq;
- int writes;
-
- /*
- * if still requests on the dispatch queue, just grab the first one
- */
- if (!list_empty(&q->queue_head)) {
-dispatch:
- rq = list_entry_rq(q->queue_head.next);
- dd->last_sector = rq->sector + rq->nr_sectors;
- return rq;
- }
-
- writes = !list_empty(&dd->sort_list[WRITE]);
+ int ret = 0;
 
         /*
          * if we have expired entries on the fifo list, move some to dispatch
@@ -370,19 +432,20 @@
                 if (writes && (dd->starved++ >= dd->writes_starved))
                         goto dispatch_writes;
 
- nxt = dd->read_fifo.next;
- drq = list_entry_fifo(nxt);
+ drq = list_entry_fifo(dd->read_fifo.next);
                 deadline_move_requests(dd, drq->request);
- goto dispatch;
+ ret = 1;
+ goto out;
         }
 
- if (!list_empty(&dd->sort_list[READ])) {
+ if (!RB_EMPTY(&dd->rb_list[READ])) {
                 if (writes && (dd->starved++ >= dd->writes_starved))
                         goto dispatch_writes;
 
- nxt = dd->sort_list[READ].next;
- deadline_move_requests(dd, list_entry_rq(nxt));
- goto dispatch;
+ drq = deadline_rb_entry(rb_first(&dd->rb_list[READ]));
+ deadline_move_requests(dd, drq->request);
+ ret = 1;
+ goto out;
         }
 
         /*
@@ -391,14 +454,41 @@
          */
         if (writes) {
 dispatch_writes:
- nxt = dd->sort_list[WRITE].next;
- deadline_move_requests(dd, list_entry_rq(nxt));
+ drq = deadline_rb_entry(rb_first(&dd->rb_list[WRITE]));
+ deadline_move_requests(dd, drq->request);
                 dd->starved = 0;
- goto dispatch;
+ ret = 1;
         }
 
- BUG_ON(!list_empty(&dd->sort_list[READ]));
- BUG_ON(writes);
+out:
+ return ret;
+}
+
+static struct request *deadline_next_request(request_queue_t *q)
+{
+ struct deadline_data *dd = q->elevator.elevator_data;
+ struct request *rq;
+
+ /*
+ * if there are still requests on the dispatch queue, grab the first one
+ */
+ if (!list_empty(&q->queue_head)) {
+dispatch:
+ rq = list_entry_rq(q->queue_head.next);
+ dd->last_sector = rq->sector + rq->nr_sectors;
+ return rq;
+ }
+
+ if (deadline_dispatch_requests(dd))
+ goto dispatch;
+
+ /*
+ * if we have entries on the read or write sorted list, its a bug
+ * if deadline_dispatch_requests() didn't move any
+ */
+ BUG_ON(!RB_EMPTY(&dd->rb_list[READ]));
+ BUG_ON(!RB_EMPTY(&dd->rb_list[WRITE]));
+
         return NULL;
 }
 
@@ -409,28 +499,23 @@
         struct deadline_rq *drq = RQ_DATA(rq);
         const int data_dir = rq_data_dir(rq);
 
- /*
- * flush hash on barrier insert, as not to allow merges before a
- * barrier.
- */
         if (unlikely(rq->flags & REQ_BARRIER)) {
                 DL_INVALIDATE_HASH(dd);
                 q->last_merge = NULL;
         }
 
- /*
- * add to sort list
- */
- if (!insert_here)
- insert_here = dd->sort_list[data_dir].prev;
+ if (unlikely(!(rq->flags & REQ_CMD))) {
+ if (!insert_here)
+ insert_here = dd->dispatch->prev;
 
- list_add(&rq->queuelist, insert_here);
-
- if (unlikely(!(rq->flags & REQ_CMD)))
+ list_add(&rq->queuelist, insert_here);
                 return;
+ }
+
+ deadline_rb_add(dd, drq);
 
         if (rq_mergeable(rq)) {
- deadline_add_rq_hash(dd, drq);
+ deadline_hash_add(dd, drq);
 
                 if (!q->last_merge)
                         q->last_merge = &rq->queuelist;
@@ -443,15 +528,27 @@
                 drq->expires = jiffies + dd->read_expire;
                 list_add_tail(&drq->fifo, &dd->read_fifo);
         }
+
+ /*
+ * heuristic to offload the time spent moving entries interrupt
+ * context. this happens when a driver queues a new request(s) from
+ * its isr
+ */
+#if 0
+ if (dd->rb_entries >= 8)
+ deadline_dispatch_requests(dd);
+#endif
 }
 
 static void deadline_remove_request(request_queue_t *q, struct request *rq)
 {
+ struct deadline_data *dd = q->elevator.elevator_data;
         struct deadline_rq *drq = RQ_DATA(rq);
 
         if (drq) {
                 list_del_init(&drq->fifo);
- deadline_del_rq_hash(drq);
+ deadline_hash_del(drq);
+ deadline_rb_del(dd, drq);
         }
 }
 
@@ -459,8 +556,8 @@
 {
         struct deadline_data *dd = q->elevator.elevator_data;
 
- if (!list_empty(&dd->sort_list[WRITE]) ||
- !list_empty(&dd->sort_list[READ]) ||
+ if (!RB_EMPTY(&dd->rb_list[WRITE]) ||
+ !RB_EMPTY(&dd->rb_list[READ]) ||
             !list_empty(&q->queue_head))
                 return 0;
 
@@ -473,7 +570,7 @@
 {
         struct deadline_data *dd = q->elevator.elevator_data;
 
- return &dd->sort_list[rq_data_dir(rq)];
+ return dd->dispatch;
 }
 
 static void deadline_exit(request_queue_t *q, elevator_t *e)
@@ -484,8 +581,8 @@
         int i;
 
         BUG_ON(!list_empty(&dd->read_fifo));
- BUG_ON(!list_empty(&dd->sort_list[READ]));
- BUG_ON(!list_empty(&dd->sort_list[WRITE]));
+ BUG_ON(!RB_EMPTY(&dd->rb_list[READ]));
+ BUG_ON(!RB_EMPTY(&dd->rb_list[WRITE]));
 
         for (i = READ; i <= WRITE; i++) {
                 struct request_list *rl = &q->rq[i];
@@ -538,8 +635,8 @@
                 INIT_LIST_HEAD(&dd->hash[i]);
 
         INIT_LIST_HEAD(&dd->read_fifo);
- INIT_LIST_HEAD(&dd->sort_list[READ]);
- INIT_LIST_HEAD(&dd->sort_list[WRITE]);
+ dd->rb_list[READ] = RB_ROOT;
+ dd->rb_list[WRITE] = RB_ROOT;
         dd->dispatch = &q->queue_head;
         dd->fifo_batch = fifo_batch;
         dd->read_expire = read_expire;
@@ -567,6 +664,7 @@
                         memset(drq, 0, sizeof(*drq));
                         INIT_LIST_HEAD(&drq->fifo);
                         INIT_LIST_HEAD(&drq->hash);
+ RB_CLEAR(&drq->rb_node);
                         drq->request = rq;
                         rq->elevator_private = drq;
                 }

-- 
Jens Axboe

- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/



This archive was generated by hypermail 2b29 : Thu Oct 31 2002 - 22:00:53 EST