[RFC] cfq: adapt slice to number of processes doing I/O (v2)

From: Corrado Zoccolo
Date: Tue Sep 15 2009 - 13:02:34 EST


[This applies on top of git://git.kernel.dk/linux-2.6-block.git for-2.6.32]

When the number of processes performing I/O concurrently increases,
a fixed time slice per process will cause large latencies.

This (v2) patch will scale the time slice assigned to each process,
according to a target latency (tunable from sysfs, default 300ms).

In order to keep fairness among processes, we adopt two devices, w.r.t. v1.

* The number of active processes is computed using a special form of
running average, that quickly follows sudden increases (to keep latency low),
and decrease slowly (to have fairness in spite of rapid decreases of this value).

* The idle time is computed using remaining slice as a maximum.

To safeguard sequential bandwidth, we impose a minimum time slice
(computed using 2*cfq_slice_idle as base, adjusted according to priority
and async-ness).

Signed-off-by: Corrado Zoccolo <czoccolo@xxxxxxxxx>

---
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 0e3814b..e1a1e4d 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -27,6 +27,8 @@ static const int cfq_slice_sync = HZ / 10;
static int cfq_slice_async = HZ / 25;
static const int cfq_slice_async_rq = 2;
static int cfq_slice_idle = HZ / 125;
+static int cfq_preferred_latency = HZ * 3/10; /* 300 ms */
+static int cfq_queue_hist_divisor = 4;

/*
* offset from end of service tree
@@ -134,6 +136,9 @@ struct cfq_data {
struct rb_root prio_trees[CFQ_PRIO_LISTS];

unsigned int busy_queues;
+ unsigned int busy_queues_avg;
+ unsigned int busy_rt_queues;
+ unsigned int busy_rt_queues_avg;

int rq_in_driver[2];
int sync_flight;
@@ -173,6 +178,8 @@ struct cfq_data {
unsigned int cfq_slice[2];
unsigned int cfq_slice_async_rq;
unsigned int cfq_slice_idle;
+ unsigned int cfq_preferred_latency;
+ unsigned int cfq_queue_hist_divisor;

struct list_head cic_list;

@@ -301,10 +308,37 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
}

+static inline unsigned
+cfq_get_interested_queues(struct cfq_data *cfqd, bool rt) {
+ unsigned min_q, max_q;
+ unsigned mult = cfqd->cfq_queue_hist_divisor - 1;
+ unsigned round = cfqd->cfq_queue_hist_divisor / 2;
+ if (rt) {
+ min_q = min(cfqd->busy_rt_queues_avg, cfqd->busy_rt_queues);
+ max_q = max(cfqd->busy_rt_queues_avg, cfqd->busy_rt_queues);
+ cfqd->busy_rt_queues_avg = (mult * max_q + min_q + round) / cfqd->cfq_queue_hist_divisor;
+ return cfqd->busy_rt_queues_avg;
+ } else {
+ min_q = min(cfqd->busy_queues_avg, cfqd->busy_queues);
+ max_q = max(cfqd->busy_queues_avg, cfqd->busy_queues);
+ cfqd->busy_queues_avg = (mult * max_q + min_q + round) / cfqd->cfq_queue_hist_divisor;
+ return cfqd->busy_queues_avg;
+ }
+}
+
static inline void
cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
- cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
+ unsigned process_threshold = cfqd->cfq_preferred_latency / cfqd->cfq_slice[1];
+ unsigned interested_queues = cfq_get_interested_queues(cfqd, cfq_class_rt(cfqq));
+ unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
+
+ if (interested_queues > process_threshold) {
+ unsigned low_slice = min(slice, 2 * slice * cfqd->cfq_slice_idle / cfqd->cfq_slice[1]);
+ slice = max(slice * process_threshold / interested_queues, low_slice);
+ }
+
+ cfqq->slice_end = jiffies + slice;
cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
}

@@ -646,6 +680,8 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
BUG_ON(cfq_cfqq_on_rr(cfqq));
cfq_mark_cfqq_on_rr(cfqq);
cfqd->busy_queues++;
+ if (cfq_class_rt(cfqq))
+ cfqd->busy_rt_queues++;

cfq_resort_rr_list(cfqd, cfqq);
}
@@ -669,6 +705,8 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)

BUG_ON(!cfqd->busy_queues);
cfqd->busy_queues--;
+ if (cfq_class_rt(cfqq))
+ cfqd->busy_rt_queues--;
}

/*
@@ -1092,7 +1130,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
* fair distribution of slice time for a process doing back-to-back
* seeks. so allow a little bit of time for him to submit a new rq
*/
- sl = cfqd->cfq_slice_idle;
+ sl = min(cfqd->cfq_slice_idle, (unsigned)(cfqq->slice_end - jiffies));
if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic))
sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT));

@@ -2480,6 +2518,8 @@ static void *cfq_init_queue(struct request_queue *q)
cfqd->cfq_slice[1] = cfq_slice_sync;
cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
cfqd->cfq_slice_idle = cfq_slice_idle;
+ cfqd->cfq_preferred_latency = cfq_preferred_latency;
+ cfqd->cfq_queue_hist_divisor = cfq_queue_hist_divisor;
cfqd->hw_tag = 1;

return cfqd;
@@ -2549,6 +2589,8 @@ SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
+SHOW_FUNCTION(cfq_preferred_latency_show, cfqd->cfq_preferred_latency, 1);
+SHOW_FUNCTION(cfq_queue_hist_divisor_show, cfqd->cfq_queue_hist_divisor, 0);
#undef SHOW_FUNCTION

#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
@@ -2580,6 +2622,10 @@ STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
UINT_MAX, 0);
+
+STORE_FUNCTION(cfq_preferred_latency_store, &cfqd->cfq_preferred_latency, 1, 1000, 1);
+STORE_FUNCTION(cfq_queue_hist_divisor_store, &cfqd->cfq_queue_hist_divisor, 1, 100, 0);
+
#undef STORE_FUNCTION

#define CFQ_ATTR(name) \
@@ -2595,6 +2641,8 @@ static struct elv_fs_entry cfq_attrs[] = {
CFQ_ATTR(slice_async),
CFQ_ATTR(slice_async_rq),
CFQ_ATTR(slice_idle),
+ CFQ_ATTR(preferred_latency),
+ CFQ_ATTR(queue_hist_divisor),
__ATTR_NULL
};


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/