Re: [RFC] [PATCH 8/8] cfq-iosched: Introduce hierarchical schedulingwith CFQ queue and group at the same level

From: Gui Jianfeng
Date: Sun Nov 28 2010 - 21:42:41 EST


Vivek Goyal wrote:
> On Sun, Nov 14, 2010 at 04:25:49PM +0800, Gui Jianfeng wrote:
>> This patch makes CFQ queue and CFQ group schedules at the same level.
>> Consider the following hierarchy:
>>
>> Root
>> / | \
>> q1 q2 G1
>> / \
>> q3 G2
>>
>> q1 q2 and q3 are CFQ queues G1 and G2 are CFQ groups. Currently, q1, q2
>> and G1 are scheduling on a same service tree in Root CFQ group. q3 and G2
>> are schedluing under G1. Note, for the time being, CFQ group is treated
>> as "BE and SYNC" workload, and is put on "BE and SYNC" service tree. That
>> means service differentiate only happens in "BE and SYNC" service tree.
>> Later, we may introduce "IO Class" for CFQ group.
>>
>
> Have you got rid of flat mode (existing default mode). IOW, I don't see
> the introduction of "use_hierarchy" which will differentiate between
> whether to treat an hierarchy as flat or not?

Vivek,

As I said in [PATCH 0/8], yes, for the time being, I get rid of flat mode.
Hierarchical cgroup creation is just merge into block-tree, not in Mainline
now, .so I think it's ok I'd like to post "use_hierarchy" patchset separately
when this patchset get merged. How do you say, Vivek?

Gui

>
> Vivek
>
>> Signed-off-by: Gui Jianfeng <guijianfeng@xxxxxxxxxxxxxx>
>> ---
>> block/cfq-iosched.c | 483 ++++++++++++++++++++++++++++++++++-----------------
>> 1 files changed, 324 insertions(+), 159 deletions(-)
>>
>> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
>> index 1df0928..9def3a2 100644
>> --- a/block/cfq-iosched.c
>> +++ b/block/cfq-iosched.c
>> @@ -105,6 +105,9 @@ struct io_sched_entity {
>> u64 vdisktime;
>> bool is_group_entity;
>> unsigned int weight;
>> + struct io_sched_entity *parent;
>> + /* Reposition time */
>> + unsigned long reposition_time;
>> };
>>
>> /*
>> @@ -113,8 +116,6 @@ struct io_sched_entity {
>> struct cfq_queue {
>> /* The schedule entity */
>> struct io_sched_entity queue_entity;
>> - /* Reposition time */
>> - unsigned long reposition_time;
>> /* reference count */
>> atomic_t ref;
>> /* various state flags, see below */
>> @@ -193,6 +194,9 @@ struct cfq_group {
>> /* number of cfqq currently on this group */
>> int nr_cfqq;
>>
>> + /* number of sub cfq groups */
>> + int nr_subgp;
>> +
>> /* Per group busy queus average. Useful for workload slice calc. */
>> unsigned int busy_queues_avg[2];
>> /*
>> @@ -219,8 +223,6 @@ struct cfq_group {
>> */
>> struct cfq_data {
>> struct request_queue *queue;
>> - /* Root service tree for cfq_groups */
>> - struct cfq_rb_root grp_service_tree;
>> struct cfq_group root_group;
>>
>> /*
>> @@ -337,8 +339,6 @@ cfqg_of_entity(struct io_sched_entity *io_entity)
>> return NULL;
>> }
>>
>> -static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
>> -
>> static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
>> enum wl_prio_t prio,
>> enum wl_type_t type)
>> @@ -629,10 +629,15 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
>> static inline unsigned
>> cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> {
>> - struct cfq_rb_root *st = &cfqd->grp_service_tree;
>> struct io_sched_entity *group_entity = &cfqg->group_entity;
>> + struct cfq_rb_root *st = group_entity->service_tree;
>>
>> - return cfq_target_latency * group_entity->weight / st->total_weight;
>> + if (st)
>> + return cfq_target_latency * group_entity->weight
>> + / st->total_weight;
>> + else
>> + /* If this is the root group, give it a full slice. */
>> + return cfq_target_latency;
>> }
>>
>> static inline void
>> @@ -795,17 +800,6 @@ static struct io_sched_entity *cfq_rb_first(struct cfq_rb_root *root)
>> return NULL;
>> }
>>
>> -static struct io_sched_entity *cfq_rb_first_entity(struct cfq_rb_root *root)
>> -{
>> - if (!root->left)
>> - root->left = rb_first(&root->rb);
>> -
>> - if (root->left)
>> - return rb_entry_entity(root->left);
>> -
>> - return NULL;
>> -}
>> -
>> static void rb_erase_init(struct rb_node *n, struct rb_root *root)
>> {
>> rb_erase(n, root);
>> @@ -887,6 +881,7 @@ io_entity_service_tree_add(struct cfq_rb_root *st,
>> struct io_sched_entity *io_entity)
>> {
>> __io_entity_service_tree_add(st, io_entity);
>> + io_entity->reposition_time = jiffies;
>> st->count++;
>> st->total_weight += io_entity->weight;
>> }
>> @@ -894,29 +889,49 @@ io_entity_service_tree_add(struct cfq_rb_root *st,
>> static void
>> cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> {
>> - struct cfq_rb_root *st = &cfqd->grp_service_tree;
>> struct rb_node *n;
>> struct io_sched_entity *group_entity = &cfqg->group_entity;
>> - struct io_sched_entity *__group_entity;
>> + struct io_sched_entity *entity;
>> + struct cfq_rb_root *st;
>> + struct cfq_group *__cfqg;
>>
>> cfqg->nr_cfqq++;
>> +
>> + /*
>> + * Root group doesn't belongs to any service
>> + */
>> + if (cfqg == &cfqd->root_group)
>> + return;
>> +
>> if (!RB_EMPTY_NODE(&group_entity->rb_node))
>> return;
>>
>> - /*
>> - * Currently put the group at the end. Later implement something
>> - * so that groups get lesser vtime based on their weights, so that
>> - * if group does not loose all if it was not continously backlogged.
>> + /*
>> + * Enqueue this group and its ancestors onto their service tree.
>> */
>> - n = rb_last(&st->rb);
>> - if (n) {
>> - __group_entity = rb_entry_entity(n);
>> - group_entity->vdisktime = __group_entity->vdisktime +
>> - CFQ_IDLE_DELAY;
>> - } else
>> - group_entity->vdisktime = st->min_vdisktime;
>> + while (group_entity && group_entity->parent) {
>> + if (!RB_EMPTY_NODE(&group_entity->rb_node))
>> + return;
>> + /*
>> + * Currently put the group at the end. Later implement
>> + * something so that groups get lesser vtime based on their
>> + * weights, so that if group does not loose all if it was not
>> + * continously backlogged.
>> + */
>> + st = group_entity->service_tree;
>> + n = rb_last(&st->rb);
>> + if (n) {
>> + entity = rb_entry_entity(n);
>> + group_entity->vdisktime = entity->vdisktime +
>> + CFQ_IDLE_DELAY;
>> + } else
>> + group_entity->vdisktime = st->min_vdisktime;
>>
>> - io_entity_service_tree_add(st, group_entity);
>> + io_entity_service_tree_add(st, group_entity);
>> + group_entity = group_entity->parent;
>> + __cfqg = cfqg_of_entity(group_entity);
>> + __cfqg->nr_subgp++;
>> + }
>> }
>>
>> static void
>> @@ -933,27 +948,47 @@ io_entity_service_tree_del(struct cfq_rb_root *st,
>> if (!RB_EMPTY_NODE(&io_entity->rb_node)) {
>> __io_entity_service_tree_del(st, io_entity);
>> st->total_weight -= io_entity->weight;
>> - io_entity->service_tree = NULL;
>> }
>> }
>>
>> static void
>> cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> {
>> - struct cfq_rb_root *st = &cfqd->grp_service_tree;
>> struct io_sched_entity *group_entity = &cfqg->group_entity;
>> + struct cfq_group *__cfqg, *p_cfqg;
>>
>> BUG_ON(cfqg->nr_cfqq < 1);
>> cfqg->nr_cfqq--;
>>
>> + /*
>> + * Root group doesn't belongs to any service
>> + */
>> + if (cfqg == &cfqd->root_group)
>> + return;
>> +
>> /* If there are other cfq queues under this group, don't delete it */
>> if (cfqg->nr_cfqq)
>> return;
>> -
>> - cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
>> - io_entity_service_tree_del(st, group_entity);
>> - cfqg->saved_workload_slice = 0;
>> - cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
>> + /* If child group exists, don't dequeue it */
>> + if (cfqg->nr_subgp)
>> + return;
>> +
>> + /*
>> + * Dequeue this group and its ancestors from their service tree.
>> + */
>> + while (group_entity && group_entity->parent) {
>> + __cfqg = cfqg_of_entity(group_entity);
>> + p_cfqg = cfqg_of_entity(group_entity->parent);
>> + io_entity_service_tree_del(group_entity->service_tree,
>> + group_entity);
>> + cfq_blkiocg_update_dequeue_stats(&__cfqg->blkg, 1);
>> + cfq_log_cfqg(cfqd, __cfqg, "del_from_rr group");
>> + __cfqg->saved_workload_slice = 0;
>> + group_entity = group_entity->parent;
>> + p_cfqg->nr_subgp--;
>> + if (p_cfqg->nr_cfqq || p_cfqg->nr_subgp)
>> + return;
>> + }
>> }
>>
>> static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
>> @@ -985,7 +1020,6 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
>> static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
>> struct cfq_queue *cfqq)
>> {
>> - struct cfq_rb_root *st = &cfqd->grp_service_tree;
>> unsigned int used_sl, charge;
>> int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
>> - cfqg->service_tree_idle.count;
>> @@ -999,10 +1033,21 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
>> else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
>> charge = cfqq->allocated_slice;
>>
>> - /* Can't update vdisktime while group is on service tree */
>> - __io_entity_service_tree_del(st, group_entity);
>> - group_entity->vdisktime += cfq_scale_slice(charge, group_entity);
>> - __io_entity_service_tree_add(st, group_entity);
>> + /*
>> + * Update the vdisktime on the whole chain.
>> + */
>> + while (group_entity && group_entity->parent) {
>> + struct cfq_rb_root *st = group_entity->service_tree;
>> +
>> + /* Can't update vdisktime while group is on service tree */
>> + __io_entity_service_tree_del(st, group_entity);
>> + group_entity->vdisktime += cfq_scale_slice(charge,
>> + group_entity);
>> + __io_entity_service_tree_add(st, group_entity);
>> + st->count++;
>> + group_entity->reposition_time = jiffies;
>> + group_entity = group_entity->parent;
>> + }
>>
>> /* This group is being expired. Save the context */
>> if (time_after(cfqd->workload_expires, jiffies)) {
>> @@ -1014,7 +1059,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
>> cfqg->saved_workload_slice = 0;
>>
>> cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu",
>> - group_entity->vdisktime, st->min_vdisktime);
>> + cfqg->group_entity.vdisktime,
>> + cfqg->group_entity.service_tree->min_vdisktime);
>> cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
>> " sect=%u", used_sl, cfqq->slice_dispatch, charge,
>> iops_mode(cfqd), cfqq->nr_sectors);
>> @@ -1036,35 +1082,27 @@ void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
>> cfqg_of_blkg(blkg)->group_entity.weight = weight;
>> }
>>
>> -static struct cfq_group *
>> -cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>> +static void init_group_entity(struct blkio_cgroup *blkcg,
>> + struct cfq_group *cfqg)
>> +{
>> + struct io_sched_entity *group_entity = &cfqg->group_entity;
>> +
>> + group_entity->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
>> + RB_CLEAR_NODE(&group_entity->rb_node);
>> + group_entity->is_group_entity = true;
>> + group_entity->parent = NULL;
>> +}
>> +
>> +static void init_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg,
>> + struct cfq_group *cfqg)
>> {
>> - struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
>> - struct cfq_group *cfqg = NULL;
>> - void *key = cfqd;
>> int i, j;
>> struct cfq_rb_root *st;
>> - struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>> unsigned int major, minor;
>> -
>> - cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>> - if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
>> - sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>> - cfqg->blkg.dev = MKDEV(major, minor);
>> - goto done;
>> - }
>> - if (cfqg || !create)
>> - goto done;
>> -
>> - cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
>> - if (!cfqg)
>> - goto done;
>> + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>>
>> for_each_cfqg_st(cfqg, i, j, st)
>> *st = CFQ_RB_ROOT;
>> - RB_CLEAR_NODE(&cfqg->group_entity.rb_node);
>> -
>> - cfqg->group_entity.is_group_entity = true;
>>
>> /*
>> * Take the initial reference that will be released on destroy
>> @@ -1074,24 +1112,119 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>> */
>> atomic_set(&cfqg->ref, 1);
>>
>> + /* Add group onto cgroup list */
>> + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>> + cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
>> + MKDEV(major, minor));
>> + /* Initiate group entity */
>> + init_group_entity(blkcg, cfqg);
>> + /* Add group on cfqd list */
>> + hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
>> +}
>> +
>> +static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg);
>> +
>> +static void uninit_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> +{
>> + if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg))
>> + cfq_destroy_cfqg(cfqd, cfqg);
>> +}
>> +
>> +static void cfqg_set_parent(struct cfq_data *cfqd, struct cfq_group *cfqg,
>> + struct cfq_group *p_cfqg)
>> +{
>> + struct io_sched_entity *group_entity, *p_group_entity;
>> +
>> + group_entity = &cfqg->group_entity;
>> +
>> + p_group_entity = &p_cfqg->group_entity;
>> +
>> + group_entity->parent = p_group_entity;
>> +
>> /*
>> - * Add group onto cgroup list. It might happen that bdi->dev is
>> - * not initiliazed yet. Initialize this new group without major
>> - * and minor info and this info will be filled in once a new thread
>> - * comes for IO. See code above.
>> + * Currently, just put cfq group entity on "BE:SYNC" workload
>> + * service tree.
>> */
>> - if (bdi->dev) {
>> - sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>> - cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
>> - MKDEV(major, minor));
>> - } else
>> - cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
>> - 0);
>> + group_entity->service_tree = service_tree_for(p_cfqg, BE_WORKLOAD,
>> + SYNC_WORKLOAD);
>> + /* child reference */
>> + atomic_inc(&p_cfqg->ref);
>> +}
>>
>> - cfqg->group_entity.weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
>> +int cfqg_chain_alloc(struct cfq_data *cfqd, struct cgroup *cgroup)
>> +{
>> + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
>> + struct blkio_cgroup *p_blkcg;
>> + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>> + unsigned int major, minor;
>> + struct cfq_group *cfqg, *p_cfqg;
>> + void *key = cfqd;
>> + int ret;
>>
>> - /* Add group on cfqd list */
>> - hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
>> + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>> + if (cfqg) {
>> + if (!cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
>> + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>> + cfqg->blkg.dev = MKDEV(major, minor);
>> + }
>> + /* chain has already been built */
>> + return 0;
>> + }
>> +
>> + cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
>> + if (!cfqg)
>> + return -1;
>> +
>> + init_cfqg(cfqd, blkcg, cfqg);
>> +
>> + /* Already to the top group */
>> + if (!cgroup->parent)
>> + return 0;
>> +
>> + /* Allocate CFQ groups on the chain */
>> + ret = cfqg_chain_alloc(cfqd, cgroup->parent);
>> + if (ret == -1) {
>> + uninit_cfqg(cfqd, cfqg);
>> + return -1;
>> + }
>> +
>> + p_blkcg = cgroup_to_blkio_cgroup(cgroup->parent);
>> + p_cfqg = cfqg_of_blkg(blkiocg_lookup_group(p_blkcg, key));
>> + BUG_ON(p_cfqg == NULL);
>> +
>> + cfqg_set_parent(cfqd, cfqg, p_cfqg);
>> + return 0;
>> +}
>> +
>> +static struct cfq_group *
>> +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>> +{
>> + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
>> + struct cfq_group *cfqg = NULL;
>> + void *key = cfqd;
>> + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>> + unsigned int major, minor;
>> + int ret;
>> +
>> + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>> + if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
>> + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>> + cfqg->blkg.dev = MKDEV(major, minor);
>> + goto done;
>> + }
>> + if (cfqg || !create)
>> + goto done;
>> +
>> + /*
>> + * For hierarchical cfq group scheduling, we need to allocate
>> + * the whole cfq group chain.
>> + */
>> + ret = cfqg_chain_alloc(cfqd, cgroup);
>> + if (!ret) {
>> + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>> + BUG_ON(cfqg == NULL);
>> + goto done;
>> + }
>>
>> done:
>> return cfqg;
>> @@ -1136,12 +1269,22 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
>> {
>> struct cfq_rb_root *st;
>> int i, j;
>> + struct io_sched_entity *group_entity;
>> + struct cfq_group *p_cfqg;
>>
>> BUG_ON(atomic_read(&cfqg->ref) <= 0);
>> if (!atomic_dec_and_test(&cfqg->ref))
>> return;
>> for_each_cfqg_st(cfqg, i, j, st)
>> BUG_ON(!RB_EMPTY_ROOT(&st->rb));
>> +
>> + group_entity = &cfqg->group_entity;
>> + if (group_entity->parent) {
>> + p_cfqg = cfqg_of_entity(group_entity->parent);
>> + /* Drop the reference taken by children */
>> + atomic_dec(&p_cfqg->ref);
>> + }
>> +
>> kfree(cfqg);
>> }
>>
>> @@ -1336,7 +1479,6 @@ insert:
>> io_entity_service_tree_add(service_tree, queue_entity);
>>
>> update_min_vdisktime(service_tree);
>> - cfqq->reposition_time = jiffies;
>> if ((add_front || !new_cfqq) && !group_changed)
>> return;
>> cfq_group_service_tree_add(cfqd, cfqq->cfqg);
>> @@ -1779,28 +1921,30 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
>> return cfqq_of_entity(cfq_rb_first(service_tree));
>> }
>>
>> -static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
>> +static struct io_sched_entity *
>> +cfq_get_next_entity_forced(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> {
>> - struct cfq_group *cfqg;
>> - struct io_sched_entity *queue_entity;
>> + struct io_sched_entity *entity;
>> int i, j;
>> struct cfq_rb_root *st;
>>
>> if (!cfqd->rq_queued)
>> return NULL;
>>
>> - cfqg = cfq_get_next_cfqg(cfqd);
>> - if (!cfqg)
>> - return NULL;
>> -
>> for_each_cfqg_st(cfqg, i, j, st) {
>> - queue_entity = cfq_rb_first(st);
>> - if (queue_entity != NULL)
>> - return cfqq_of_entity(queue_entity);
>> + entity = cfq_rb_first(st);
>> +
>> + if (entity && !entity->is_group_entity)
>> + return entity;
>> + else if (entity && entity->is_group_entity) {
>> + cfqg = cfqg_of_entity(entity);
>> + return cfq_get_next_entity_forced(cfqd, cfqg);
>> + }
>> }
>> return NULL;
>> }
>>
>> +
>> /*
>> * Get and set a new active queue for service.
>> */
>> @@ -2155,8 +2299,7 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
>> static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
>> struct cfq_group *cfqg, enum wl_prio_t prio)
>> {
>> - struct io_sched_entity *queue_entity;
>> - struct cfq_queue *cfqq;
>> + struct io_sched_entity *entity;
>> unsigned long lowest_start_time;
>> int i;
>> bool time_valid = false;
>> @@ -2167,12 +2310,11 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
>> * type. But for the time being just make use of reposition_time only.
>> */
>> for (i = 0; i <= SYNC_WORKLOAD; ++i) {
>> - queue_entity = cfq_rb_first(service_tree_for(cfqg, prio, i));
>> - cfqq = cfqq_of_entity(queue_entity);
>> - if (queue_entity &&
>> + entity = cfq_rb_first(service_tree_for(cfqg, prio, i));
>> + if (entity &&
>> (!time_valid ||
>> - cfqq->reposition_time < lowest_start_time)) {
>> - lowest_start_time = cfqq->reposition_time;
>> + entity->reposition_time < lowest_start_time)) {
>> + lowest_start_time = entity->reposition_time;
>> cur_best = i;
>> time_valid = true;
>> }
>> @@ -2181,47 +2323,13 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
>> return cur_best;
>> }
>>
>> -static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> +static void set_workload_expire(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> {
>> unsigned slice;
>> unsigned count;
>> struct cfq_rb_root *st;
>> unsigned group_slice;
>>
>> - if (!cfqg) {
>> - cfqd->serving_prio = IDLE_WORKLOAD;
>> - cfqd->workload_expires = jiffies + 1;
>> - return;
>> - }
>> -
>> - /* Choose next priority. RT > BE > IDLE */
>> - if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
>> - cfqd->serving_prio = RT_WORKLOAD;
>> - else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
>> - cfqd->serving_prio = BE_WORKLOAD;
>> - else {
>> - cfqd->serving_prio = IDLE_WORKLOAD;
>> - cfqd->workload_expires = jiffies + 1;
>> - return;
>> - }
>> -
>> - /*
>> - * For RT and BE, we have to choose also the type
>> - * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
>> - * expiration time
>> - */
>> - st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
>> - count = st->count;
>> -
>> - /*
>> - * check workload expiration, and that we still have other queues ready
>> - */
>> - if (count && !time_after(jiffies, cfqd->workload_expires))
>> - return;
>> -
>> - /* otherwise select new workload type */
>> - cfqd->serving_type =
>> - cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
>> st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
>> count = st->count;
>>
>> @@ -2262,26 +2370,51 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> cfqd->workload_expires = jiffies + slice;
>> }
>>
>> -static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
>> +static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> {
>> - struct cfq_rb_root *st = &cfqd->grp_service_tree;
>> - struct cfq_group *cfqg;
>> - struct io_sched_entity *group_entity;
>> + struct cfq_rb_root *st;
>> + unsigned count;
>>
>> - if (RB_EMPTY_ROOT(&st->rb))
>> - return NULL;
>> - group_entity = cfq_rb_first_entity(st);
>> - cfqg = cfqg_of_entity(group_entity);
>> - BUG_ON(!cfqg);
>> - update_min_vdisktime(st);
>> - return cfqg;
>> + if (!cfqg) {
>> + cfqd->serving_prio = IDLE_WORKLOAD;
>> + cfqd->workload_expires = jiffies + 1;
>> + return;
>> + }
>> +
>> + /* Choose next priority. RT > BE > IDLE */
>> + if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
>> + cfqd->serving_prio = RT_WORKLOAD;
>> + else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
>> + cfqd->serving_prio = BE_WORKLOAD;
>> + else {
>> + cfqd->serving_prio = IDLE_WORKLOAD;
>> + cfqd->workload_expires = jiffies + 1;
>> + return;
>> + }
>> +
>> + /*
>> + * For RT and BE, we have to choose also the type
>> + * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
>> + * expiration time
>> + */
>> + st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
>> + count = st->count;
>> +
>> + /*
>> + * check workload expiration, and that we still have other queues ready
>> + */
>> + if (count && !time_after(jiffies, cfqd->workload_expires))
>> + return;
>> +
>> + /* otherwise select new workload type */
>> + cfqd->serving_type =
>> + cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
>> }
>>
>> -static void cfq_choose_cfqg(struct cfq_data *cfqd)
>> +struct io_sched_entity *choose_serving_entity(struct cfq_data *cfqd,
>> + struct cfq_group *cfqg)
>> {
>> - struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
>> -
>> - cfqd->serving_group = cfqg;
>> + struct cfq_rb_root *service_tree;
>>
>> /* Restore the workload type data */
>> if (cfqg->saved_workload_slice) {
>> @@ -2292,8 +2425,21 @@ static void cfq_choose_cfqg(struct cfq_data *cfqd)
>> cfqd->workload_expires = jiffies - 1;
>>
>> choose_service_tree(cfqd, cfqg);
>> -}
>>
>> + service_tree = service_tree_for(cfqg, cfqd->serving_prio,
>> + cfqd->serving_type);
>> +
>> + if (!cfqd->rq_queued)
>> + return NULL;
>> +
>> + /* There is nothing to dispatch */
>> + if (!service_tree)
>> + return NULL;
>> + if (RB_EMPTY_ROOT(&service_tree->rb))
>> + return NULL;
>> +
>> + return cfq_rb_first(service_tree);
>> +}
>> /*
>> * Select a queue for service. If we have a current active queue,
>> * check whether to continue servicing it, or retrieve and set a new one.
>> @@ -2301,6 +2447,8 @@ static void cfq_choose_cfqg(struct cfq_data *cfqd)
>> static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
>> {
>> struct cfq_queue *cfqq, *new_cfqq = NULL;
>> + struct cfq_group *cfqg;
>> + struct io_sched_entity *entity;
>>
>> cfqq = cfqd->active_queue;
>> if (!cfqq)
>> @@ -2389,8 +2537,23 @@ new_queue:
>> * Current queue expired. Check if we have to switch to a new
>> * service tree
>> */
>> - if (!new_cfqq)
>> - cfq_choose_cfqg(cfqd);
>> + cfqg = &cfqd->root_group;
>> +
>> + if (!new_cfqq) {
>> + do {
>> + entity = choose_serving_entity(cfqd, cfqg);
>> + if (entity && !entity->is_group_entity) {
>> + /* This is the CFQ queue that should run */
>> + new_cfqq = cfqq_of_entity(entity);
>> + cfqd->serving_group = cfqg;
>> + set_workload_expire(cfqd, cfqg);
>> + break;
>> + } else if (entity && entity->is_group_entity) {
>> + /* Continue to lookup in this CFQ group */
>> + cfqg = cfqg_of_entity(entity);
>> + }
>> + } while (entity && entity->is_group_entity);
>> + }
>>
>> cfqq = cfq_set_active_queue(cfqd, new_cfqq);
>> keep_queue:
>> @@ -2421,10 +2584,14 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
>> {
>> struct cfq_queue *cfqq;
>> int dispatched = 0;
>> + struct io_sched_entity *entity;
>> + struct cfq_group *root = &cfqd->root_group;
>>
>> /* Expire the timeslice of the current active queue first */
>> cfq_slice_expired(cfqd, 0);
>> - while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
>> + while ((entity = cfq_get_next_entity_forced(cfqd, root)) != NULL) {
>> + BUG_ON(entity->is_group_entity);
>> + cfqq = cfqq_of_entity(entity);
>> __cfq_set_active_queue(cfqd, cfqq);
>> dispatched += __cfq_forced_dispatch_cfqq(cfqq);
>> }
>> @@ -3954,9 +4121,6 @@ static void *cfq_init_queue(struct request_queue *q)
>>
>> cfqd->cic_index = i;
>>
>> - /* Init root service tree */
>> - cfqd->grp_service_tree = CFQ_RB_ROOT;
>> -
>> /* Init root group */
>> cfqg = &cfqd->root_group;
>> for_each_cfqg_st(cfqg, i, j, st)
>> @@ -3966,6 +4130,7 @@ static void *cfq_init_queue(struct request_queue *q)
>> /* Give preference to root group over other groups */
>> cfqg->group_entity.weight = 2*BLKIO_WEIGHT_DEFAULT;
>> cfqg->group_entity.is_group_entity = true;
>> + cfqg->group_entity.parent = NULL;
>>
>> #ifdef CONFIG_CFQ_GROUP_IOSCHED
>> /*
>> --
>> 1.6.5.2
>>
>>
>

--
Regards
Gui Jianfeng
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/