[PATCH v5] blk: fix a wrong accounting of hd_struct->in_flight

From: Yasuaki Ishimatsu
Date: Mon Oct 18 2010 - 08:20:08 EST


Hi Jens,

Jens Axboe wrote:
> On 2010-10-18 10:28, Yasuaki Ishimatsu wrote:
>> Hi Jens,
>>
>>> This looks good! To quiesce the queue, something like the below.
>>> Completely untested.
>> Thank you for your advice.
>> I applied your idea to the patch.
>
> But you changed it, though:
>
>> if (old_ptbl) {
>> rcu_assign_pointer(old_ptbl->last_lookup, NULL);
>> + spin_lock_irq(q->queue_lock);
>> + elv_quiesce_start(q);
>> call_rcu(&old_ptbl->rcu_head, disk_free_ptbl_rcu_cb);
>> + elv_quiesce_end(q);
>> + spin_unlock_irq(q->queue_lock);
>> }
>> }
>
> That is not going to work. The point is to start the drain period
> before, then end it when the callback has gone through. By placing it
> just after the call_rcu() call, there's no guarentee that the RCU grace
> period has elapsed. That is why I placed it inside the rcu callback. Why
> did you move it?

Ah...
I misunderstood the purpose of the call_rcu().
I moved elv_quiesce_end() to the rcu callback.

Regards,
Yasuaki Ishimatsu.
===

From: Yasuaki Ishimatsu <isimatu.yasuaki@xxxxxxxxxxxxxx>

/proc/diskstats would display a strange output as follows.

$ cat /proc/diskstats |grep sda
8 0 sda 90524 7579 102154 20464 0 0 0 0 0 14096 20089
8 1 sda1 19085 1352 21841 4209 0 0 0 0 4294967064 15689 4293424691
~~~~~~~~~~
8 2 sda2 71252 3624 74891 15950 0 0 0 0 232 23995 1562390
8 3 sda3 54 487 2188 92 0 0 0 0 0 88 92
8 4 sda4 4 0 8 0 0 0 0 0 0 0 0
8 5 sda5 81 2027 2130 138 0 0 0 0 0 87 137

Its reason is the wrong way of accounting hd_struct->in_flight. When a bio is
merged into a request belongs to different partition by ELEVATOR_FRONT_MERGE.

The detailed root cause is as follows.

Assuming that there are two partition, sda1 and sda2.

1. A request for sda2 is in request_queue. Hence sda1's hd_struct->in_flight
is 0 and sda2's one is 1.

| hd_struct->in_flight
---------------------------
sda1 | 0
sda2 | 1
---------------------------

2. A bio belongs to sda1 is issued and is merged into the request mentioned on
step1 by ELEVATOR_BACK_MERGE. The first sector of the request is changed
from sda2 region to sda1 region. However the two partition's
hd_struct->in_flight are not changed.

| hd_struct->in_flight
---------------------------
sda1 | 0
sda2 | 1
---------------------------

3. The request is finished and blk_account_io_done() is called. In this case,
sda2's hd_struct->in_flight, not a sda1's one, is decremented.

| hd_struct->in_flight
---------------------------
sda1 | -1
sda2 | 1
---------------------------

The patch fixes the problem.

Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@xxxxxxxxxxxxxx>
---
block/blk-core.c | 24 ++++++++++++++++--------
block/blk-merge.c | 2 +-
block/blk.h | 4 ----
block/genhd.c | 14 ++++++++++++++
fs/partitions/check.c | 12 ++++++++++++
include/linux/blkdev.h | 1 +
include/linux/elevator.h | 2 ++
include/linux/genhd.h | 1 +
8 files changed, 47 insertions(+), 13 deletions(-)

Index: linux-2.6.36-rc7/block/blk-core.c
===================================================================
--- linux-2.6.36-rc7.orig/block/blk-core.c 2010-10-15 09:21:37.000000000 +0900
+++ linux-2.6.36-rc7/block/blk-core.c 2010-10-18 14:45:19.000000000 +0900
@@ -64,13 +64,15 @@ static void drive_stat_acct(struct reque
return;

cpu = part_stat_lock();
- part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));

- if (!new_io)
+ if (!new_io) {
+ part = rq->part;
part_stat_inc(cpu, part, merges[rw]);
- else {
+ } else {
+ part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
part_round_stats(cpu, part);
part_inc_in_flight(part, rw);
+ rq->part = part;
}

part_stat_unlock();
@@ -128,6 +130,7 @@ void blk_rq_init(struct request_queue *q
rq->ref_count = 1;
rq->start_time = jiffies;
set_start_time_ns(rq);
+ rq->part = NULL;
}
EXPORT_SYMBOL(blk_rq_init);

@@ -796,11 +799,16 @@ static struct request *get_request(struc
rl->starved[is_sync] = 0;

priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
- if (priv)
+ if (priv) {
rl->elvpriv++;

- if (blk_queue_io_stat(q))
- rw_flags |= REQ_IO_STAT;
+ /*
+ * Don't do stats for non-priv requests
+ */
+ if (blk_queue_io_stat(q))
+ rw_flags |= REQ_IO_STAT;
+ }
+
spin_unlock_irq(q->queue_lock);

rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);
@@ -1759,7 +1767,7 @@ static void blk_account_io_completion(st
int cpu;

cpu = part_stat_lock();
- part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+ part = req->part;
part_stat_add(cpu, part, sectors[rw], bytes >> 9);
part_stat_unlock();
}
@@ -1779,7 +1787,7 @@ static void blk_account_io_done(struct r
int cpu;

cpu = part_stat_lock();
- part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+ part = req->part;

part_stat_inc(cpu, part, ios[rw]);
part_stat_add(cpu, part, ticks[rw], duration);
Index: linux-2.6.36-rc7/block/blk-merge.c
===================================================================
--- linux-2.6.36-rc7.orig/block/blk-merge.c 2010-10-07 05:39:52.000000000 +0900
+++ linux-2.6.36-rc7/block/blk-merge.c 2010-10-18 14:41:03.000000000 +0900
@@ -343,7 +343,7 @@ static void blk_account_io_merge(struct
int cpu;

cpu = part_stat_lock();
- part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+ part = req->part;

part_round_stats(cpu, part);
part_dec_in_flight(part, rq_data_dir(req));
Index: linux-2.6.36-rc7/include/linux/blkdev.h
===================================================================
--- linux-2.6.36-rc7.orig/include/linux/blkdev.h 2010-10-15 09:21:37.000000000 +0900
+++ linux-2.6.36-rc7/include/linux/blkdev.h 2010-10-15 09:26:22.000000000 +0900
@@ -115,6 +115,7 @@ struct request {
void *elevator_private3;

struct gendisk *rq_disk;
+ struct hd_struct *part;
unsigned long start_time;
#ifdef CONFIG_BLK_CGROUP
unsigned long long start_time_ns;
Index: linux-2.6.36-rc7/block/genhd.c
===================================================================
--- linux-2.6.36-rc7.orig/block/genhd.c 2010-10-07 05:39:52.000000000 +0900
+++ linux-2.6.36-rc7/block/genhd.c 2010-10-18 20:50:18.000000000 +0900
@@ -925,8 +925,15 @@ static void disk_free_ptbl_rcu_cb(struct
{
struct disk_part_tbl *ptbl =
container_of(head, struct disk_part_tbl, rcu_head);
+ struct gendisk *disk = ptbl->disk;
+ struct request_queue *q = disk->queue;
+ unsigned long flags;

kfree(ptbl);
+
+ spin_lock_irqsave(q->queue_lock, flags);
+ elv_quiesce_end(q);
+ spin_unlock_irqrestore(q->queue_lock, flags);
}

/**
@@ -944,11 +951,17 @@ static void disk_replace_part_tbl(struct
struct disk_part_tbl *new_ptbl)
{
struct disk_part_tbl *old_ptbl = disk->part_tbl;
+ struct request_queue *q = disk->queue;

rcu_assign_pointer(disk->part_tbl, new_ptbl);

if (old_ptbl) {
rcu_assign_pointer(old_ptbl->last_lookup, NULL);
+
+ spin_lock_irq(q->queue_lock);
+ elv_quiesce_start(q);
+ spin_unlock_irq(q->queue_lock);
+
call_rcu(&old_ptbl->rcu_head, disk_free_ptbl_rcu_cb);
}
}
@@ -989,6 +1002,7 @@ int disk_expand_part_tbl(struct gendisk
return -ENOMEM;

new_ptbl->len = target;
+ new_ptbl->disk = disk;

for (i = 0; i < len; i++)
rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]);
Index: linux-2.6.36-rc7/fs/partitions/check.c
===================================================================
--- linux-2.6.36-rc7.orig/fs/partitions/check.c 2010-10-07 05:39:52.000000000 +0900
+++ linux-2.6.36-rc7/fs/partitions/check.c 2010-10-18 20:20:43.000000000 +0900
@@ -364,17 +364,25 @@ struct device_type part_type = {
static void delete_partition_rcu_cb(struct rcu_head *head)
{
struct hd_struct *part = container_of(head, struct hd_struct, rcu_head);
+ struct gendisk *disk = part_to_disk(part);
+ struct request_queue *q = disk->queue;
+ unsigned long flags;

part->start_sect = 0;
part->nr_sects = 0;
part_stat_set_all(part, 0);
put_device(part_to_dev(part));
+
+ spin_lock_irqsave(q->queue_lock, flags);
+ elv_quiesce_end(q);
+ spin_unlock_irqrestore(q->queue_lock, flags);
}

void delete_partition(struct gendisk *disk, int partno)
{
struct disk_part_tbl *ptbl = disk->part_tbl;
struct hd_struct *part;
+ struct request_queue *q = disk->queue;

if (partno >= ptbl->len)
return;
@@ -389,6 +397,10 @@ void delete_partition(struct gendisk *di
kobject_put(part->holder_dir);
device_del(part_to_dev(part));

+ spin_lock_irq(q->queue_lock);
+ elv_quiesce_start(q);
+ spin_unlock_irq(q->queue_lock);
+
call_rcu(&part->rcu_head, delete_partition_rcu_cb);
}

Index: linux-2.6.36-rc7/block/blk.h
===================================================================
--- linux-2.6.36-rc7.orig/block/blk.h 2010-10-07 05:39:52.000000000 +0900
+++ linux-2.6.36-rc7/block/blk.h 2010-10-18 16:22:47.000000000 +0900
@@ -110,10 +110,6 @@ void blk_queue_congestion_threshold(stru

int blk_dev_init(void);

-void elv_quiesce_start(struct request_queue *q);
-void elv_quiesce_end(struct request_queue *q);
-
-
/*
* Return the threshold (number of used requests) at which the queue is
* considered to be congested. It include a little hysteresis to keep the
Index: linux-2.6.36-rc7/include/linux/elevator.h
===================================================================
--- linux-2.6.36-rc7.orig/include/linux/elevator.h 2010-10-07 05:39:52.000000000 +0900
+++ linux-2.6.36-rc7/include/linux/elevator.h 2010-10-18 17:09:58.000000000 +0900
@@ -121,6 +121,8 @@ extern void elv_completed_request(struct
extern int elv_set_request(struct request_queue *, struct request *, gfp_t);
extern void elv_put_request(struct request_queue *, struct request *);
extern void elv_drain_elevator(struct request_queue *);
+extern void elv_quiesce_start(struct request_queue *);
+extern void elv_quiesce_end(struct request_queue *);

/*
* io scheduler registration
Index: linux-2.6.36-rc7/include/linux/genhd.h
===================================================================
--- linux-2.6.36-rc7.orig/include/linux/genhd.h 2010-10-07 05:39:52.000000000 +0900
+++ linux-2.6.36-rc7/include/linux/genhd.h 2010-10-18 19:57:36.000000000 +0900
@@ -130,6 +130,7 @@ struct disk_part_tbl {
struct rcu_head rcu_head;
int len;
struct hd_struct *last_lookup;
+ struct gendisk *disk;
struct hd_struct *part[];
};


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/