md versus partition scanning (bd_invalidated)

From: Dan Williams
Date: Wed Jul 21 2010 - 15:44:25 EST


Hi,

We (Intel software raid devs) are seeing a problem with the detection of
partitions on md devices. The trace below shows that the block device
inode is dropped in-between when the array comes up and when it is first
opened (timestamp 655.498875). When this happens bdev->bd_invalidated
is cleared by bdget() and we never detect partitions. (Seen on a 2.6.32
based kernel, but I presume it is still present)

# tracer: nop
#
# TASK-PID CPU# TIMESTAMP FUNCTION
# | | | | |
<...>-1114 [000] 655.488730: check_disk_size_change: ffff8800374d3780: (md1) check_disk_size_change:1103 open: 1 invalidated: 0
<...>-1114 [000] 655.497906: check_disk_size_change: ffff8800374d3780: (md1) check_disk_size_change:1112 open: 1 invalidated: 0
<...>-1114 [000] 655.497908: flush_disk: ffff8800374d3780: (md1) flush_disk:1084 open: 1 invalidated: 1
<...>-1114 [000] 655.497909: flush_disk: ffff8800374d3780: (md1) flush_disk:1086 open: 1 invalidated: 1
<...>-1117 [003] 655.498875: bdget: ffff8800375aec40: () bdget:595 open: 0 invalidated: 0
<...>-1117 [003] 655.498878: __blkdev_get: ffff8800375aec40: (md1) __blkdev_get:1229 open: 0 invalidated: 0
<...>-1117 [003] 655.498879: __blkdev_get: ffff8800375aec40: (md1) __blkdev_get:1233 open: 0 invalidated: 0
<...>-1117 [003] 655.498880: __blkdev_get: ffff8800375aec40: (md1) __blkdev_get:1239 open: 0 invalidated: 0
<...>-1117 [003] 655.498882: __blkdev_get: ffff8800375aec40: (md1) __blkdev_get:1307 open: 1 invalidated: 0

These traces generated by:

#define dbg(bdev) ({ if (debug_partitions) {\
char name[BDEVNAME_SIZE] = "";\
if (bdev->bd_disk)\
disk_name(bdev->bd_disk, 0, name);\
trace_printk("%p: (%s) %s:%d open: %d invalidated: %d\n", bdev, name, __func__, __LINE__, bdev->bd_openers, bdev->bd_invalidated); \
} 0; })

The patch below (2.6.32 based) moves the block_device bd_invalidated
field to a gendisk flag, as it seems this info wants a longer lifetime.

Thoughts on this fix? Maybe it wants to be a standalone integer flag so
we don't need to add locking to nbd.

Thanks,
Dan

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index cc923a5..de8a4a4 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -607,8 +607,10 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo,
if (S_ISSOCK(inode->i_mode)) {
lo->file = file;
lo->sock = SOCKET_I(inode);
+ mutex_lock(&bdev->bd_mutex);
if (max_part > 0)
- bdev->bd_invalidated = 1;
+ bdev->bd_disk->flags |= GENHD_FL_INVALIDATED;
+ mutex_unlock(&bdev->bd_mutex);
return 0;
} else {
fput(file);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 6dcee88..147f449 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -571,7 +571,6 @@ struct block_device *bdget(dev_t dev)
bdev->bd_inode = inode;
bdev->bd_block_size = (1 << inode->i_blkbits);
bdev->bd_part_count = 0;
- bdev->bd_invalidated = 0;
inode->i_mode = S_IFBLK;
inode->i_rdev = dev;
inode->i_bdev = bdev;
@@ -1069,7 +1068,7 @@ static void flush_disk(struct block_device *bdev)
if (!bdev->bd_disk)
return;
if (disk_partitionable(bdev->bd_disk))
- bdev->bd_invalidated = 1;
+ bdev->bd_disk->flags |= GENHD_FL_INVALIDATED;
}

/**
@@ -1243,7 +1242,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
bdi = &default_backing_dev_info;
bdev->bd_inode->i_data.backing_dev_info = bdi;
}
- if (bdev->bd_invalidated)
+ if (bdev->bd_disk->flags & GENHD_FL_INVALIDATED)
rescan_partitions(disk, bdev);
} else {
struct block_device *whole;
@@ -1276,7 +1275,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
if (ret)
goto out_unlock_bdev;
}
- if (bdev->bd_invalidated)
+ if (bdev->bd_disk->flags & GENHD_FL_INVALIDATED)
rescan_partitions(bdev->bd_disk, bdev);
}
}
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index e8865c1..7872269 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -519,7 +519,7 @@ void register_disk(struct gendisk *disk)
if (!bdev)
goto exit;

- bdev->bd_invalidated = 1;
+ disk->flags |= GENHD_FL_INVALIDATED;
err = blkdev_get(bdev, FMODE_READ);
if (err < 0)
goto exit;
@@ -558,7 +558,7 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
if (disk->fops->revalidate_disk)
disk->fops->revalidate_disk(disk);
check_disk_size_change(disk, bdev);
- bdev->bd_invalidated = 0;
+ bdev->bd_disk->flags &= ~GENHD_FL_INVALIDATED;
if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
return 0;
if (IS_ERR(state)) /* I/O error reading the partition table */
@@ -609,7 +609,7 @@ try_scan:
if (capacity > get_capacity(disk)) {
set_capacity(disk, capacity);
check_disk_size_change(disk, bdev);
- bdev->bd_invalidated = 0;
+ bdev->bd_disk->flags &= ~GENHD_FL_INVALIDATED;
}
goto try_scan;
} else {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1e58fc8..367875a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -661,7 +661,6 @@ struct block_device {
struct hd_struct * bd_part;
/* number of times partitions within this device have been opened. */
unsigned bd_part_count;
- int bd_invalidated;
struct gendisk * bd_disk;
struct list_head bd_list;
/*
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index c6c0c41..d97cdec 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -116,6 +116,7 @@ struct hd_struct {
#define GENHD_FL_SUPPRESS_PARTITION_INFO 32
#define GENHD_FL_EXT_DEVT 64 /* allow extended devt */
#define GENHD_FL_NATIVE_CAPACITY 128
+#define GENHD_FL_INVALIDATED 256

#define BLK_SCSI_MAX_CMDS (256)
#define BLK_SCSI_CMD_PER_LONG (BLK_SCSI_MAX_CMDS / (sizeof(long) * 8))


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/