Fwd: ext2/3 create large filesystem takes too much time; solutions

From: Pavel Mironchik
Date: Thu Sep 14 2006 - 03:52:47 EST


---------- Forwarded message ----------
From: Pavel Mironchik <tibor0@xxxxxxxxx>
Date: Sep 12, 2006 2:07 PM
Subject: ext2/3 create large filesystem takes too much time; solutions
To: linux-ext4@xxxxxxxxxxxxxxx


Hi,

Ext2/3 does erase of inode tables, when do creation of new systems.
This is very very long operation when the target file system volume is more than
2Tb. Other filesystem are not affected by such huge delay on creation of
filesystem. My concern was to improve design of ext3 to decrease time
consuption of creation large ext3 volumes on storage servers.
In general to solve problem, we should defer job of cleaning nodes to
kernel. In e2fsprogs there is LAZY_BG options but it just avoids doing
erase of inodes only.

I see several solutions for that problem:
1) Add special bitmaps into fs header (inode groups descriptors?).
By looking at those bitmaps kernel could determine if inode is not cleaned, and
that inode will be propertly initialized.
2) Add special identifiers into inodes. If super block id != inode id
-> inode is dirty
and should be cleaned in kernel, where super block id is generated on
creation stage.

I choosed second (much easier - just few lines of code)
and implemented patch for e2fsprogs, kernel ext3. It is just proof of a concept.
With the help of this patch I could create terrabytes volumes fast.
Of cource this patch will broke compatibility for existing filesystem.
More correctly is to choose first way and do not broke compatibility.

Writing this mail, I just want check if there is any interest for this problem
from community.
I would like to see that future ext4 filesystem will be created fast.
and I would be appreciate for thoughts, remarks.
---------------------------
Pavel S. Mironchik




--
-----------------------------------
Pavel S. Mironchik
diff -ruN upload/fs/ext3/inode.c l1/linux-2.6.17/fs/ext3/inode.c
--- upload/fs/ext3/inode.c 2006-08-24 00:16:33 +0300
+++ linux-2.6.17/fs/ext3/inode.c 2006-09-06 16:22:17 +0300
@@ -2576,6 +2576,17 @@
inode->i_flags |= S_DIRSYNC;
}

+void ext3_check_uuid_inode(struct ext3_inode *rinode, struct inode *vinode)
+{
+ struct ext3_sb_info* sbi = (struct ext3_sb_info*)(vinode->i_sb)->s_fs_info;
+ int length = sizeof(u8)*16;
+
+ if(memcmp(rinode->i_uuid,sbi->s_uuid, length)) {
+ memset(rinode,0,sizeof(struct ext3_inode));
+ memcpy(rinode->i_uuid,sbi->s_uuid, length);
+ }
+}
+
void ext3_read_inode(struct inode * inode)
{
struct ext3_iloc iloc;
@@ -2594,6 +2605,9 @@
goto bad_inode;
bh = iloc.bh;
raw_inode = ext3_raw_inode(&iloc);
+
+ ext3_check_uuid_inode(raw_inode,inode);
+
inode->i_mode = le16_to_cpu(raw_inode->i_mode);
inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
@@ -2713,6 +2727,8 @@
return;
}

+
+
/*
* Post the struct inode info into an on-disk inode location in the
* buffer-cache. This gobbles the caller's reference to the
@@ -2725,6 +2741,7 @@
struct ext3_iloc *iloc)
{
struct ext3_inode *raw_inode = ext3_raw_inode(iloc);
+ struct ext3_sb_info* sbi = (struct ext3_sb_info*)(inode->i_sb)->s_fs_info;
struct ext3_inode_info *ei = EXT3_I(inode);
struct buffer_head *bh = iloc->bh;
int err = 0, rc, block;
@@ -2773,6 +2790,8 @@
raw_inode->i_fsize = ei->i_frag_size;
#endif
raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
+ memcpy(raw_inode->i_uuid, sbi->s_uuid, sizeof(u8)*16);
+
if (!S_ISREG(inode->i_mode)) {
raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
} else {
@@ -2832,7 +2851,7 @@
}

/*
- * ext3_write_inode()
+ * ext3_write_inode
*
* We are called from a few places:
*
diff -ruN upload/fs/ext3/super.c l1/linux-2.6.17/fs/ext3/super.c
--- upload/fs/ext3/super.c 2006-08-24 00:16:33 +0300
+++ linux-2.6.17/fs/ext3/super.c 2006-09-06 16:03:25 +0300
@@ -1464,6 +1425,8 @@
sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
sbi->s_resgid = le16_to_cpu(es->s_def_resgid);

+ memcpy(sbi->s_uuid,es->s_uuid, sizeof(u8)*16);
+
set_opt(sbi->s_mount_opt, RESERVATION);

if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
diff -ruN upload/include/linux/ext3_fs.h l1/linux-2.6.17/include/linux/ext3_fs.h
--- upload/include/linux/ext3_fs.h 2006-08-24 00:16:33 +0300
+++ l1/linux-2.6.17/include/linux/ext3_fs.h 2006-09-06 16:20:53 +0300
@@ -284,6 +284,7 @@
__le32 i_file_acl; /* File ACL */
__le32 i_dir_acl; /* Directory ACL */
__le32 i_faddr; /* Fragment address */
+ __le16 i_uuid[8];
union {
struct {
__u8 l_i_frag; /* Fragment number */
@@ -531,7 +532,7 @@
#define EXT3_CURRENT_REV EXT3_GOOD_OLD_REV
#define EXT3_MAX_SUPP_REV EXT3_DYNAMIC_REV

-#define EXT3_GOOD_OLD_INODE_SIZE 128
+#define EXT3_GOOD_OLD_INODE_SIZE (128+(8*16))

/*
* Feature set definitions
diff -ruN upload/include/linux/ext3_fs_sb.h l1/linux-2.6.17/include/linux/ext3_fs_sb.h
--- upload/include/linux/ext3_fs_sb.h 2006-08-24 00:16:33 +0300
+++ l1/linux-2.6.17/include/linux/ext3_fs_sb.h 2006-09-06 15:59:35 +0300
@@ -78,6 +78,7 @@
char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */
int s_jquota_fmt; /* Format of quota to use */
#endif
+ u8 s_uuid[16];
};

#endif /* _LINUX_EXT3_FS_SB */
diff --git a/lib/ext2fs/ext2_fs.h b/lib/ext2fs/ext2_fs.h
index 1cfbc35..7008826 100644
--- a/lib/ext2fs/ext2_fs.h
+++ b/lib/ext2fs/ext2_fs.h
@@ -296,7 +296,8 @@ struct ext2_inode {
__u32 i_generation; /* File version (for NFS) */
__u32 i_file_acl; /* File ACL */
__u32 i_dir_acl; /* Directory ACL */
- __u32 i_faddr; /* Fragment address */
+ __u32 i_faddr; /* Fragment address */
+ __u8 s_uuid[16];
union {
struct {
__u8 l_i_frag; /* Fragment number */
@@ -354,6 +355,7 @@ struct ext2_inode_large {
__u32 i_file_acl; /* File ACL */
__u32 i_dir_acl; /* Directory ACL */
__u32 i_faddr; /* Fragment address */
+ __u8 s_uuid[16];
union {
struct {
__u8 l_i_frag; /* Fragment number */
@@ -547,7 +549,7 @@ #define EXT2_DYNAMIC_REV 1 /* V2 format
#define EXT2_CURRENT_REV EXT2_GOOD_OLD_REV
#define EXT2_MAX_SUPP_REV EXT2_DYNAMIC_REV

-#define EXT2_GOOD_OLD_INODE_SIZE 128
+#define EXT2_GOOD_OLD_INODE_SIZE (128+(16*8))

/*
* Journal inode backup types
diff --git a/lib/ext2fs/inode.c b/lib/ext2fs/inode.c
index 8d528b4..af3b87f 100644
--- a/lib/ext2fs/inode.c
+++ b/lib/ext2fs/inode.c
@@ -381,6 +381,8 @@ static inline int is_empty_scan(ext2_ino
}
#endif

+errcode_t ext2fs_check_inode_uuid(ext2_filsys fs, struct ext2_inode* inode);
+
errcode_t ext2fs_get_next_inode_full(ext2_inode_scan scan, ext2_ino_t *ino,
struct ext2_inode *inode, int bufsize)
{
@@ -452,7 +454,7 @@ #endif
scan->inode_size - extra_bytes);
scan->ptr += scan->inode_size - extra_bytes;
scan->bytes_left -= scan->inode_size - extra_bytes;
-
+ /* CHECK */
#ifdef EXT2FS_ENABLE_SWAPFS
if ((scan->fs->flags & EXT2_FLAG_SWAP_BYTES) ||
(scan->fs->flags & EXT2_FLAG_SWAP_BYTES_READ))
@@ -463,10 +465,12 @@ #ifdef EXT2FS_ENABLE_SWAPFS
else
#endif
*inode = *((struct ext2_inode *) scan->temp_buffer);
+ ext2fs_check_inode_uuid(scan->fs,inode);
if (scan->scan_flags & EXT2_SF_BAD_EXTRA_BYTES)
retval = EXT2_ET_BAD_BLOCK_IN_INODE_TABLE;
scan->scan_flags &= ~EXT2_SF_BAD_EXTRA_BYTES;
} else {
+ /* CHECK */
#ifdef EXT2FS_ENABLE_SWAPFS
if ((scan->fs->flags & EXT2_FLAG_SWAP_BYTES) ||
(scan->fs->flags & EXT2_FLAG_SWAP_BYTES_READ))
@@ -477,6 +481,7 @@ #ifdef EXT2FS_ENABLE_SWAPFS
else
#endif
memcpy(inode, scan->ptr, bufsize);
+ ext2fs_check_inode_uuid(scan->fs,inode);
scan->ptr += scan->inode_size;
scan->bytes_left -= scan->inode_size;
if (scan->scan_flags & EXT2_SF_BAD_INODE_BLK)
@@ -496,6 +501,17 @@ errcode_t ext2fs_get_next_inode(ext2_ino
sizeof(struct ext2_inode));
}

+errcode_t ext2fs_update_inode_uuid(ext2_filsys fs, struct ext2_inode* inode);
+
+errcode_t ext2fs_check_inode_uuid(ext2_filsys fs, struct ext2_inode* inode)
+{
+ if(memcmp(inode->s_uuid,fs->super->s_uuid,sizeof(__u8)*16)) {
+ memset(inode, 0, sizeof(struct ext2_inode));
+ ext2fs_update_inode_uuid(fs, inode);
+ }
+ return 0;
+}
+
/*
* Functions to read and write a single inode.
*/
@@ -589,6 +605,9 @@ #ifdef EXT2FS_ENABLE_SWAPFS
0, length);
#endif

+
+ ext2fs_check_inode_uuid(fs,inode);
+
/* Update the inode cache */
fs->icache->cache_last = (fs->icache->cache_last + 1) %
fs->icache->cache_size;
@@ -605,6 +624,18 @@ errcode_t ext2fs_read_inode(ext2_filsys
sizeof(struct ext2_inode));
}

+
+/*
+ NODE CONNECTIVITY!!!
+ */
+errcode_t ext2fs_update_inode_uuid(ext2_filsys fs, struct ext2_inode* inode)
+{
+
+ memcpy(inode->s_uuid,fs->super->s_uuid, sizeof(__u8) * 16);
+ return 0;
+}
+
+
errcode_t ext2fs_write_inode_full(ext2_filsys fs, ext2_ino_t ino,
struct ext2_inode * inode, int bufsize)
{
@@ -615,6 +646,8 @@ errcode_t ext2fs_write_inode_full(ext2_f
int clen, i, length;

EXT2_CHECK_MAGIC(fs, EXT2_ET_MAGIC_EXT2FS_FILSYS);
+
+ ext2fs_update_inode_uuid(fs,inode);

/* Check to see if user provided an override function */
if (fs->write_inode) {
diff --git a/misc/mke2fs.c b/misc/mke2fs.c
index d008e15..e786938 100644
--- a/misc/mke2fs.c
+++ b/misc/mke2fs.c
@@ -398,7 +398,7 @@ static errcode_t zero_blocks(ext2_filsys

static void write_inode_tables(ext2_filsys fs)
{
- errcode_t retval;
+ errcode_t retval = 0;
blk_t blk;
dgrp_t i;
int num;
@@ -423,7 +423,7 @@ static void write_inode_tables(ext2_fils

if (!(lazy_flag &&
(fs->group_desc[i].bg_flags & EXT2_BG_INODE_UNINIT))) {
- retval = zero_blocks(fs, blk, num, 0, &blk, &num);
+ // retval = zero_blocks(fs, blk, num, 0, &blk, &num);
if (retval) {
fprintf(stderr, _("\nCould not write %d "
"blocks in inode table starting at %u: %s\n"),