[PATCH] ocfs2: Zero the tail cluster when extending past i_size.

From: Joel Becker
Date: Fri Jul 02 2010 - 18:51:19 EST


Linus et al,
Here's the first patch for the problem. This is the corruption
fix. It changes ocfs2 to expect that blocks past i_size will not be
zeroed; ocfs2 now zeros them when i_size expands to encompass them.
This has been tested with various ocfs2 configurations. My test script
was sent as a separate email to ocfs2-devel.
There is still one more patch to come. ocfs2 still tries to
zero entire clusters as it allocates them. Any extra pages past i_size
remain dirty but untouched by writeback. When combined with Dave's
patch, this will still trigger the BUG_ON() in CoW. My next job is to
stop zeroing the pages past i_size when we allocate clusters.
The combination of both patches is the complete fix. Linus, I
intend to send it through the fixes branch of ocfs2.git when I'm done.
I want to get some of our generic test workloads going once the second
patch is written. It will definitely be 2.6.35-rc; I don't want 2.6.35
going out with this problem.

Joel

-----------------------------

ocfs2's allocation unit is the cluster. This can be larger than a block
or even a memory page. This means that a file may have many blocks in
its last extent that are beyond the block containing i_size.

When ocfs2 grows a file, it zeros the entire cluster in order to ensure
future i_size growth will see cleared blocks. Unfortunately,
block_write_full_page() drops the pages past i_size. This means that
ocfs2 is actually leaking garbage data into the tail end of that last
cluster.

We adjust ocfs2_write_begin_nolock() and ocfs2_extend_file() to detect
when a write or truncate is past i_size. If there is any existing
allocation between the block containing the current i_size and the
location of the write or truncate, zeros will be written to that
allocation.

This is only for sparse filesystems. Non-sparse filesystems already get
this via ocfs2_extend_no_holes().

Signed-off-by: Joel Becker <joel.becker@xxxxxxxxxx>
---
fs/ocfs2/aops.c | 22 ++++----
fs/ocfs2/file.c | 145 ++++++++++++++++++++++++++++++++++++++++++++++++-------
fs/ocfs2/file.h | 2 +
3 files changed, 141 insertions(+), 28 deletions(-)

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 3623ca2..96e6aeb 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -196,15 +196,14 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock,
dump_stack();
goto bail;
}
-
- past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
- mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
- (unsigned long long)past_eof);
-
- if (create && (iblock >= past_eof))
- set_buffer_new(bh_result);
}

+ past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
+ mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
+ (unsigned long long)past_eof);
+ if (create && (iblock >= past_eof))
+ set_buffer_new(bh_result);
+
bail:
if (err < 0)
err = -EIO;
@@ -1625,11 +1624,9 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
struct ocfs2_write_ctxt *wc)
{
int ret;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
loff_t newsize = pos + len;

- if (ocfs2_sparse_alloc(osb))
- return 0;
+ BUG_ON(ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));

if (newsize <= i_size_read(inode))
return 0;
@@ -1679,7 +1676,10 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
}
}

- ret = ocfs2_expand_nonsparse_inode(inode, pos, len, wc);
+ if (ocfs2_sparse_alloc(osb))
+ ret = ocfs2_zero_tail(inode, di_bh, pos);
+ else
+ ret = ocfs2_expand_nonsparse_inode(inode, pos, len, wc);
if (ret) {
mlog_errno(ret);
goto out;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 6a13ea6..a64ec02 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -848,6 +848,128 @@ out:
return ret;
}

+/*
+ * This function is a helper for ocfs2_zero_tail(). It calculates
+ * what blocks need zeroing and does any CoW necessary.
+ */
+static int ocfs2_zero_tail_prepare(struct inode *inode,
+ struct buffer_head *di_bh,
+ loff_t pos, u64 *start_blkno,
+ u64 *blocks)
+{
+ int rc = 0;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ u32 tail_cpos, pos_cpos, p_cpos;
+ u64 tail_blkno, pos_blkno, blocks_to_zero;
+ unsigned int num_clusters = 0;
+ unsigned int ext_flags = 0;
+
+ /*
+ * The block containing i_size has already been zeroed, so our tail
+ * block is the first block after i_size. The block containing
+ * pos will be zeroed. So we only need to do anything if
+ * tail_blkno is before pos_blkno.
+ */
+ tail_blkno = (i_size_read(inode) >> inode->i_sb->s_blocksize_bits) + 1;
+ pos_blkno = pos >> inode->i_sb->s_blocksize_bits;
+ mlog(0, "tail_blkno = %llu, pos_blkno = %llu\n",
+ (unsigned long long)tail_blkno, (unsigned long long)pos_blkno);
+ if (pos_blkno <= tail_blkno)
+ goto out;
+ blocks_to_zero = pos_blkno - tail_blkno;
+
+ /*
+ * If tail_blkno is in the cluster past i_size, we don't need
+ * to touch the cluster containing i_size at all.
+ */
+ tail_cpos = i_size_read(inode) >> osb->s_clustersize_bits;
+ if (ocfs2_blocks_to_clusters(inode->i_sb, tail_blkno) > tail_cpos)
+ tail_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
+ tail_blkno);
+
+ rc = ocfs2_get_clusters(inode, tail_cpos, &p_cpos, &num_clusters,
+ &ext_flags);
+ if (rc) {
+ mlog_errno(rc);
+ goto out;
+ }
+ /* Are we off the end of the allocation? */
+ if (!p_cpos) {
+ BUG_ON(tail_cpos <=
+ (i_size_read(inode) >> osb->s_clustersize_bits));
+ goto out;
+ }
+
+ pos_cpos = pos >> osb->s_clustersize_bits;
+ if ((tail_cpos + num_clusters) >= pos_cpos) {
+ num_clusters = pos_cpos - tail_cpos;
+ if (pos_blkno >
+ ocfs2_clusters_to_blocks(inode->i_sb, pos_cpos))
+ num_clusters += 1;
+ } else {
+ blocks_to_zero =
+ ocfs2_clusters_to_blocks(inode->i_sb,
+ tail_cpos + num_clusters);
+ blocks_to_zero -= tail_blkno;
+ }
+
+ /* Now CoW the clusters we're about to zero */
+ if (ext_flags & OCFS2_EXT_REFCOUNTED) {
+ rc = ocfs2_refcount_cow(inode, di_bh, tail_cpos,
+ num_clusters, UINT_MAX);
+ if (rc) {
+ mlog_errno(rc);
+ goto out;
+ }
+ }
+
+ *start_blkno = tail_blkno;
+ *blocks = blocks_to_zero;
+ mlog(0, "start_blkno = %llu, blocks = %llu\n",
+ (unsigned long long)(*start_blkno),
+ (unsigned long long)(*blocks));
+
+out:
+ return rc;
+}
+
+/*
+ * This function only does work for sparse filesystems.
+ * ocfs2_extend_no_holes() will do the same work for non-sparse * files.
+ *
+ * If the last extent of the file has blocks beyond i_size, we must zero
+ * them before we can grow i_size to cover them. Specifically, any
+ * allocation between the block containing the current i_size and the block
+ * containing pos must be zeroed.
+ */
+int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
+ loff_t pos)
+{
+ int rc = 0;
+ u64 tail_blkno = 0, blocks_to_zero = 0;
+
+ BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
+
+ rc = ocfs2_zero_tail_prepare(inode, di_bh, pos, &tail_blkno,
+ &blocks_to_zero);
+ if (rc) {
+ mlog_errno(rc);
+ goto out;
+ }
+
+ if (!blocks_to_zero)
+ goto out;
+
+ rc = ocfs2_zero_extend(inode,
+ (tail_blkno + blocks_to_zero) <<
+ inode->i_sb->s_blocksize_bits);
+ if (rc)
+ mlog_errno(rc);
+
+out:
+ return rc;
+}
+
static int ocfs2_extend_file(struct inode *inode,
struct buffer_head *di_bh,
u64 new_i_size)
@@ -862,27 +984,15 @@ static int ocfs2_extend_file(struct inode *inode,
goto out;

if (i_size_read(inode) == new_i_size)
- goto out;
+ goto out;
BUG_ON(new_i_size < i_size_read(inode));

/*
- * Fall through for converting inline data, even if the fs
- * supports sparse files.
- *
- * The check for inline data here is legal - nobody can add
- * the feature since we have i_mutex. We must check it again
- * after acquiring ip_alloc_sem though, as paths like mmap
- * might have raced us to converting the inode to extents.
- */
- if (!(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
- && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
- goto out_update_size;
-
- /*
* The alloc sem blocks people in read/write from reading our
* allocation until we're done changing it. We depend on
* i_mutex to block other extend/truncate calls while we're
- * here.
+ * here. We even have to hold it for sparse files because there
+ * might be some tail zeroing.
*/
down_write(&oi->ip_alloc_sem);

@@ -899,13 +1009,14 @@ static int ocfs2_extend_file(struct inode *inode,
ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
if (ret) {
up_write(&oi->ip_alloc_sem);
-
mlog_errno(ret);
goto out;
}
}

- if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+ if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+ ret = ocfs2_zero_tail(inode, di_bh, new_i_size);
+ else
ret = ocfs2_extend_no_holes(inode, new_i_size, new_i_size);

up_write(&oi->ip_alloc_sem);
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index d66cf4f..7493d97 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -56,6 +56,8 @@ int ocfs2_simple_size_update(struct inode *inode,
u64 new_i_size);
int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size,
u64 zero_to);
+int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
+ loff_t pos);
int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat);
--
1.7.1


--

"Time is an illusion, lunchtime doubly so."
-Douglas Adams

Joel Becker
Consulting Software Developer
Oracle
E-mail: joel.becker@xxxxxxxxxx
Phone: (650) 506-8127
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/