Re: ramdisk corruption problems - was: RE: pivot_root and initrd kern el panic woes

From: Andrew Morton (akpm@zip.com.au)
Date: Sat Jan 05 2002 - 06:43:36 EST


Andrea Arcangeli wrote:
>
> Note: I'm fine to introduce another way to notify the app about -ENOSPC,
> -ENOSPC on mmap is the most obvious one, but we could still allow the
> current "overcommit" behaviour with a kind of sigbus mentioned by
> Andrew (possibly not sigbus though, since it has just well defined
> semantics for MAP_SHARED, maybe they could be extended, anyways as said
> this is only a matter of API). My point is only that some API should be
> added because your mmap on sparse files are unreliable at the moment.
>

The very least we can do is to return a sensible error code from msync().
At present, if you create a 200 meg mapping on a 100 meg disk, dirty it
all and run msync(MS_SYNC), the damn thing returns zero and you don't
know that you lost half your data.

The patch makes filemap_fdatasync() and filemap_fdatawait() return
an error code, and propagates that up through all callers, including
fsync() and fdatasync(). Please review, especially the nfs and
generic_file_direct_IO() changes.

There's also a half-assed attempt to implement MS_ASYNC in here.
If anyone thinks that's not worth the effort, I won't be offended.

--- linux-2.4.18-pre1/include/linux/fs.h Fri Dec 21 11:19:23 2001
+++ linux-akpm/include/linux/fs.h Sat Jan 5 03:21:07 2002
@@ -1212,8 +1212,8 @@ extern int osync_inode_data_buffers(stru
 extern int fsync_inode_buffers(struct inode *);
 extern int fsync_inode_data_buffers(struct inode *);
 extern int inode_has_buffers(struct inode *);
-extern void filemap_fdatasync(struct address_space *);
-extern void filemap_fdatawait(struct address_space *);
+extern int filemap_fdatasync(struct address_space *);
+extern int filemap_fdatawait(struct address_space *);
 extern void sync_supers(kdev_t);
 extern int bmap(struct inode *, int);
 extern int notify_change(struct dentry *, struct iattr *);
--- linux-2.4.18-pre1/mm/filemap.c Wed Dec 26 11:47:41 2001
+++ linux-akpm/mm/filemap.c Sat Jan 5 03:23:09 2002
@@ -582,8 +582,9 @@ EXPORT_SYMBOL(fail_writepage);
  * @mapping: address space structure to write
  *
  */
-void filemap_fdatasync(struct address_space * mapping)
+int filemap_fdatasync(struct address_space * mapping)
 {
+ int ret = 0;
         int (*writepage)(struct page *) = mapping->a_ops->writepage;
 
         spin_lock(&pagecache_lock);
@@ -603,8 +604,11 @@ void filemap_fdatasync(struct address_sp
                 lock_page(page);
 
                 if (PageDirty(page)) {
+ int err;
                         ClearPageDirty(page);
- writepage(page);
+ err = writepage(page);
+ if (err && !ret)
+ ret = err;
                 } else
                         UnlockPage(page);
 
@@ -612,6 +616,7 @@ void filemap_fdatasync(struct address_sp
                 spin_lock(&pagecache_lock);
         }
         spin_unlock(&pagecache_lock);
+ return ret;
 }
 
 /**
@@ -621,8 +626,10 @@ void filemap_fdatasync(struct address_sp
  * @mapping: address space structure to wait for
  *
  */
-void filemap_fdatawait(struct address_space * mapping)
+int filemap_fdatawait(struct address_space * mapping)
 {
+ int ret = 0;
+
         spin_lock(&pagecache_lock);
 
         while (!list_empty(&mapping->locked_pages)) {
@@ -638,11 +645,14 @@ void filemap_fdatawait(struct address_sp
                 spin_unlock(&pagecache_lock);
 
                 ___wait_on_page(page);
+ if (PageError(page))
+ ret = -EIO;
 
                 page_cache_release(page);
                 spin_lock(&pagecache_lock);
         }
         spin_unlock(&pagecache_lock);
+ return ret;
 }
 
 /*
@@ -1519,12 +1529,14 @@ static ssize_t generic_file_direct_IO(in
                 goto out_free;
 
         /*
- * Flush to disk exlusively the _data_, metadata must remains
+ * Flush to disk exclusively the _data_, metadata must remain
          * completly asynchronous or performance will go to /dev/null.
          */
- filemap_fdatasync(mapping);
- retval = fsync_inode_data_buffers(inode);
- filemap_fdatawait(mapping);
+ retval = filemap_fdatasync(mapping);
+ if (retval == 0)
+ retval = fsync_inode_data_buffers(inode);
+ if (retval == 0)
+ retval = filemap_fdatawait(mapping);
         if (retval < 0)
                 goto out_free;
 
@@ -2141,26 +2153,41 @@ int generic_file_mmap(struct file * file
  * The msync() system call.
  */
 
+/*
+ * We attempt to implement MS_ASYNC, but it's lame. There needs to be a way
+ * of starting async writeout of the metadata and inode.
+ */
 static int msync_interval(struct vm_area_struct * vma,
         unsigned long start, unsigned long end, int flags)
 {
+ int ret = 0;
         struct file * file = vma->vm_file;
+
         if (file && (vma->vm_flags & VM_SHARED)) {
- int error;
- error = filemap_sync(vma, start, end-start, flags);
+ /* filemap_sync() cannot fail */
+ ret = filemap_sync(vma, start, end-start, flags);
 
- if (!error && (flags & MS_SYNC)) {
+ if (flags & (MS_SYNC|MS_ASYNC)) {
                         struct inode * inode = file->f_dentry->d_inode;
+
                         down(&inode->i_sem);
- filemap_fdatasync(inode->i_mapping);
- if (file->f_op && file->f_op->fsync)
- error = file->f_op->fsync(file, file->f_dentry, 1);
- filemap_fdatawait(inode->i_mapping);
+ ret = filemap_fdatasync(inode->i_mapping);
+ if (flags & MS_SYNC) {
+ int err;
+
+ if (file->f_op && file->f_op->fsync) {
+ err = file->f_op->fsync(file, file->f_dentry, 1);
+ if (err && !ret)
+ ret = err;
+ }
+ err = filemap_fdatawait(inode->i_mapping);
+ if (err && !ret)
+ ret = err;
+ }
                         up(&inode->i_sem);
                 }
- return error;
         }
- return 0;
+ return ret;
 }
 
 asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
--- linux-2.4.18-pre1/fs/block_dev.c Fri Dec 21 11:19:14 2001
+++ linux-akpm/fs/block_dev.c Sat Jan 5 03:21:07 2002
@@ -171,11 +171,15 @@ static loff_t block_llseek(struct file *
 
 static int __block_fsync(struct inode * inode)
 {
- int ret;
+ int ret, err;
 
- filemap_fdatasync(inode->i_mapping);
- ret = sync_buffers(inode->i_rdev, 1);
- filemap_fdatawait(inode->i_mapping);
+ ret = filemap_fdatasync(inode->i_mapping);
+ err = sync_buffers(inode->i_rdev, 1);
+ if (err && !ret)
+ ret = err;
+ err = filemap_fdatawait(inode->i_mapping);
+ if (err && !ret)
+ ret = err;
 
         return ret;
 }
--- linux-2.4.18-pre1/fs/buffer.c Fri Dec 21 11:19:14 2001
+++ linux-akpm/fs/buffer.c Sat Jan 5 03:21:07 2002
@@ -401,9 +401,9 @@ asmlinkage long sys_fsync(unsigned int f
         struct file * file;
         struct dentry * dentry;
         struct inode * inode;
- int err;
+ int ret, err;
 
- err = -EBADF;
+ ret = -EBADF;
         file = fget(fd);
         if (!file)
                 goto out;
@@ -411,21 +411,27 @@ asmlinkage long sys_fsync(unsigned int f
         dentry = file->f_dentry;
         inode = dentry->d_inode;
 
- err = -EINVAL;
- if (!file->f_op || !file->f_op->fsync)
+ ret = -EINVAL;
+ if (!file->f_op || !file->f_op->fsync) {
+ /* Why? We can still call filemap_fdatasync */
                 goto out_putf;
+ }
 
         /* We need to protect against concurrent writers.. */
         down(&inode->i_sem);
- filemap_fdatasync(inode->i_mapping);
+ ret = filemap_fdatasync(inode->i_mapping);
         err = file->f_op->fsync(file, dentry, 0);
- filemap_fdatawait(inode->i_mapping);
+ if (err && !ret)
+ ret = err;
+ err = filemap_fdatawait(inode->i_mapping);
+ if (err && !ret)
+ ret = err;
         up(&inode->i_sem);
 
 out_putf:
         fput(file);
 out:
- return err;
+ return ret;
 }
 
 asmlinkage long sys_fdatasync(unsigned int fd)
@@ -433,9 +439,9 @@ asmlinkage long sys_fdatasync(unsigned i
         struct file * file;
         struct dentry * dentry;
         struct inode * inode;
- int err;
+ int ret, err;
 
- err = -EBADF;
+ ret = -EBADF;
         file = fget(fd);
         if (!file)
                 goto out;
@@ -443,14 +449,18 @@ asmlinkage long sys_fdatasync(unsigned i
         dentry = file->f_dentry;
         inode = dentry->d_inode;
 
- err = -EINVAL;
+ ret = -EINVAL;
         if (!file->f_op || !file->f_op->fsync)
                 goto out_putf;
 
         down(&inode->i_sem);
- filemap_fdatasync(inode->i_mapping);
+ ret = filemap_fdatasync(inode->i_mapping);
         err = file->f_op->fsync(file, dentry, 1);
- filemap_fdatawait(inode->i_mapping);
+ if (err && !ret)
+ ret = err;
+ err = filemap_fdatawait(inode->i_mapping);
+ if (err && !ret)
+ ret = err;
         up(&inode->i_sem);
 
 out_putf:
--- linux-2.4.18-pre1/fs/nfs/file.c Wed Dec 26 11:47:41 2001
+++ linux-akpm/fs/nfs/file.c Sat Jan 5 03:21:07 2002
@@ -244,6 +244,7 @@ nfs_lock(struct file *filp, int cmd, str
 {
         struct inode * inode = filp->f_dentry->d_inode;
         int status = 0;
+ int status2;
 
         dprintk("NFS: nfs_lock(f=%4x/%ld, t=%x, fl=%x, r=%Ld:%Ld)\n",
                         inode->i_dev, inode->i_ino,
@@ -278,11 +279,18 @@ nfs_lock(struct file *filp, int cmd, str
          * Flush all pending writes before doing anything
          * with locks..
          */
- filemap_fdatasync(inode->i_mapping);
+ /*
+ * Shouldn't filemap_fdatasync/wait be inside i_sem?
+ */
+ status = filemap_fdatasync(inode->i_mapping);
         down(&inode->i_sem);
- status = nfs_wb_all(inode);
+ status2 = nfs_wb_all(inode);
+ if (status2 && !status)
+ status = status2;
         up(&inode->i_sem);
- filemap_fdatawait(inode->i_mapping);
+ status2 = filemap_fdatawait(inode->i_mapping);
+ if (status2 && !status)
+ status = status2;
         if (status < 0)
                 return status;
 
@@ -300,11 +308,17 @@ nfs_lock(struct file *filp, int cmd, str
          */
  out_ok:
         if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
- filemap_fdatasync(inode->i_mapping);
+ status2 = filemap_fdatasync(inode->i_mapping);
+ if (status2 && !status)
+ status = status2;
                 down(&inode->i_sem);
- nfs_wb_all(inode); /* we may have slept */
+ status2 = nfs_wb_all(inode); /* we may have slept */
+ if (status2 && !status)
+ status2 = status;
                 up(&inode->i_sem);
- filemap_fdatawait(inode->i_mapping);
+ status2 = filemap_fdatawait(inode->i_mapping);
+ if (status2 && !status)
+ status = status2;
                 nfs_zap_caches(inode);
         }
         return status;
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



This archive was generated by hypermail 2b29 : Mon Jan 07 2002 - 21:00:28 EST