[Patch] deadlock on write in tmpfs

From: Christoph Rohland (cr@sap.com)
Date: Tue May 01 2001 - 08:39:47 EST


Hi Linus and Stephen,

tmpfs deadlocks when writing into a file from a mapping of the same
file.

The problem is the following:

- shmem_file_write may call shmem_no_page and calls
  shmem_getpage_locked later,
- shmem_no_page calls shmem_getpage_locked
- shmem_getpage_locked may call shmem_writepage on page allocation

- shmem_file_write holds the inode semaphore
- shmem_getpage_locked prevent races against shmem_writepage with the
  shmem spinlock
- shmem_getpage_locked needs serialization against itself and
  shmem_truncate

The last was done with the inode semaphore, which deadlocks with
shmem_write

So I see two choices:

1) Do not serialise the whole of shmem_getpage_locked but protect
   critical pathes with the spinlock and do retries after sleeps
2) Add another semaphore to serialize shmem_getpage_locked and
   shmem_truncate

I tried some time to get 1) done but the retry logic became way too
complicated. So the attached patch implements 2)

I still think it's ugly to add another semaphore, but it works.

Greetings
                Christoph

diff -uNr 2.4.4/include/linux/shmem_fs.h c/include/linux/shmem_fs.h
--- 2.4.4/include/linux/shmem_fs.h Sun Apr 29 20:33:00 2001
+++ c/include/linux/shmem_fs.h Sun Apr 29 22:43:56 2001
@@ -19,6 +19,7 @@
 
 struct shmem_inode_info {
         spinlock_t lock;
+ struct semaphore sem;
         unsigned long max_index;
         swp_entry_t i_direct[SHMEM_NR_DIRECT]; /* for the first blocks */
         swp_entry_t **i_indirect; /* doubly indirect blocks */
diff -uNr 2.4.4/mm/shmem.c c/mm/shmem.c
--- 2.4.4/mm/shmem.c Mon Apr 30 09:45:39 2001
+++ c/mm/shmem.c Tue May 1 15:15:38 2001
@@ -161,6 +161,7 @@
         swp_entry_t **base, **ptr, **last;
         struct shmem_inode_info * info = &inode->u.shmem_i;
 
+ down(&info->sem);
         inode->i_ctime = inode->i_mtime = CURRENT_TIME;
         spin_lock (&info->lock);
         index = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
@@ -197,6 +198,7 @@
         info->swapped -= freed;
         shmem_recalc_inode(inode);
         spin_unlock (&info->lock);
+ up(&info->sem);
 }
 
 static void shmem_delete_inode(struct inode * inode)
@@ -281,15 +283,12 @@
  * still need to guard against racing with shm_writepage(), which might
  * be trying to move the page to the swap cache as we run.
  */
-static struct page * shmem_getpage_locked(struct inode * inode, unsigned long idx)
+static struct page * shmem_getpage_locked(struct shmem_inode_info *info, struct inode * inode, unsigned long idx)
 {
         struct address_space * mapping = inode->i_mapping;
- struct shmem_inode_info *info;
         struct page * page;
         swp_entry_t *entry;
 
- info = &inode->u.shmem_i;
-
 repeat:
         page = find_lock_page(mapping, idx);
         if (page)
@@ -393,6 +392,7 @@
 
 static int shmem_getpage(struct inode * inode, unsigned long idx, struct page **ptr)
 {
+ struct shmem_inode_info *info;
         struct address_space * mapping = inode->i_mapping;
         int error;
 
@@ -407,27 +407,28 @@
                 page_cache_release(*ptr);
         }
 
- down (&inode->i_sem);
- /* retest we may have slept */
+ info = &inode->u.shmem_i;
+ down (&info->sem);
+ /* retest we may have slept */
+
+ *ptr = ERR_PTR(-EFAULT);
         if (inode->i_size < (loff_t) idx * PAGE_CACHE_SIZE)
- goto sigbus;
- *ptr = shmem_getpage_locked(inode, idx);
+ goto failed;
+
+ *ptr = shmem_getpage_locked(&inode->u.shmem_i, inode, idx);
         if (IS_ERR (*ptr))
                 goto failed;
+
         UnlockPage(*ptr);
- up (&inode->i_sem);
+ up (&info->sem);
         return 0;
 failed:
- up (&inode->i_sem);
+ up (&info->sem);
         error = PTR_ERR(*ptr);
- *ptr = NOPAGE_OOM;
- if (error != -EFBIG)
- *ptr = NOPAGE_SIGBUS;
- return error;
-sigbus:
- up (&inode->i_sem);
         *ptr = NOPAGE_SIGBUS;
- return -EFAULT;
+ if (error == -ENOMEM)
+ *ptr = NOPAGE_OOM;
+ return error;
 }
 
 struct page * shmem_nopage(struct vm_area_struct * vma, unsigned long address, int no_share)
@@ -500,6 +501,7 @@
 struct inode *shmem_get_inode(struct super_block *sb, int mode, int dev)
 {
         struct inode * inode;
+ struct shmem_inode_info *info;
 
         spin_lock (&sb->u.shmem_sb.stat_lock);
         if (!sb->u.shmem_sb.free_inodes) {
@@ -519,7 +521,9 @@
                 inode->i_rdev = to_kdev_t(dev);
                 inode->i_mapping->a_ops = &shmem_aops;
                 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- spin_lock_init (&inode->u.shmem_i.lock);
+ info = &inode->u.shmem_i;
+ spin_lock_init (&info->lock);
+ sema_init (&info->sem, 1);
                 switch (mode & S_IFMT) {
                 default:
                         init_special_inode(inode, mode, dev);
@@ -549,6 +553,7 @@
 shmem_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos)
 {
         struct inode *inode = file->f_dentry->d_inode;
+ struct shmem_inode_info *info;
         unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
         loff_t pos;
         struct page *page;
@@ -624,7 +629,11 @@
                         __get_user(dummy, buf+bytes-1);
                 }
 
- page = shmem_getpage_locked(inode, index);
+ info = &inode->u.shmem_i;
+ down (&info->sem);
+ page = shmem_getpage_locked(info, inode, index);
+ up (&info->sem);
+
                 status = PTR_ERR(page);
                 if (IS_ERR(page))
                         break;
@@ -635,7 +644,6 @@
                 }
 
                 kaddr = kmap(page);
-// can this do a truncated write? cr
                 status = copy_from_user(kaddr+offset, buf, bytes);
                 kunmap(page);
                 if (status)
@@ -932,7 +940,7 @@
                 
         inode = dentry->d_inode;
         down(&inode->i_sem);
- page = shmem_getpage_locked(inode, 0);
+ page = shmem_getpage_locked(&inode->u.shmem_i, inode, 0);
         if (IS_ERR(page))
                 goto fail;
         kaddr = kmap(page);

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



This archive was generated by hypermail 2b29 : Mon May 07 2001 - 21:00:10 EST