[PATCH] tmpfs 3/9 remove info->sem

From: Hugh Dickins (hugh@veritas.com)
Date: Tue Oct 01 2002 - 16:47:44 EST


Between inode->i_sem and info->lock comes info->sem; but it doesn't
guard thoroughly against the difficult races (truncate during read),
and serializes reads from tmpfs unlike other filesystems. I'd prefer
to work with just i_sem and info->lock, backtracking when necessary
(when another task allocates block or metablock at the same time).

(I am not satisfied with the locked setting of next_index at the
start of shmem_getpage_locked: it's one lock hold too many, and
it doesn't really fix races against truncate better than before:
another patch in a later batch will resolve that.)

--- tmpfs2/include/linux/shmem_fs.h Thu Aug 1 23:58:33 2002
+++ tmpfs3/include/linux/shmem_fs.h Tue Oct 1 19:49:04 2002
@@ -11,7 +11,6 @@
 
 struct shmem_inode_info {
         spinlock_t lock;
- struct semaphore sem;
         unsigned long next_index;
         swp_entry_t i_direct[SHMEM_NR_DIRECT]; /* for the first blocks */
         void **i_indirect; /* indirect blocks */
--- tmpfs2/mm/shmem.c Tue Oct 1 19:48:59 2002
+++ tmpfs3/mm/shmem.c Tue Oct 1 19:49:04 2002
@@ -111,11 +111,9 @@
  * @page: optional page to add to the structure. Has to be preset to
  * all zeros
  *
- * If there is no space allocated yet it will return -ENOMEM when
- * page == 0 else it will use the page for the needed block.
- *
- * returns -EFBIG if the index is too big.
- *
+ * If there is no space allocated yet it will return NULL when
+ * page is 0, else it will use the page for the needed block,
+ * setting it to 0 on return to indicate that it has been used.
  *
  * The swap vector is organized the following way:
  *
@@ -143,70 +141,80 @@
  * +-> 48-51
  * +-> 52-55
  */
-static swp_entry_t * shmem_swp_entry (struct shmem_inode_info *info, unsigned long index, unsigned long page)
+static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, unsigned long *page)
 {
         unsigned long offset;
         void **dir;
 
+ if (index >= info->next_index)
+ return NULL;
         if (index < SHMEM_NR_DIRECT)
                 return info->i_direct+index;
+ if (!info->i_indirect) {
+ if (page) {
+ info->i_indirect = (void *) *page;
+ *page = 0;
+ }
+ return NULL; /* need another page */
+ }
 
         index -= SHMEM_NR_DIRECT;
         offset = index % ENTRIES_PER_PAGE;
         index /= ENTRIES_PER_PAGE;
-
- if (!info->i_indirect) {
- info->i_indirect = (void *) page;
- return ERR_PTR(-ENOMEM);
- }
-
         dir = info->i_indirect + index;
+
         if (index >= ENTRIES_PER_PAGE/2) {
                 index -= ENTRIES_PER_PAGE/2;
                 dir = info->i_indirect + ENTRIES_PER_PAGE/2
                         + index/ENTRIES_PER_PAGE;
                 index %= ENTRIES_PER_PAGE;
-
- if(!*dir) {
- *dir = (void *) page;
- /* We return since we will need another page
- in the next step */
- return ERR_PTR(-ENOMEM);
+ if (!*dir) {
+ if (page) {
+ *dir = (void *) *page;
+ *page = 0;
+ }
+ return NULL; /* need another page */
                 }
                 dir = ((void **)*dir) + index;
         }
+
         if (!*dir) {
- if (!page)
- return ERR_PTR(-ENOMEM);
- *dir = (void *)page;
+ if (!page || !*page)
+ return NULL; /* need a page */
+ *dir = (void *) *page;
+ *page = 0;
         }
         return ((swp_entry_t *)*dir) + offset;
 }
 
 /*
- * shmem_alloc_entry - get the position of the swap entry for the
- * page. If it does not exist allocate the entry
+ * shmem_swp_alloc - get the position of the swap entry for the page.
+ * If it does not exist allocate the entry.
  *
  * @info: info structure for the inode
  * @index: index of the page to find
  */
-static inline swp_entry_t * shmem_alloc_entry (struct shmem_inode_info *info, unsigned long index)
+static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index)
 {
         unsigned long page = 0;
- swp_entry_t * res;
-
- if (index >= SHMEM_MAX_INDEX)
- return ERR_PTR(-EFBIG);
-
- if (info->next_index <= index)
- info->next_index = index + 1;
+ swp_entry_t *entry;
 
- while ((res = shmem_swp_entry(info,index,page)) == ERR_PTR(-ENOMEM)) {
+ while (!(entry = shmem_swp_entry(info, index, &page))) {
+ if (index >= info->next_index) {
+ entry = ERR_PTR(-EFAULT);
+ break;
+ }
+ spin_unlock(&info->lock);
                 page = get_zeroed_page(GFP_USER);
+ spin_lock(&info->lock);
                 if (!page)
- break;
+ return ERR_PTR(-ENOMEM);
+ }
+ if (page) {
+ /* another task gave its page, or truncated the file */
+ free_page(page);
         }
- return res;
+ return entry;
 }
 
 /*
@@ -330,17 +338,15 @@
         unsigned long freed = 0;
         struct shmem_inode_info * info = SHMEM_I(inode);
 
- down(&info->sem);
         inode->i_ctime = inode->i_mtime = CURRENT_TIME;
- spin_lock (&info->lock);
         index = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ spin_lock (&info->lock);
         while (index < info->next_index)
                 freed += shmem_truncate_indirect(info, index);
 
         info->swapped -= freed;
         shmem_recalc_inode(inode);
         spin_unlock (&info->lock);
- up(&info->sem);
 }
 
 static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
@@ -436,8 +442,8 @@
 
         for (idx = SHMEM_NR_DIRECT; idx < info->next_index;
              idx += ENTRIES_PER_PAGE) {
- ptr = shmem_swp_entry(info, idx, 0);
- if (IS_ERR(ptr))
+ ptr = shmem_swp_entry(info, idx, NULL);
+ if (!ptr)
                         continue;
                 offset = info->next_index - idx;
                 if (offset > ENTRIES_PER_PAGE)
@@ -519,10 +525,10 @@
                 return fail_writepage(page);
 
         spin_lock(&info->lock);
- entry = shmem_swp_entry(info, index, 0);
- if (IS_ERR(entry)) /* this had been allocated on page allocation */
- BUG();
         shmem_recalc_inode(inode);
+ entry = shmem_swp_entry(info, index, NULL);
+ if (!entry)
+ BUG();
         if (entry->val)
                 BUG();
 
@@ -570,61 +576,68 @@
         struct shmem_sb_info *sbinfo;
         struct page *page;
         swp_entry_t *entry;
- int error;
+ swp_entry_t swap;
+ int error = 0;
+
+ if (idx >= SHMEM_MAX_INDEX)
+ return ERR_PTR(-EFBIG);
+
+ /*
+ * When writing, i_sem is held against truncation and other
+ * writing, so next_index will remain as set here; but when
+ * reading, idx must always be checked against next_index
+ * after sleeping, lest truncation occurred meanwhile.
+ */
+ spin_lock(&info->lock);
+ if (info->next_index <= idx)
+ info->next_index = idx + 1;
+ spin_unlock(&info->lock);
 
 repeat:
         page = find_lock_page(mapping, idx);
         if (page)
                 return page;
 
- entry = shmem_alloc_entry (info, idx);
- if (IS_ERR(entry))
+ spin_lock(&info->lock);
+ shmem_recalc_inode(inode);
+ entry = shmem_swp_alloc(info, idx);
+ if (IS_ERR(entry)) {
+ spin_unlock(&info->lock);
                 return (void *)entry;
-
- spin_lock (&info->lock);
-
- /* The shmem_alloc_entry() call may have blocked, and
- * shmem_writepage may have been moving a page between the page
- * cache and swap cache. We need to recheck the page cache
- * under the protection of the info->lock spinlock. */
-
- page = find_get_page(mapping, idx);
- if (page) {
- if (TestSetPageLocked(page))
- goto wait_retry;
- spin_unlock (&info->lock);
- return page;
         }
-
- shmem_recalc_inode(inode);
- if (entry->val) {
+ swap = *entry;
+
+ if (swap.val) {
                 /* Look it up and read it in.. */
- page = lookup_swap_cache(*entry);
+ page = lookup_swap_cache(swap);
                 if (!page) {
- swp_entry_t swap = *entry;
- spin_unlock (&info->lock);
- swapin_readahead(*entry);
- page = read_swap_cache_async(*entry);
+ spin_unlock(&info->lock);
+ swapin_readahead(swap);
+ page = read_swap_cache_async(swap);
                         if (!page) {
- if (entry->val != swap.val)
- goto repeat;
- return ERR_PTR(-ENOMEM);
+ spin_lock(&info->lock);
+ entry = shmem_swp_alloc(info, idx);
+ if (IS_ERR(entry))
+ error = PTR_ERR(entry);
+ else if (entry->val == swap.val)
+ error = -ENOMEM;
+ spin_unlock(&info->lock);
+ if (error)
+ return ERR_PTR(error);
+ goto repeat;
                         }
                         wait_on_page_locked(page);
- if (!PageUptodate(page) && entry->val == swap.val) {
- page_cache_release(page);
- return ERR_PTR(-EIO);
- }
-
- /* Too bad we can't trust this page, because we
- * dropped the info->lock spinlock */
                         page_cache_release(page);
                         goto repeat;
                 }
 
                 /* We have to do this with page locked to prevent races */
- if (TestSetPageLocked(page))
- goto wait_retry;
+ if (TestSetPageLocked(page)) {
+ spin_unlock(&info->lock);
+ wait_on_page_locked(page);
+ page_cache_release(page);
+ goto repeat;
+ }
                 if (PageWriteback(page)) {
                         spin_unlock(&info->lock);
                         wait_on_page_writeback(page);
@@ -632,42 +645,55 @@
                         page_cache_release(page);
                         goto repeat;
                 }
- error = move_from_swap_cache(page, idx, mapping);
- if (error < 0) {
+
+ error = PageUptodate(page)?
+ move_from_swap_cache(page, idx, mapping): -EIO;
+ if (error) {
                         spin_unlock(&info->lock);
                         unlock_page(page);
                         page_cache_release(page);
                         return ERR_PTR(error);
                 }
 
- swap_free(*entry);
                 *entry = (swp_entry_t) {0};
                 info->swapped--;
                 spin_unlock (&info->lock);
+ swap_free(swap);
         } else {
+ spin_unlock(&info->lock);
                 sbinfo = SHMEM_SB(inode->i_sb);
- spin_unlock (&info->lock);
- spin_lock (&sbinfo->stat_lock);
- if (sbinfo->free_blocks == 0)
- goto no_space;
+ spin_lock(&sbinfo->stat_lock);
+ if (sbinfo->free_blocks == 0) {
+ spin_unlock(&sbinfo->stat_lock);
+ return ERR_PTR(-ENOSPC);
+ }
                 sbinfo->free_blocks--;
- spin_unlock (&sbinfo->stat_lock);
+ spin_unlock(&sbinfo->stat_lock);
 
- /* Ok, get a new page. We don't have to worry about the
- * info->lock spinlock here: we cannot race against
- * shm_writepage because we have already verified that
- * there is no page present either in memory or in the
- * swap cache, so we are guaranteed to be populating a
- * new shm entry. The inode semaphore we already hold
- * is enough to make this atomic. */
                 page = page_cache_alloc(mapping);
- if (!page)
- goto no_mem;
- error = add_to_page_cache_lru(page, mapping, idx);
- if (error < 0) {
+ if (!page) {
+ spin_lock(&sbinfo->stat_lock);
+ sbinfo->free_blocks++;
+ spin_unlock(&sbinfo->stat_lock);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ spin_lock(&info->lock);
+ entry = shmem_swp_alloc(info, idx);
+ if (IS_ERR(entry))
+ error = PTR_ERR(entry);
+ if (error || entry->val ||
+ add_to_page_cache_lru(page, mapping, idx) < 0) {
+ spin_unlock(&info->lock);
                         page_cache_release(page);
- goto no_mem;
+ spin_lock(&sbinfo->stat_lock);
+ sbinfo->free_blocks++;
+ spin_unlock(&sbinfo->stat_lock);
+ if (error)
+ return ERR_PTR(error);
+ goto repeat;
                 }
+ spin_unlock(&info->lock);
                 clear_highpage(page);
                 inode->i_blocks += BLOCKS_PER_PAGE;
         }
@@ -675,22 +701,6 @@
         /* We have the page */
         SetPageUptodate(page);
         return page;
-
-no_mem:
- spin_lock(&sbinfo->stat_lock);
- sbinfo->free_blocks++;
- spin_unlock(&sbinfo->stat_lock);
- return ERR_PTR(-ENOMEM);
-
-no_space:
- spin_unlock (&sbinfo->stat_lock);
- return ERR_PTR(-ENOSPC);
-
-wait_retry:
- spin_unlock(&info->lock);
- wait_on_page_locked(page);
- page_cache_release(page);
- goto repeat;
 }
 
 static int shmem_getpage(struct inode * inode, unsigned long idx, struct page **ptr)
@@ -698,7 +708,6 @@
         struct shmem_inode_info *info = SHMEM_I(inode);
         int error;
 
- down (&info->sem);
         *ptr = ERR_PTR(-EFAULT);
         if (inode->i_size <= (loff_t) idx * PAGE_CACHE_SIZE)
                 goto failed;
@@ -708,10 +717,8 @@
                 goto failed;
 
         unlock_page(*ptr);
- up (&info->sem);
         return 0;
 failed:
- up (&info->sem);
         error = PTR_ERR(*ptr);
         *ptr = NOPAGE_SIGBUS;
         if (error == -ENOMEM)
@@ -734,8 +741,8 @@
         spin_lock(&info->lock);
         page = find_get_page(inode->i_mapping, idx);
         if (!page) {
- entry = shmem_swp_entry(info, idx, 0);
- if (!IS_ERR(entry))
+ entry = shmem_swp_entry(info, idx, NULL);
+ if (entry)
                         swap = *entry;
         }
         spin_unlock(&info->lock);
@@ -814,12 +821,8 @@
                 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
                 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                 info = SHMEM_I(inode);
- spin_lock_init (&info->lock);
- sema_init (&info->sem, 1);
- info->next_index = 0;
- memset (info->i_direct, 0, sizeof(info->i_direct));
- info->i_indirect = NULL;
- info->swapped = 0;
+ memset(info, 0, (char *)inode - (char *)info);
+ spin_lock_init(&info->lock);
                 info->flags = VM_ACCOUNT;
                 switch (mode & S_IFMT) {
                 default:
@@ -971,9 +974,7 @@
                 }
 
                 info = SHMEM_I(inode);
- down (&info->sem);
                 page = shmem_getpage_locked(info, inode, index);
- up (&info->sem);
 
                 status = PTR_ERR(page);
                 if (IS_ERR(page))
@@ -1041,17 +1042,33 @@
                 end_index = inode->i_size >> PAGE_CACHE_SHIFT;
                 if (index > end_index)
                         break;
- nr = PAGE_CACHE_SIZE;
                 if (index == end_index) {
                         nr = inode->i_size & ~PAGE_CACHE_MASK;
                         if (nr <= offset)
                                 break;
                 }
 
- nr = nr - offset;
-
- if ((desc->error = shmem_getpage(inode, index, &page)))
+ desc->error = shmem_getpage(inode, index, &page);
+ if (desc->error) {
+ if (desc->error == -EFAULT)
+ desc->error = 0;
                         break;
+ }
+
+ /*
+ * We must evaluate after, since reads (unlike writes)
+ * are called without i_sem protection against truncate
+ */
+ nr = PAGE_CACHE_SIZE;
+ end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+ if (index == end_index) {
+ nr = inode->i_size & ~PAGE_CACHE_MASK;
+ if (nr <= offset) {
+ page_cache_release(page);
+ break;
+ }
+ }
+ nr -= offset;
 
                 if (!list_empty(&mapping->i_mmap_shared))
                         flush_dcache_page(page);
@@ -1279,10 +1296,8 @@
                         iput(inode);
                         return -ENOMEM;
                 }
- down(&info->sem);
                 page = shmem_getpage_locked(info, inode, 0);
                 if (IS_ERR(page)) {
- up(&info->sem);
                         vm_unacct_memory(VM_ACCT(1));
                         iput(inode);
                         return PTR_ERR(page);
@@ -1297,7 +1312,6 @@
                 set_page_dirty(page);
                 unlock_page(page);
                 page_cache_release(page);
- up(&info->sem);
         }
         dir->i_size += BOGO_DIRENT_SIZE;
         dir->i_ctime = dir->i_mtime = CURRENT_TIME;

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



This archive was generated by hypermail 2b29 : Mon Oct 07 2002 - 22:00:28 EST