Re: [Patch] shm bug introduced with pagecache in 2.3.11

Andrea Arcangeli (andrea@suse.de)
Thu, 11 Nov 1999 22:12:20 +0100 (CET)


On 11 Nov 1999, Christoph Rohland wrote:

>The patch also fixes some int/size_t issues.

The patch is buggy. In this path:

swapin_readahead(entry);
+ if (pte_val(pte) != pte_val(SHM_ENTRY(shp,
idx))) goto again;
page = read_swap_cache(entry);

you `goto again` without first releasing the big kernel lock and without
acquiring again the shm lock.

But this is a minor implementation issue. There is a worse problem. The
real issue is that in SMP even removing the readahead is still racy. All
the checks for the pte you added are racy.

This my patch should fix all races (both UP and SMP). Please try it out.
It's against 2.3.27pre4. As the anonymous swapin we are protected by the
per-mm semaphore, in shm we must protect us with a per-shm-segment
semaphore to handle the swapin case safely. The design is almost the same
as in the anonymous swapin then.

diff -urN 2.3.27pre4/include/linux/swap.h shm/include/linux/swap.h
--- 2.3.27pre4/include/linux/swap.h Thu Nov 11 18:23:09 1999
+++ shm/include/linux/swap.h Thu Nov 11 21:20:36 1999
@@ -112,8 +112,10 @@
extern struct swap_info_struct swap_info[];
extern int is_swap_partition(kdev_t);
extern void si_swapinfo(struct sysinfo *);
-extern swp_entry_t get_swap_page(void);
-extern void swap_free(swp_entry_t);
+extern swp_entry_t __get_swap_page(unsigned short);
+#define get_swap_page() __get_swap_page(1)
+extern void __swap_free(swp_entry_t, unsigned short);
+#define swap_free(entry) __swap_free((entry), 1)
struct swap_list_t {
int head; /* head of priority-ordered swapfile list */
int next; /* swapfile to be used next */
diff -urN 2.3.27pre4/ipc/shm.c shm/ipc/shm.c
--- 2.3.27pre4/ipc/shm.c Wed Nov 10 16:59:27 1999
+++ shm/ipc/shm.c Thu Nov 11 21:58:28 1999
@@ -36,6 +36,7 @@
pte_t **shm_dir; /* ptr to array of ptrs to frames -> SHMMAX */
struct vm_area_struct *attaches; /* descriptors for attaches */
int id; /* backreference to id for shm_close */
+ struct semaphore sem;
};

static int findkey (key_t key);
@@ -61,6 +62,9 @@
static unsigned int num_segs = 0;
static unsigned short shm_seq = 0; /* incremented, for recognizing stale ids */

+/* locks order:
+ shm_lock -> pagecache_lock (end of shm_swap)
+ shp->sem -> other spinlocks (shm_nopage) */
spinlock_t shm_lock = SPIN_LOCK_UNLOCKED;

/* some statistics */
@@ -260,6 +264,7 @@
shp->u.shm_ctime = CURRENT_TIME;
shp->shm_npages = numpages;
shp->id = id;
+ init_MUTEX(&shp->sem);

spin_lock(&shm_lock);

@@ -770,10 +775,13 @@
idx = (address - shmd->vm_start) >> PAGE_SHIFT;
idx += shmd->vm_pgoff;

+ down(&shp->sem);
spin_lock(&shm_lock);
-again:
pte = SHM_ENTRY(shp,idx);
if (!pte_present(pte)) {
+ /* page not present so shm_swap can't race with us
+ and the semaphore protects us by other tasks that
+ could potentially fault on our pte under us */
if (pte_none(pte)) {
spin_unlock(&shm_lock);
page = get_free_highpage(GFP_HIGHUSER);
@@ -781,8 +789,6 @@
goto oom;
clear_highpage(page);
spin_lock(&shm_lock);
- if (pte_val(pte) != pte_val(SHM_ENTRY(shp, idx)))
- goto changed;
} else {
swp_entry_t entry = pte_to_swp_entry(pte);

@@ -803,9 +809,6 @@
unlock_kernel();
spin_lock(&shm_lock);
shm_swp--;
- pte = SHM_ENTRY(shp, idx);
- if (pte_present(pte))
- goto present;
}
shm_rss++;
pte = pte_mkdirty(mk_pte(page, PAGE_SHARED));
@@ -813,21 +816,15 @@
} else
--current->maj_flt; /* was incremented in do_no_page */

-done:
/* pte_val(pte) == SHM_ENTRY (shp, idx) */
get_page(pte_page(pte));
spin_unlock(&shm_lock);
+ up(&shp->sem);
current->min_flt++;
return pte_page(pte);

-changed:
- __free_page(page);
- goto again;
-present:
- if (page)
- free_page_and_swap_cache(page);
- goto done;
oom:
+ up(&shp->sem);
return NOPAGE_OOM;
}

@@ -851,7 +848,11 @@
if (!counter)
return 0;
lock_kernel();
- swap_entry = get_swap_page();
+ /* subtle: preload the swap count for the swap cache. We can't
+ increase the count inside the critical section as we can't release
+ the shm_lock there. And we can't acquire the big lock with the
+ shm_lock held (otherwise we would deadlock too easily). */
+ swap_entry = __get_swap_page(2);
if (!swap_entry.val) {
unlock_kernel();
return 0;
@@ -893,7 +894,7 @@
failed:
spin_unlock(&shm_lock);
lock_kernel();
- swap_free(swap_entry);
+ __swap_free(swap_entry, 2);
unlock_kernel();
return 0;
}
@@ -905,11 +906,16 @@
swap_successes++;
shm_swp++;
shm_rss--;
+
+ /* add the locked page to the swap cache before allowing
+ the swapin path to run lookup_swap_cache(). This avoids
+ reading a not yet uptodate block from disk.
+ NOTE: we just accounted the swap space reference for this
+ swap cache page at __get_swap_page() time. */
+ add_to_swap_cache(page_map, swap_entry);
spin_unlock(&shm_lock);

lock_kernel();
- swap_duplicate(swap_entry);
- add_to_swap_cache(page_map, swap_entry);
rw_swap_page(WRITE, page_map, 0);
unlock_kernel();

diff -urN 2.3.27pre4/mm/swapfile.c shm/mm/swapfile.c
--- 2.3.27pre4/mm/swapfile.c Sun Nov 7 17:33:38 1999
+++ shm/mm/swapfile.c Thu Nov 11 21:38:01 1999
@@ -25,7 +25,7 @@

#define SWAPFILE_CLUSTER 256

-static inline int scan_swap_map(struct swap_info_struct *si)
+static inline int scan_swap_map(struct swap_info_struct *si, unsigned short count)
{
unsigned long offset;
/*
@@ -73,7 +73,7 @@
si->lowest_bit++;
if (offset == si->highest_bit)
si->highest_bit--;
- si->swap_map[offset] = 1;
+ si->swap_map[offset] = count;
nr_swap_pages--;
si->cluster_next = offset+1;
return offset;
@@ -81,7 +81,7 @@
return 0;
}

-swp_entry_t get_swap_page(void)
+swp_entry_t __get_swap_page(unsigned short count)
{
struct swap_info_struct * p;
unsigned long offset;
@@ -94,11 +94,13 @@
goto out;
if (nr_swap_pages == 0)
goto out;
+ if (count >= SWAP_MAP_MAX)
+ goto bad_count;

while (1) {
p = &swap_info[type];
if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
- offset = scan_swap_map(p);
+ offset = scan_swap_map(p, count);
if (offset) {
entry = SWP_ENTRY(type,offset);
type = swap_info[type].next;
@@ -123,10 +125,15 @@
}
out:
return entry;
+
+bad_count:
+ printk(KERN_ERR "get_swap_page: bad count %hd from %p\n",
+ count, __builtin_return_address(0));
+ goto out;
}


-void swap_free(swp_entry_t entry)
+void __swap_free(swp_entry_t entry, unsigned short count)
{
struct swap_info_struct * p;
unsigned long offset, type;
@@ -148,7 +155,9 @@
if (!p->swap_map[offset])
goto bad_free;
if (p->swap_map[offset] < SWAP_MAP_MAX) {
- if (!--p->swap_map[offset]) {
+ if (p->swap_map[offset] < count)
+ goto bad_count;
+ if (!(p->swap_map[offset] -= count)) {
if (offset < p->lowest_bit)
p->lowest_bit = offset;
if (offset > p->highest_bit)
@@ -170,6 +179,9 @@
goto out;
bad_free:
printk("VM: Bad swap entry %08lx\n", entry.val);
+ goto out;
+bad_count:
+ printk(KERN_ERR "VM: Bad count %hd current count %hd\n", count, p->swap_map[offset]);
goto out;
}

The only ordering rule I added is that shm_lock must be acquired _before_
pagecache_lock.

I am stressing the code with your shmtst on SMP and it works fine here.

I suggest applying my race fixes to the stock kernel as the design looks
like the right one to me now.

Andrea

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/