Re: [PATCH -mm] mm, swap: Fix race between swapoff and some swap operations

From: Minchan Kim
Date: Fri Dec 08 2017 - 03:26:54 EST


On Fri, Dec 08, 2017 at 01:41:10PM +0800, Huang, Ying wrote:
> Minchan Kim <minchan@xxxxxxxxxx> writes:
>
> > On Thu, Dec 07, 2017 at 04:29:37PM -0800, Andrew Morton wrote:
> >> On Thu, 7 Dec 2017 09:14:26 +0800 "Huang, Ying" <ying.huang@xxxxxxxxx> wrote:
> >>
> >> > When the swapin is performed, after getting the swap entry information
> >> > from the page table, the PTL (page table lock) will be released, then
> >> > system will go to swap in the swap entry, without any lock held to
> >> > prevent the swap device from being swapoff. This may cause the race
> >> > like below,
> >> >
> >> > CPU 1 CPU 2
> >> > ----- -----
> >> > do_swap_page
> >> > swapin_readahead
> >> > __read_swap_cache_async
> >> > swapoff swapcache_prepare
> >> > p->swap_map = NULL __swap_duplicate
> >> > p->swap_map[?] /* !!! NULL pointer access */
> >> >
> >> > Because swap off is usually done when system shutdown only, the race
> >> > may not hit many people in practice. But it is still a race need to
> >> > be fixed.
> >>
> >> swapoff is so rare that it's hard to get motivated about any fix which
> >> adds overhead to the regular codepaths.
> >
> > That was my concern, too when I see this patch.
> >
> >>
> >> Is there something we can do to ensure that all the overhead of this
> >> fix is placed into the swapoff side? stop_machine() may be a bit
> >> brutal, but a surprising amount of code uses it. Any other ideas?
> >
> > How about this?
> >
> > I think It's same approach with old where we uses si->lock everywhere
> > instead of more fine-grained cluster lock.
> >
> > The reason I repeated to reset p->max to zero in the loop is to avoid
> > using lockdep annotation(maybe, spin_lock_nested(something) to prevent
> > false positive.
> >
> > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > index 42fe5653814a..9ce007a42bbc 100644
> > --- a/mm/swapfile.c
> > +++ b/mm/swapfile.c
> > @@ -2644,6 +2644,19 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
> > swap_file = p->swap_file;
> > old_block_size = p->old_block_size;
> > p->swap_file = NULL;
> > +
> > + if (p->flags & SWP_SOLIDSTATE) {
> > + unsigned long ci, nr_cluster;
> > +
> > + nr_cluster = DIV_ROUND_UP(p->max, SWAPFILE_CLUSTER);
> > + for (ci = 0; ci < nr_cluster; ci++) {
> > + struct swap_cluster_info *sci;
> > +
> > + sci = lock_cluster(p, ci * SWAPFILE_CLUSTER);
> > + p->max = 0;
> > + unlock_cluster(sci);
> > + }
> > + }
> > p->max = 0;
> > swap_map = p->swap_map;
> > p->swap_map = NULL;
> > @@ -3369,10 +3382,10 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
> > goto bad_file;
> > p = swap_info[type];
> > offset = swp_offset(entry);
> > - if (unlikely(offset >= p->max))
> > - goto out;
> >
> > ci = lock_cluster_or_swap_info(p, offset);
> > + if (unlikely(offset >= p->max))
> > + goto unlock_out;
> >
> > count = p->swap_map[offset];
> >
>
> Sorry, this doesn't work, because
>
> lock_cluster_or_swap_info()
>
> Need to read p->cluster_info, which may be freed during swapoff too.
>
>
> To reduce the added overhead in regular code path, Maybe we can use SRCU
> to implement get_swap_device() and put_swap_device()? There is only
> increment/decrement on CPU local variable in srcu_read_lock/unlock().
> Should be acceptable in not so hot swap path?
>
> This needs to select CONFIG_SRCU if CONFIG_SWAP is enabled. But I guess
> that should be acceptable too?
>

Why do we need srcu here? Is it enough with rcu like below?

It might have a bug/room to be optimized about performance/naming.
I just wanted to show my intention.

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 2417d288e016..bfe493f3bcb8 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -273,6 +273,7 @@ struct swap_info_struct {
*/
struct work_struct discard_work; /* discard worker */
struct swap_cluster_list discard_clusters; /* discard clusters list */
+ struct rcu_head rcu;
};

#ifdef CONFIG_64BIT
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 42fe5653814a..ecec064f9b20 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -302,6 +302,7 @@ static inline struct swap_cluster_info *lock_cluster_or_swap_info(
{
struct swap_cluster_info *ci;

+ rcu_read_lock();
ci = lock_cluster(si, offset);
if (!ci)
spin_lock(&si->lock);
@@ -316,6 +317,7 @@ static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
unlock_cluster(ci);
else
spin_unlock(&si->lock);
+ rcu_read_unlock();
}

static inline bool cluster_list_empty(struct swap_cluster_list *list)
@@ -2530,11 +2532,17 @@ bool has_usable_swap(void)
return ret;
}

+static void swap_cluster_info_free(struct rcu_head *rcu)
+{
+ struct swap_info_struct *si = container_of(rcu, struct swap_info_struct, rcu);
+ kvfree(si->cluster_info);
+ si->cluster_info = NULL;
+}
+
SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
{
struct swap_info_struct *p = NULL;
unsigned char *swap_map;
- struct swap_cluster_info *cluster_info;
unsigned long *frontswap_map;
struct file *swap_file, *victim;
struct address_space *mapping;
@@ -2542,6 +2550,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
struct filename *pathname;
int err, found = 0;
unsigned int old_block_size;
+ unsigned long ci, nr_cluster;

if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -2631,6 +2640,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
spin_lock(&p->lock);
drain_mmlist();

+ nr_cluster = DIV_ROUND_UP(p->max, SWAPFILE_CLUSTER);
/* wait for anyone still in scan_swap_map */
p->highest_bit = 0; /* cuts scans short */
while (p->flags >= SWP_SCANNING) {
@@ -2641,14 +2651,33 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
spin_lock(&p->lock);
}

+ if (p->flags & SWP_SOLIDSTATE) {
+ struct swap_cluster_info *sci, *head_cluster;
+
+ head_cluster = p->cluster_info;
+ spin_lock(&head_cluster->lock);
+ sci = head_cluster + 1;
+ for (ci = 1; ci < nr_cluster; ci++, sci++)
+ spin_lock_nest_lock(&sci->lock, &head_cluster->lock);
+ }
+
swap_file = p->swap_file;
old_block_size = p->old_block_size;
p->swap_file = NULL;
p->max = 0;
swap_map = p->swap_map;
p->swap_map = NULL;
- cluster_info = p->cluster_info;
- p->cluster_info = NULL;
+
+ if (p->flags & SWP_SOLIDSTATE) {
+ struct swap_cluster_info *sci, *head_cluster;
+
+ head_cluster = p->cluster_info;
+ sci = head_cluster + 1;
+ for (ci = 1; ci < nr_cluster; ci++, sci++)
+ spin_unlock(&sci->lock);
+ spin_unlock(&head_cluster->lock);
+ }
+
frontswap_map = frontswap_map_get(p);
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
@@ -2658,7 +2687,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
free_percpu(p->percpu_cluster);
p->percpu_cluster = NULL;
vfree(swap_map);
- kvfree(cluster_info);
+ call_rcu(&p->rcu, swap_cluster_info_free);
+ synchronize_rcu();
kvfree(frontswap_map);
/* Destroy swap account information */
swap_cgroup_swapoff(p->type);
@@ -3369,10 +3399,10 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
goto bad_file;
p = swap_info[type];
offset = swp_offset(entry);
- if (unlikely(offset >= p->max))
- goto out;

ci = lock_cluster_or_swap_info(p, offset);
+ if (unlikely(offset >= p->max))
+ goto unlock_out;

count = p->swap_map[offset];