Re: [PATCH -mm -v4 03/21] mm, THP, swap: Support PMD swap mapping in swap_duplicate()

From: Dave Hansen
Date: Mon Jul 09 2018 - 12:51:51 EST


> +static inline bool thp_swap_supported(void)
> +{
> + return IS_ENABLED(CONFIG_THP_SWAP);
> +}

This seems like rather useless abstraction. Why do we need it?

...
> -static inline int swap_duplicate(swp_entry_t swp)
> +static inline int swap_duplicate(swp_entry_t *swp, bool cluster)
> {
> return 0;
> }

FWIW, I despise true/false function arguments like this. When I see
this in code:

swap_duplicate(&entry, false);

I have no idea what false does. I'd much rather see:

enum do_swap_cluster {
SWP_DO_CLUSTER,
SWP_NO_CLUSTER
};

So you see:

swap_duplicate(&entry, SWP_NO_CLUSTER);

vs.

swap_duplicate(&entry, SWP_DO_CLUSTER);


> diff --git a/mm/memory.c b/mm/memory.c
> index e9cac1c4fa69..f3900282e3da 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -951,7 +951,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
> swp_entry_t entry = pte_to_swp_entry(pte);
>
> if (likely(!non_swap_entry(entry))) {
> - if (swap_duplicate(entry) < 0)
> + if (swap_duplicate(&entry, false) < 0)
> return entry.val;
>
> /* make sure dst_mm is on swapoff's mmlist. */

I'll also point out that in a multi-hundred-line patch, adding arguments
to a existing function would not be something I'd try to include in the
patch. I'd break it out separately unless absolutely necessary.

> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index f42b1b0cdc58..48e2c54385ee 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -49,6 +49,9 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
> unsigned char);
> static void free_swap_count_continuations(struct swap_info_struct *);
> static sector_t map_swap_entry(swp_entry_t, struct block_device**);
> +static int add_swap_count_continuation_locked(struct swap_info_struct *si,
> + unsigned long offset,
> + struct page *page);
>
> DEFINE_SPINLOCK(swap_lock);
> static unsigned int nr_swapfiles;
> @@ -319,6 +322,11 @@ static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
> spin_unlock(&si->lock);
> }
>
> +static inline bool is_cluster_offset(unsigned long offset)
> +{
> + return !(offset % SWAPFILE_CLUSTER);
> +}
> +
> static inline bool cluster_list_empty(struct swap_cluster_list *list)
> {
> return cluster_is_null(&list->head);
> @@ -1166,16 +1174,14 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry)
> return NULL;
> }
>
> -static unsigned char __swap_entry_free(struct swap_info_struct *p,
> - swp_entry_t entry, unsigned char usage)
> +static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
> + struct swap_cluster_info *ci,
> + unsigned long offset,
> + unsigned char usage)
> {
> - struct swap_cluster_info *ci;
> - unsigned long offset = swp_offset(entry);
> unsigned char count;
> unsigned char has_cache;
>
> - ci = lock_cluster_or_swap_info(p, offset);
> -
> count = p->swap_map[offset];
>
> has_cache = count & SWAP_HAS_CACHE;
> @@ -1203,6 +1209,17 @@ static unsigned char __swap_entry_free(struct swap_info_struct *p,
> usage = count | has_cache;
> p->swap_map[offset] = usage ? : SWAP_HAS_CACHE;
>
> + return usage;
> +}
> +
> +static unsigned char __swap_entry_free(struct swap_info_struct *p,
> + swp_entry_t entry, unsigned char usage)
> +{
> + struct swap_cluster_info *ci;
> + unsigned long offset = swp_offset(entry);
> +
> + ci = lock_cluster_or_swap_info(p, offset);
> + usage = __swap_entry_free_locked(p, ci, offset, usage);
> unlock_cluster_or_swap_info(p, ci);
>
> return usage;
> @@ -3450,32 +3467,12 @@ void si_swapinfo(struct sysinfo *val)
> spin_unlock(&swap_lock);
> }
>
> -/*
> - * Verify that a swap entry is valid and increment its swap map count.
> - *
> - * Returns error code in following case.
> - * - success -> 0
> - * - swp_entry is invalid -> EINVAL
> - * - swp_entry is migration entry -> EINVAL
> - * - swap-cache reference is requested but there is already one. -> EEXIST
> - * - swap-cache reference is requested but the entry is not used. -> ENOENT
> - * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
> - */
> -static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
> +static int __swap_duplicate_locked(struct swap_info_struct *p,
> + unsigned long offset, unsigned char usage)
> {
> - struct swap_info_struct *p;
> - struct swap_cluster_info *ci;
> - unsigned long offset;
> unsigned char count;
> unsigned char has_cache;
> - int err = -EINVAL;
> -
> - p = get_swap_device(entry);
> - if (!p)
> - goto out;
> -
> - offset = swp_offset(entry);
> - ci = lock_cluster_or_swap_info(p, offset);
> + int err = 0;
>
> count = p->swap_map[offset];
>
> @@ -3485,12 +3482,11 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
> */
> if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
> err = -ENOENT;
> - goto unlock_out;
> + goto out;
> }
>
> has_cache = count & SWAP_HAS_CACHE;
> count &= ~SWAP_HAS_CACHE;
> - err = 0;
>
> if (usage == SWAP_HAS_CACHE) {
>
> @@ -3517,11 +3513,39 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
>
> p->swap_map[offset] = count | has_cache;
>
> -unlock_out:
> +out:
> + return err;
> +}

... and that all looks like refactoring, not actively implementing PMD
swap support. That's unfortunate.

> +/*
> + * Verify that a swap entry is valid and increment its swap map count.
> + *
> + * Returns error code in following case.
> + * - success -> 0
> + * - swp_entry is invalid -> EINVAL
> + * - swp_entry is migration entry -> EINVAL
> + * - swap-cache reference is requested but there is already one. -> EEXIST
> + * - swap-cache reference is requested but the entry is not used. -> ENOENT
> + * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
> + */
> +static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
> +{
> + struct swap_info_struct *p;
> + struct swap_cluster_info *ci;
> + unsigned long offset;
> + int err = -EINVAL;
> +
> + p = get_swap_device(entry);
> + if (!p)
> + goto out;

Is this an error, or just for running into something like a migration
entry? Comments please.

> + offset = swp_offset(entry);
> + ci = lock_cluster_or_swap_info(p, offset);
> + err = __swap_duplicate_locked(p, offset, usage);
> unlock_cluster_or_swap_info(p, ci);
> +
> + put_swap_device(p);
> out:
> - if (p)
> - put_swap_device(p);
> return err;
> }

Not a comment on this patch, but lock_cluster_or_swap_info() is woefully
uncommented.

> @@ -3534,6 +3558,81 @@ void swap_shmem_alloc(swp_entry_t entry)
> __swap_duplicate(entry, SWAP_MAP_SHMEM);
> }
>
> +#ifdef CONFIG_THP_SWAP
> +static int __swap_duplicate_cluster(swp_entry_t *entry, unsigned char usage)
> +{
> + struct swap_info_struct *si;
> + struct swap_cluster_info *ci;
> + unsigned long offset;
> + unsigned char *map;
> + int i, err = 0;

Instead of an #ifdef, is there a reason we can't just do:

if (!IS_ENABLED(THP_SWAP))
return 0;

?

> + si = get_swap_device(*entry);
> + if (!si) {
> + err = -EINVAL;
> + goto out;
> + }
> + offset = swp_offset(*entry);
> + ci = lock_cluster(si, offset);

Could you explain a bit why we do lock_cluster() and not
lock_cluster_or_swap_info() here?

> + if (cluster_is_free(ci)) {
> + err = -ENOENT;
> + goto unlock;
> + }

Needs comments on how this could happen. We just took the lock, so I
assume this is some kind of race, but can you elaborate?

> + if (!cluster_is_huge(ci)) {
> + err = -ENOTDIR;
> + goto unlock;
> + }

Yikes! This function is the core of the new functionality and its
comment count is exactly 0. There was quite a long patch description,
which will be surely lost to the ages, but nothing in the code that
folks _will_ be looking at for decades to come.

Can we fix that?

> + VM_BUG_ON(!is_cluster_offset(offset));
> + VM_BUG_ON(cluster_count(ci) < SWAPFILE_CLUSTER);

So, by this point, we know we are looking at (or supposed to be looking
at) a cluster on the device?

> + map = si->swap_map + offset;
> + if (usage == SWAP_HAS_CACHE) {
> + if (map[0] & SWAP_HAS_CACHE) {
> + err = -EEXIST;
> + goto unlock;
> + }
> + for (i = 0; i < SWAPFILE_CLUSTER; i++) {
> + VM_BUG_ON(map[i] & SWAP_HAS_CACHE);
> + map[i] |= SWAP_HAS_CACHE;
> + }

So, it's OK to race with the first entry, but after that it's a bug
because the tail pages should agree with the head page's state?

> + } else {
> + for (i = 0; i < SWAPFILE_CLUSTER; i++) {
> +retry:
> + err = __swap_duplicate_locked(si, offset + i, usage);
> + if (err == -ENOMEM) {
> + struct page *page;
> +
> + page = alloc_page(GFP_ATOMIC | __GFP_HIGHMEM);

I noticed that the non-clustering analog of this function takes a GFP
mask. Why not this one?

> + err = add_swap_count_continuation_locked(
> + si, offset + i, page);
> + if (err) {
> + *entry = swp_entry(si->type, offset+i);
> + goto undup;
> + }
> + goto retry;
> + } else if (err)
> + goto undup;
> + }
> + cluster_set_count(ci, cluster_count(ci) + usage);
> + }
> +unlock:
> + unlock_cluster(ci);
> + put_swap_device(si);
> +out:
> + return err;
> +undup:
> + for (i--; i >= 0; i--)
> + __swap_entry_free_locked(
> + si, ci, offset + i, usage);
> + goto unlock;
> +}

So, we've basically created a fork of the __swap_duplicate() code for
huge pages, along with a presumably new set of bugs and a second code
path to update. Was this unavoidable? Can we unify this any more with
the small pages path?

> /*
> * Increase reference count of swap entry by 1.
> * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
> @@ -3541,12 +3640,15 @@ void swap_shmem_alloc(swp_entry_t entry)
> * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
> * might occur if a page table entry has got corrupted.
> */
> -int swap_duplicate(swp_entry_t entry)
> +int swap_duplicate(swp_entry_t *entry, bool cluster)
> {
> int err = 0;
>
> - while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
> - err = add_swap_count_continuation(entry, GFP_ATOMIC);
> + if (thp_swap_supported() && cluster)
> + return __swap_duplicate_cluster(entry, 1);
> +
> + while (!err && __swap_duplicate(*entry, 1) == -ENOMEM)
> + err = add_swap_count_continuation(*entry, GFP_ATOMIC);
> return err;
> }

Reading this, I wonder whether this has been refactored as much as
possible. Both add_swap_count_continuation() and
__swap_duplciate_cluster() start off with the same get_swap_device() dance.

> @@ -3558,9 +3660,12 @@ int swap_duplicate(swp_entry_t entry)
> * -EBUSY means there is a swap cache.
> * Note: return code is different from swap_duplicate().
> */
> -int swapcache_prepare(swp_entry_t entry)
> +int swapcache_prepare(swp_entry_t entry, bool cluster)
> {
> - return __swap_duplicate(entry, SWAP_HAS_CACHE);
> + if (thp_swap_supported() && cluster)
> + return __swap_duplicate_cluster(&entry, SWAP_HAS_CACHE);
> + else
> + return __swap_duplicate(entry, SWAP_HAS_CACHE);
> }
>
> struct swap_info_struct *swp_swap_info(swp_entry_t entry)
> @@ -3590,51 +3695,13 @@ pgoff_t __page_file_index(struct page *page)
> }
> EXPORT_SYMBOL_GPL(__page_file_index);
>
> -/*
> - * add_swap_count_continuation - called when a swap count is duplicated
> - * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
> - * page of the original vmalloc'ed swap_map, to hold the continuation count
> - * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called
> - * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc.

This closes out with a lot of refactoring noise. Any chance that can be
isolated into another patch?