Re: [PATCH for v5.8 1/3] mm: workingset: age nonresident information alongside anonymous pages

From: Vlastimil Babka
Date: Mon Jun 29 2020 - 16:35:03 EST


On 6/16/20 8:16 AM, js1304@xxxxxxxxx wrote:
> From: Johannes Weiner <hannes@xxxxxxxxxxx>
>
> After ("mm: workingset: let cache workingset challenge anon fix"), we

This could now be updated to:
After commit 34e58cac6d8f ("mm: workingset: let cache workingset challenge
anon"), we ...

> compare refault distances to active_file + anon. But age of the
> non-resident information is only driven by the file LRU. As a result,
> we may overestimate the recency of any incoming refaults and activate
> them too eagerly, causing unnecessary LRU churn in certain situations.
>
> Make anon aging drive nonresident age as well to address that.

Fixes: 34e58cac6d8f ("mm: workingset: let cache workingset challenge anon")

> Reported-by: Joonsoo Kim <js1304@xxxxxxxxx>
> Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx>
> Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@xxxxxxx>

Acked-by: Vlastimil Babka <vbabka@xxxxxxx>

> ---
> include/linux/mmzone.h | 4 ++--
> include/linux/swap.h | 1 +
> mm/vmscan.c | 3 +++
> mm/workingset.c | 46 +++++++++++++++++++++++++++-------------------
> 4 files changed, 33 insertions(+), 21 deletions(-)
>
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index c4c37fd..f6f8849 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -257,8 +257,8 @@ struct lruvec {
> */
> unsigned long anon_cost;
> unsigned long file_cost;
> - /* Evictions & activations on the inactive file list */
> - atomic_long_t inactive_age;
> + /* Non-resident age, driven by LRU movement */
> + atomic_long_t nonresident_age;
> /* Refaults at the time of last reclaim cycle */
> unsigned long refaults;
> /* Various lruvec state flags (enum lruvec_flags) */
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 4c5974b..5b3216b 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -313,6 +313,7 @@ struct vma_swap_readahead {
> };
>
> /* linux/mm/workingset.c */
> +void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages);
> void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg);
> void workingset_refault(struct page *page, void *shadow);
> void workingset_activation(struct page *page);
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index b6d8432..749d239 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -904,6 +904,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
> __delete_from_swap_cache(page, swap);
> xa_unlock_irqrestore(&mapping->i_pages, flags);
> put_swap_page(page, swap);
> + workingset_eviction(page, target_memcg);
> } else {
> void (*freepage)(struct page *);
> void *shadow = NULL;
> @@ -1884,6 +1885,8 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
> list_add(&page->lru, &pages_to_free);
> } else {
> nr_moved += nr_pages;
> + if (PageActive(page))
> + workingset_age_nonresident(lruvec, nr_pages);
> }
> }
>
> diff --git a/mm/workingset.c b/mm/workingset.c
> index d481ea4..50b7937 100644
> --- a/mm/workingset.c
> +++ b/mm/workingset.c
> @@ -156,8 +156,8 @@
> *
> * Implementation
> *
> - * For each node's file LRU lists, a counter for inactive evictions
> - * and activations is maintained (node->inactive_age).
> + * For each node's LRU lists, a counter for inactive evictions and
> + * activations is maintained (node->nonresident_age).
> *
> * On eviction, a snapshot of this counter (along with some bits to
> * identify the node) is stored in the now empty page cache
> @@ -213,7 +213,17 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
> *workingsetp = workingset;
> }
>
> -static void advance_inactive_age(struct mem_cgroup *memcg, pg_data_t *pgdat)
> +/**
> + * workingset_age_nonresident - age non-resident entries as LRU ages
> + * @memcg: the lruvec that was aged
> + * @nr_pages: the number of pages to count
> + *
> + * As in-memory pages are aged, non-resident pages need to be aged as
> + * well, in order for the refault distances later on to be comparable
> + * to the in-memory dimensions. This function allows reclaim and LRU
> + * operations to drive the non-resident aging along in parallel.
> + */
> +void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages)
> {
> /*
> * Reclaiming a cgroup means reclaiming all its children in a
> @@ -227,11 +237,8 @@ static void advance_inactive_age(struct mem_cgroup *memcg, pg_data_t *pgdat)
> * the root cgroup's, age as well.
> */
> do {
> - struct lruvec *lruvec;
> -
> - lruvec = mem_cgroup_lruvec(memcg, pgdat);
> - atomic_long_inc(&lruvec->inactive_age);
> - } while (memcg && (memcg = parent_mem_cgroup(memcg)));
> + atomic_long_add(nr_pages, &lruvec->nonresident_age);
> + } while ((lruvec = parent_lruvec(lruvec)));
> }
>
> /**
> @@ -254,12 +261,11 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg)
> VM_BUG_ON_PAGE(page_count(page), page);
> VM_BUG_ON_PAGE(!PageLocked(page), page);
>
> - advance_inactive_age(page_memcg(page), pgdat);
> -
> lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
> + workingset_age_nonresident(lruvec, hpage_nr_pages(page));
> /* XXX: target_memcg can be NULL, go through lruvec */
> memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
> - eviction = atomic_long_read(&lruvec->inactive_age);
> + eviction = atomic_long_read(&lruvec->nonresident_age);
> return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
> }
>
> @@ -309,20 +315,20 @@ void workingset_refault(struct page *page, void *shadow)
> if (!mem_cgroup_disabled() && !eviction_memcg)
> goto out;
> eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat);
> - refault = atomic_long_read(&eviction_lruvec->inactive_age);
> + refault = atomic_long_read(&eviction_lruvec->nonresident_age);
>
> /*
> * Calculate the refault distance
> *
> * The unsigned subtraction here gives an accurate distance
> - * across inactive_age overflows in most cases. There is a
> + * across nonresident_age overflows in most cases. There is a
> * special case: usually, shadow entries have a short lifetime
> * and are either refaulted or reclaimed along with the inode
> * before they get too old. But it is not impossible for the
> - * inactive_age to lap a shadow entry in the field, which can
> - * then result in a false small refault distance, leading to a
> - * false activation should this old entry actually refault
> - * again. However, earlier kernels used to deactivate
> + * nonresident_age to lap a shadow entry in the field, which
> + * can then result in a false small refault distance, leading
> + * to a false activation should this old entry actually
> + * refault again. However, earlier kernels used to deactivate
> * unconditionally with *every* reclaim invocation for the
> * longest time, so the occasional inappropriate activation
> * leading to pressure on the active list is not a problem.
> @@ -359,7 +365,7 @@ void workingset_refault(struct page *page, void *shadow)
> goto out;
>
> SetPageActive(page);
> - advance_inactive_age(memcg, pgdat);
> + workingset_age_nonresident(lruvec, hpage_nr_pages(page));
> inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
>
> /* Page was active prior to eviction */
> @@ -382,6 +388,7 @@ void workingset_refault(struct page *page, void *shadow)
> void workingset_activation(struct page *page)
> {
> struct mem_cgroup *memcg;
> + struct lruvec *lruvec;
>
> rcu_read_lock();
> /*
> @@ -394,7 +401,8 @@ void workingset_activation(struct page *page)
> memcg = page_memcg_rcu(page);
> if (!mem_cgroup_disabled() && !memcg)
> goto out;
> - advance_inactive_age(memcg, page_pgdat(page));
> + lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
> + workingset_age_nonresident(lruvec, hpage_nr_pages(page));
> out:
> rcu_read_unlock();
> }
>