Re: [PATCH v17 18/21] mm/lru: introduce the relock_page_lruvec function

From: Alexander Duyck
Date: Wed Jul 29 2020 - 13:52:15 EST


On Sat, Jul 25, 2020 at 6:00 AM Alex Shi <alex.shi@xxxxxxxxxxxxxxxxx> wrote:
>
> Use this new function to replace repeated same code, no func change.
>
> Signed-off-by: Alex Shi <alex.shi@xxxxxxxxxxxxxxxxx>
> Cc: Johannes Weiner <hannes@xxxxxxxxxxx>
> Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
> Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> Cc: Andrey Ryabinin <aryabinin@xxxxxxxxxxxxx>
> Cc: Matthew Wilcox <willy@xxxxxxxxxxxxx>
> Cc: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx>
> Cc: Konstantin Khlebnikov <khlebnikov@xxxxxxxxxxxxxx>
> Cc: Hugh Dickins <hughd@xxxxxxxxxx>
> Cc: Tejun Heo <tj@xxxxxxxxxx>
> Cc: linux-kernel@xxxxxxxxxxxxxxx
> Cc: cgroups@xxxxxxxxxxxxxxx
> Cc: linux-mm@xxxxxxxxx
> ---
> include/linux/memcontrol.h | 40 ++++++++++++++++++++++++++++++++++++++++
> mm/mlock.c | 9 +--------
> mm/swap.c | 33 +++++++--------------------------
> mm/vmscan.c | 8 +-------
> 4 files changed, 49 insertions(+), 41 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 258901021c6c..6e670f991b42 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -1313,6 +1313,46 @@ static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec,
> spin_unlock_irqrestore(&lruvec->lru_lock, flags);
> }
>
> +/* Don't lock again iff page's lruvec locked */
> +static inline struct lruvec *relock_page_lruvec_irq(struct page *page,
> + struct lruvec *locked_lruvec)
> +{
> + struct pglist_data *pgdat = page_pgdat(page);
> + bool locked;
> +
> + rcu_read_lock();
> + locked = mem_cgroup_page_lruvec(page, pgdat) == locked_lruvec;
> + rcu_read_unlock();
> +
> + if (locked)
> + return locked_lruvec;
> +
> + if (locked_lruvec)
> + unlock_page_lruvec_irq(locked_lruvec);
> +
> + return lock_page_lruvec_irq(page);
> +}
> +
> +/* Don't lock again iff page's lruvec locked */
> +static inline struct lruvec *relock_page_lruvec_irqsave(struct page *page,
> + struct lruvec *locked_lruvec, unsigned long *flags)
> +{
> + struct pglist_data *pgdat = page_pgdat(page);
> + bool locked;
> +
> + rcu_read_lock();
> + locked = mem_cgroup_page_lruvec(page, pgdat) == locked_lruvec;
> + rcu_read_unlock();
> +
> + if (locked)
> + return locked_lruvec;
> +
> + if (locked_lruvec)
> + unlock_page_lruvec_irqrestore(locked_lruvec, *flags);
> +
> + return lock_page_lruvec_irqsave(page, flags);
> +}
> +

So looking these over they seem to be pretty inefficient for what they
do. Basically in worst case (locked_lruvec == NULL) you end up calling
mem_cgoup_page_lruvec and all the rcu_read_lock/unlock a couple times
for a single page. It might make more sense to structure this like:
if (locked_lruvec) {
if (lruvec_holds_page_lru_lock(page, locked_lruvec))
return locked_lruvec;

unlock_page_lruvec_irqrestore(locked_lruvec, *flags);
}
return lock_page_lruvec_irqsave(page, flags);

The other piece that has me scratching my head is that I wonder if we
couldn't do this without needing the rcu_read_lock. For example, what
if we were to compare the page mem_cgroup pointer to the memcg back
pointer stored in the mem_cgroup_per_node? It seems like ordering
things this way would significantly reduce the overhead due to the
pointer chasing to see if the page is in the locked lruvec or not.

> #ifdef CONFIG_CGROUP_WRITEBACK
>
> struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
> diff --git a/mm/mlock.c b/mm/mlock.c
> index 5d40d259a931..bc2fb3bfbe7a 100644
> --- a/mm/mlock.c
> +++ b/mm/mlock.c
> @@ -303,17 +303,10 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
> /* Phase 1: page isolation */
> for (i = 0; i < nr; i++) {
> struct page *page = pvec->pages[i];
> - struct lruvec *new_lruvec;
>
> /* block memcg change in mem_cgroup_move_account */
> lock_page_memcg(page);
> - new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
> - if (new_lruvec != lruvec) {
> - if (lruvec)
> - unlock_page_lruvec_irq(lruvec);
> - lruvec = lock_page_lruvec_irq(page);
> - }
> -
> + lruvec = relock_page_lruvec_irq(page, lruvec);
> if (TestClearPageMlocked(page)) {
> /*
> * We already have pin from follow_page_mask()
> diff --git a/mm/swap.c b/mm/swap.c
> index 09edac441eb6..6d9c7288f7de 100644
> --- a/mm/swap.c
> +++ b/mm/swap.c
> @@ -209,19 +209,12 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
>
> for (i = 0; i < pagevec_count(pvec); i++) {
> struct page *page = pvec->pages[i];
> - struct lruvec *new_lruvec;
>
> /* block memcg migration during page moving between lru */
> if (!TestClearPageLRU(page))
> continue;
>
> - new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
> - if (lruvec != new_lruvec) {
> - if (lruvec)
> - unlock_page_lruvec_irqrestore(lruvec, flags);
> - lruvec = lock_page_lruvec_irqsave(page, &flags);
> - }
> -
> + lruvec = relock_page_lruvec_irqsave(page, lruvec, &flags);
> (*move_fn)(page, lruvec);
>
> SetPageLRU(page);
> @@ -864,17 +857,12 @@ void release_pages(struct page **pages, int nr)
> }
>
> if (PageLRU(page)) {
> - struct lruvec *new_lruvec;
> -
> - new_lruvec = mem_cgroup_page_lruvec(page,
> - page_pgdat(page));
> - if (new_lruvec != lruvec) {
> - if (lruvec)
> - unlock_page_lruvec_irqrestore(lruvec,
> - flags);
> + struct lruvec *prev_lruvec = lruvec;
> +
> + lruvec = relock_page_lruvec_irqsave(page, lruvec,
> + &flags);
> + if (prev_lruvec != lruvec)
> lock_batch = 0;
> - lruvec = lock_page_lruvec_irqsave(page, &flags);
> - }
>
> __ClearPageLRU(page);
> del_page_from_lru_list(page, lruvec, page_off_lru(page));
> @@ -980,15 +968,8 @@ void __pagevec_lru_add(struct pagevec *pvec)
>
> for (i = 0; i < pagevec_count(pvec); i++) {
> struct page *page = pvec->pages[i];
> - struct lruvec *new_lruvec;
> -
> - new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
> - if (lruvec != new_lruvec) {
> - if (lruvec)
> - unlock_page_lruvec_irqrestore(lruvec, flags);
> - lruvec = lock_page_lruvec_irqsave(page, &flags);
> - }
>
> + lruvec = relock_page_lruvec_irqsave(page, lruvec, &flags);
> __pagevec_lru_add_fn(page, lruvec);
> }
> if (lruvec)
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 168c1659e430..bdb53a678e7e 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -4292,15 +4292,9 @@ void check_move_unevictable_pages(struct pagevec *pvec)
>
> for (i = 0; i < pvec->nr; i++) {
> struct page *page = pvec->pages[i];
> - struct lruvec *new_lruvec;
>
> pgscanned++;
> - new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
> - if (lruvec != new_lruvec) {
> - if (lruvec)
> - unlock_page_lruvec_irq(lruvec);
> - lruvec = lock_page_lruvec_irq(page);
> - }
> + lruvec = relock_page_lruvec_irq(page, lruvec);
>
> if (!PageLRU(page) || !PageUnevictable(page))
> continue;
> --
> 1.8.3.1
>