Re: [RFC] mm/vmscan.c: avoid possible long latency caused by too_many_isolated()

From: Yu Zhao
Date: Fri Apr 23 2021 - 16:23:53 EST


On Fri, Apr 23, 2021 at 02:57:07PM +0800, Xing Zhengjun wrote:
> On 4/23/2021 1:13 AM, Yu Zhao wrote:
> > On Thu, Apr 22, 2021 at 04:36:19PM +0800, Xing Zhengjun wrote:
> > > Hi,
> > >
> > > In the system with very few file pages (nr_active_file + nr_inactive_file
> > > < 100), it is easy to reproduce "nr_isolated_file > nr_inactive_file", then
> > > too_many_isolated return true, shrink_inactive_list enter "msleep(100)", the
> > > long latency will happen.
> > >
> > > The test case to reproduce it is very simple: allocate many huge pages(near
> > > the DRAM size), then do free, repeat the same operation many times.
> > > In the test case, the system with very few file pages (nr_active_file +
> > > nr_inactive_file < 100), I have dumpped the numbers of
> > > active/inactive/isolated file pages during the whole test(see in the
> > > attachments) , in shrink_inactive_list "too_many_isolated" is very easy to
> > > return true, then enter "msleep(100)",in "too_many_isolated" sc->gfp_mask is
> > > 0x342cca ("_GFP_IO" and "__GFP_FS" is masked) , it is also very easy to
> > > enter “inactive >>=3”, then “isolated > inactive” will be true.
> > >
> > > So I have a proposal to set a threshold number for the total file pages to
> > > ignore the system with very few file pages, and then bypass the 100ms sleep.
> > > It is hard to set a perfect number for the threshold, so I just give an
> > > example of "256" for it.
> > >
> > > I appreciate it if you can give me your suggestion/comments. Thanks.
> >
> > Hi Zhengjun,
> >
> > It seems to me using the number of isolated pages to keep a lid on
> > direct reclaimers is not a good solution. We shouldn't keep going
> > that direction if we really want to fix the problem because migration
> > can isolate many pages too, which in turn blocks page reclaim.
> >
> > Here is something works a lot better. Please give it a try. Thanks.
>
> Thanks, I will try it with my test cases.

Thanks. I took care my sloppiness from yesterday and tested the
following. It should apply cleanly and work well. Please let me know.

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 47946cec7584..48bb2b77389e 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -832,6 +832,7 @@ typedef struct pglist_data {
#endif

/* Fields commonly accessed by the page reclaim scanner */
+ atomic_t nr_reclaimers;

/*
* NOTE: THIS IS UNUSED IF MEMCG IS ENABLED.
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 562e87cbd7a1..3fcdfbee89c7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1775,43 +1775,6 @@ int isolate_lru_page(struct page *page)
return ret;
}

-/*
- * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
- * then get rescheduled. When there are massive number of tasks doing page
- * allocation, such sleeping direct reclaimers may keep piling up on each CPU,
- * the LRU list will go small and be scanned faster than necessary, leading to
- * unnecessary swapping, thrashing and OOM.
- */
-static int too_many_isolated(struct pglist_data *pgdat, int file,
- struct scan_control *sc)
-{
- unsigned long inactive, isolated;
-
- if (current_is_kswapd())
- return 0;
-
- if (!writeback_throttling_sane(sc))
- return 0;
-
- if (file) {
- inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
- isolated = node_page_state(pgdat, NR_ISOLATED_FILE);
- } else {
- inactive = node_page_state(pgdat, NR_INACTIVE_ANON);
- isolated = node_page_state(pgdat, NR_ISOLATED_ANON);
- }
-
- /*
- * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
- * won't get blocked by normal direct-reclaimers, forming a circular
- * deadlock.
- */
- if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
- inactive >>= 3;
-
- return isolated > inactive;
-}
-
/*
* move_pages_to_lru() moves pages from private @list to appropriate LRU list.
* On return, @list is reused as a list of pages to be freed by the caller.
@@ -1911,20 +1874,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
bool file = is_file_lru(lru);
enum vm_event_item item;
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
- bool stalled = false;
-
- while (unlikely(too_many_isolated(pgdat, file, sc))) {
- if (stalled)
- return 0;
-
- /* wait a bit for the reclaimer. */
- msleep(100);
- stalled = true;
-
- /* We are about to die and free our memory. Return now. */
- if (fatal_signal_pending(current))
- return SWAP_CLUSTER_MAX;
- }

lru_add_drain();

@@ -2903,6 +2852,8 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
unsigned long nr_soft_scanned;
gfp_t orig_mask;
pg_data_t *last_pgdat = NULL;
+ bool should_retry = false;
+ int nr_cpus = num_online_cpus();

/*
* If the number of buffer_heads in the machine exceeds the maximum
@@ -2914,9 +2865,18 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
sc->gfp_mask |= __GFP_HIGHMEM;
sc->reclaim_idx = gfp_zone(sc->gfp_mask);
}
-
+retry:
for_each_zone_zonelist_nodemask(zone, z, zonelist,
sc->reclaim_idx, sc->nodemask) {
+ /*
+ * Shrink each node in the zonelist once. If the zonelist is
+ * ordered by zone (not the default) then a node may be shrunk
+ * multiple times but in that case the user prefers lower zones
+ * being preserved.
+ */
+ if (zone->zone_pgdat == last_pgdat)
+ continue;
+
/*
* Take care memory controller reclaiming has small influence
* to global LRU.
@@ -2941,16 +2901,28 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
sc->compaction_ready = true;
continue;
}
+ }

- /*
- * Shrink each node in the zonelist once. If the
- * zonelist is ordered by zone (not the default) then a
- * node may be shrunk multiple times but in that case
- * the user prefers lower zones being preserved.
- */
- if (zone->zone_pgdat == last_pgdat)
- continue;
+ /*
+ * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from
+ * the LRU list and then get rescheduled. When there are massive
+ * number of tasks doing page allocation, such sleeping direct
+ * reclaimers may keep piling up on each CPU, the LRU list will
+ * go small and be scanned faster than necessary, leading to
+ * unnecessary swapping, thrashing and OOM.
+ */
+ VM_BUG_ON(current_is_kswapd());

+ if (!atomic_add_unless(&zone->zone_pgdat->nr_reclaimers, 1, nr_cpus)) {
+ should_retry = true;
+ continue;
+ }
+
+ if (last_pgdat)
+ atomic_dec(&last_pgdat->nr_reclaimers);
+ last_pgdat = zone->zone_pgdat;
+
+ if (!cgroup_reclaim(sc)) {
/*
* This steals pages from memory cgroups over softlimit
* and returns the number of reclaimed pages and
@@ -2966,13 +2938,20 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
/* need some check for avoid more shrink_zone() */
}

- /* See comment about same check for global reclaim above */
- if (zone->zone_pgdat == last_pgdat)
- continue;
- last_pgdat = zone->zone_pgdat;
shrink_node(zone->zone_pgdat, sc);
}

+ if (last_pgdat)
+ atomic_dec(&last_pgdat->nr_reclaimers);
+ else if (should_retry) {
+ /* wait a bit for the reclaimer. */
+ if (!schedule_timeout_killable(HZ / 10))
+ goto retry;
+
+ /* We are about to die and free our memory. Return now. */
+ sc->nr_reclaimed += SWAP_CLUSTER_MAX;
+ }
+
/*
* Restore to original mask to avoid the impact on the caller if we
* promoted it to __GFP_HIGHMEM.
@@ -4189,6 +4168,15 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
set_task_reclaim_state(p, &sc.reclaim_state);

if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
+ int nr_cpus = num_online_cpus();
+
+ VM_BUG_ON(current_is_kswapd());
+
+ if (!atomic_add_unless(&pgdat->nr_reclaimers, 1, nr_cpus)) {
+ schedule_timeout_killable(HZ / 10);
+ goto out;
+ }
+
/*
* Free memory by calling shrink node with increasing
* priorities until we have enough memory freed.
@@ -4196,8 +4184,10 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
do {
shrink_node(pgdat, &sc);
} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
- }

+ atomic_dec(&pgdat->nr_reclaimers);
+ }
+out:
set_task_reclaim_state(p, NULL);
current->flags &= ~PF_SWAPWRITE;
memalloc_noreclaim_restore(noreclaim_flag);