Re: [PATCH 1/2] mm: page allocator: Adjust the per-cpu counterthreshold when memory is low

From: KAMEZAWA Hiroyuki
Date: Wed Oct 27 2010 - 21:15:06 EST


On Wed, 27 Oct 2010 09:47:35 +0100
Mel Gorman <mel@xxxxxxxxx> wrote:

> Commit [aa45484: calculate a better estimate of NR_FREE_PAGES when
> memory is low] noted that watermarks were based on the vmstat
> NR_FREE_PAGES. To avoid synchronization overhead, these counters are
> maintained on a per-cpu basis and drained both periodically and when a
> threshold is above a threshold. On large CPU systems, the difference
> between the estimate and real value of NR_FREE_PAGES can be very high.
> The system can get into a case where pages are allocated far below the
> min watermark potentially causing livelock issues. The commit solved the
> problem by taking a better reading of NR_FREE_PAGES when memory was low.
>
> Unfortunately, as reported by Shaohua Li this accurate reading can consume
> a large amount of CPU time on systems with many sockets due to cache
> line bouncing. This patch takes a different approach. For large machines
> where counter drift might be unsafe and while kswapd is awake, the per-cpu
> thresholds for the target pgdat are reduced to limit the level of drift
> to what should be a safe level. This incurs a performance penalty in heavy
> memory pressure by a factor that depends on the workload and the machine but
> the machine should function correctly without accidentally exhausting all
> memory on a node. There is an additional cost when kswapd wakes and sleeps
> but the event is not expected to be frequent - in Shaohua's test case,
> there was one recorded sleep and wake event at least.
>
> To ensure that kswapd wakes up, a safe version of zone_watermark_ok()
> is introduced that takes a more accurate reading of NR_FREE_PAGES when
> called from wakeup_kswapd, when deciding whether it is really safe to go
> back to sleep in sleeping_prematurely() and when deciding if a zone is
> really balanced or not in balance_pgdat(). We are still using an expensive
> function but limiting how often it is called.
>
> When the test case is reproduced, the time spent in the watermark functions
> is reduced. The following report is on the percentage of time spent
> cumulatively spent in the functions zone_nr_free_pages(), zone_watermark_ok(),
> __zone_watermark_ok(), zone_watermark_ok_safe(), zone_page_state_snapshot(),
> zone_page_state().
>
> vanilla 11.6615%
> disable-threshold 0.2584%
>
> Reported-by: Shaohua Li <shaohua.li@xxxxxxxxx>
> Signed-off-by: Mel Gorman <mel@xxxxxxxxx>
> ---
> include/linux/mmzone.h | 10 +++-------
> include/linux/vmstat.h | 5 +++++
> mm/mmzone.c | 21 ---------------------
> mm/page_alloc.c | 35 +++++++++++++++++++++++++++--------
> mm/vmscan.c | 23 +++++++++++++----------
> mm/vmstat.c | 46 +++++++++++++++++++++++++++++++++++++++++++++-
> 6 files changed, 93 insertions(+), 47 deletions(-)
>
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 3984c4e..8d789d7 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -448,12 +448,6 @@ static inline int zone_is_oom_locked(const struct zone *zone)
> return test_bit(ZONE_OOM_LOCKED, &zone->flags);
> }
>
> -#ifdef CONFIG_SMP
> -unsigned long zone_nr_free_pages(struct zone *zone);
> -#else
> -#define zone_nr_free_pages(zone) zone_page_state(zone, NR_FREE_PAGES)
> -#endif /* CONFIG_SMP */
> -
> /*
> * The "priority" of VM scanning is how much of the queues we will scan in one
> * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
> @@ -651,7 +645,9 @@ typedef struct pglist_data {
> extern struct mutex zonelists_mutex;
> void build_all_zonelists(void *data);
> void wakeup_kswapd(struct zone *zone, int order);
> -int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
> +bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
> + int classzone_idx, int alloc_flags);
> +bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
> int classzone_idx, int alloc_flags);
> enum memmap_context {
> MEMMAP_EARLY,
> diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
> index eaaea37..e4cc21c 100644
> --- a/include/linux/vmstat.h
> +++ b/include/linux/vmstat.h
> @@ -254,6 +254,8 @@ extern void dec_zone_state(struct zone *, enum zone_stat_item);
> extern void __dec_zone_state(struct zone *, enum zone_stat_item);
>
> void refresh_cpu_vm_stats(int);
> +void reduce_pgdat_percpu_threshold(pg_data_t *pgdat);
> +void restore_pgdat_percpu_threshold(pg_data_t *pgdat);
> #else /* CONFIG_SMP */
>
> /*
> @@ -298,6 +300,9 @@ static inline void __dec_zone_page_state(struct page *page,
> #define dec_zone_page_state __dec_zone_page_state
> #define mod_zone_page_state __mod_zone_page_state
>
> +static inline void reduce_pgdat_percpu_threshold(pg_data_t *pgdat) { }
> +static inline void restore_pgdat_percpu_threshold(pg_data_t *pgdat) { }
> +
> static inline void refresh_cpu_vm_stats(int cpu) { }
> #endif
>
> diff --git a/mm/mmzone.c b/mm/mmzone.c
> index e35bfb8..f5b7d17 100644
> --- a/mm/mmzone.c
> +++ b/mm/mmzone.c
> @@ -87,24 +87,3 @@ int memmap_valid_within(unsigned long pfn,
> return 1;
> }
> #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
> -
> -#ifdef CONFIG_SMP
> -/* Called when a more accurate view of NR_FREE_PAGES is needed */
> -unsigned long zone_nr_free_pages(struct zone *zone)
> -{
> - unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES);
> -
> - /*
> - * While kswapd is awake, it is considered the zone is under some
> - * memory pressure. Under pressure, there is a risk that
> - * per-cpu-counter-drift will allow the min watermark to be breached
> - * potentially causing a live-lock. While kswapd is awake and
> - * free pages are low, get a better estimate for free pages
> - */
> - if (nr_free_pages < zone->percpu_drift_mark &&
> - !waitqueue_active(&zone->zone_pgdat->kswapd_wait))
> - return zone_page_state_snapshot(zone, NR_FREE_PAGES);
> -
> - return nr_free_pages;
> -}
> -#endif /* CONFIG_SMP */
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index f12ad18..0286150 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1454,24 +1454,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
> #endif /* CONFIG_FAIL_PAGE_ALLOC */
>
> /*
> - * Return 1 if free pages are above 'mark'. This takes into account the order
> + * Return true if free pages are above 'mark'. This takes into account the order
> * of the allocation.
> */
> -int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
> - int classzone_idx, int alloc_flags)
> +static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
> + int classzone_idx, int alloc_flags, long free_pages)
> {
> /* free_pages my go negative - that's OK */
> long min = mark;
> - long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
> int o;
>
> + free_pages -= (1 << order) + 1;
> if (alloc_flags & ALLOC_HIGH)
> min -= min / 2;
> if (alloc_flags & ALLOC_HARDER)
> min -= min / 4;
>
> if (free_pages <= min + z->lowmem_reserve[classzone_idx])
> - return 0;
> + return false;
> for (o = 0; o < order; o++) {
> /* At the next order, this order's pages become unavailable */
> free_pages -= z->free_area[o].nr_free << o;
> @@ -1480,9 +1480,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
> min >>= 1;
>
> if (free_pages <= min)
> - return 0;
> + return false;
> }
> - return 1;
> + return true;
> +}
> +
> +bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
> + int classzone_idx, int alloc_flags)
> +{
> + return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
> + zone_page_state(z, NR_FREE_PAGES));
> +}
> +
> +bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
> + int classzone_idx, int alloc_flags)
> +{
> + long free_pages = zone_page_state(z, NR_FREE_PAGES);
> +
> + if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
> + free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
> +
> + return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
> + free_pages);
> }
>
> #ifdef CONFIG_NUMA
> @@ -2436,7 +2455,7 @@ void show_free_areas(void)
> " all_unreclaimable? %s"
> "\n",
> zone->name,
> - K(zone_nr_free_pages(zone)),
> + K(zone_page_state(zone, NR_FREE_PAGES)),
> K(min_wmark_pages(zone)),
> K(low_wmark_pages(zone)),
> K(high_wmark_pages(zone)),
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index c5dfabf..3e71cb1 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2082,7 +2082,7 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
> if (zone->all_unreclaimable)
> continue;
>
> - if (!zone_watermark_ok(zone, order, high_wmark_pages(zone),
> + if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
> 0, 0))
> return 1;
> }
> @@ -2169,7 +2169,7 @@ loop_again:
> shrink_active_list(SWAP_CLUSTER_MAX, zone,
> &sc, priority, 0);
>
> - if (!zone_watermark_ok(zone, order,
> + if (!zone_watermark_ok_safe(zone, order,
> high_wmark_pages(zone), 0, 0)) {
> end_zone = i;
> break;
> @@ -2215,7 +2215,7 @@ loop_again:
> * We put equal pressure on every zone, unless one
> * zone has way too many pages free already.
> */
> - if (!zone_watermark_ok(zone, order,
> + if (!zone_watermark_ok_safe(zone, order,
> 8*high_wmark_pages(zone), end_zone, 0))
> shrink_zone(priority, zone, &sc);
> reclaim_state->reclaimed_slab = 0;
> @@ -2236,7 +2236,7 @@ loop_again:
> total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
> sc.may_writepage = 1;
>
> - if (!zone_watermark_ok(zone, order,
> + if (!zone_watermark_ok_safe(zone, order,
> high_wmark_pages(zone), end_zone, 0)) {
> all_zones_ok = 0;
> /*
> @@ -2244,7 +2244,7 @@ loop_again:
> * means that we have a GFP_ATOMIC allocation
> * failure risk. Hurry up!
> */
> - if (!zone_watermark_ok(zone, order,
> + if (!zone_watermark_ok_safe(zone, order,
> min_wmark_pages(zone), end_zone, 0))
> has_under_min_watermark_zone = 1;
> }
> @@ -2378,7 +2378,9 @@ static int kswapd(void *p)
> */
> if (!sleeping_prematurely(pgdat, order, remaining)) {
> trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
> + restore_pgdat_percpu_threshold(pgdat);
> schedule();
> + reduce_pgdat_percpu_threshold(pgdat);
> } else {
> if (remaining)
> count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
> @@ -2417,16 +2419,17 @@ void wakeup_kswapd(struct zone *zone, int order)
> if (!populated_zone(zone))
> return;
>
> - pgdat = zone->zone_pgdat;
> - if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
> + if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
> return;
> + pgdat = zone->zone_pgdat;
> if (pgdat->kswapd_max_order < order)
> pgdat->kswapd_max_order = order;
> - trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
> - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
> - return;
> if (!waitqueue_active(&pgdat->kswapd_wait))
> return;
> + if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
> + return;
> +
> + trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
> wake_up_interruptible(&pgdat->kswapd_wait);
> }
>
> diff --git a/mm/vmstat.c b/mm/vmstat.c
> index 355a9e6..cafcc2d 100644
> --- a/mm/vmstat.c
> +++ b/mm/vmstat.c
> @@ -81,6 +81,12 @@ EXPORT_SYMBOL(vm_stat);
>
> #ifdef CONFIG_SMP
>
> +static int calculate_pressure_threshold(struct zone *zone)
> +{
> + return max(1, (int)((high_wmark_pages(zone) - low_wmark_pages(zone) /
> + num_online_cpus())));
> +}
> +

Could you add background theory of this calculation as a comment to
show the difference with calculate_threshold() ?

And don't we need to have "max=125" thresh here ?


> static int calculate_threshold(struct zone *zone)
> {
> int threshold;
> @@ -159,6 +165,44 @@ static void refresh_zone_stat_thresholds(void)
> }
> }
>
> +void reduce_pgdat_percpu_threshold(pg_data_t *pgdat)
> +{
> + struct zone *zone;
> + int cpu;
> + int threshold;
> + int i;
> +

get_online_cpus();

> + for (i = 0; i < pgdat->nr_zones; i++) {
> + zone = &pgdat->node_zones[i];
> + if (!zone->percpu_drift_mark)
> + continue;
> +
> + threshold = calculate_pressure_threshold(zone);
> + for_each_online_cpu(cpu)
> + per_cpu_ptr(zone->pageset, cpu)->stat_threshold
> + = threshold;
> + }

put_online_cpus();

> +}
> +
> +void restore_pgdat_percpu_threshold(pg_data_t *pgdat)
> +{
> + struct zone *zone;
> + int cpu;
> + int threshold;
> + int i;
> +

get_online_cpus();

> + for (i = 0; i < pgdat->nr_zones; i++) {
> + zone = &pgdat->node_zones[i];
> + if (!zone->percpu_drift_mark)
> + continue;
> +
> + threshold = calculate_threshold(zone);
> + for_each_online_cpu(cpu)
> + per_cpu_ptr(zone->pageset, cpu)->stat_threshold
> + = threshold;
> + }

put_online_cpus();


Thanks,
-Kame

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/