[RFC PATCH 08/11] mm: Add support for creating memory aeration

From: Alexander Duyck
Date: Thu May 30 2019 - 17:58:17 EST


From: Alexander Duyck <alexander.h.duyck@xxxxxxxxxxxxxxx>

Add support for "aerating" memory in a guest by pushing individual pages
out. This patch is meant to add generic support for this by adding a common
framework that can be used later by drivers such as virtio-balloon.

Signed-off-by: Alexander Duyck <alexander.h.duyck@xxxxxxxxxxxxxxx>
---
include/linux/memory_aeration.h | 54 +++++++
mm/Kconfig | 5 +
mm/Makefile | 1
mm/aeration.c | 320 +++++++++++++++++++++++++++++++++++++++
4 files changed, 380 insertions(+)
create mode 100644 include/linux/memory_aeration.h
create mode 100644 mm/aeration.c

diff --git a/include/linux/memory_aeration.h b/include/linux/memory_aeration.h
new file mode 100644
index 000000000000..5ba0e634f240
--- /dev/null
+++ b/include/linux/memory_aeration.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_MEMORY_AERATION_H
+#define _LINUX_MEMORY_AERATION_H
+
+#include <linux/pageblock-flags.h>
+#include <linux/jump_label.h>
+#include <asm/pgtable_types.h>
+
+struct zone;
+
+#define AERATOR_MIN_ORDER pageblock_order
+
+struct aerator_dev_info {
+ unsigned long capacity;
+ struct list_head batch_reactor;
+ atomic_t refcnt;
+ void (*react)(struct aerator_dev_info *a_dev_info);
+};
+
+extern struct static_key aerator_notify_enabled;
+
+void aerator_cycle(void);
+void __aerator_notify(struct zone *zone, int order);
+
+/**
+ * aerator_notify_free - Free page notification that will start page processing
+ * @page: Last page processed
+ * @zone: Pointer to current zone of last page processed
+ * @order: Order of last page added to zone
+ *
+ * This function is meant to act as a screener for __aerator_notify which
+ * will determine if a give zone has crossed over the high-water mark that
+ * will justify us beginning page treatment. If we have crossed that
+ * threshold then it will start the process of pulling some pages and
+ * placing them in the batch_reactor list for treatment.
+ */
+static inline void
+aerator_notify_free(struct page *page, struct zone *zone, int order)
+{
+ if (!static_key_false(&aerator_notify_enabled))
+ return;
+
+ if (order < AERATOR_MIN_ORDER)
+ return;
+
+ __aerator_notify(zone, order);
+}
+
+void aerator_shutdown(void);
+int aerator_startup(struct aerator_dev_info *sdev);
+
+#define AERATOR_ZONE_BITS (BITS_TO_LONGS(MAX_NR_ZONES) * BITS_PER_LONG)
+#define AERATOR_HWM_BITS (AERATOR_ZONE_BITS * MAX_NUMNODES)
+#endif /*_LINUX_MEMORY_AERATION_H */
diff --git a/mm/Kconfig b/mm/Kconfig
index f0c76ba47695..34680214cefa 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -236,6 +236,11 @@ config COMPACTION
linux-mm@xxxxxxxxxx

#
+# support for memory aeration
+config AERATION
+ bool
+
+#
# support for page migration
#
config MIGRATION
diff --git a/mm/Makefile b/mm/Makefile
index ac5e5ba78874..26c2fcd2b89d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -104,3 +104,4 @@ obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
obj-$(CONFIG_HMM) += hmm.o
obj-$(CONFIG_MEMFD_CREATE) += memfd.o
+obj-$(CONFIG_AERATION) += aeration.o
diff --git a/mm/aeration.c b/mm/aeration.c
new file mode 100644
index 000000000000..aaf8af8d822f
--- /dev/null
+++ b/mm/aeration.c
@@ -0,0 +1,320 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/memory_aeration.h>
+#include <linux/mmzone.h>
+#include <linux/gfp.h>
+#include <linux/export.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+
+static unsigned long *aerator_hwm;
+static struct aerator_dev_info *a_dev_info;
+struct static_key aerator_notify_enabled;
+
+void aerator_shutdown(void)
+{
+ static_key_slow_dec(&aerator_notify_enabled);
+
+ while (atomic_read(&a_dev_info->refcnt))
+ msleep(20);
+
+ kfree(aerator_hwm);
+ aerator_hwm = NULL;
+
+ a_dev_info = NULL;
+}
+EXPORT_SYMBOL_GPL(aerator_shutdown);
+
+int aerator_startup(struct aerator_dev_info *sdev)
+{
+ size_t size = BITS_TO_LONGS(AERATOR_HWM_BITS) * sizeof(unsigned long);
+ unsigned long *hwm;
+
+ if (a_dev_info || aerator_hwm)
+ return -EBUSY;
+
+ a_dev_info = sdev;
+
+ atomic_set(&sdev->refcnt, 0);
+
+ hwm = kzalloc(size, GFP_KERNEL);
+ if (!hwm) {
+ aerator_shutdown();
+ return -ENOMEM;
+ }
+
+ aerator_hwm = hwm;
+
+ static_key_slow_inc(&aerator_notify_enabled);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(aerator_startup);
+
+static inline unsigned long *get_aerator_hwm(int nid)
+{
+ if (!aerator_hwm)
+ return NULL;
+
+ return aerator_hwm + (BITS_TO_LONGS(MAX_NR_ZONES) * nid);
+}
+
+static int __aerator_fill(struct zone *zone, unsigned int size)
+{
+ struct list_head *batch = &a_dev_info->batch_reactor;
+ unsigned long nr_raw = 0;
+ unsigned int len = 0;
+ unsigned int order;
+
+ for (order = MAX_ORDER; order-- != AERATOR_MIN_ORDER;) {
+ struct free_area *area = &(zone->free_area[order]);
+ int mt = area->treatment_mt;
+
+ /*
+ * If there are no untreated pages to pull
+ * then we might as well skip the area.
+ */
+ while (area->nr_free_raw) {
+ unsigned int count = 0;
+ struct page *page;
+
+ /*
+ * If we completed aeration we can let the current
+ * free list work on settling so that a batch of
+ * new raw pages can build. In the meantime move on
+ * to the next migratetype.
+ */
+ if (++mt >= MIGRATE_TYPES)
+ mt = 0;
+
+ /*
+ * Pull pages from free list until we have drained
+ * it or we have filled the batch reactor.
+ */
+ while ((page = get_raw_pages(zone, order, mt))) {
+ list_add(&page->lru, batch);
+
+ if (++count == (size - len))
+ return size;
+ }
+
+ /*
+ * If we pulled any pages from this migratetype then
+ * we must move on to a new free area as we cannot
+ * move the membrane until after we have decanted the
+ * pages currently being aerated.
+ */
+ if (count) {
+ len += count;
+ break;
+ }
+ }
+
+ /*
+ * Keep a running total of the raw packets we have left
+ * behind. We will use this to determine if we should
+ * clear the HWM flag.
+ */
+ nr_raw += area->nr_free_raw;
+ }
+
+ /*
+ * If there are no longer enough free pages to fully populate
+ * the aerator, then we can just shut it down for this zone.
+ */
+ if (nr_raw < a_dev_info->capacity) {
+ unsigned long *hwm = get_aerator_hwm(zone_to_nid(zone));
+
+ clear_bit(zone_idx(zone), hwm);
+ atomic_dec(&a_dev_info->refcnt);
+ }
+
+ return len;
+}
+
+static unsigned int aerator_fill(int nid, int zid, int budget)
+{
+ pg_data_t *pgdat = NODE_DATA(nid);
+ struct zone *zone = &pgdat->node_zones[zid];
+ unsigned long flags;
+ int len;
+
+ spin_lock_irqsave(&zone->lock, flags);
+
+ /* fill aerator with "raw" pages */
+ len = __aerator_fill(zone, budget);
+
+ spin_unlock_irqrestore(&zone->lock, flags);
+
+ return len;
+}
+
+static void aerator_fill_and_react(void)
+{
+ int budget = a_dev_info->capacity;
+ int nr;
+
+ /*
+ * We should never be calling this function while there are already
+ * pages in the reactor being aerated. If we are called under such
+ * a circumstance report an error.
+ */
+ BUG_ON(!list_empty(&a_dev_info->batch_reactor));
+retry:
+ /*
+ * We want to hold one additional reference against the number of
+ * active hints as we may clear the hint that originally brought us
+ * here. We will clear it after we have either vaporized the content
+ * of the pages, or if we discover all pages were stolen out from
+ * under us.
+ */
+ atomic_inc(&a_dev_info->refcnt);
+
+ for_each_set_bit(nr, aerator_hwm, AERATOR_HWM_BITS) {
+ int node_id = nr / AERATOR_ZONE_BITS;
+ int zone_id = nr % AERATOR_ZONE_BITS;
+
+ budget -= aerator_fill(node_id, zone_id, budget);
+ if (!budget)
+ goto start_aerating;
+ }
+
+ if (unlikely(list_empty(&a_dev_info->batch_reactor))) {
+ /*
+ * If we never generated any pages, and we were holding the
+ * only remaining reference to active hints then we can
+ * just let this go for now and go idle.
+ */
+ if (atomic_dec_and_test(&a_dev_info->refcnt))
+ return;
+
+ /*
+ * There must be a bit populated somewhere, try going
+ * back through and finding it.
+ */
+ goto retry;
+ }
+
+start_aerating:
+ a_dev_info->react(a_dev_info);
+}
+
+void aerator_decant(void)
+{
+ struct list_head *list = &a_dev_info->batch_reactor;
+ struct page *page;
+
+ /*
+ * This function should never be called on an empty list. If so it
+ * points to a bug as we should never be running the aerator when
+ * the list is empty.
+ */
+ WARN_ON(list_empty(&a_dev_info->batch_reactor));
+
+ while ((page = list_first_entry_or_null(list, struct page, lru))) {
+ list_del(&page->lru);
+
+ __SetPageTreated(page);
+
+ free_treated_page(page);
+ }
+}
+
+/**
+ * aerator_cycle - drain, fill, and start aerating another batch of pages
+ *
+ * This function is at the heart of the aerator. It should be called after
+ * the previous batch of pages has finished being processed by the aerator.
+ * It will drain the aerator, refill it, and start the next set of pages
+ * being processed.
+ */
+void aerator_cycle(void)
+{
+ aerator_decant();
+
+ /*
+ * Now that the pages have been flushed we can drop our reference to
+ * the active hints list. If there are no further hints that need to
+ * be processed we can simply go idle.
+ */
+ if (atomic_dec_and_test(&a_dev_info->refcnt))
+ return;
+
+ aerator_fill_and_react();
+}
+EXPORT_SYMBOL_GPL(aerator_cycle);
+
+static void __aerator_fill_and_react(struct zone *zone)
+{
+ /*
+ * We should never be calling this function while there are already
+ * pages in the list being aerated. If we are called under such a
+ * circumstance report an error.
+ */
+ BUG_ON(!list_empty(&a_dev_info->batch_reactor));
+
+ /*
+ * We want to hold one additional reference against the number of
+ * active hints as we may clear the hint that originally brought us
+ * here. We will clear it after we have either vaporized the content
+ * of the pages, or if we discover all pages were stolen out from
+ * under us.
+ */
+ atomic_inc(&a_dev_info->refcnt);
+
+ __aerator_fill(zone, a_dev_info->capacity);
+
+ if (unlikely(list_empty(&a_dev_info->batch_reactor))) {
+ /*
+ * If we never generated any pages, and we were holding the
+ * only remaining reference to active hints then we can just
+ * let this go for now and go idle.
+ */
+ if (atomic_dec_and_test(&a_dev_info->refcnt))
+ return;
+
+ /*
+ * Another zone must have populated some raw pages that
+ * need to be processed. Release the zone lock and process
+ * that zone instead.
+ */
+ spin_unlock(&zone->lock);
+ aerator_fill_and_react();
+ } else {
+ /* Release the zone lock and begin the page aerator */
+ spin_unlock(&zone->lock);
+ a_dev_info->react(a_dev_info);
+ }
+
+ /* Reaquire lock so we can resume processing this zone */
+ spin_lock(&zone->lock);
+}
+
+void __aerator_notify(struct zone *zone, int order)
+{
+ int node_id = zone_to_nid(zone);
+ int zone_id = zone_idx(zone);
+ unsigned long *hwm;
+
+ if (zone->free_area[order].nr_free_raw < (2 * a_dev_info->capacity))
+ return;
+
+ hwm = get_aerator_hwm(node_id);
+
+ /*
+ * We an use separate test and set operations here as there
+ * is nothing else that can set or clear this bit while we are
+ * holding the zone lock. The advantage to doing it this way is
+ * that we don't have to dirty the cacheline unless we are
+ * changing the value.
+ */
+ if (test_bit(zone_id, hwm))
+ return;
+ set_bit(zone_id, hwm);
+
+ if (atomic_fetch_inc(&a_dev_info->refcnt))
+ return;
+
+ __aerator_fill_and_react(zone);
+}
+EXPORT_SYMBOL_GPL(__aerator_notify);
+