Re: [PATCH 4/9] kstaled: minimalistic implementation.

From: Zhu Yanhai
Date: Mon Feb 20 2012 - 04:18:25 EST


2011/9/28 Michel Lespinasse <walken@xxxxxxxxxx>:
> Introduce minimal kstaled implementation. The scan rate is controlled by
> /sys/kernel/mm/kstaled/scan_seconds and per-cgroup statistics are output
> into /dev/cgroup/*/memory.idle_page_stats.
>
>
> Signed-off-by: Michel Lespinasse <walken@xxxxxxxxxx>
> ---
> Âmm/memcontrol.c | Â297 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
> Â1 files changed, 297 insertions(+), 0 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index e013b8e..e55056f 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -49,6 +49,8 @@
> Â#include <linux/page_cgroup.h>
> Â#include <linux/cpu.h>
> Â#include <linux/oom.h>
> +#include <linux/kthread.h>
> +#include <linux/rmap.h>
> Â#include "internal.h"
>
> Â#include <asm/uaccess.h>
> @@ -283,6 +285,16 @@ struct mem_cgroup {
> Â Â Â Â */
> Â Â Â Âstruct mem_cgroup_stat_cpu nocpu_base;
> Â Â Â Âspinlock_t pcp_counter_lock;
> +
> +#ifdef CONFIG_KSTALED
> + Â Â Â seqcount_t idle_page_stats_lock;
> + Â Â Â struct idle_page_stats {
> + Â Â Â Â Â Â Â unsigned long idle_clean;
> + Â Â Â Â Â Â Â unsigned long idle_dirty_file;
> + Â Â Â Â Â Â Â unsigned long idle_dirty_swap;
> + Â Â Â } idle_page_stats, idle_scan_stats;
> + Â Â Â unsigned long idle_page_scans;
> +#endif
> Â};
>
> Â/* Stuffs for move charges at task migration. */
> @@ -4668,6 +4680,30 @@ static int mem_control_numa_stat_open(struct inode *unused, struct file *file)
> Â}
> Â#endif /* CONFIG_NUMA */
>
> +#ifdef CONFIG_KSTALED
> +static int mem_cgroup_idle_page_stats_read(struct cgroup *cgrp,
> + Â Â Â struct cftype *cft, Âstruct cgroup_map_cb *cb)
> +{
> + Â Â Â struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
> + Â Â Â unsigned int seqcount;
> + Â Â Â struct idle_page_stats stats;
> + Â Â Â unsigned long scans;
> +
> + Â Â Â do {
> + Â Â Â Â Â Â Â seqcount = read_seqcount_begin(&memcg->idle_page_stats_lock);
> + Â Â Â Â Â Â Â stats = memcg->idle_page_stats;
> + Â Â Â Â Â Â Â scans = memcg->idle_page_scans;
> + Â Â Â } while (read_seqcount_retry(&memcg->idle_page_stats_lock, seqcount));
> +
> + Â Â Â cb->fill(cb, "idle_clean", stats.idle_clean * PAGE_SIZE);
> + Â Â Â cb->fill(cb, "idle_dirty_file", stats.idle_dirty_file * PAGE_SIZE);
> + Â Â Â cb->fill(cb, "idle_dirty_swap", stats.idle_dirty_swap * PAGE_SIZE);
> + Â Â Â cb->fill(cb, "scans", scans);
> +
> + Â Â Â return 0;
> +}
> +#endif /* CONFIG_KSTALED */
> +
> Âstatic struct cftype mem_cgroup_files[] = {
> Â Â Â Â{
> Â Â Â Â Â Â Â Â.name = "usage_in_bytes",
> @@ -4738,6 +4774,12 @@ static struct cftype mem_cgroup_files[] = {
> Â Â Â Â Â Â Â Â.mode = S_IRUGO,
> Â Â Â Â},
> Â#endif
> +#ifdef CONFIG_KSTALED
> + Â Â Â {
> + Â Â Â Â Â Â Â .name = "idle_page_stats",
> + Â Â Â Â Â Â Â .read_map = mem_cgroup_idle_page_stats_read,
> + Â Â Â },
> +#endif
> Â};
>
> Â#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
> @@ -5001,6 +5043,9 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
> Â Â Â Âatomic_set(&mem->refcnt, 1);
> Â Â Â Âmem->move_charge_at_immigrate = 0;
> Â Â Â Âmutex_init(&mem->thresholds_lock);
> +#ifdef CONFIG_KSTALED
> + Â Â Â seqcount_init(&mem->idle_page_stats_lock);
> +#endif
> Â Â Â Âreturn &mem->css;
> Âfree_out:
> Â Â Â Â__mem_cgroup_free(mem);
> @@ -5568,3 +5613,255 @@ static int __init enable_swap_account(char *s)
> Â__setup("swapaccount=", enable_swap_account);
>
> Â#endif
> +
> +#ifdef CONFIG_KSTALED
> +
> +static unsigned int kstaled_scan_seconds;
> +static DECLARE_WAIT_QUEUE_HEAD(kstaled_wait);
> +
> +static unsigned kstaled_scan_page(struct page *page)
> +{
> + Â Â Â bool is_locked = false;
> + Â Â Â bool is_file;
> + Â Â Â struct page_referenced_info info;
> + Â Â Â struct page_cgroup *pc;
> + Â Â Â struct idle_page_stats *stats;
> + Â Â Â unsigned nr_pages;
> +
> + Â Â Â /*
> + Â Â Â Â* Before taking the page reference, check if the page is
> + Â Â Â Â* a user page which is not obviously unreclaimable
> + Â Â Â Â* (we will do more complete checks later).
> + Â Â Â Â*/
> + Â Â Â if (!PageLRU(page) ||
> + Â Â Â Â Â (!PageCompound(page) &&
> + Â Â Â Â Â Â(PageMlocked(page) ||
> + Â Â Â Â Â Â (page->mapping == NULL && !PageSwapCache(page)))))
> + Â Â Â Â Â Â Â return 1;
> +
> + Â Â Â if (!get_page_unless_zero(page))
> + Â Â Â Â Â Â Â return 1;
> +
> + Â Â Â /* Recheck now that we have the page reference. */
> + Â Â Â if (unlikely(!PageLRU(page)))
> + Â Â Â Â Â Â Â goto out;
> + Â Â Â nr_pages = 1 << compound_trans_order(page);
> + Â Â Â if (PageMlocked(page))
> + Â Â Â Â Â Â Â goto out;
> +
> + Â Â Â /*
> + Â Â Â Â* Anon and SwapCache pages can be identified without locking.
> + Â Â Â Â* For all other cases, we need the page locked in order to
> + Â Â Â Â* dereference page->mapping.
> + Â Â Â Â*/
> + Â Â Â if (PageAnon(page) || PageSwapCache(page))
> + Â Â Â Â Â Â Â is_file = false;
> + Â Â Â else if (!trylock_page(page)) {
> + Â Â Â Â Â Â Â /*
> + Â Â Â Â Â Â Â Â* We need to lock the page to dereference the mapping.
> + Â Â Â Â Â Â Â Â* But don't risk sleeping by calling lock_page().
> + Â Â Â Â Â Â Â Â* We don't want to stall kstaled, so we conservatively
> + Â Â Â Â Â Â Â Â* count locked pages as unreclaimable.
> + Â Â Â Â Â Â Â Â*/
> + Â Â Â Â Â Â Â goto out;
> + Â Â Â } else {
> + Â Â Â Â Â Â Â struct address_space *mapping = page->mapping;
> +
> + Â Â Â Â Â Â Â is_locked = true;
> +
> + Â Â Â Â Â Â Â /*
> + Â Â Â Â Â Â Â Â* The page is still anon - it has been continuously referenced
> + Â Â Â Â Â Â Â Â* since the prior check.
> + Â Â Â Â Â Â Â Â*/
> + Â Â Â Â Â Â Â VM_BUG_ON(PageAnon(page) || mapping != page_rmapping(page));
> +
> + Â Â Â Â Â Â Â /*
> + Â Â Â Â Â Â Â Â* Check the mapping under protection of the page lock.
> + Â Â Â Â Â Â Â Â* 1. If the page is not swap cache and has no mapping,
> + Â Â Â Â Â Â Â Â* Â Âshrink_page_list can't do anything with it.
> + Â Â Â Â Â Â Â Â* 2. If the mapping is unevictable (as in SHM_LOCK segments),
> + Â Â Â Â Â Â Â Â* Â Âshrink_page_list can't do anything with it.
> + Â Â Â Â Â Â Â Â* 3. If the page is swap cache or the mapping is swap backed
> + Â Â Â Â Â Â Â Â* Â Â(as in shmem), consider it a swappable page.
> + Â Â Â Â Â Â Â Â* 4. If the backing dev has indicated that it does not want
> + Â Â Â Â Â Â Â Â* Â Âits pages sync'd to disk (as in ramfs), take this as
> + Â Â Â Â Â Â Â Â* Â Âa hint that its pages are not reclaimable.
> + Â Â Â Â Â Â Â Â* 5. Otherwise, consider this as a file page reclaimable
> + Â Â Â Â Â Â Â Â* Â Âthrough standard pageout.
> + Â Â Â Â Â Â Â Â*/
> + Â Â Â Â Â Â Â if (!mapping && !PageSwapCache(page))
> + Â Â Â Â Â Â Â Â Â Â Â goto out;
> + Â Â Â Â Â Â Â else if (mapping_unevictable(mapping))
> + Â Â Â Â Â Â Â Â Â Â Â goto out;
> + Â Â Â Â Â Â Â else if (PageSwapCache(page) ||
> + Â Â Â Â Â Â Â Â Â Â Â Âmapping_cap_swap_backed(mapping))
> + Â Â Â Â Â Â Â Â Â Â Â is_file = false;
> + Â Â Â Â Â Â Â else if (!mapping_cap_writeback_dirty(mapping))
> + Â Â Â Â Â Â Â Â Â Â Â goto out;
> + Â Â Â Â Â Â Â else
> + Â Â Â Â Â Â Â Â Â Â Â is_file = true;
> + Â Â Â }
> +
> + Â Â Â /* Find out if the page is idle. Also test for pending mlock. */
> + Â Â Â page_referenced_kstaled(page, is_locked, &info);
> + Â Â Â if ((info.pr_flags & PR_REFERENCED) || (info.vm_flags & VM_LOCKED))
> + Â Â Â Â Â Â Â goto out;
> +
> + Â Â Â /* Locate kstaled stats for the page's cgroup. */
> + Â Â Â pc = lookup_page_cgroup(page);
> + Â Â Â if (!pc)
> + Â Â Â Â Â Â Â goto out;
> + Â Â Â lock_page_cgroup(pc);
> + Â Â Â if (!PageCgroupUsed(pc))
> + Â Â Â Â Â Â Â goto unlock_page_cgroup_out;
> + Â Â Â stats = &pc->mem_cgroup->idle_scan_stats;
Is it safe to deference it like this? I think we need something like this:
struct mem_cgroup *memcg = pc->mem_cgroup;
if (!memcg || !css_tryget(&memcg->css))
goto out;
And also css_put() in soewhere bmelow.
Or simply remove the lock_page_cgroup() above and use
try_get_mem_cgroup_from_page() directly.

--
Thanks,
Zhu Yanhai
> +
> + Â Â Â /* Finally increment the correct statistic for this page. */
> + Â Â Â if (!(info.pr_flags & PR_DIRTY) &&
> + Â Â Â Â Â !PageDirty(page) && !PageWriteback(page))
> + Â Â Â Â Â Â Â stats->idle_clean += nr_pages;
> + Â Â Â else if (is_file)
> + Â Â Â Â Â Â Â stats->idle_dirty_file += nr_pages;
> + Â Â Â else
> + Â Â Â Â Â Â Â stats->idle_dirty_swap += nr_pages;
> +
> + unlock_page_cgroup_out:
> + Â Â Â unlock_page_cgroup(pc);
> +
> + out:
> + Â Â Â if (is_locked)
> + Â Â Â Â Â Â Â unlock_page(page);
> + Â Â Â put_page(page);
> +
> + Â Â Â return nr_pages;
> +}
> +
> +static void kstaled_scan_node(pg_data_t *pgdat)
> +{
> + Â Â Â unsigned long flags;
> + Â Â Â unsigned long pfn, end;
> +
> + Â Â Â pgdat_resize_lock(pgdat, &flags);
> +
> + Â Â Â pfn = pgdat->node_start_pfn;
> + Â Â Â end = pfn + pgdat->node_spanned_pages;
> +
> + Â Â Â while (pfn < end) {
> + Â Â Â Â Â Â Â if (need_resched()) {
> + Â Â Â Â Â Â Â Â Â Â Â pgdat_resize_unlock(pgdat, &flags);
> + Â Â Â Â Â Â Â Â Â Â Â cond_resched();
> + Â Â Â Â Â Â Â Â Â Â Â pgdat_resize_lock(pgdat, &flags);
> +
> +#ifdef CONFIG_MEMORY_HOTPLUG
> + Â Â Â Â Â Â Â Â Â Â Â /* abort if the node got resized */
> + Â Â Â Â Â Â Â Â Â Â Â if (pfn < pgdat->node_start_pfn ||
> + Â Â Â Â Â Â Â Â Â Â Â Â Â end > (pgdat->node_start_pfn +
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Âpgdat->node_spanned_pages))
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â goto abort;
> +#endif
> + Â Â Â Â Â Â Â }
> +
> + Â Â Â Â Â Â Â pfn += pfn_valid(pfn) ?
> + Â Â Â Â Â Â Â Â Â Â Â kstaled_scan_page(pfn_to_page(pfn)) : 1;
> + Â Â Â }
> +
> +abort:
> + Â Â Â pgdat_resize_unlock(pgdat, &flags);
> +}
> +
> +static int kstaled(void *dummy)
> +{
> + Â Â Â while (1) {
> + Â Â Â Â Â Â Â int scan_seconds;
> + Â Â Â Â Â Â Â int nid;
> + Â Â Â Â Â Â Â struct mem_cgroup *memcg;
> +
> + Â Â Â Â Â Â Â wait_event_interruptible(kstaled_wait,
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â(scan_seconds = kstaled_scan_seconds) > 0);
> + Â Â Â Â Â Â Â /*
> + Â Â Â Â Â Â Â Â* We use interruptible wait_event so as not to contribute
> + Â Â Â Â Â Â Â Â* to the machine load average while we're sleeping.
> + Â Â Â Â Â Â Â Â* However, we don't actually expect to receive a signal
> + Â Â Â Â Â Â Â Â* since we run as a kernel thread, so the condition we were
> + Â Â Â Â Â Â Â Â* waiting for should be true once we get here.
> + Â Â Â Â Â Â Â Â*/
> + Â Â Â Â Â Â Â BUG_ON(scan_seconds <= 0);
> +
> + Â Â Â Â Â Â Â for_each_mem_cgroup_all(memcg)
> + Â Â Â Â Â Â Â Â Â Â Â memset(&memcg->idle_scan_stats, 0,
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Âsizeof(memcg->idle_scan_stats));
> +
> + Â Â Â Â Â Â Â for_each_node_state(nid, N_HIGH_MEMORY)
> + Â Â Â Â Â Â Â Â Â Â Â kstaled_scan_node(NODE_DATA(nid));
> +
> + Â Â Â Â Â Â Â for_each_mem_cgroup_all(memcg) {
> + Â Â Â Â Â Â Â Â Â Â Â write_seqcount_begin(&memcg->idle_page_stats_lock);
> + Â Â Â Â Â Â Â Â Â Â Â memcg->idle_page_stats = memcg->idle_scan_stats;
> + Â Â Â Â Â Â Â Â Â Â Â memcg->idle_page_scans++;
> + Â Â Â Â Â Â Â Â Â Â Â write_seqcount_end(&memcg->idle_page_stats_lock);
> + Â Â Â Â Â Â Â }
> +
> + Â Â Â Â Â Â Â schedule_timeout_interruptible(scan_seconds * HZ);
> + Â Â Â }
> +
> + Â Â Â BUG();
> + Â Â Â return 0; Â Â Â /* NOT REACHED */
> +}
> +
> +static ssize_t kstaled_scan_seconds_show(struct kobject *kobj,
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Âstruct kobj_attribute *attr,
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Âchar *buf)
> +{
> + Â Â Â return sprintf(buf, "%u\n", kstaled_scan_seconds);
> +}
> +
> +static ssize_t kstaled_scan_seconds_store(struct kobject *kobj,
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â struct kobj_attribute *attr,
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â const char *buf, size_t count)
> +{
> + Â Â Â int err;
> + Â Â Â unsigned long input;
> +
> + Â Â Â err = kstrtoul(buf, 10, &input);
> + Â Â Â if (err)
> + Â Â Â Â Â Â Â return -EINVAL;
> + Â Â Â kstaled_scan_seconds = input;
> + Â Â Â wake_up_interruptible(&kstaled_wait);
> + Â Â Â return count;
> +}
> +
> +static struct kobj_attribute kstaled_scan_seconds_attr = __ATTR(
> + Â Â Â scan_seconds, 0644,
> + Â Â Â kstaled_scan_seconds_show, kstaled_scan_seconds_store);
> +
> +static struct attribute *kstaled_attrs[] = {
> + Â Â Â &kstaled_scan_seconds_attr.attr,
> + Â Â Â NULL
> +};
> +static struct attribute_group kstaled_attr_group = {
> + Â Â Â .name = "kstaled",
> + Â Â Â .attrs = kstaled_attrs,
> +};
> +
> +static int __init kstaled_init(void)
> +{
> + Â Â Â int error;
> + Â Â Â struct task_struct *thread;
> +
> + Â Â Â error = sysfs_create_group(mm_kobj, &kstaled_attr_group);
> + Â Â Â if (error) {
> + Â Â Â Â Â Â Â pr_err("Failed to create kstaled sysfs node\n");
> + Â Â Â Â Â Â Â return error;
> + Â Â Â }
> +
> + Â Â Â thread = kthread_run(kstaled, NULL, "kstaled");
> + Â Â Â if (IS_ERR(thread)) {
> + Â Â Â Â Â Â Â pr_err("Failed to start kstaled\n");
> + Â Â Â Â Â Â Â return PTR_ERR(thread);
> + Â Â Â }
> +
> + Â Â Â return 0;
> +}
> +module_init(kstaled_init);
> +
> +#endif /* CONFIG_KSTALED */
> --
> 1.7.3.1
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@xxxxxxxxxx ÂFor more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
> Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>
N‹§²æìr¸›yúèšØb²X¬¶ÇvØ^–)Þ{.nÇ+‰·¥Š{±‘êçzX§¶›¡Ü}©ž²ÆzÚ&j:+v‰¨¾«‘êçzZ+€Ê+zf£¢·hšˆ§~†­†Ûiÿûàz¹®w¥¢¸?™¨è­Ú&¢)ßf”ù^jÇy§m…á@A«a¶Úÿ 0¶ìh®å’i