+unsigned long autonuma_flags __read_mostly =
+ (1<<AUTONUMA_SCHED_LOAD_BALANCE_STRICT_FLAG)|
+ (1<<AUTONUMA_SCHED_CLONE_RESET_FLAG)|
+ (1<<AUTONUMA_SCHED_FORK_RESET_FLAG)|
+#ifdef CONFIG_AUTONUMA_DEFAULT_ENABLED
+ (1<<AUTONUMA_FLAG)|
+#endif
+ (1<<AUTONUMA_SCAN_PMD_FLAG);
+static DEFINE_MUTEX(knumad_mm_mutex);
+static inline bool autonuma_impossible(void)
+{
+ return num_possible_nodes()<= 1 ||
+ test_bit(AUTONUMA_IMPOSSIBLE_FLAG,&autonuma_flags);
+}
+/* caller already holds the compound_lock */
+void autonuma_migrate_split_huge_page(struct page *page,
+ struct page *page_tail)
+{
+ int nid, last_nid;
+
+ nid = page->autonuma_migrate_nid;
+ VM_BUG_ON(nid>= MAX_NUMNODES);
+ VM_BUG_ON(nid< -1);
+ VM_BUG_ON(page_tail->autonuma_migrate_nid != -1);
+ if (nid>= 0) {
+ VM_BUG_ON(page_to_nid(page) != page_to_nid(page_tail));
+
+ compound_lock(page_tail);
+void __autonuma_migrate_page_remove(struct page *page)
+{
+ unsigned long flags;
+ int nid;
+static void __autonuma_migrate_page_add(struct page *page, int dst_nid,
+ int page_nid)
+{
+ VM_BUG_ON(dst_nid>= MAX_NUMNODES);
+ VM_BUG_ON(dst_nid< -1);
+ VM_BUG_ON(page_nid>= MAX_NUMNODES);
+ VM_BUG_ON(page_nid< -1);
+
+ VM_BUG_ON(page_nid == dst_nid);
+ VM_BUG_ON(page_to_nid(page) != page_nid);
+
+ flags = compound_lock_irqsave(page);
+static void autonuma_migrate_page_add(struct page *page, int dst_nid,
+ int page_nid)
+{
+ int migrate_nid = ACCESS_ONCE(page->autonuma_migrate_nid);
+ if (migrate_nid != dst_nid)
+ __autonuma_migrate_page_add(page, dst_nid, page_nid);
+}
+static bool balance_pgdat(struct pglist_data *pgdat,
+ int nr_migrate_pages)
+{
+ /* FIXME: this only check the wmarks, make it move
+ * "unused" memory or pagecache by queuing it to
+ * pgdat->autonuma_migrate_head[pgdat->node_id].
+ */
+static void cpu_follow_memory_pass(struct task_struct *p,
+ struct task_autonuma *task_autonuma,
+ unsigned long *task_numa_fault)
+{
+ int nid;
+ for_each_node(nid)
+ task_numa_fault[nid]>>= 1;
+ task_autonuma->task_numa_fault_tot>>= 1;
+}
+static void numa_hinting_fault_cpu_follow_memory(struct task_struct *p,
+ int access_nid,
+ int numpages,
+ bool pass)
+{
+ struct task_autonuma *task_autonuma = p->task_autonuma;
+ unsigned long *task_numa_fault = task_autonuma->task_numa_fault;
+ if (unlikely(pass))
+ cpu_follow_memory_pass(p, task_autonuma, task_numa_fault);
+ task_numa_fault[access_nid] += numpages;
+ task_autonuma->task_numa_fault_tot += numpages;
+}
+static inline bool last_nid_set(struct task_struct *p,
+ struct page *page, int cpu_nid)
+{
+ bool ret = true;
+ int autonuma_last_nid = ACCESS_ONCE(page->autonuma_last_nid);
+ VM_BUG_ON(cpu_nid< 0);
+ VM_BUG_ON(cpu_nid>= MAX_NUMNODES);
+ if (autonuma_last_nid>= 0&& autonuma_last_nid != cpu_nid) {
+ int migrate_nid = ACCESS_ONCE(page->autonuma_migrate_nid);
+ if (migrate_nid>= 0&& migrate_nid != cpu_nid)
+ __autonuma_migrate_page_remove(page);
+ ret = false;
+ }
+ if (autonuma_last_nid != cpu_nid)
+ ACCESS_ONCE(page->autonuma_last_nid) = cpu_nid;
+ return ret;
+}
+static int __page_migrate_nid(struct page *page, int page_nid)
+{
+ int migrate_nid = ACCESS_ONCE(page->autonuma_migrate_nid);
+ if (migrate_nid< 0)
+ migrate_nid = page_nid;
+#if 0
+ return page_nid;
+#endif
+ return migrate_nid;
+}
+
+static int page_migrate_nid(struct page *page)
+{
+ return __page_migrate_nid(page, page_to_nid(page));
+}
+static int knumad_scan_pmd(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte, *_pte;
+ struct page *page;
+ unsigned long _address, end;
+ spinlock_t *ptl;
+ int ret = 0;
+
+ VM_BUG_ON(address& ~PAGE_MASK);
+
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ goto out;
+
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ goto out;
+
+ pmd = pmd_offset(pud, address);
+ if (pmd_none(*pmd))
+ goto out;
+ if (pmd_trans_huge(*pmd)) {
+ spin_lock(&mm->page_table_lock);
+ if (pmd_trans_huge(*pmd)) {
+ VM_BUG_ON(address& ~HPAGE_PMD_MASK);
+ if (unlikely(pmd_trans_splitting(*pmd))) {
+ spin_unlock(&mm->page_table_lock);
+ wait_split_huge_page(vma->anon_vma, pmd);
+ } else {
+ int page_nid;
+ unsigned long *numa_fault_tmp;
+ ret = HPAGE_PMD_NR;
+
+ if (autonuma_scan_use_working_set()&&
+ pmd_numa(*pmd)) {
+ spin_unlock(&mm->page_table_lock);
+ goto out;
+ }
+
+ page = pmd_page(*pmd);
+
+ /* only check non-shared pages */
+ if (page_mapcount(page) != 1) {
+ spin_unlock(&mm->page_table_lock);
+ goto out;
+ }
+
+ page_nid = page_migrate_nid(page);
+ numa_fault_tmp = mm_autonuma_numa_fault_tmp(mm);
+ numa_fault_tmp[page_nid] += ret;
+
+ if (pmd_numa(*pmd)) {
+ spin_unlock(&mm->page_table_lock);
+ goto out;
+ }
+
+ set_pmd_at(mm, address, pmd, pmd_mknuma(*pmd));
+ /* defer TLB flush to lower the overhead */
+ spin_unlock(&mm->page_table_lock);
+ goto out;
+ }
+ } else
+ spin_unlock(&mm->page_table_lock);
+ }
+
+ VM_BUG_ON(!pmd_present(*pmd));
+
+ end = min(vma->vm_end, (address + PMD_SIZE)& PMD_MASK);
+ pte = pte_offset_map_lock(mm, pmd, address,&ptl);
+ for (_address = address, _pte = pte; _address< end;
+ _pte++, _address += PAGE_SIZE) {
+ unsigned long *numa_fault_tmp;
+ pte_t pteval = *_pte;
+ if (!pte_present(pteval))
+ continue;
+ if (autonuma_scan_use_working_set()&&
+ pte_numa(pteval))
+ continue;
+ page = vm_normal_page(vma, _address, pteval);
+ if (unlikely(!page))
+ continue;
+ /* only check non-shared pages */
+ if (page_mapcount(page) != 1)
+ continue;
+
+ numa_fault_tmp = mm_autonuma_numa_fault_tmp(mm);
+ numa_fault_tmp[page_migrate_nid(page)]++;
+ if (pte_numa(pteval))
+ continue;
+ if (!autonuma_scan_pmd())
+ set_pte_at(mm, _address, _pte, pte_mknuma(pteval));
+
+ /* defer TLB flush to lower the overhead */
+ ret++;
+ }
+ pte_unmap_unlock(pte, ptl);
+
+ if (ret&& !pmd_numa(*pmd)&& autonuma_scan_pmd()) {
+ spin_lock(&mm->page_table_lock);
+ set_pmd_at(mm, address, pmd, pmd_mknuma(*pmd));
+ spin_unlock(&mm->page_table_lock);
+ /* defer TLB flush to lower the overhead */
+ }
+static void mm_numa_fault_flush(struct mm_struct *mm)
+{
+ int nid;
+ struct mm_autonuma *mma = mm->mm_autonuma;
+ unsigned long *numa_fault_tmp = mm_autonuma_numa_fault_tmp(mm);
+ unsigned long tot = 0;
+ /* FIXME: protect this with seqlock against autonuma_balance() */
+static int knumad_do_scan(void)
+{
+ struct mm_struct *mm;
+ struct mm_autonuma *mm_autonuma;
+ unsigned long address;
+ struct vm_area_struct *vma;
+ int progress = 0;
+
+ mm = knumad_scan.mm;
+ if (!mm) {
+ if (unlikely(list_empty(&knumad_scan.mm_head)))
+ return pages_to_scan;
+ down_read(&mm->mmap_sem);
+ if (unlikely(knumad_test_exit(mm)))
+ vma = NULL;
+ else
+ vma = find_vma(mm, address);
+ for (; vma&& progress< pages_to_scan; vma = vma->vm_next) {/* process is exiting */
+ unsigned long start_addr, end_addr;
+ cond_resched();
+ if (unlikely(knumad_test_exit(mm))) {/* only do anonymous memory without explicit numa placement */
+ progress++;
+ break;
+ }
+ if (!vma->anon_vma || vma_policy(vma)) {
+ progress++;
+ continue;
+ }
+ if (vma->vm_flags& (VM_PFNMAP | VM_MIXEDMAP)) {
+ progress++;
+ continue;
+ }
+ if (is_vma_temporary_stack(vma)) {
+ progress++;
+ continue;
+ }
+
+ VM_BUG_ON(address& ~PAGE_MASK);
+ if (address< vma->vm_start)
+ address = vma->vm_start;
+ flush_tlb_range(vma, start_addr, end_addr);
+ mmu_notifier_invalidate_range_end(vma->vm_mm, start_addr,
+ end_addr);
+ }
+ up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */
+static int knuma_scand(void *none)
+{
+ struct mm_struct *mm = NULL;
+ int progress = 0, _progress;
+ unsigned long total_progress = 0;
+
+ set_freezable();
+
+ knuma_scand_disabled();
+
+ mutex_lock(&knumad_mm_mutex);
+
+ for (;;) {
+ if (unlikely(kthread_should_stop()))
+ break;
+ _progress = knumad_do_scan();
+ progress += _progress;
+ total_progress += _progress;
+ mm = knumad_scan.mm;
+ knumad_scan.mm = NULL;
+ if (mm&& knumad_test_exit(mm)) {
+ list_del(&mm->mm_autonuma->mm_node);
+ /* tell autonuma_exit not to list_del */
+ VM_BUG_ON(mm->mm_autonuma->mm != mm);
+ mm->mm_autonuma->mm = NULL;
+ }
+ mutex_unlock(&knumad_mm_mutex);
+
+ if (mm)
+ mmdrop(mm);
+
+static int isolate_migratepages(struct list_head *migratepages,
+ struct pglist_data *pgdat)
+{
+ int nr = 0, nid;
+ struct list_head *heads = pgdat->autonuma_migrate_head;
+
+ /* FIXME: THP balancing, restart from last nid */
+ if (PageTransHuge(page)) {
+ VM_BUG_ON(!PageAnon(page));
+ /* FIXME: remove split_huge_page */
+ if (!__isolate_lru_page(page, 0)) {
+ VM_BUG_ON(PageTransCompound(page));
+ del_page_from_lru_list(page, lruvec, page_lru(page));
+ inc_zone_state(zone, page_is_file_cache(page) ?
+ NR_ISOLATED_FILE : NR_ISOLATED_ANON);
+ spin_unlock_irq(&zone->lru_lock);
+ /*
+ * hold the page pin at least until
+ * __isolate_lru_page succeeds
+ * (__isolate_lru_page takes a second pin when
+ * it succeeds). If we release the pin before
+ * __isolate_lru_page returns, the page could
+ * have been freed and reallocated from under
+ * us, so rendering worthless our previous
+ * checks on the page including the
+ * split_huge_page call.
+ */
+ put_page(page);
+
+ list_add(&page->lru, migratepages);
+ nr += hpage_nr_pages(page);
+ } else {
+ /* FIXME: losing page, safest and simplest for now */
+static void knumad_do_migrate(struct pglist_data *pgdat)
+{
+ int nr_migrate_pages = 0;
+ LIST_HEAD(migratepages);
+
+ autonuma_printk("nr_migrate_pages %lu to node %d\n",
+ pgdat->autonuma_nr_migrate_pages, pgdat->node_id);
+ do {
+ int isolated = 0;
+ if (balance_pgdat(pgdat, nr_migrate_pages))
+ isolated = isolate_migratepages(&migratepages, pgdat);
+ /* FIXME: might need to check too many isolated */
+ if (!isolated)
+ break;
+ nr_migrate_pages += isolated;
+ } while (nr_migrate_pages< pages_to_migrate);
+
+ if (nr_migrate_pages) {
+ int err;
+ autonuma_printk("migrate %d to node %d\n", nr_migrate_pages,
+ pgdat->node_id);
+ pages_migrated += nr_migrate_pages; /* FIXME: per node */
+ err = migrate_pages(&migratepages, alloc_migrate_dst_page,
+ pgdat->node_id, false, true);
+ if (err)
+ /* FIXME: requeue failed pages */
+ putback_lru_pages(&migratepages);
+void autonuma_enter(struct mm_struct *mm)
+{
+ if (autonuma_impossible())
+ return;
+
+ mutex_lock(&knumad_mm_mutex);
+ list_add_tail(&mm->mm_autonuma->mm_node,&knumad_scan.mm_head);
+ mutex_unlock(&knumad_mm_mutex);
+}
+void autonuma_exit(struct mm_struct *mm)
+{
+ bool serialize;
+
+ if (autonuma_impossible())
+ return;
+ serialize = false;
+ mutex_lock(&knumad_mm_mutex);
+ if (knumad_scan.mm == mm)
+ serialize = true;
+ else if (mm->mm_autonuma->mm) {
+ VM_BUG_ON(mm->mm_autonuma->mm != mm);
+ mm->mm_autonuma->mm = NULL; /* debug */
+ list_del(&mm->mm_autonuma->mm_node);
+ }
+ mutex_unlock(&knumad_mm_mutex);
+
+ if (serialize) {
+ /* prevent the mm to go away under knumad_do_scan main loop */
+ down_write(&mm->mmap_sem);
+ up_write(&mm->mmap_sem);
+SYSFS_ENTRY(debug, AUTONUMA_DEBUG_FLAG);
+SYSFS_ENTRY(pmd, AUTONUMA_SCAN_PMD_FLAG);
+SYSFS_ENTRY(working_set, AUTONUMA_SCAN_USE_WORKING_SET_FLAG);
+SYSFS_ENTRY(defer, AUTONUMA_MIGRATE_DEFER_FLAG);
+SYSFS_ENTRY(load_balance_strict, AUTONUMA_SCHED_LOAD_BALANCE_STRICT_FLAG);
+SYSFS_ENTRY(clone_reset, AUTONUMA_SCHED_CLONE_RESET_FLAG);
+SYSFS_ENTRY(fork_reset, AUTONUMA_SCHED_FORK_RESET_FLAG);
+#undef SYSFS_ENTRY
+
+enum {
+ SYSFS_KNUMA_SCAND_SLEEP_ENTRY,
+ SYSFS_KNUMA_SCAND_PAGES_ENTRY,
+ SYSFS_KNUMA_MIGRATED_SLEEP_ENTRY,
+ SYSFS_KNUMA_MIGRATED_PAGES_ENTRY,
+};
+SYSFS_ENTRY(scan_sleep_millisecs, SYSFS_KNUMA_SCAND_SLEEP_ENTRY);
+SYSFS_ENTRY(scan_sleep_pass_millisecs, SYSFS_KNUMA_SCAND_SLEEP_ENTRY);
+SYSFS_ENTRY(pages_to_scan, SYSFS_KNUMA_SCAND_PAGES_ENTRY);
+
+SYSFS_ENTRY(migrate_sleep_millisecs, SYSFS_KNUMA_MIGRATED_SLEEP_ENTRY);
+SYSFS_ENTRY(pages_to_migrate, SYSFS_KNUMA_MIGRATED_PAGES_ENTRY);
+SYSFS_ENTRY(full_scans);
+SYSFS_ENTRY(pages_scanned);
+SYSFS_ENTRY(pages_migrated);
+static inline void autonuma_exit_sysfs(struct kobject *autonuma_kobj)
+{
+}
+#endif /* CONFIG_SYSFS */
+
+static int __init noautonuma_setup(char *str)
+{
+ if (!autonuma_impossible()) {
+ printk("AutoNUMA permanently disabled\n");
+ set_bit(AUTONUMA_IMPOSSIBLE_FLAG,&autonuma_flags);
+int alloc_task_autonuma(struct task_struct *tsk, struct task_struct *orig,
+ int node)
+{
+void free_task_autonuma(struct task_struct *tsk)
+{
+ if (autonuma_impossible()) {
+ BUG_ON(tsk->task_autonuma);
+ return;
+ }
+
+ BUG_ON(!tsk->task_autonuma);
+ kmem_cache_free(task_autonuma_cachep, tsk->task_autonuma);
+ tsk->task_autonuma = NULL;
+}
+void free_mm_autonuma(struct mm_struct *mm)
+{
+ if (autonuma_impossible()) {
+ BUG_ON(mm->mm_autonuma);
+ return;
+ }
+
+ BUG_ON(!mm->mm_autonuma);