Re: Linux 3.0.39

From: Greg KH
Date: Wed Aug 01 2012 - 15:48:50 EST


diff --git a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
index 12cecc8..4a37c47 100644
--- a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
+++ b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
@@ -379,10 +379,10 @@ EVENT_PROCESS:

# To closer match vmstat scanning statistics, only count isolate_both
# and isolate_inactive as scanning. isolate_active is rotation
- # isolate_inactive == 0
- # isolate_active == 1
- # isolate_both == 2
- if ($isolate_mode != 1) {
+ # isolate_inactive == 1
+ # isolate_active == 2
+ # isolate_both == 3
+ if ($isolate_mode != 2) {
$perprocesspid{$process_pid}->{HIGH_NR_SCANNED} += $nr_scanned;
}
$perprocesspid{$process_pid}->{HIGH_NR_CONTIG_DIRTY} += $nr_contig_dirty;
diff --git a/Makefile b/Makefile
index 5fdfaa8..3ec1722 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
VERSION = 3
PATCHLEVEL = 0
-SUBLEVEL = 38
+SUBLEVEL = 39
EXTRAVERSION =
NAME = Sneaky Weasel

diff --git a/arch/mips/include/asm/thread_info.h b/arch/mips/include/asm/thread_info.h
index 97f8bf6..adda036 100644
--- a/arch/mips/include/asm/thread_info.h
+++ b/arch/mips/include/asm/thread_info.h
@@ -60,6 +60,8 @@ struct thread_info {
register struct thread_info *__current_thread_info __asm__("$28");
#define current_thread_info() __current_thread_info

+#endif /* !__ASSEMBLY__ */
+
/* thread information allocation */
#if defined(CONFIG_PAGE_SIZE_4KB) && defined(CONFIG_32BIT)
#define THREAD_SIZE_ORDER (1)
@@ -97,8 +99,6 @@ register struct thread_info *__current_thread_info __asm__("$28");

#define free_thread_info(info) kfree(info)

-#endif /* !__ASSEMBLY__ */
-
#define PREEMPT_ACTIVE 0x10000000

/*
diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S
index a81176f..be281c6 100644
--- a/arch/mips/kernel/vmlinux.lds.S
+++ b/arch/mips/kernel/vmlinux.lds.S
@@ -1,5 +1,6 @@
#include <asm/asm-offsets.h>
#include <asm/page.h>
+#include <asm/thread_info.h>
#include <asm-generic/vmlinux.lds.h>

#undef mips
@@ -73,7 +74,7 @@ SECTIONS
.data : { /* Data */
. = . + DATAOFFSET; /* for CONFIG_MAPPED_KERNEL */

- INIT_TASK_DATA(PAGE_SIZE)
+ INIT_TASK_DATA(THREAD_SIZE)
NOSAVE_DATA
CACHELINE_ALIGNED_DATA(1 << CONFIG_MIPS_L1_CACHE_SHIFT)
READ_MOSTLY_DATA(1 << CONFIG_MIPS_L1_CACHE_SHIFT)
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 45d7c8f..5fb6aae 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -224,13 +224,48 @@ int memory_isolate_notify(unsigned long val, void *v)
}

/*
+ * The probe routines leave the pages reserved, just as the bootmem code does.
+ * Make sure they're still that way.
+ */
+static bool pages_correctly_reserved(unsigned long start_pfn,
+ unsigned long nr_pages)
+{
+ int i, j;
+ struct page *page;
+ unsigned long pfn = start_pfn;
+
+ /*
+ * memmap between sections is not contiguous except with
+ * SPARSEMEM_VMEMMAP. We lookup the page once per section
+ * and assume memmap is contiguous within each section
+ */
+ for (i = 0; i < sections_per_block; i++, pfn += PAGES_PER_SECTION) {
+ if (WARN_ON_ONCE(!pfn_valid(pfn)))
+ return false;
+ page = pfn_to_page(pfn);
+
+ for (j = 0; j < PAGES_PER_SECTION; j++) {
+ if (PageReserved(page + j))
+ continue;
+
+ printk(KERN_WARNING "section number %ld page number %d "
+ "not reserved, was it already online?\n",
+ pfn_to_section_nr(pfn), j);
+
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/*
* MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
* OK to have direct references to sparsemem variables in here.
*/
static int
memory_block_action(unsigned long phys_index, unsigned long action)
{
- int i;
unsigned long start_pfn, start_paddr;
unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
struct page *first_page;
@@ -238,26 +273,13 @@ memory_block_action(unsigned long phys_index, unsigned long action)

first_page = pfn_to_page(phys_index << PFN_SECTION_SHIFT);

- /*
- * The probe routines leave the pages reserved, just
- * as the bootmem code does. Make sure they're still
- * that way.
- */
- if (action == MEM_ONLINE) {
- for (i = 0; i < nr_pages; i++) {
- if (PageReserved(first_page+i))
- continue;
-
- printk(KERN_WARNING "section number %ld page number %d "
- "not reserved, was it already online?\n",
- phys_index, i);
- return -EBUSY;
- }
- }
-
switch (action) {
case MEM_ONLINE:
start_pfn = page_to_pfn(first_page);
+
+ if (!pages_correctly_reserved(start_pfn, nr_pages))
+ return -EBUSY;
+
ret = online_pages(start_pfn, nr_pages);
break;
case MEM_OFFLINE:
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 9bfd057..42ef54f 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1210,7 +1210,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
* We need to dec pending if this was a write.
*/
if (rw == WRITE) {
- if (!(bio->bi_rw & REQ_FLUSH))
+ if (!(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD)))
dm_rh_dec(ms->rh, map_context->ll);
return error;
}
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index 7771ed2..69732e0 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -404,6 +404,9 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio)
return;
}

+ if (bio->bi_rw & REQ_DISCARD)
+ return;
+
/* We must inform the log that the sync count has changed. */
log->type->set_region_sync(log, region, 0);

@@ -524,7 +527,7 @@ void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios)
struct bio *bio;

for (bio = bios->head; bio; bio = bio->bi_next) {
- if (bio->bi_rw & REQ_FLUSH)
+ if (bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))
continue;
rh_inc(rh, dm_rh_bio_to_region(rh, bio));
}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1ac8db5d..57106a9 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -801,7 +801,8 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,

#ifdef CONFIG_MIGRATION
static int btree_migratepage(struct address_space *mapping,
- struct page *newpage, struct page *page)
+ struct page *newpage, struct page *page,
+ enum migrate_mode mode)
{
/*
* we can't safely write a btree page from here,
@@ -816,7 +817,7 @@ static int btree_migratepage(struct address_space *mapping,
if (page_has_private(page) &&
!try_to_release_page(page, GFP_KERNEL))
return -EAGAIN;
- return migrate_page(mapping, newpage, page);
+ return migrate_page(mapping, newpage, page, mode);
}
#endif

diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 6751e74..c71032b 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -85,9 +85,12 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,

dentry = d_lookup(parent, name);
if (dentry) {
- /* FIXME: check for inode number changes? */
- if (dentry->d_inode != NULL)
+ inode = dentry->d_inode;
+ /* update inode in place if i_ino didn't change */
+ if (inode && CIFS_I(inode)->uniqueid == fattr->cf_uniqueid) {
+ cifs_fattr_to_inode(inode, fattr);
return dentry;
+ }
d_drop(dentry);
dput(dentry);
}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 8b0c875..6327a06 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -568,7 +568,8 @@ static int hugetlbfs_set_page_dirty(struct page *page)
}

static int hugetlbfs_migrate_page(struct address_space *mapping,
- struct page *newpage, struct page *page)
+ struct page *newpage, struct page *page,
+ enum migrate_mode mode)
{
int rc;

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 2a55347..4f10d81 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -315,7 +315,7 @@ void nfs_commit_release_pages(struct nfs_write_data *data);

#ifdef CONFIG_MIGRATION
extern int nfs_migrate_page(struct address_space *,
- struct page *, struct page *);
+ struct page *, struct page *, enum migrate_mode);
#else
#define nfs_migrate_page NULL
#endif
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index f2f80c0..58bb999 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1662,7 +1662,7 @@ out_error:

#ifdef CONFIG_MIGRATION
int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
- struct page *page)
+ struct page *page, enum migrate_mode mode)
{
/*
* If PagePrivate is set, then the page is currently associated with
@@ -1677,7 +1677,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,

nfs_fscache_release_page(page, GFP_KERNEL);

- return migrate_page(mapping, newpage, page);
+ return migrate_page(mapping, newpage, page, mode);
}
#endif

diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index c606f01..1250016 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -715,8 +715,12 @@ static int fixup_free_space(struct ubifs_info *c)
lnum = ubifs_next_log_lnum(c, lnum);
}

- /* Fixup the current log head */
- err = fixup_leb(c, c->lhead_lnum, c->lhead_offs);
+ /*
+ * Fixup the log head which contains the only a CS node at the
+ * beginning.
+ */
+ err = fixup_leb(c, c->lhead_lnum,
+ ALIGN(UBIFS_CS_NODE_SZ, c->min_io_size));
if (err)
goto out;

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index e9eaec5..7a7e5fd 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -89,42 +89,33 @@ extern void rebuild_sched_domains(void);
extern void cpuset_print_task_mems_allowed(struct task_struct *p);

/*
- * reading current mems_allowed and mempolicy in the fastpath must protected
- * by get_mems_allowed()
+ * get_mems_allowed is required when making decisions involving mems_allowed
+ * such as during page allocation. mems_allowed can be updated in parallel
+ * and depending on the new value an operation can fail potentially causing
+ * process failure. A retry loop with get_mems_allowed and put_mems_allowed
+ * prevents these artificial failures.
*/
-static inline void get_mems_allowed(void)
+static inline unsigned int get_mems_allowed(void)
{
- current->mems_allowed_change_disable++;
-
- /*
- * ensure that reading mems_allowed and mempolicy happens after the
- * update of ->mems_allowed_change_disable.
- *
- * the write-side task finds ->mems_allowed_change_disable is not 0,
- * and knows the read-side task is reading mems_allowed or mempolicy,
- * so it will clear old bits lazily.
- */
- smp_mb();
+ return read_seqcount_begin(&current->mems_allowed_seq);
}

-static inline void put_mems_allowed(void)
+/*
+ * If this returns false, the operation that took place after get_mems_allowed
+ * may have failed. It is up to the caller to retry the operation if
+ * appropriate.
+ */
+static inline bool put_mems_allowed(unsigned int seq)
{
- /*
- * ensure that reading mems_allowed and mempolicy before reducing
- * mems_allowed_change_disable.
- *
- * the write-side task will know that the read-side task is still
- * reading mems_allowed or mempolicy, don't clears old bits in the
- * nodemask.
- */
- smp_mb();
- --ACCESS_ONCE(current->mems_allowed_change_disable);
+ return !read_seqcount_retry(&current->mems_allowed_seq, seq);
}

static inline void set_mems_allowed(nodemask_t nodemask)
{
task_lock(current);
+ write_seqcount_begin(&current->mems_allowed_seq);
current->mems_allowed = nodemask;
+ write_seqcount_end(&current->mems_allowed_seq);
task_unlock(current);
}

@@ -234,12 +225,14 @@ static inline void set_mems_allowed(nodemask_t nodemask)
{
}

-static inline void get_mems_allowed(void)
+static inline unsigned int get_mems_allowed(void)
{
+ return 0;
}

-static inline void put_mems_allowed(void)
+static inline bool put_mems_allowed(unsigned int seq)
{
+ return true;
}

#endif /* !CONFIG_CPUSETS */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 96b1035..212ea7b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -523,6 +523,7 @@ enum positive_aop_returns {
struct page;
struct address_space;
struct writeback_control;
+enum migrate_mode;

struct iov_iter {
const struct iovec *iov;
@@ -607,9 +608,12 @@ struct address_space_operations {
loff_t offset, unsigned long nr_segs);
int (*get_xip_mem)(struct address_space *, pgoff_t, int,
void **, unsigned long *);
- /* migrate the contents of a page to the specified target */
+ /*
+ * migrate the contents of a page to the specified target. If sync
+ * is false, it must not block.
+ */
int (*migratepage) (struct address_space *,
- struct page *, struct page *);
+ struct page *, struct page *, enum migrate_mode);
int (*launder_page) (struct page *);
int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
unsigned long);
@@ -2478,7 +2482,8 @@ extern int generic_check_addressable(unsigned, u64);

#ifdef CONFIG_MIGRATION
extern int buffer_migrate_page(struct address_space *,
- struct page *, struct page *);
+ struct page *, struct page *,
+ enum migrate_mode);
#else
#define buffer_migrate_page NULL
#endif
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 580f70c..5e41a8e 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -30,6 +30,13 @@ extern struct fs_struct init_fs;
#define INIT_THREADGROUP_FORK_LOCK(sig)
#endif

+#ifdef CONFIG_CPUSETS
+#define INIT_CPUSET_SEQ \
+ .mems_allowed_seq = SEQCNT_ZERO,
+#else
+#define INIT_CPUSET_SEQ
+#endif
+
#define INIT_SIGNALS(sig) { \
.nr_threads = 1, \
.wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(sig.wait_chldexit),\
@@ -193,6 +200,7 @@ extern struct cred init_cred;
INIT_FTRACE_GRAPH \
INIT_TRACE_RECURSION \
INIT_TASK_RCU_PREEMPT(tsk) \
+ INIT_CPUSET_SEQ \
}


diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 313a00e..4a8da84 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -35,7 +35,8 @@ enum mem_cgroup_page_stat_item {
extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
struct list_head *dst,
unsigned long *scanned, int order,
- int mode, struct zone *z,
+ isolate_mode_t mode,
+ struct zone *z,
struct mem_cgroup *mem_cont,
int active, int file);

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index e39aeec..eaf8674 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -6,18 +6,31 @@

typedef struct page *new_page_t(struct page *, unsigned long private, int **);

+/*
+ * MIGRATE_ASYNC means never block
+ * MIGRATE_SYNC_LIGHT in the current implementation means to allow blocking
+ * on most operations but not ->writepage as the potential stall time
+ * is too significant
+ * MIGRATE_SYNC will block when migrating pages
+ */
+enum migrate_mode {
+ MIGRATE_ASYNC,
+ MIGRATE_SYNC_LIGHT,
+ MIGRATE_SYNC,
+};
+
#ifdef CONFIG_MIGRATION
#define PAGE_MIGRATION 1

extern void putback_lru_pages(struct list_head *l);
extern int migrate_page(struct address_space *,
- struct page *, struct page *);
+ struct page *, struct page *, enum migrate_mode);
extern int migrate_pages(struct list_head *l, new_page_t x,
unsigned long private, bool offlining,
- bool sync);
+ enum migrate_mode mode);
extern int migrate_huge_pages(struct list_head *l, new_page_t x,
unsigned long private, bool offlining,
- bool sync);
+ enum migrate_mode mode);

extern int fail_migrate_page(struct address_space *,
struct page *, struct page *);
@@ -36,10 +49,10 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping,
static inline void putback_lru_pages(struct list_head *l) {}
static inline int migrate_pages(struct list_head *l, new_page_t x,
unsigned long private, bool offlining,
- bool sync) { return -ENOSYS; }
+ enum migrate_mode mode) { return -ENOSYS; }
static inline int migrate_huge_pages(struct list_head *l, new_page_t x,
unsigned long private, bool offlining,
- bool sync) { return -ENOSYS; }
+ enum migrate_mode mode) { return -ENOSYS; }

static inline int migrate_prep(void) { return -ENOSYS; }
static inline int migrate_prep_local(void) { return -ENOSYS; }
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index aa2d80b..b32f3f9 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -158,6 +158,20 @@ static inline int is_unevictable_lru(enum lru_list l)
return (l == LRU_UNEVICTABLE);
}

+/* Isolate inactive pages */
+#define ISOLATE_INACTIVE ((__force isolate_mode_t)0x1)
+/* Isolate active pages */
+#define ISOLATE_ACTIVE ((__force isolate_mode_t)0x2)
+/* Isolate clean file */
+#define ISOLATE_CLEAN ((__force isolate_mode_t)0x4)
+/* Isolate unmapped file */
+#define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x8)
+/* Isolate for asynchronous migration */
+#define ISOLATE_ASYNC_MIGRATE ((__force isolate_mode_t)0x10)
+
+/* LRU Isolation modes. */
+typedef unsigned __bitwise__ isolate_mode_t;
+
enum zone_watermarks {
WMARK_MIN,
WMARK_LOW,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4ef452b..443ec43 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1484,7 +1484,7 @@ struct task_struct {
#endif
#ifdef CONFIG_CPUSETS
nodemask_t mems_allowed; /* Protected by alloc_lock */
- int mems_allowed_change_disable;
+ seqcount_t mems_allowed_seq; /* Seqence no to catch updates */
int cpuset_mem_spread_rotor;
int cpuset_slab_spread_rotor;
#endif
diff --git a/include/linux/swap.h b/include/linux/swap.h
index a273468..e73799d 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -243,11 +243,6 @@ static inline void lru_cache_add_file(struct page *page)
__lru_cache_add(page, LRU_INACTIVE_FILE);
}

-/* LRU Isolation modes. */
-#define ISOLATE_INACTIVE 0 /* Isolate inactive pages. */
-#define ISOLATE_ACTIVE 1 /* Isolate active pages. */
-#define ISOLATE_BOTH 2 /* Isolate both active and inactive pages. */
-
/* linux/mm/vmscan.c */
extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
gfp_t gfp_mask, nodemask_t *mask);
@@ -259,7 +254,7 @@ extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
unsigned int swappiness,
struct zone *zone,
unsigned long *nr_scanned);
-extern int __isolate_lru_page(struct page *page, int mode, int file);
+extern int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file);
extern unsigned long shrink_all_memory(unsigned long nr_pages);
extern int vm_swappiness;
extern int remove_mapping(struct address_space *mapping, struct page *page);
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index b2c33bd..edc4b3d 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -179,6 +179,83 @@ DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_memcg_softlimit_re
TP_ARGS(nr_reclaimed)
);

+TRACE_EVENT(mm_shrink_slab_start,
+ TP_PROTO(struct shrinker *shr, struct shrink_control *sc,
+ long nr_objects_to_shrink, unsigned long pgs_scanned,
+ unsigned long lru_pgs, unsigned long cache_items,
+ unsigned long long delta, unsigned long total_scan),
+
+ TP_ARGS(shr, sc, nr_objects_to_shrink, pgs_scanned, lru_pgs,
+ cache_items, delta, total_scan),
+
+ TP_STRUCT__entry(
+ __field(struct shrinker *, shr)
+ __field(void *, shrink)
+ __field(long, nr_objects_to_shrink)
+ __field(gfp_t, gfp_flags)
+ __field(unsigned long, pgs_scanned)
+ __field(unsigned long, lru_pgs)
+ __field(unsigned long, cache_items)
+ __field(unsigned long long, delta)
+ __field(unsigned long, total_scan)
+ ),
+
+ TP_fast_assign(
+ __entry->shr = shr;
+ __entry->shrink = shr->shrink;
+ __entry->nr_objects_to_shrink = nr_objects_to_shrink;
+ __entry->gfp_flags = sc->gfp_mask;
+ __entry->pgs_scanned = pgs_scanned;
+ __entry->lru_pgs = lru_pgs;
+ __entry->cache_items = cache_items;
+ __entry->delta = delta;
+ __entry->total_scan = total_scan;
+ ),
+
+ TP_printk("%pF %p: objects to shrink %ld gfp_flags %s pgs_scanned %ld lru_pgs %ld cache items %ld delta %lld total_scan %ld",
+ __entry->shrink,
+ __entry->shr,
+ __entry->nr_objects_to_shrink,
+ show_gfp_flags(__entry->gfp_flags),
+ __entry->pgs_scanned,
+ __entry->lru_pgs,
+ __entry->cache_items,
+ __entry->delta,
+ __entry->total_scan)
+);
+
+TRACE_EVENT(mm_shrink_slab_end,
+ TP_PROTO(struct shrinker *shr, int shrinker_retval,
+ long unused_scan_cnt, long new_scan_cnt),
+
+ TP_ARGS(shr, shrinker_retval, unused_scan_cnt, new_scan_cnt),
+
+ TP_STRUCT__entry(
+ __field(struct shrinker *, shr)
+ __field(void *, shrink)
+ __field(long, unused_scan)
+ __field(long, new_scan)
+ __field(int, retval)
+ __field(long, total_scan)
+ ),
+
+ TP_fast_assign(
+ __entry->shr = shr;
+ __entry->shrink = shr->shrink;
+ __entry->unused_scan = unused_scan_cnt;
+ __entry->new_scan = new_scan_cnt;
+ __entry->retval = shrinker_retval;
+ __entry->total_scan = new_scan_cnt - unused_scan_cnt;
+ ),
+
+ TP_printk("%pF %p: unused scan count %ld new scan count %ld total_scan %ld last shrinker return val %d",
+ __entry->shrink,
+ __entry->shr,
+ __entry->unused_scan,
+ __entry->new_scan,
+ __entry->total_scan,
+ __entry->retval)
+);

DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template,

@@ -189,7 +266,7 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template,
unsigned long nr_lumpy_taken,
unsigned long nr_lumpy_dirty,
unsigned long nr_lumpy_failed,
- int isolate_mode),
+ isolate_mode_t isolate_mode),

TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode),

@@ -201,7 +278,7 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template,
__field(unsigned long, nr_lumpy_taken)
__field(unsigned long, nr_lumpy_dirty)
__field(unsigned long, nr_lumpy_failed)
- __field(int, isolate_mode)
+ __field(isolate_mode_t, isolate_mode)
),

TP_fast_assign(
@@ -235,7 +312,7 @@ DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_lru_isolate,
unsigned long nr_lumpy_taken,
unsigned long nr_lumpy_dirty,
unsigned long nr_lumpy_failed,
- int isolate_mode),
+ isolate_mode_t isolate_mode),

TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode)

@@ -250,7 +327,7 @@ DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_memcg_isolate,
unsigned long nr_lumpy_taken,
unsigned long nr_lumpy_dirty,
unsigned long nr_lumpy_failed,
- int isolate_mode),
+ isolate_mode_t isolate_mode),

TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 9c9b754..b2e84bd 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -123,6 +123,19 @@ static inline struct cpuset *task_cs(struct task_struct *task)
struct cpuset, css);
}

+#ifdef CONFIG_NUMA
+static inline bool task_has_mempolicy(struct task_struct *task)
+{
+ return task->mempolicy;
+}
+#else
+static inline bool task_has_mempolicy(struct task_struct *task)
+{
+ return false;
+}
+#endif
+
+
/* bits in struct cpuset flags field */
typedef enum {
CS_CPU_EXCLUSIVE,
@@ -949,7 +962,8 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
static void cpuset_change_task_nodemask(struct task_struct *tsk,
nodemask_t *newmems)
{
-repeat:
+ bool need_loop;
+
/*
* Allow tasks that have access to memory reserves because they have
* been OOM killed to get memory anywhere.
@@ -960,46 +974,27 @@ repeat:
return;

task_lock(tsk);
- nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
- mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
-
-
/*
- * ensure checking ->mems_allowed_change_disable after setting all new
- * allowed nodes.
- *
- * the read-side task can see an nodemask with new allowed nodes and
- * old allowed nodes. and if it allocates page when cpuset clears newly
- * disallowed ones continuous, it can see the new allowed bits.
- *
- * And if setting all new allowed nodes is after the checking, setting
- * all new allowed nodes and clearing newly disallowed ones will be done
- * continuous, and the read-side task may find no node to alloc page.
+ * Determine if a loop is necessary if another thread is doing
+ * get_mems_allowed(). If at least one node remains unchanged and
+ * tsk does not have a mempolicy, then an empty nodemask will not be
+ * possible when mems_allowed is larger than a word.
*/
- smp_mb();
+ need_loop = task_has_mempolicy(tsk) ||
+ !nodes_intersects(*newmems, tsk->mems_allowed);

- /*
- * Allocation of memory is very fast, we needn't sleep when waiting
- * for the read-side.
- */
- while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
- task_unlock(tsk);
- if (!task_curr(tsk))
- yield();
- goto repeat;
- }
+ if (need_loop)
+ write_seqcount_begin(&tsk->mems_allowed_seq);

- /*
- * ensure checking ->mems_allowed_change_disable before clearing all new
- * disallowed nodes.
- *
- * if clearing newly disallowed bits before the checking, the read-side
- * task may find no node to alloc page.
- */
- smp_mb();
+ nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
+ mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);

mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
tsk->mems_allowed = *newmems;
+
+ if (need_loop)
+ write_seqcount_end(&tsk->mems_allowed_seq);
+
task_unlock(tsk);
}

diff --git a/kernel/fork.c b/kernel/fork.c
index 4712e3e..3d42aa3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -985,6 +985,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
#ifdef CONFIG_CGROUPS
init_rwsem(&sig->threadgroup_fork_lock);
#endif
+#ifdef CONFIG_CPUSETS
+ seqcount_init(&tsk->mems_allowed_seq);
+#endif

sig->oom_adj = current->signal->oom_adj;
sig->oom_score_adj = current->signal->oom_score_adj;
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index f1eb182..61fc450 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -375,7 +375,9 @@ int second_overflow(unsigned long secs)
time_state = TIME_DEL;
break;
case TIME_INS:
- if (secs % 86400 == 0) {
+ if (!(time_status & STA_INS))
+ time_state = TIME_OK;
+ else if (secs % 86400 == 0) {
leap = -1;
time_state = TIME_OOP;
time_tai++;
@@ -384,7 +386,9 @@ int second_overflow(unsigned long secs)
}
break;
case TIME_DEL:
- if ((secs + 1) % 86400 == 0) {
+ if (!(time_status & STA_DEL))
+ time_state = TIME_OK;
+ else if ((secs + 1) % 86400 == 0) {
leap = 1;
time_tai--;
time_state = TIME_WAIT;
diff --git a/mm/compaction.c b/mm/compaction.c
index adc5336..8ea7308 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -35,10 +35,6 @@ struct compact_control {
unsigned long migrate_pfn; /* isolate_migratepages search base */
bool sync; /* Synchronous migration */

- /* Account for isolated anon and file pages */
- unsigned long nr_anon;
- unsigned long nr_file;
-
unsigned int order; /* order a direct compactor needs */
int migratetype; /* MOVABLE, RECLAIMABLE etc */
struct zone *zone;
@@ -223,17 +219,13 @@ static void isolate_freepages(struct zone *zone,
static void acct_isolated(struct zone *zone, struct compact_control *cc)
{
struct page *page;
- unsigned int count[NR_LRU_LISTS] = { 0, };
+ unsigned int count[2] = { 0, };

- list_for_each_entry(page, &cc->migratepages, lru) {
- int lru = page_lru_base_type(page);
- count[lru]++;
- }
+ list_for_each_entry(page, &cc->migratepages, lru)
+ count[!!page_is_file_cache(page)]++;

- cc->nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
- cc->nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
- __mod_zone_page_state(zone, NR_ISOLATED_ANON, cc->nr_anon);
- __mod_zone_page_state(zone, NR_ISOLATED_FILE, cc->nr_file);
+ __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
+ __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
}

/* Similar to reclaim, but different enough that they don't share logic */
@@ -269,6 +261,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
unsigned long last_pageblock_nr = 0, pageblock_nr;
unsigned long nr_scanned = 0, nr_isolated = 0;
struct list_head *migratelist = &cc->migratepages;
+ isolate_mode_t mode = ISOLATE_ACTIVE|ISOLATE_INACTIVE;

/* Do not scan outside zone boundaries */
low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
@@ -378,8 +371,11 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
continue;
}

+ if (!cc->sync)
+ mode |= ISOLATE_ASYNC_MIGRATE;
+
/* Try isolate the page */
- if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0)
+ if (__isolate_lru_page(page, mode, 0) != 0)
continue;

VM_BUG_ON(PageTransCompound(page));
@@ -581,7 +577,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
nr_migrate = cc->nr_migratepages;
err = migrate_pages(&cc->migratepages, compaction_alloc,
(unsigned long)cc, false,
- cc->sync);
+ cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC);
update_nr_listpages(cc);
nr_remaining = cc->nr_migratepages;

diff --git a/mm/filemap.c b/mm/filemap.c
index b7d8603..10481eb 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -516,10 +516,13 @@ struct page *__page_cache_alloc(gfp_t gfp)
struct page *page;

if (cpuset_do_page_mem_spread()) {
- get_mems_allowed();
- n = cpuset_mem_spread_node();
- page = alloc_pages_exact_node(n, gfp, 0);
- put_mems_allowed();
+ unsigned int cpuset_mems_cookie;
+ do {
+ cpuset_mems_cookie = get_mems_allowed();
+ n = cpuset_mem_spread_node();
+ page = alloc_pages_exact_node(n, gfp, 0);
+ } while (!put_mems_allowed(cpuset_mems_cookie) && !page);
+
return page;
}
return alloc_pages(gfp, 0);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 05f8fd4..ae60a53 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -460,8 +460,10 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
struct zonelist *zonelist;
struct zone *zone;
struct zoneref *z;
+ unsigned int cpuset_mems_cookie;

- get_mems_allowed();
+retry_cpuset:
+ cpuset_mems_cookie = get_mems_allowed();
zonelist = huge_zonelist(vma, address,
htlb_alloc_mask, &mpol, &nodemask);
/*
@@ -488,10 +490,15 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
}
}
}
-err:
+
mpol_cond_put(mpol);
- put_mems_allowed();
+ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+ goto retry_cpuset;
return page;
+
+err:
+ mpol_cond_put(mpol);
+ return NULL;
}

static void update_and_free_page(struct hstate *h, struct page *page)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ffb99b4..57cdf5a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1251,7 +1251,8 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
struct list_head *dst,
unsigned long *scanned, int order,
- int mode, struct zone *z,
+ isolate_mode_t mode,
+ struct zone *z,
struct mem_cgroup *mem_cont,
int active, int file)
{
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 740c4f5..6496748 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1464,7 +1464,7 @@ int soft_offline_page(struct page *page, int flags)
page_is_file_cache(page));
list_add(&page->lru, &pagelist);
ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
- 0, true);
+ 0, MIGRATE_SYNC);
if (ret) {
putback_lru_pages(&pagelist);
pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c46887b..ae5a3f2 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -747,7 +747,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
}
/* this function returns # of failed pages */
ret = migrate_pages(&source, hotremove_migrate_alloc, 0,
- true, true);
+ true, MIGRATE_SYNC);
if (ret)
putback_lru_pages(&source);
}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3dac2d1..cff919f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -926,7 +926,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,

if (!list_empty(&pagelist)) {
err = migrate_pages(&pagelist, new_node_page, dest,
- false, true);
+ false, MIGRATE_SYNC);
if (err)
putback_lru_pages(&pagelist);
}
@@ -1810,18 +1810,24 @@ struct page *
alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
unsigned long addr, int node)
{
- struct mempolicy *pol = get_vma_policy(current, vma, addr);
+ struct mempolicy *pol;
struct zonelist *zl;
struct page *page;
+ unsigned int cpuset_mems_cookie;
+
+retry_cpuset:
+ pol = get_vma_policy(current, vma, addr);
+ cpuset_mems_cookie = get_mems_allowed();

- get_mems_allowed();
if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
unsigned nid;

nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
mpol_cond_put(pol);
page = alloc_page_interleave(gfp, order, nid);
- put_mems_allowed();
+ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+ goto retry_cpuset;
+
return page;
}
zl = policy_zonelist(gfp, pol, node);
@@ -1832,7 +1838,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
struct page *page = __alloc_pages_nodemask(gfp, order,
zl, policy_nodemask(gfp, pol));
__mpol_put(pol);
- put_mems_allowed();
+ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+ goto retry_cpuset;
return page;
}
/*
@@ -1840,7 +1847,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
*/
page = __alloc_pages_nodemask(gfp, order, zl,
policy_nodemask(gfp, pol));
- put_mems_allowed();
+ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+ goto retry_cpuset;
return page;
}

@@ -1867,11 +1875,14 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
{
struct mempolicy *pol = current->mempolicy;
struct page *page;
+ unsigned int cpuset_mems_cookie;

if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
pol = &default_policy;

- get_mems_allowed();
+retry_cpuset:
+ cpuset_mems_cookie = get_mems_allowed();
+
/*
* No reference counting needed for current->mempolicy
* nor system default_policy
@@ -1882,7 +1893,10 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
page = __alloc_pages_nodemask(gfp, order,
policy_zonelist(gfp, pol, numa_node_id()),
policy_nodemask(gfp, pol));
- put_mems_allowed();
+
+ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+ goto retry_cpuset;
+
return page;
}
EXPORT_SYMBOL(alloc_pages_current);
diff --git a/mm/migrate.c b/mm/migrate.c
index 14d0a6a..480714b 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -220,6 +220,56 @@ out:
pte_unmap_unlock(ptep, ptl);
}

+#ifdef CONFIG_BLOCK
+/* Returns true if all buffers are successfully locked */
+static bool buffer_migrate_lock_buffers(struct buffer_head *head,
+ enum migrate_mode mode)
+{
+ struct buffer_head *bh = head;
+
+ /* Simple case, sync compaction */
+ if (mode != MIGRATE_ASYNC) {
+ do {
+ get_bh(bh);
+ lock_buffer(bh);
+ bh = bh->b_this_page;
+
+ } while (bh != head);
+
+ return true;
+ }
+
+ /* async case, we cannot block on lock_buffer so use trylock_buffer */
+ do {
+ get_bh(bh);
+ if (!trylock_buffer(bh)) {
+ /*
+ * We failed to lock the buffer and cannot stall in
+ * async migration. Release the taken locks
+ */
+ struct buffer_head *failed_bh = bh;
+ put_bh(failed_bh);
+ bh = head;
+ while (bh != failed_bh) {
+ unlock_buffer(bh);
+ put_bh(bh);
+ bh = bh->b_this_page;
+ }
+ return false;
+ }
+
+ bh = bh->b_this_page;
+ } while (bh != head);
+ return true;
+}
+#else
+static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
+ enum migrate_mode mode)
+{
+ return true;
+}
+#endif /* CONFIG_BLOCK */
+
/*
* Replace the page in the mapping.
*
@@ -229,7 +279,8 @@ out:
* 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
*/
static int migrate_page_move_mapping(struct address_space *mapping,
- struct page *newpage, struct page *page)
+ struct page *newpage, struct page *page,
+ struct buffer_head *head, enum migrate_mode mode)
{
int expected_count;
void **pslot;
@@ -259,6 +310,20 @@ static int migrate_page_move_mapping(struct address_space *mapping,
}

/*
+ * In the async migration case of moving a page with buffers, lock the
+ * buffers using trylock before the mapping is moved. If the mapping
+ * was moved, we later failed to lock the buffers and could not move
+ * the mapping back due to an elevated page count, we would have to
+ * block waiting on other references to be dropped.
+ */
+ if (mode == MIGRATE_ASYNC && head &&
+ !buffer_migrate_lock_buffers(head, mode)) {
+ page_unfreeze_refs(page, expected_count);
+ spin_unlock_irq(&mapping->tree_lock);
+ return -EAGAIN;
+ }
+
+ /*
* Now we know that no one else is looking at the page.
*/
get_page(newpage); /* add cache reference */
@@ -415,13 +480,14 @@ EXPORT_SYMBOL(fail_migrate_page);
* Pages are locked upon entry and exit.
*/
int migrate_page(struct address_space *mapping,
- struct page *newpage, struct page *page)
+ struct page *newpage, struct page *page,
+ enum migrate_mode mode)
{
int rc;

BUG_ON(PageWriteback(page)); /* Writeback must be complete */

- rc = migrate_page_move_mapping(mapping, newpage, page);
+ rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode);

if (rc)
return rc;
@@ -438,28 +504,28 @@ EXPORT_SYMBOL(migrate_page);
* exist.
*/
int buffer_migrate_page(struct address_space *mapping,
- struct page *newpage, struct page *page)
+ struct page *newpage, struct page *page, enum migrate_mode mode)
{
struct buffer_head *bh, *head;
int rc;

if (!page_has_buffers(page))
- return migrate_page(mapping, newpage, page);
+ return migrate_page(mapping, newpage, page, mode);

head = page_buffers(page);

- rc = migrate_page_move_mapping(mapping, newpage, page);
+ rc = migrate_page_move_mapping(mapping, newpage, page, head, mode);

if (rc)
return rc;

- bh = head;
- do {
- get_bh(bh);
- lock_buffer(bh);
- bh = bh->b_this_page;
-
- } while (bh != head);
+ /*
+ * In the async case, migrate_page_move_mapping locked the buffers
+ * with an IRQ-safe spinlock held. In the sync case, the buffers
+ * need to be locked now
+ */
+ if (mode != MIGRATE_ASYNC)
+ BUG_ON(!buffer_migrate_lock_buffers(head, mode));

ClearPagePrivate(page);
set_page_private(newpage, page_private(page));
@@ -536,10 +602,14 @@ static int writeout(struct address_space *mapping, struct page *page)
* Default handling if a filesystem does not provide a migration function.
*/
static int fallback_migrate_page(struct address_space *mapping,
- struct page *newpage, struct page *page)
+ struct page *newpage, struct page *page, enum migrate_mode mode)
{
- if (PageDirty(page))
+ if (PageDirty(page)) {
+ /* Only writeback pages in full synchronous migration */
+ if (mode != MIGRATE_SYNC)
+ return -EBUSY;
return writeout(mapping, page);
+ }

/*
* Buffers may be managed in a filesystem specific way.
@@ -549,7 +619,7 @@ static int fallback_migrate_page(struct address_space *mapping,
!try_to_release_page(page, GFP_KERNEL))
return -EAGAIN;

- return migrate_page(mapping, newpage, page);
+ return migrate_page(mapping, newpage, page, mode);
}

/*
@@ -564,7 +634,7 @@ static int fallback_migrate_page(struct address_space *mapping,
* == 0 - success
*/
static int move_to_new_page(struct page *newpage, struct page *page,
- int remap_swapcache, bool sync)
+ int remap_swapcache, enum migrate_mode mode)
{
struct address_space *mapping;
int rc;
@@ -585,29 +655,18 @@ static int move_to_new_page(struct page *newpage, struct page *page,

mapping = page_mapping(page);
if (!mapping)
- rc = migrate_page(mapping, newpage, page);
- else {
+ rc = migrate_page(mapping, newpage, page, mode);
+ else if (mapping->a_ops->migratepage)
/*
- * Do not writeback pages if !sync and migratepage is
- * not pointing to migrate_page() which is nonblocking
- * (swapcache/tmpfs uses migratepage = migrate_page).
+ * Most pages have a mapping and most filesystems provide a
+ * migratepage callback. Anonymous pages are part of swap
+ * space which also has its own migratepage callback. This
+ * is the most common path for page migration.
*/
- if (PageDirty(page) && !sync &&
- mapping->a_ops->migratepage != migrate_page)
- rc = -EBUSY;
- else if (mapping->a_ops->migratepage)
- /*
- * Most pages have a mapping and most filesystems
- * should provide a migration function. Anonymous
- * pages are part of swap space which also has its
- * own migration function. This is the most common
- * path for page migration.
- */
- rc = mapping->a_ops->migratepage(mapping,
- newpage, page);
- else
- rc = fallback_migrate_page(mapping, newpage, page);
- }
+ rc = mapping->a_ops->migratepage(mapping,
+ newpage, page, mode);
+ else
+ rc = fallback_migrate_page(mapping, newpage, page, mode);

if (rc) {
newpage->mapping = NULL;
@@ -621,38 +680,18 @@ static int move_to_new_page(struct page *newpage, struct page *page,
return rc;
}

-/*
- * Obtain the lock on page, remove all ptes and migrate the page
- * to the newly allocated page in newpage.
- */
-static int unmap_and_move(new_page_t get_new_page, unsigned long private,
- struct page *page, int force, bool offlining, bool sync)
+static int __unmap_and_move(struct page *page, struct page *newpage,
+ int force, bool offlining, enum migrate_mode mode)
{
- int rc = 0;
- int *result = NULL;
- struct page *newpage = get_new_page(page, private, &result);
+ int rc = -EAGAIN;
int remap_swapcache = 1;
int charge = 0;
struct mem_cgroup *mem;
struct anon_vma *anon_vma = NULL;

- if (!newpage)
- return -ENOMEM;
-
- if (page_count(page) == 1) {
- /* page was freed from under us. So we are done. */
- goto move_newpage;
- }
- if (unlikely(PageTransHuge(page)))
- if (unlikely(split_huge_page(page)))
- goto move_newpage;
-
- /* prepare cgroup just returns 0 or -ENOMEM */
- rc = -EAGAIN;
-
if (!trylock_page(page)) {
- if (!force || !sync)
- goto move_newpage;
+ if (!force || mode == MIGRATE_ASYNC)
+ goto out;

/*
* It's not safe for direct compaction to call lock_page.
@@ -668,7 +707,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
* altogether.
*/
if (current->flags & PF_MEMALLOC)
- goto move_newpage;
+ goto out;

lock_page(page);
}
@@ -697,10 +736,12 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,

if (PageWriteback(page)) {
/*
- * For !sync, there is no point retrying as the retry loop
- * is expected to be too short for PageWriteback to be cleared
+ * Only in the case of a full syncronous migration is it
+ * necessary to wait for PageWriteback. In the async case,
+ * the retry loop is too short and in the sync-light case,
+ * the overhead of stalling is too much
*/
- if (!sync) {
+ if (mode != MIGRATE_SYNC) {
rc = -EBUSY;
goto uncharge;
}
@@ -771,7 +812,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,

skip_unmap:
if (!page_mapped(page))
- rc = move_to_new_page(newpage, page, remap_swapcache, sync);
+ rc = move_to_new_page(newpage, page, remap_swapcache, mode);

if (rc && remap_swapcache)
remove_migration_ptes(page, page);
@@ -785,27 +826,53 @@ uncharge:
mem_cgroup_end_migration(mem, page, newpage, rc == 0);
unlock:
unlock_page(page);
+out:
+ return rc;
+}

-move_newpage:
+/*
+ * Obtain the lock on page, remove all ptes and migrate the page
+ * to the newly allocated page in newpage.
+ */
+static int unmap_and_move(new_page_t get_new_page, unsigned long private,
+ struct page *page, int force, bool offlining,
+ enum migrate_mode mode)
+{
+ int rc = 0;
+ int *result = NULL;
+ struct page *newpage = get_new_page(page, private, &result);
+
+ if (!newpage)
+ return -ENOMEM;
+
+ if (page_count(page) == 1) {
+ /* page was freed from under us. So we are done. */
+ goto out;
+ }
+
+ if (unlikely(PageTransHuge(page)))
+ if (unlikely(split_huge_page(page)))
+ goto out;
+
+ rc = __unmap_and_move(page, newpage, force, offlining, mode);
+out:
if (rc != -EAGAIN) {
- /*
- * A page that has been migrated has all references
- * removed and will be freed. A page that has not been
- * migrated will have kepts its references and be
- * restored.
- */
- list_del(&page->lru);
+ /*
+ * A page that has been migrated has all references
+ * removed and will be freed. A page that has not been
+ * migrated will have kepts its references and be
+ * restored.
+ */
+ list_del(&page->lru);
dec_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
putback_lru_page(page);
}
-
/*
* Move the new page to the LRU. If migration was not successful
* then this will free the page.
*/
putback_lru_page(newpage);
-
if (result) {
if (rc)
*result = rc;
@@ -835,7 +902,8 @@ move_newpage:
*/
static int unmap_and_move_huge_page(new_page_t get_new_page,
unsigned long private, struct page *hpage,
- int force, bool offlining, bool sync)
+ int force, bool offlining,
+ enum migrate_mode mode)
{
int rc = 0;
int *result = NULL;
@@ -848,7 +916,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
rc = -EAGAIN;

if (!trylock_page(hpage)) {
- if (!force || !sync)
+ if (!force || mode != MIGRATE_SYNC)
goto out;
lock_page(hpage);
}
@@ -859,7 +927,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);

if (!page_mapped(hpage))
- rc = move_to_new_page(new_hpage, hpage, 1, sync);
+ rc = move_to_new_page(new_hpage, hpage, 1, mode);

if (rc)
remove_migration_ptes(hpage, hpage);
@@ -902,7 +970,7 @@ out:
*/
int migrate_pages(struct list_head *from,
new_page_t get_new_page, unsigned long private, bool offlining,
- bool sync)
+ enum migrate_mode mode)
{
int retry = 1;
int nr_failed = 0;
@@ -923,7 +991,7 @@ int migrate_pages(struct list_head *from,

rc = unmap_and_move(get_new_page, private,
page, pass > 2, offlining,
- sync);
+ mode);

switch(rc) {
case -ENOMEM:
@@ -953,7 +1021,7 @@ out:

int migrate_huge_pages(struct list_head *from,
new_page_t get_new_page, unsigned long private, bool offlining,
- bool sync)
+ enum migrate_mode mode)
{
int retry = 1;
int nr_failed = 0;
@@ -970,7 +1038,7 @@ int migrate_huge_pages(struct list_head *from,

rc = unmap_and_move_huge_page(get_new_page,
private, page, pass > 2, offlining,
- sync);
+ mode);

switch(rc) {
case -ENOMEM:
@@ -1099,7 +1167,7 @@ set_status:
err = 0;
if (!list_empty(&pagelist)) {
err = migrate_pages(&pagelist, new_page_node,
- (unsigned long)pm, 0, true);
+ (unsigned long)pm, 0, MIGRATE_SYNC);
if (err)
putback_lru_pages(&pagelist);
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 947a7e9..9177aa3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1897,14 +1897,20 @@ static struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int migratetype, unsigned long *did_some_progress,
- bool sync_migration)
+ int migratetype, bool sync_migration,
+ bool *deferred_compaction,
+ unsigned long *did_some_progress)
{
struct page *page;

- if (!order || compaction_deferred(preferred_zone))
+ if (!order)
return NULL;

+ if (compaction_deferred(preferred_zone)) {
+ *deferred_compaction = true;
+ return NULL;
+ }
+
current->flags |= PF_MEMALLOC;
*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
nodemask, sync_migration);
@@ -1932,7 +1938,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
* but not enough to satisfy watermarks.
*/
count_vm_event(COMPACTFAIL);
- defer_compaction(preferred_zone);
+
+ /*
+ * As async compaction considers a subset of pageblocks, only
+ * defer if the failure was a sync compaction failure.
+ */
+ if (sync_migration)
+ defer_compaction(preferred_zone);

cond_resched();
}
@@ -1944,8 +1956,9 @@ static inline struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int migratetype, unsigned long *did_some_progress,
- bool sync_migration)
+ int migratetype, bool sync_migration,
+ bool *deferred_compaction,
+ unsigned long *did_some_progress)
{
return NULL;
}
@@ -2095,6 +2108,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
unsigned long pages_reclaimed = 0;
unsigned long did_some_progress;
bool sync_migration = false;
+ bool deferred_compaction = false;

/*
* In the slowpath, we sanity check order to avoid ever trying to
@@ -2175,12 +2189,22 @@ rebalance:
zonelist, high_zoneidx,
nodemask,
alloc_flags, preferred_zone,
- migratetype, &did_some_progress,
- sync_migration);
+ migratetype, sync_migration,
+ &deferred_compaction,
+ &did_some_progress);
if (page)
goto got_pg;
sync_migration = true;

+ /*
+ * If compaction is deferred for high-order allocations, it is because
+ * sync compaction recently failed. In this is the case and the caller
+ * has requested the system not be heavily disrupted, fail the
+ * allocation now instead of entering direct reclaim
+ */
+ if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD))
+ goto nopage;
+
/* Try direct reclaim and then allocating */
page = __alloc_pages_direct_reclaim(gfp_mask, order,
zonelist, high_zoneidx,
@@ -2243,8 +2267,9 @@ rebalance:
zonelist, high_zoneidx,
nodemask,
alloc_flags, preferred_zone,
- migratetype, &did_some_progress,
- sync_migration);
+ migratetype, sync_migration,
+ &deferred_compaction,
+ &did_some_progress);
if (page)
goto got_pg;
}
@@ -2268,8 +2293,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
{
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
struct zone *preferred_zone;
- struct page *page;
+ struct page *page = NULL;
int migratetype = allocflags_to_migratetype(gfp_mask);
+ unsigned int cpuset_mems_cookie;

gfp_mask &= gfp_allowed_mask;

@@ -2288,15 +2314,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
if (unlikely(!zonelist->_zonerefs->zone))
return NULL;

- get_mems_allowed();
+retry_cpuset:
+ cpuset_mems_cookie = get_mems_allowed();
+
/* The preferred zone is used for statistics later */
first_zones_zonelist(zonelist, high_zoneidx,
nodemask ? : &cpuset_current_mems_allowed,
&preferred_zone);
- if (!preferred_zone) {
- put_mems_allowed();
- return NULL;
- }
+ if (!preferred_zone)
+ goto out;

/* First allocation attempt */
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
@@ -2306,9 +2332,19 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
page = __alloc_pages_slowpath(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
preferred_zone, migratetype);
- put_mems_allowed();

trace_mm_page_alloc(page, order, gfp_mask, migratetype);
+
+out:
+ /*
+ * When updating a task's mems_allowed, it is possible to race with
+ * parallel threads in such a way that an allocation can fail while
+ * the mask is being updated. If a page allocation is about to fail,
+ * check if the cpuset changed during allocation and if so, retry.
+ */
+ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+ goto retry_cpuset;
+
return page;
}
EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -2532,13 +2568,15 @@ void si_meminfo_node(struct sysinfo *val, int nid)
bool skip_free_areas_node(unsigned int flags, int nid)
{
bool ret = false;
+ unsigned int cpuset_mems_cookie;

if (!(flags & SHOW_MEM_FILTER_NODES))
goto out;

- get_mems_allowed();
- ret = !node_isset(nid, cpuset_current_mems_allowed);
- put_mems_allowed();
+ do {
+ cpuset_mems_cookie = get_mems_allowed();
+ ret = !node_isset(nid, cpuset_current_mems_allowed);
+ } while (!put_mems_allowed(cpuset_mems_cookie));
out:
return ret;
}
@@ -3418,25 +3456,33 @@ static void setup_zone_migrate_reserve(struct zone *zone)
if (page_to_nid(page) != zone_to_nid(zone))
continue;

- /* Blocks with reserved pages will never free, skip them. */
- block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
- if (pageblock_is_reserved(pfn, block_end_pfn))
- continue;
-
block_migratetype = get_pageblock_migratetype(page);

- /* If this block is reserved, account for it */
- if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) {
- reserve--;
- continue;
- }
+ /* Only test what is necessary when the reserves are not met */
+ if (reserve > 0) {
+ /*
+ * Blocks with reserved pages will never free, skip
+ * them.
+ */
+ block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
+ if (pageblock_is_reserved(pfn, block_end_pfn))
+ continue;

- /* Suitable for reserving if this block is movable */
- if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) {
- set_pageblock_migratetype(page, MIGRATE_RESERVE);
- move_freepages_block(zone, page, MIGRATE_RESERVE);
- reserve--;
- continue;
+ /* If this block is reserved, account for it */
+ if (block_migratetype == MIGRATE_RESERVE) {
+ reserve--;
+ continue;
+ }
+
+ /* Suitable for reserving if this block is movable */
+ if (block_migratetype == MIGRATE_MOVABLE) {
+ set_pageblock_migratetype(page,
+ MIGRATE_RESERVE);
+ move_freepages_block(zone, page,
+ MIGRATE_RESERVE);
+ reserve--;
+ continue;
+ }
}

/*
diff --git a/mm/slab.c b/mm/slab.c
index d96e223..a67f812 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3218,12 +3218,10 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
if (in_interrupt() || (flags & __GFP_THISNODE))
return NULL;
nid_alloc = nid_here = numa_mem_id();
- get_mems_allowed();
if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
nid_alloc = cpuset_slab_spread_node();
else if (current->mempolicy)
nid_alloc = slab_node(current->mempolicy);
- put_mems_allowed();
if (nid_alloc != nid_here)
return ____cache_alloc_node(cachep, flags, nid_alloc);
return NULL;
@@ -3246,14 +3244,17 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
enum zone_type high_zoneidx = gfp_zone(flags);
void *obj = NULL;
int nid;
+ unsigned int cpuset_mems_cookie;

if (flags & __GFP_THISNODE)
return NULL;

- get_mems_allowed();
- zonelist = node_zonelist(slab_node(current->mempolicy), flags);
local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);

+retry_cpuset:
+ cpuset_mems_cookie = get_mems_allowed();
+ zonelist = node_zonelist(slab_node(current->mempolicy), flags);
+
retry:
/*
* Look through allowed nodes for objects available
@@ -3306,7 +3307,9 @@ retry:
}
}
}
- put_mems_allowed();
+
+ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj))
+ goto retry_cpuset;
return obj;
}

diff --git a/mm/slub.c b/mm/slub.c
index 10ab233..ae6e80e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1457,6 +1457,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
struct zone *zone;
enum zone_type high_zoneidx = gfp_zone(flags);
struct page *page;
+ unsigned int cpuset_mems_cookie;

/*
* The defrag ratio allows a configuration of the tradeoffs between
@@ -1480,23 +1481,32 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
get_cycles() % 1024 > s->remote_node_defrag_ratio)
return NULL;

- get_mems_allowed();
- zonelist = node_zonelist(slab_node(current->mempolicy), flags);
- for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
- struct kmem_cache_node *n;
-
- n = get_node(s, zone_to_nid(zone));
-
- if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
- n->nr_partial > s->min_partial) {
- page = get_partial_node(n);
- if (page) {
- put_mems_allowed();
- return page;
+ do {
+ cpuset_mems_cookie = get_mems_allowed();
+ zonelist = node_zonelist(slab_node(current->mempolicy), flags);
+ for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+ struct kmem_cache_node *n;
+
+ n = get_node(s, zone_to_nid(zone));
+
+ if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
+ n->nr_partial > s->min_partial) {
+ page = get_partial_node(n);
+ if (page) {
+ /*
+ * Return the object even if
+ * put_mems_allowed indicated that
+ * the cpuset mems_allowed was
+ * updated in parallel. It's a
+ * harmless race between the alloc
+ * and the cpuset update.
+ */
+ put_mems_allowed(cpuset_mems_cookie);
+ return page;
+ }
}
}
- }
- put_mems_allowed();
+ } while (!put_mems_allowed(cpuset_mems_cookie));
#endif
return NULL;
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1b0ed36..5326f98 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -248,35 +248,66 @@ unsigned long shrink_slab(struct shrink_control *shrink,

list_for_each_entry(shrinker, &shrinker_list, list) {
unsigned long long delta;
- unsigned long total_scan;
- unsigned long max_pass;
+ long total_scan;
+ long max_pass;
+ int shrink_ret = 0;
+ long nr;
+ long new_nr;

max_pass = do_shrinker_shrink(shrinker, shrink, 0);
+ if (max_pass <= 0)
+ continue;
+
+ /*
+ * copy the current shrinker scan count into a local variable
+ * and zero it so that other concurrent shrinker invocations
+ * don't also do this scanning work.
+ */
+ do {
+ nr = shrinker->nr;
+ } while (cmpxchg(&shrinker->nr, nr, 0) != nr);
+
+ total_scan = nr;
delta = (4 * nr_pages_scanned) / shrinker->seeks;
delta *= max_pass;
do_div(delta, lru_pages + 1);
- shrinker->nr += delta;
- if (shrinker->nr < 0) {
+ total_scan += delta;
+ if (total_scan < 0) {
printk(KERN_ERR "shrink_slab: %pF negative objects to "
"delete nr=%ld\n",
- shrinker->shrink, shrinker->nr);
- shrinker->nr = max_pass;
+ shrinker->shrink, total_scan);
+ total_scan = max_pass;
}

/*
+ * We need to avoid excessive windup on filesystem shrinkers
+ * due to large numbers of GFP_NOFS allocations causing the
+ * shrinkers to return -1 all the time. This results in a large
+ * nr being built up so when a shrink that can do some work
+ * comes along it empties the entire cache due to nr >>>
+ * max_pass. This is bad for sustaining a working set in
+ * memory.
+ *
+ * Hence only allow the shrinker to scan the entire cache when
+ * a large delta change is calculated directly.
+ */
+ if (delta < max_pass / 4)
+ total_scan = min(total_scan, max_pass / 2);
+
+ /*
* Avoid risking looping forever due to too large nr value:
* never try to free more than twice the estimate number of
* freeable entries.
*/
- if (shrinker->nr > max_pass * 2)
- shrinker->nr = max_pass * 2;
+ if (total_scan > max_pass * 2)
+ total_scan = max_pass * 2;

- total_scan = shrinker->nr;
- shrinker->nr = 0;
+ trace_mm_shrink_slab_start(shrinker, shrink, nr,
+ nr_pages_scanned, lru_pages,
+ max_pass, delta, total_scan);

while (total_scan >= SHRINK_BATCH) {
long this_scan = SHRINK_BATCH;
- int shrink_ret;
int nr_before;

nr_before = do_shrinker_shrink(shrinker, shrink, 0);
@@ -292,7 +323,19 @@ unsigned long shrink_slab(struct shrink_control *shrink,
cond_resched();
}

- shrinker->nr += total_scan;
+ /*
+ * move the unused scan count back into the shrinker in a
+ * manner that handles concurrent updates. If we exhausted the
+ * scan, there is no need to do an update.
+ */
+ do {
+ nr = shrinker->nr;
+ new_nr = total_scan + nr;
+ if (total_scan <= 0)
+ break;
+ } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr);
+
+ trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
}
up_read(&shrinker_rwsem);
out:
@@ -683,7 +726,13 @@ static enum page_references page_check_references(struct page *page,
*/
SetPageReferenced(page);

- if (referenced_page)
+ if (referenced_page || referenced_ptes > 1)
+ return PAGEREF_ACTIVATE;
+
+ /*
+ * Activate file-backed executable pages after first usage.
+ */
+ if (vm_flags & VM_EXEC)
return PAGEREF_ACTIVATE;

return PAGEREF_KEEP;
@@ -972,23 +1021,27 @@ keep_lumpy:
*
* returns 0 on success, -ve errno on failure.
*/
-int __isolate_lru_page(struct page *page, int mode, int file)
+int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
{
+ bool all_lru_mode;
int ret = -EINVAL;

/* Only take pages on the LRU. */
if (!PageLRU(page))
return ret;

+ all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) ==
+ (ISOLATE_ACTIVE|ISOLATE_INACTIVE);
+
/*
* When checking the active state, we need to be sure we are
* dealing with comparible boolean values. Take the logical not
* of each.
*/
- if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
+ if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE))
return ret;

- if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file)
+ if (!all_lru_mode && !!page_is_file_cache(page) != file)
return ret;

/*
@@ -1001,6 +1054,43 @@ int __isolate_lru_page(struct page *page, int mode, int file)

ret = -EBUSY;

+ /*
+ * To minimise LRU disruption, the caller can indicate that it only
+ * wants to isolate pages it will be able to operate on without
+ * blocking - clean pages for the most part.
+ *
+ * ISOLATE_CLEAN means that only clean pages should be isolated. This
+ * is used by reclaim when it is cannot write to backing storage
+ *
+ * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
+ * that it is possible to migrate without blocking
+ */
+ if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) {
+ /* All the caller can do on PageWriteback is block */
+ if (PageWriteback(page))
+ return ret;
+
+ if (PageDirty(page)) {
+ struct address_space *mapping;
+
+ /* ISOLATE_CLEAN means only clean pages */
+ if (mode & ISOLATE_CLEAN)
+ return ret;
+
+ /*
+ * Only pages without mappings or that have a
+ * ->migratepage callback are possible to migrate
+ * without blocking
+ */
+ mapping = page_mapping(page);
+ if (mapping && !mapping->a_ops->migratepage)
+ return ret;
+ }
+ }
+
+ if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
+ return ret;
+
if (likely(get_page_unless_zero(page))) {
/*
* Be careful not to clear PageLRU until after we're
@@ -1036,7 +1126,8 @@ int __isolate_lru_page(struct page *page, int mode, int file)
*/
static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
struct list_head *src, struct list_head *dst,
- unsigned long *scanned, int order, int mode, int file)
+ unsigned long *scanned, int order, isolate_mode_t mode,
+ int file)
{
unsigned long nr_taken = 0;
unsigned long nr_lumpy_taken = 0;
@@ -1111,7 +1202,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
* anon page which don't already have a swap slot is
* pointless.
*/
- if (nr_swap_pages <= 0 && PageAnon(cursor_page) &&
+ if (nr_swap_pages <= 0 && PageSwapBacked(cursor_page) &&
!PageSwapCache(cursor_page))
break;

@@ -1161,8 +1252,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
static unsigned long isolate_pages_global(unsigned long nr,
struct list_head *dst,
unsigned long *scanned, int order,
- int mode, struct zone *z,
- int active, int file)
+ isolate_mode_t mode,
+ struct zone *z, int active, int file)
{
int lru = LRU_BASE;
if (active)
@@ -1408,6 +1499,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
unsigned long nr_taken;
unsigned long nr_anon;
unsigned long nr_file;
+ isolate_mode_t reclaim_mode = ISOLATE_INACTIVE;

while (unlikely(too_many_isolated(zone, file, sc))) {
congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1418,15 +1510,21 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
}

set_reclaim_mode(priority, sc, false);
+ if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
+ reclaim_mode |= ISOLATE_ACTIVE;
+
lru_add_drain();
+
+ if (!sc->may_unmap)
+ reclaim_mode |= ISOLATE_UNMAPPED;
+ if (!sc->may_writepage)
+ reclaim_mode |= ISOLATE_CLEAN;
+
spin_lock_irq(&zone->lru_lock);

if (scanning_global_lru(sc)) {
- nr_taken = isolate_pages_global(nr_to_scan,
- &page_list, &nr_scanned, sc->order,
- sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
- ISOLATE_BOTH : ISOLATE_INACTIVE,
- zone, 0, file);
+ nr_taken = isolate_pages_global(nr_to_scan, &page_list,
+ &nr_scanned, sc->order, reclaim_mode, zone, 0, file);
zone->pages_scanned += nr_scanned;
if (current_is_kswapd())
__count_zone_vm_events(PGSCAN_KSWAPD, zone,
@@ -1435,12 +1533,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
__count_zone_vm_events(PGSCAN_DIRECT, zone,
nr_scanned);
} else {
- nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
- &page_list, &nr_scanned, sc->order,
- sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
- ISOLATE_BOTH : ISOLATE_INACTIVE,
- zone, sc->mem_cgroup,
- 0, file);
+ nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list,
+ &nr_scanned, sc->order, reclaim_mode, zone,
+ sc->mem_cgroup, 0, file);
/*
* mem_cgroup_isolate_pages() keeps track of
* scanned pages on its own.
@@ -1542,19 +1637,26 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
struct page *page;
struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
unsigned long nr_rotated = 0;
+ isolate_mode_t reclaim_mode = ISOLATE_ACTIVE;

lru_add_drain();
+
+ if (!sc->may_unmap)
+ reclaim_mode |= ISOLATE_UNMAPPED;
+ if (!sc->may_writepage)
+ reclaim_mode |= ISOLATE_CLEAN;
+
spin_lock_irq(&zone->lru_lock);
if (scanning_global_lru(sc)) {
nr_taken = isolate_pages_global(nr_pages, &l_hold,
&pgscanned, sc->order,
- ISOLATE_ACTIVE, zone,
+ reclaim_mode, zone,
1, file);
zone->pages_scanned += pgscanned;
} else {
nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
&pgscanned, sc->order,
- ISOLATE_ACTIVE, zone,
+ reclaim_mode, zone,
sc->mem_cgroup, 1, file);
/*
* mem_cgroup_isolate_pages() keeps track of
@@ -1747,23 +1849,16 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
u64 fraction[2], denominator;
enum lru_list l;
int noswap = 0;
- int force_scan = 0;
+ bool force_scan = false;
unsigned long nr_force_scan[2];

-
- anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
- zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
- file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
- zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
-
- if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) {
- /* kswapd does zone balancing and need to scan this zone */
- if (scanning_global_lru(sc) && current_is_kswapd())
- force_scan = 1;
- /* memcg may have small limit and need to avoid priority drop */
- if (!scanning_global_lru(sc))
- force_scan = 1;
- }
+ /* kswapd does zone balancing and needs to scan this zone */
+ if (scanning_global_lru(sc) && current_is_kswapd() &&
+ zone->all_unreclaimable)
+ force_scan = true;
+ /* memcg may have small limit and need to avoid priority drop */
+ if (!scanning_global_lru(sc))
+ force_scan = true;

/* If we have no swap space, do not bother scanning anon pages. */
if (!sc->may_swap || (nr_swap_pages <= 0)) {
@@ -1776,6 +1871,11 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
goto out;
}

+ anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
+ zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
+ file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
+ zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+
if (scanning_global_lru(sc)) {
free = zone_page_state(zone, NR_FREE_PAGES);
/* If we have very few page cache pages,
@@ -1912,8 +2012,9 @@ static inline bool should_continue_reclaim(struct zone *zone,
* inactive lists are large enough, continue reclaiming
*/
pages_for_compaction = (2UL << sc->order);
- inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) +
- zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+ inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+ if (nr_swap_pages > 0)
+ inactive_lru_pages += zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
if (sc->nr_reclaimed < pages_for_compaction &&
inactive_lru_pages > pages_for_compaction)
return true;
@@ -1985,6 +2086,42 @@ restart:
throttle_vm_writeout(sc->gfp_mask);
}

+/* Returns true if compaction should go ahead for a high-order request */
+static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
+{
+ unsigned long balance_gap, watermark;
+ bool watermark_ok;
+
+ /* Do not consider compaction for orders reclaim is meant to satisfy */
+ if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)
+ return false;
+
+ /*
+ * Compaction takes time to run and there are potentially other
+ * callers using the pages just freed. Continue reclaiming until
+ * there is a buffer of free pages available to give compaction
+ * a reasonable chance of completing and allocating the page
+ */
+ balance_gap = min(low_wmark_pages(zone),
+ (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
+ KSWAPD_ZONE_BALANCE_GAP_RATIO);
+ watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
+ watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
+
+ /*
+ * If compaction is deferred, reclaim up to a point where
+ * compaction will have a chance of success when re-enabled
+ */
+ if (compaction_deferred(zone))
+ return watermark_ok;
+
+ /* If compaction is not ready to start, keep reclaiming */
+ if (!compaction_suitable(zone, sc->order))
+ return false;
+
+ return watermark_ok;
+}
+
/*
* This is the direct reclaim path, for page-allocating processes. We only
* try to reclaim pages from zones which will satisfy the caller's allocation
@@ -2000,14 +2137,20 @@ restart:
*
* If a zone is deemed to be full of pinned pages then just give it a light
* scan then give up on it.
+ *
+ * This function returns true if a zone is being reclaimed for a costly
+ * high-order allocation and compaction is ready to begin. This indicates to
+ * the caller that it should consider retrying the allocation instead of
+ * further reclaim.
*/
-static void shrink_zones(int priority, struct zonelist *zonelist,
+static bool shrink_zones(int priority, struct zonelist *zonelist,
struct scan_control *sc)
{
struct zoneref *z;
struct zone *zone;
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
+ bool aborted_reclaim = false;

for_each_zone_zonelist_nodemask(zone, z, zonelist,
gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -2022,6 +2165,21 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
continue;
if (zone->all_unreclaimable && priority != DEF_PRIORITY)
continue; /* Let kswapd poll it */
+ if (COMPACTION_BUILD) {
+ /*
+ * If we already have plenty of memory free for
+ * compaction in this zone, don't free any more.
+ * Even though compaction is invoked for any
+ * non-zero order, only frequent costly order
+ * reclamation is disruptive enough to become a
+ * noticable problem, like transparent huge page
+ * allocations.
+ */
+ if (compaction_ready(zone, sc)) {
+ aborted_reclaim = true;
+ continue;
+ }
+ }
/*
* This steals pages from memory cgroups over softlimit
* and returns the number of reclaimed pages and
@@ -2039,6 +2197,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist,

shrink_zone(priority, zone, sc);
}
+
+ return aborted_reclaim;
}

static bool zone_reclaimable(struct zone *zone)
@@ -2092,8 +2252,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
struct zoneref *z;
struct zone *zone;
unsigned long writeback_threshold;
+ bool aborted_reclaim;

- get_mems_allowed();
delayacct_freepages_start();

if (scanning_global_lru(sc))
@@ -2103,7 +2263,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
sc->nr_scanned = 0;
if (!priority)
disable_swap_token(sc->mem_cgroup);
- shrink_zones(priority, zonelist, sc);
+ aborted_reclaim = shrink_zones(priority, zonelist, sc);
+
/*
* Don't shrink slabs when reclaiming memory from
* over limit cgroups
@@ -2155,7 +2316,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,

out:
delayacct_freepages_end();
- put_mems_allowed();

if (sc->nr_reclaimed)
return sc->nr_reclaimed;
@@ -2168,6 +2328,10 @@ out:
if (oom_killer_disabled)
return 0;

+ /* Aborted reclaim to try compaction? don't OOM, then */
+ if (aborted_reclaim)
+ return 1;
+
/* top priority shrink_zones still had more to do? don't OOM, then */
if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc))
return 1;
@@ -2459,6 +2623,9 @@ loop_again:
high_wmark_pages(zone), 0, 0)) {
end_zone = i;
break;
+ } else {
+ /* If balanced, clear the congested flag */
+ zone_clear_flag(zone, ZONE_CONGESTED);
}
}
if (i < 0)
@@ -2695,7 +2862,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
* them before going back to sleep.
*/
set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
- schedule();
+
+ if (!kthread_should_stop())
+ schedule();
+
set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
} else {
if (remaining)
@@ -2722,7 +2892,9 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
static int kswapd(void *p)
{
unsigned long order, new_order;
+ unsigned balanced_order;
int classzone_idx, new_classzone_idx;
+ int balanced_classzone_idx;
pg_data_t *pgdat = (pg_data_t*)p;
struct task_struct *tsk = current;

@@ -2753,7 +2925,9 @@ static int kswapd(void *p)
set_freezable();

order = new_order = 0;
+ balanced_order = 0;
classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
+ balanced_classzone_idx = classzone_idx;
for ( ; ; ) {
int ret;

@@ -2762,7 +2936,8 @@ static int kswapd(void *p)
* new request of a similar or harder type will succeed soon
* so consider going to sleep on the basis we reclaimed at
*/
- if (classzone_idx >= new_classzone_idx && order == new_order) {
+ if (balanced_classzone_idx >= new_classzone_idx &&
+ balanced_order == new_order) {
new_order = pgdat->kswapd_max_order;
new_classzone_idx = pgdat->classzone_idx;
pgdat->kswapd_max_order = 0;
@@ -2777,9 +2952,12 @@ static int kswapd(void *p)
order = new_order;
classzone_idx = new_classzone_idx;
} else {
- kswapd_try_to_sleep(pgdat, order, classzone_idx);
+ kswapd_try_to_sleep(pgdat, balanced_order,
+ balanced_classzone_idx);
order = pgdat->kswapd_max_order;
classzone_idx = pgdat->classzone_idx;
+ new_order = order;
+ new_classzone_idx = classzone_idx;
pgdat->kswapd_max_order = 0;
pgdat->classzone_idx = pgdat->nr_zones - 1;
}
@@ -2794,7 +2972,9 @@ static int kswapd(void *p)
*/
if (!ret) {
trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
- order = balance_pgdat(pgdat, order, &classzone_idx);
+ balanced_classzone_idx = classzone_idx;
+ balanced_order = balance_pgdat(pgdat, order,
+ &balanced_classzone_idx);
}
}
return 0;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 20c18b7..6559013 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -78,7 +78,7 @@ void vm_events_fold_cpu(int cpu)
*
* vm_stat contains the global counters
*/
-atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
+atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
EXPORT_SYMBOL(vm_stat);

#ifdef CONFIG_SMP
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/