Re: [Approach #2] [RFC][PATCH] Remove cgroup member from structpage

From: KAMEZAWA Hiroyuki
Date: Tue Sep 09 2008 - 21:44:42 EST


On Tue, 9 Sep 2008 18:20:48 -0700
Balbir Singh <balbir@xxxxxxxxxxxxxxxxxx> wrote:

> * KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> [2008-09-09 21:30:12]:
> OK, here is approach #2, it works for me and gives me really good
> performance (surpassing even the current memory controller). I am
> seeing almost a 7% increase
This number is from pre-allcation, maybe.
We really do alloc-at-boot all page_cgroup ? This seems a big change.

>
> Caveats
>
> 1. Uses more memory (since it allocates memory for each node based on
> spanned_pages. Ignores holes, so might not be the most efficient,
> but it is a tradeoff of complexity versus space. I propose refining it
> as we go along.
> 2. Does not currently handle alloc_bootmem failure
> 3. Needs lots of testing/tuning and polishing
>
If we can do alloc-at-boot, we can make memcg much simpler.



> I've tested it on an x86_64 box with 4G of memory
>
> Again, this is an early RFC patch, please review test.
>
> Comments/Reviews?
>
> Signed-off-by: Balbir Singh <balbir@xxxxxxxxxxxxxxxxxx>
> ---
>
> include/linux/memcontrol.h | 32 ++++++
> include/linux/mm_types.h | 4
> mm/memcontrol.c | 212 +++++++++++++++++++++++++++------------------
> mm/page_alloc.c | 10 --
> 4 files changed, 162 insertions(+), 96 deletions(-)
>
> diff -puN mm/memcontrol.c~memcg_move_to_radix_tree mm/memcontrol.c
> --- linux-2.6.27-rc5/mm/memcontrol.c~memcg_move_to_radix_tree 2008-09-04 03:15:54.000000000 -0700
> +++ linux-2.6.27-rc5-balbir/mm/memcontrol.c 2008-09-09 17:56:54.000000000 -0700
> @@ -18,6 +18,7 @@
> */
>
> #include <linux/res_counter.h>
> +#include <linux/bootmem.h>
> #include <linux/memcontrol.h>
> #include <linux/cgroup.h>
> #include <linux/mm.h>
> @@ -37,9 +38,10 @@
> #include <asm/uaccess.h>
>
> struct cgroup_subsys mem_cgroup_subsys __read_mostly;
> -static struct kmem_cache *page_cgroup_cache __read_mostly;
> #define MEM_CGROUP_RECLAIM_RETRIES 5
>
> +static struct page_cgroup *pcg_map[MAX_NUMNODES];
> +
> /*
> * Statistics for memory cgroup.
> */
> @@ -137,20 +139,6 @@ struct mem_cgroup {
> static struct mem_cgroup init_mem_cgroup;
>
> /*
> - * We use the lower bit of the page->page_cgroup pointer as a bit spin
> - * lock. We need to ensure that page->page_cgroup is at least two
> - * byte aligned (based on comments from Nick Piggin). But since
> - * bit_spin_lock doesn't actually set that lock bit in a non-debug
> - * uniprocessor kernel, we should avoid setting it here too.
> - */
> -#define PAGE_CGROUP_LOCK_BIT 0x0
> -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
> -#define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT)
> -#else
> -#define PAGE_CGROUP_LOCK 0x0
> -#endif
> -
> -/*
> * A page_cgroup page is associated with every page descriptor. The
> * page_cgroup helps us identify information about the cgroup
> */
> @@ -158,12 +146,26 @@ struct page_cgroup {
> struct list_head lru; /* per cgroup LRU list */
> struct page *page;
> struct mem_cgroup *mem_cgroup;
> - int flags;
> + unsigned long flags;
> };
> -#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */
> -#define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */
> -#define PAGE_CGROUP_FLAG_FILE (0x4) /* page is file system backed */
> -#define PAGE_CGROUP_FLAG_UNEVICTABLE (0x8) /* page is unevictableable */
> +
> +/*
> + * LOCK_BIT is 0, with value 1
> + */
> +#define PAGE_CGROUP_FLAG_LOCK_BIT (0x0) /* lock bit */
> +
> +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
> +#define PAGE_CGROUP_FLAG_LOCK (0x1) /* lock value */
> +#else
> +#define PAGE_CGROUP_FLAG_LOCK (0x0) /* lock value */
> +#endif
> +
> +#define PAGE_CGROUP_FLAG_CACHE (0x2) /* charged as cache */
> +#define PAGE_CGROUP_FLAG_ACTIVE (0x4) /* page is active in this cgroup */
> +#define PAGE_CGROUP_FLAG_FILE (0x8) /* page is file system backed */
> +#define PAGE_CGROUP_FLAG_UNEVICTABLE (0x10)/* page is unevictableable */
> +#define PAGE_CGROUP_FLAG_INUSE (0x20)/* pc is allocated and in use */
> +#define PAGE_CGROUP_FLAG_VALID (0x40)/* pc is allocated and in use */
>
> static int page_cgroup_nid(struct page_cgroup *pc)
> {
> @@ -248,35 +250,99 @@ struct mem_cgroup *mem_cgroup_from_task(
> struct mem_cgroup, css);
> }
>
> -static inline int page_cgroup_locked(struct page *page)
> +static inline void lock_page_cgroup(struct page_cgroup *pc)
> {
> - return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
> + bit_spin_lock(PAGE_CGROUP_FLAG_LOCK_BIT, &pc->flags);
> }
>
> -static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
> +static inline int trylock_page_cgroup(struct page_cgroup *pc)
> {
> - VM_BUG_ON(!page_cgroup_locked(page));
> - page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK);
> + return bit_spin_trylock(PAGE_CGROUP_FLAG_LOCK_BIT, &pc->flags);
> }
>
> -struct page_cgroup *page_get_page_cgroup(struct page *page)
> +static inline void unlock_page_cgroup(struct page_cgroup *pc)
> {
> - return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK);
> + bit_spin_unlock(PAGE_CGROUP_FLAG_LOCK_BIT, &pc->flags);
> }
>
> -static void lock_page_cgroup(struct page *page)
> +/*
> + * Called from memmap_init_zone(), has the advantage of dealing with
> + * memory_hotplug (Addition of memory)
> + */
> +int page_cgroup_alloc(int n)
> {
> - bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
> + struct pglist_data *pgdat;
> + unsigned long size, start, end;
> +
> + if (mem_cgroup_subsys.disabled)
> + return;
> +
> + pgdat = NODE_DATA(n);
> + /*
> + * Already allocated, leave
> + */
> + if (pcg_map[n])
> + return 0;
> +
> + start = pgdat->node_start_pfn;
> + end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
> + size = (end - start) * sizeof(struct page_cgroup);
> + printk("Allocating %lu bytes for node %d\n", size, n);
^^^^^

1. This is nonsense...do you know the memory map of IBM's (maybe ppc) machine ?
Node's memory are splitted into several pieces and not ordered by node number.
example)
Node 0 | Node 1 | Node 2 | Node 1 | Node 2 |

This seems special but when I helped SPARSEMEM and MEMORY_HOTPLUG,
I saw mannnny kinds of memory map. As you wrote, this should be re-designed.

2. If pre-allocating all is ok, I stop my work. Mine is of-no-use.
But you have to know that by pre-allocationg, we can't use avoid-lru-lock
by batch like page_vec technique. We can't delay uncharge because a page
can be reused soon.




> + pcg_map[n] = alloc_bootmem_node(pgdat, size);
> + /*
> + * We can do smoother recovery
> + */
> + BUG_ON(!pcg_map[n]);
> + return 0;
> }
>
> -static int try_lock_page_cgroup(struct page *page)
> +void page_cgroup_init(int nid, unsigned long pfn)
> {
> - return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
> + unsigned long node_pfn;
> + struct page_cgroup *pc;
> +
> + if (mem_cgroup_subsys.disabled)
> + return;
> +
> + node_pfn = pfn - NODE_DATA(nid)->node_start_pfn;
> + pc = &pcg_map[nid][node_pfn];
> +
> + BUG_ON(!pc);
> + pc->flags = PAGE_CGROUP_FLAG_VALID;
> + INIT_LIST_HEAD(&pc->lru);
> + pc->page = NULL;
This NULL is unnecessary. pc->page = pnf_to_page(pfn) always.


> + pc->mem_cgroup = NULL;
> }
>
> -static void unlock_page_cgroup(struct page *page)
> +struct page_cgroup *__page_get_page_cgroup(struct page *page, bool lock,
> + bool trylock)
> {
> - bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
> + struct page_cgroup *pc;
> + int ret;
> + int node = page_to_nid(page);
> + unsigned long pfn;
> +
> + pfn = page_to_pfn(page) - NODE_DATA(node)->node_start_pfn;
> + pc = &pcg_map[node][pfn];
> + BUG_ON(!(pc->flags & PAGE_CGROUP_FLAG_VALID));
> + if (lock)
> + lock_page_cgroup(pc);
> + else if (trylock) {
> + ret = trylock_page_cgroup(pc);
> + if (!ret)
> + pc = NULL;
> + }
> +
> + return pc;
> +}
> +
> +/*
> + * Should be called with page_cgroup lock held. Any additions to pc->flags
> + * should be reflected here. This might seem ugly, refine it later.
> + */
> +void page_clear_page_cgroup(struct page_cgroup *pc)
> +{
> + pc->flags &= ~PAGE_CGROUP_FLAG_INUSE;
> }
>
> static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
> @@ -377,17 +443,15 @@ void mem_cgroup_move_lists(struct page *
> * safely get to page_cgroup without it, so just try_lock it:
> * mem_cgroup_isolate_pages allows for page left on wrong list.
> */
> - if (!try_lock_page_cgroup(page))
> + pc = page_get_page_cgroup_trylock(page);
> + if (!pc)
> return;
>
> - pc = page_get_page_cgroup(page);
> - if (pc) {
> - mz = page_cgroup_zoneinfo(pc);
> - spin_lock_irqsave(&mz->lru_lock, flags);
> - __mem_cgroup_move_lists(pc, lru);
> - spin_unlock_irqrestore(&mz->lru_lock, flags);
> - }
> - unlock_page_cgroup(page);
> + mz = page_cgroup_zoneinfo(pc);
> + spin_lock_irqsave(&mz->lru_lock, flags);
> + __mem_cgroup_move_lists(pc, lru);
> + spin_unlock_irqrestore(&mz->lru_lock, flags);
> + unlock_page_cgroup(pc);

This lock/unlock_page_cgroup is against what ?

> }
>
> /*
> @@ -521,10 +585,6 @@ static int mem_cgroup_charge_common(stru
> unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
> struct mem_cgroup_per_zone *mz;
>
> - pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask);
> - if (unlikely(pc == NULL))
> - goto err;
> -
> /*
> * We always charge the cgroup the mm_struct belongs to.
> * The mm_struct's mem_cgroup changes on task migration if the
> @@ -567,43 +627,40 @@ static int mem_cgroup_charge_common(stru
> }
> }
>
> + pc = page_get_page_cgroup_locked(page);
> + if (pc->flags & PAGE_CGROUP_FLAG_INUSE) {
> + unlock_page_cgroup(pc);
> + res_counter_uncharge(&mem->res, PAGE_SIZE);
> + css_put(&mem->css);
> + goto done;
> + }
> +
Can this happen ? Our direction should be
VM_BUG_ON(pc->flags & PAGE_CGROUP_FLAG_INUSE)



> pc->mem_cgroup = mem;
> pc->page = page;
> + pc->flags |= PAGE_CGROUP_FLAG_INUSE;
> +
> /*
> * If a page is accounted as a page cache, insert to inactive list.
> * If anon, insert to active list.
> */
> if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) {
> - pc->flags = PAGE_CGROUP_FLAG_CACHE;
> + pc->flags |= PAGE_CGROUP_FLAG_CACHE;
> if (page_is_file_cache(page))
> pc->flags |= PAGE_CGROUP_FLAG_FILE;
> else
> pc->flags |= PAGE_CGROUP_FLAG_ACTIVE;
> } else
> - pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
> -
> - lock_page_cgroup(page);
> - if (unlikely(page_get_page_cgroup(page))) {
> - unlock_page_cgroup(page);
> - res_counter_uncharge(&mem->res, PAGE_SIZE);
> - css_put(&mem->css);
> - kmem_cache_free(page_cgroup_cache, pc);
> - goto done;
> - }
> - page_assign_page_cgroup(page, pc);
> + pc->flags |= PAGE_CGROUP_FLAG_ACTIVE;
>
> mz = page_cgroup_zoneinfo(pc);
> spin_lock_irqsave(&mz->lru_lock, flags);
> __mem_cgroup_add_list(mz, pc);
> spin_unlock_irqrestore(&mz->lru_lock, flags);
> -
> - unlock_page_cgroup(page);
> + unlock_page_cgroup(pc);

Is this lock/unlock_page_cgroup is for what kind of race ?

Thanks,
-Kame

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/