[PATCH 2.6.9-rc2-mm1 1/2] mm: memory policy for page cache allocation

From: Ray Bryant
Date: Mon Sep 20 2004 - 14:13:07 EST


This patch creates MPOL_ROUNDROBIN. This is like MPOL_INTERLEAVE,
but doesn't require a global offset or index to be specified.

Index: linux-2.6.9-rc1-mm3-kdb-pagecache/include/linux/sched.h
===================================================================
--- linux-2.6.9-rc1-mm3-kdb-pagecache.orig/include/linux/sched.h 2004-09-03 09:45:42.000000000 -0700
+++ linux-2.6.9-rc1-mm3-kdb-pagecache/include/linux/sched.h 2004-09-03 09:47:42.000000000 -0700
@@ -596,6 +596,7 @@
#ifdef CONFIG_NUMA
struct mempolicy *mempolicy;
short il_next; /* could be shared with used_math */
+ short rr_next;
#endif
#ifdef CONFIG_CPUSETS
struct cpuset *cpuset;
Index: linux-2.6.9-rc1-mm3-kdb-pagecache/mm/mempolicy.c
===================================================================
--- linux-2.6.9-rc1-mm3-kdb-pagecache.orig/mm/mempolicy.c 2004-09-03 09:45:40.000000000 -0700
+++ linux-2.6.9-rc1-mm3-kdb-pagecache/mm/mempolicy.c 2004-09-03 09:47:42.000000000 -0700
@@ -7,10 +7,17 @@
* NUMA policy allows the user to give hints in which node(s) memory should
* be allocated.
*
- * Support four policies per VMA and per process:
+ * Support five policies per VMA and per process:
*
* The VMA policy has priority over the process policy for a page fault.
*
+ * roundrobin Allocate memory round-robined over a set of nodes,
+ * with normal fallback if it fails. The round-robin is
+ * based on a per-thread rotor both to provide predictability
+ * of allocation locations and to avoid cacheline contention
+ * compared to a global rotor. This policy is distinct from
+ * interleave in that it seeks to distribute allocations evenly
+ * across nodes, whereas interleave seeks to maximize bandwidth.
* interleave Allocate memory interleaved over a set of nodes,
* with normal fallback if it fails.
* For VMA based allocations this interleaves based on the
@@ -117,6 +124,7 @@
break;
case MPOL_BIND:
case MPOL_INTERLEAVE:
+ case MPOL_ROUNDROBIN:
/* Preferred will only use the first bit, but allow
more for now. */
if (empty)
@@ -215,6 +223,7 @@
atomic_set(&policy->refcnt, 1);
switch (mode) {
case MPOL_INTERLEAVE:
+ case MPOL_ROUNDROBIN:
bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES);
break;
case MPOL_PREFERRED:
@@ -406,6 +415,8 @@
current->mempolicy = new;
if (new && new->policy == MPOL_INTERLEAVE)
current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES);
+ if (new && new->policy == MPOL_ROUNDROBIN)
+ current->rr_next = find_first_bit(new->v.nodes, MAX_NUMNODES);
return 0;
}

@@ -423,6 +434,7 @@
case MPOL_DEFAULT:
break;
case MPOL_INTERLEAVE:
+ case MPOL_ROUNDROBIN:
bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES);
break;
case MPOL_PREFERRED:
@@ -507,6 +519,9 @@
} else if (pol == current->mempolicy &&
pol->policy == MPOL_INTERLEAVE) {
pval = current->il_next;
+ } else if (pol == current->mempolicy &&
+ pol->policy == MPOL_ROUNDROBIN) {
+ pval = current->rr_next;
} else {
err = -EINVAL;
goto out;
@@ -585,6 +600,7 @@
return policy->v.zonelist;
/*FALL THROUGH*/
case MPOL_INTERLEAVE: /* should not happen */
+ case MPOL_ROUNDROBIN: /* should not happen */
case MPOL_DEFAULT:
nd = numa_node_id();
break;
@@ -595,6 +611,21 @@
return NODE_DATA(nd)->node_zonelists + (gfp & GFP_ZONEMASK);
}

+/* Do dynamic round-robin for a process */
+static unsigned roundrobin_nodes(struct mempolicy *policy)
+{
+ unsigned nid, next;
+ struct task_struct *me = current;
+
+ nid = me->rr_next;
+ BUG_ON(nid >= MAX_NUMNODES);
+ next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid);
+ if (next >= MAX_NUMNODES)
+ next = find_first_bit(policy->v.nodes, MAX_NUMNODES);
+ me->rr_next = next;
+ return nid;
+}
+
/* Do dynamic interleaving for a process */
static unsigned interleave_nodes(struct mempolicy *policy)
{
@@ -646,6 +677,27 @@
return page;
}

+/* Allocate a page in round-robin policy.
+ Own path because first fallback needs to round-robin. */
+static struct page *alloc_page_roundrobin(unsigned gfp, unsigned order, struct mempolicy* policy)
+{
+ struct zonelist *zl;
+ struct page *page;
+ unsigned nid;
+ int i, numnodes = bitmap_weight(policy->v.nodes, MAX_NUMNODES);
+
+ for (i = 0; i < numnodes; i++) {
+ nid = roundrobin_nodes(policy);
+ BUG_ON(!test_bit(nid, (const volatile void *) &node_online_map));
+ zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
+ page = __alloc_pages(gfp, order, zl);
+ if (page)
+ return page;
+ }
+
+ return NULL;
+}
+
/**
* alloc_page_vma - Allocate a page for a VMA.
*
@@ -671,26 +723,30 @@
struct page *
alloc_page_vma(unsigned gfp, struct vm_area_struct *vma, unsigned long addr)
{
+ unsigned nid;
struct mempolicy *pol = get_vma_policy(vma, addr);

cpuset_update_current_mems_allowed();

- if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
- unsigned nid;
- if (vma) {
- unsigned long off;
- BUG_ON(addr >= vma->vm_end);
- BUG_ON(addr < vma->vm_start);
- off = vma->vm_pgoff;
- off += (addr - vma->vm_start) >> PAGE_SHIFT;
- nid = offset_il_node(pol, vma, off);
- } else {
- /* fall back to process interleaving */
- nid = interleave_nodes(pol);
- }
- return alloc_page_interleave(gfp, 0, nid);
+ switch (pol->policy) {
+ case MPOL_INTERLEAVE:
+ if (vma) {
+ unsigned long off;
+ BUG_ON(addr >= vma->vm_end);
+ BUG_ON(addr < vma->vm_start);
+ off = vma->vm_pgoff;
+ off += (addr - vma->vm_start) >> PAGE_SHIFT;
+ nid = offset_il_node(pol, vma, off);
+ } else {
+ /* fall back to process interleaving */
+ nid = interleave_nodes(pol);
+ }
+ return alloc_page_interleave(gfp, 0, nid);
+ case MPOL_ROUNDROBIN:
+ return alloc_page_roundrobin(gfp, 0, pol);
+ default:
+ return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
}
- return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
}

/**
@@ -716,8 +772,11 @@
cpuset_update_current_mems_allowed();
if (!pol || in_interrupt())
pol = &default_policy;
- if (pol->policy == MPOL_INTERLEAVE)
+ if (pol->policy == MPOL_INTERLEAVE) {
return alloc_page_interleave(gfp, order, interleave_nodes(pol));
+ } else if (pol->policy == MPOL_ROUNDROBIN) {
+ return alloc_page_roundrobin(gfp, order, pol);
+ }
return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
}
EXPORT_SYMBOL(alloc_pages_current);
@@ -754,6 +813,7 @@
case MPOL_DEFAULT:
return 1;
case MPOL_INTERLEAVE:
+ case MPOL_ROUNDROBIN:
return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES);
case MPOL_PREFERRED:
return a->v.preferred_node == b->v.preferred_node;
@@ -798,6 +858,8 @@
return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
case MPOL_INTERLEAVE:
return interleave_nodes(pol);
+ case MPOL_ROUNDROBIN:
+ return roundrobin_nodes(pol);
case MPOL_PREFERRED:
return pol->v.preferred_node >= 0 ?
pol->v.preferred_node : numa_node_id();
@@ -815,6 +877,7 @@
case MPOL_PREFERRED:
case MPOL_DEFAULT:
case MPOL_INTERLEAVE:
+ case MPOL_ROUNDROBIN:
return 1;
case MPOL_BIND: {
struct zone **z;
Index: linux-2.6.9-rc1-mm3-kdb-pagecache/include/linux/mempolicy.h
===================================================================
--- linux-2.6.9-rc1-mm3-kdb-pagecache.orig/include/linux/mempolicy.h 2004-08-27 10:06:15.000000000 -0700
+++ linux-2.6.9-rc1-mm3-kdb-pagecache/include/linux/mempolicy.h 2004-09-16 09:27:08.000000000 -0700
@@ -13,8 +13,9 @@
#define MPOL_PREFERRED 1
#define MPOL_BIND 2
#define MPOL_INTERLEAVE 3
+#define MPOL_ROUNDROBIN 4

-#define MPOL_MAX MPOL_INTERLEAVE
+#define MPOL_MAX MPOL_ROUNDROBIN

/* Flags for get_mem_policy */
#define MPOL_F_NODE (1<<0) /* return next IL mode instead of node mask */

--
Best Regards,
Ray
-----------------------------------------------
Ray Bryant raybry@xxxxxxx
The box said: "Requires Windows 98 or better",
so I installed Linux.
-----------------------------------------------
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/