[PATCH 3/7] x86/intel_rdt: Support cache bit mask for Intel CAT

From: Vikas Shivappa
Date: Fri May 01 2015 - 21:39:40 EST


Add support for cache bit mask manipulation. The change adds a file
cache_mask to the RDT cgroup which represents the CBM(cache bit mask)
for the cgroup.

Update to the CBM is done by writing to the IA32_L3_MASK_n.
The RDT cgroup follows cgroup hierarchy ,mkdir and adding tasks to the
cgroup never fails. When a child cgroup is created it inherits the
CLOSid and the cache_mask from its parent. When a user changes the
default CBM for a cgroup, a new CLOSid may be allocated if the
cache_mask was not used before. If the new CBM is the one that is
already used, the count for that CLOSid<->CBM is incremented. The
changing of 'cbm' may fail with -ENOSPC once the kernel runs out of
maximum CLOSids it can support.
User can create as many cgroups as he wants but having different CBMs
at the same time is restricted by the maximum number of CLOSids
(multiple cgroups can have the same CBM).
Kernel maintains a CLOSid<->cbm mapping which keeps count
of cgroups using a CLOSid.

The tasks in the CAT cgroup would get to fill the L3 cache represented
by the cgroup's cache_mask file.

Reuse of CLOSids for cgroups with same bitmask also has following
advantages:
- This helps to use the scant CLOSids optimally.
- This also implies that during context switch, write to PQR-MSR is done
only when a task with a different bitmask is scheduled in.

During cpu bringup due to a hotplug event, IA32_L3_MASK_n MSR is
synchronized from the clos cbm map if it is used by any cgroup for the
package.

Signed-off-by: Vikas Shivappa <vikas.shivappa@xxxxxxxxxxxxxxx>
---
arch/x86/include/asm/intel_rdt.h | 7 +-
arch/x86/kernel/cpu/intel_rdt.c | 364 ++++++++++++++++++++++++++++++++++++---
2 files changed, 346 insertions(+), 25 deletions(-)

diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h
index 87af1a5..9e9dbbe 100644
--- a/arch/x86/include/asm/intel_rdt.h
+++ b/arch/x86/include/asm/intel_rdt.h
@@ -4,6 +4,9 @@
#ifdef CONFIG_CGROUP_RDT

#include <linux/cgroup.h>
+#define MAX_CBM_LENGTH 32
+#define IA32_L3_CBM_BASE 0xc90
+#define CBM_FROM_INDEX(x) (IA32_L3_CBM_BASE + x)

struct rdt_subsys_info {
/* Clos Bitmap to keep track of available CLOSids.*/
@@ -17,8 +20,8 @@ struct intel_rdt {
};

struct clos_cbm_map {
- unsigned long cbm;
- unsigned int cgrp_count;
+ unsigned long cache_mask;
+ unsigned int clos_refcnt;
};

/*
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index eec57fe..58b39d6 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -24,16 +24,25 @@
#include <linux/slab.h>
#include <linux/err.h>
#include <linux/spinlock.h>
+#include <linux/cpu.h>
#include <asm/intel_rdt.h>

/*
- * ccmap maintains 1:1 mapping between CLOSid and cbm.
+ * ccmap maintains 1:1 mapping between CLOSid and cache_mask.
*/
static struct clos_cbm_map *ccmap;
static struct rdt_subsys_info rdtss_info;
static DEFINE_MUTEX(rdt_group_mutex);
struct intel_rdt rdt_root_group;

+/*
+ * Mask of CPUs for writing CBM values. We only need one per-socket.
+ */
+static cpumask_t rdt_cpumask;
+
+#define rdt_for_each_child(pos_css, parent_ir) \
+ css_for_each_child((pos_css), &(parent_ir)->css)
+
static inline bool cat_supported(struct cpuinfo_x86 *c)
{
if (cpu_has(c, X86_FEATURE_CAT_L3))
@@ -42,22 +51,66 @@ static inline bool cat_supported(struct cpuinfo_x86 *c)
return false;
}

+static void __clos_init(unsigned int closid)
+{
+ struct clos_cbm_map *ccm = &ccmap[closid];
+
+ lockdep_assert_held(&rdt_group_mutex);
+
+ ccm->clos_refcnt = 1;
+}
+
/*
-* Called with the rdt_group_mutex held.
-*/
-static int rdt_free_closid(struct intel_rdt *ir)
+ * Allocates a new closid from unused closids.
+ */
+static int rdt_alloc_closid(struct intel_rdt *ir)
{
+ unsigned int id;
+ unsigned int maxid;

lockdep_assert_held(&rdt_group_mutex);

- WARN_ON(!ccmap[ir->clos].cgrp_count);
- ccmap[ir->clos].cgrp_count--;
- if (!ccmap[ir->clos].cgrp_count)
- clear_bit(ir->clos, rdtss_info.closmap);
+ maxid = boot_cpu_data.x86_cat_closs;
+ id = find_next_zero_bit(rdtss_info.closmap, maxid, 0);
+ if (id == maxid)
+ return -ENOSPC;
+
+ set_bit(id, rdtss_info.closmap);
+ __clos_init(id);
+ ir->clos = id;

return 0;
}

+static void rdt_free_closid(unsigned int clos)
+{
+
+ lockdep_assert_held(&rdt_group_mutex);
+
+ clear_bit(clos, rdtss_info.closmap);
+}
+
+static void __clos_get(unsigned int closid)
+{
+ struct clos_cbm_map *ccm = &ccmap[closid];
+
+ lockdep_assert_held(&rdt_group_mutex);
+
+ ccm->clos_refcnt += 1;
+}
+
+static void __clos_put(unsigned int closid)
+{
+ struct clos_cbm_map *ccm = &ccmap[closid];
+
+ lockdep_assert_held(&rdt_group_mutex);
+ WARN_ON(!ccm->clos_refcnt);
+
+ ccm->clos_refcnt -= 1;
+ if (!ccm->clos_refcnt)
+ rdt_free_closid(closid);
+}
+
static struct cgroup_subsys_state *
rdt_css_alloc(struct cgroup_subsys_state *parent_css)
{
@@ -77,27 +130,285 @@ rdt_css_alloc(struct cgroup_subsys_state *parent_css)

mutex_lock(&rdt_group_mutex);
ir->clos = parent->clos;
- ccmap[parent->clos].cgrp_count++;
+ __clos_get(ir->clos);
mutex_unlock(&rdt_group_mutex);

return &ir->css;
}

+static void rdt_css_free(struct cgroup_subsys_state *css)
+{
+ struct intel_rdt *ir = css_rdt(css);
+
+ mutex_lock(&rdt_group_mutex);
+ __clos_put(ir->clos);
+ kfree(ir);
+ mutex_unlock(&rdt_group_mutex);
+}
+
+static inline bool cbm_is_contiguous(unsigned long var)
+{
+ unsigned long first_bit, zero_bit;
+ unsigned long maxcbm = MAX_CBM_LENGTH;
+
+ if (!var)
+ return false;
+
+ first_bit = find_next_bit(&var, maxcbm, 0);
+ zero_bit = find_next_zero_bit(&var, maxcbm, first_bit);
+
+ if (find_next_bit(&var, maxcbm, zero_bit) < maxcbm)
+ return false;
+
+ return true;
+}
+
+static int cat_cbm_read(struct seq_file *m, void *v)
+{
+ struct intel_rdt *ir = css_rdt(seq_css(m));
+
+ seq_printf(m, "%08lx\n", ccmap[ir->clos].cache_mask);
+ return 0;
+}
+
+static int validate_cbm(struct intel_rdt *ir, unsigned long cbmvalue)
+{
+ struct intel_rdt *par, *c;
+ struct cgroup_subsys_state *css;
+ unsigned long *cbm_tmp;
+
+ if (!cbm_is_contiguous(cbmvalue)) {
+ pr_err("bitmask should have >= 1 bits and be contiguous\n");
+ return -EINVAL;
+ }
+
+ par = parent_rdt(ir);
+ cbm_tmp = &ccmap[par->clos].cache_mask;
+ if (!bitmap_subset(&cbmvalue, cbm_tmp, MAX_CBM_LENGTH))
+ return -EINVAL;
+
+ rcu_read_lock();
+ rdt_for_each_child(css, ir) {
+ c = css_rdt(css);
+ cbm_tmp = &ccmap[c->clos].cache_mask;
+ if (!bitmap_subset(cbm_tmp, &cbmvalue, MAX_CBM_LENGTH)) {
+ rcu_read_unlock();
+ pr_err("Children's mask not a subset\n");
+ return -EINVAL;
+ }
+ }
+
+ rcu_read_unlock();
+ return 0;
+}
+
+static bool cbm_search(unsigned long cbm, int *closid)
+{
+ int maxid = boot_cpu_data.x86_cat_closs;
+ unsigned int i;
+
+ for (i = 0; i < maxid; i++) {
+ if (bitmap_equal(&cbm, &ccmap[i].cache_mask, MAX_CBM_LENGTH)) {
+ *closid = i;
+ return true;
+ }
+ }
+ return false;
+}
+
+static void cbmmap_dump(void)
+{
+ int i;
+
+ pr_debug("CBMMAP\n");
+ for (i = 0; i < boot_cpu_data.x86_cat_closs; i++)
+ pr_debug("cache_mask: 0x%x,clos_refcnt: %u\n",
+ (unsigned int)ccmap[i].cache_mask, ccmap[i].clos_refcnt);
+}
+
+static void __cpu_cbm_update(void *info)
+{
+ unsigned int closid = *((unsigned int *)info);
+
+ wrmsrl(CBM_FROM_INDEX(closid), ccmap[closid].cache_mask);
+}
+
+/*
+ * cbm_update_all() - Update the cache bit mask for all packages.
+ */
+static inline void cbm_update_all(unsigned int closid)
+{
+ on_each_cpu_mask(&rdt_cpumask, __cpu_cbm_update, &closid, 1);
+}
+
+/*
+ * cbm_update_msrs() - Updates all the existing IA32_L3_MASK_n MSRs
+ * which are one per CLOSid, on the current package.
+ * @cpu : the cpu on which the mask is updated.
+ */
+static inline void cbm_update_msrs(int cpu)
+{
+ int maxid = boot_cpu_data.x86_cat_closs;
+ unsigned int i;
+
+ if (WARN_ON(cpu != smp_processor_id()))
+ return;
+
+ for (i = 1; i < maxid; i++) {
+ if (ccmap[i].clos_refcnt)
+ __cpu_cbm_update(&i);
+ }
+}
+
+/*
+ * rdt_cbm_write() - Validates and writes the cache bit mask(cbm)
+ * to the IA32_L3_MASK_n and also store the same in the ccmap.
+ *
+ * CLOSids are reused for cgroups which have same bitmask.
+ * - This helps to use the scant CLOSids optimally.
+ * - This also implies that at context switch write
+ * to PQR-MSR is done only when a task with a
+ * different bitmask is scheduled in.
+ */
+static int cat_cbm_write(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 cbmvalue)
+{
+ struct intel_rdt *ir = css_rdt(css);
+ ssize_t err = 0;
+ unsigned long cache_mask, max_mask;
+ unsigned long *cbm_tmp;
+ unsigned int closid;
+ u32 max_cbm = boot_cpu_data.x86_cat_cbmlength;
+
+ if (ir == &rdt_root_group)
+ return -EPERM;
+ bitmap_set(&max_mask, 0, max_cbm);
+
+ /*
+ * Need global mutex as cbm write may allocate a closid.
+ */
+ mutex_lock(&rdt_group_mutex);
+ bitmap_and(&cache_mask, (unsigned long *)&cbmvalue, &max_mask, max_cbm);
+ cbm_tmp = &ccmap[ir->clos].cache_mask;
+
+ if (bitmap_equal(&cache_mask, cbm_tmp, MAX_CBM_LENGTH))
+ goto out;
+
+ err = validate_cbm(ir, cache_mask);
+ if (err)
+ goto out;
+
+ /*
+ * At this point we are sure to change the cache_mask.Hence release the
+ * reference to the current CLOSid and try to get a reference for
+ * a different CLOSid.
+ */
+ __clos_put(ir->clos);
+
+ if (cbm_search(cache_mask, &closid)) {
+ ir->clos = closid;
+ __clos_get(closid);
+ } else {
+ err = rdt_alloc_closid(ir);
+ if (err)
+ goto out;
+
+ ccmap[ir->clos].cache_mask = cache_mask;
+ cbm_update_all(ir->clos);
+ }
+
+ cbmmap_dump();
+out:
+
+ mutex_unlock(&rdt_group_mutex);
+ return err;
+}
+
+static inline bool rdt_update_cpumask(int cpu)
+{
+ int phys_id = topology_physical_package_id(cpu);
+ struct cpumask *mask = &rdt_cpumask;
+ int i;
+
+ for_each_cpu(i, mask) {
+ if (phys_id == topology_physical_package_id(i))
+ return false;
+ }
+
+ cpumask_set_cpu(cpu, mask);
+ return true;
+}
+
+/*
+ * rdt_cpu_start() - If a new package has come up, update all
+ * the Cache bitmasks on the package.
+ */
+static inline void rdt_cpu_start(int cpu)
+{
+ mutex_lock(&rdt_group_mutex);
+ if (rdt_update_cpumask(cpu))
+ cbm_update_msrs(cpu);
+ mutex_unlock(&rdt_group_mutex);
+}
+
+static void rdt_cpu_exit(unsigned int cpu)
+{
+ int phys_id = topology_physical_package_id(cpu);
+ int i;
+
+ mutex_lock(&rdt_group_mutex);
+ if (!cpumask_test_and_clear_cpu(cpu, &rdt_cpumask)) {
+ mutex_unlock(&rdt_group_mutex);
+ return;
+ }
+
+ for_each_online_cpu(i) {
+ if (i == cpu)
+ continue;
+
+ if (phys_id == topology_physical_package_id(i)) {
+ cpumask_set_cpu(i, &rdt_cpumask);
+ break;
+ }
+ }
+ mutex_unlock(&rdt_group_mutex);
+}
+
+static int rdt_cpu_notifier(struct notifier_block *nb,
+ unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (unsigned long)hcpu;
+
+ switch (action) {
+ case CPU_STARTING:
+ rdt_cpu_start(cpu);
+ break;
+ case CPU_DOWN_PREPARE:
+ rdt_cpu_exit(cpu);
+ break;
+ default:
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
static int __init rdt_late_init(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
static struct clos_cbm_map *ccm;
size_t sizeb;
- int maxid, cbm_len;
+ int maxid, cbm_len, i;

if (!cat_supported(c)) {
rdt_root_group.css.ss->disabled = 1;
return -ENODEV;
}
+
maxid = c->x86_cat_closs;
cbm_len = c->x86_cat_cbmlength;
- sizeb = BITS_TO_LONGS(maxid) * sizeof(long);

+ sizeb = BITS_TO_LONGS(maxid) * sizeof(long);
rdtss_info.closmap = kzalloc(sizeb, GFP_KERNEL);
if (!rdtss_info.closmap)
return -ENOMEM;
@@ -111,11 +422,17 @@ static int __init rdt_late_init(void)

set_bit(0, rdtss_info.closmap);
rdt_root_group.clos = 0;
-
ccm = &ccmap[0];
- ccm->cbm = (u32)((u64)(1 << cbm_len) - 1);
- ccm->cgrp_count++;
+ bitmap_set(&ccm->cache_mask, 0, cbm_len);
+ ccm->clos_refcnt = 1;
+
+ cpu_notifier_register_begin();
+ for_each_online_cpu(i)
+ rdt_update_cpumask(i);
+
+ __hotcpu_notifier(rdt_cpu_notifier, 0);

+ cpu_notifier_register_done();
pr_info("Max bitmask length:%u,Max ClosIds: %u\n", cbm_len, maxid);

return 0;
@@ -123,18 +440,19 @@ static int __init rdt_late_init(void)

late_initcall(rdt_late_init);

-static void rdt_css_free(struct cgroup_subsys_state *css)
-{
- struct intel_rdt *ir = css_rdt(css);
-
- mutex_lock(&rdt_group_mutex);
- rdt_free_closid(ir);
- kfree(ir);
- mutex_unlock(&rdt_group_mutex);
-}
+static struct cftype rdt_files[] = {
+ {
+ .name = "cache_mask",
+ .seq_show = cat_cbm_read,
+ .write_u64 = cat_cbm_write,
+ .mode = 0666,
+ },
+ { } /* terminate */
+};

struct cgroup_subsys rdt_cgrp_subsys = {
.css_alloc = rdt_css_alloc,
.css_free = rdt_css_free,
+ .legacy_cftypes = rdt_files,
.early_init = 0,
};
--
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/