[PATCH 6/9] x86/intel_rdt: Add support for cache bit mask management

From: Vikas Shivappa
Date: Thu Aug 06 2015 - 17:56:43 EST


Adds a file l3_cbm to the intel_rdt cgroup which represents the cache
capacity bit mask for the cgroup. The tasks in the cgroup would get to
fill the L3 cache represented by the cgroup's l3_cbm file. The bit mask
may map to ways in the cache but could be hardware implementation
specific.

The l3_cbm would represent one of IA32_L3_MASK_n MSRs, there by any
updates to the l3_cbm end up in an MSR write to the appropriate
IA32_L3_MASK_n. The IA32_L3_MASK_n MSRs are per package but the l3_cbm
represents the global value of the MSR on all packages.

When a child cgroup is created it inherits the CLOSid and the l3_cbm
from its parent. When a user changes the default l3_cbm for a cgroup, a
new CLOSid may be allocated if the l3_cbm was not used before. If the
new l3_cbm is the one that is already used, the count for that CLOSid
<-> l3_cbm is incremented. The changing of 'l3_cbm' may fail with
-ENOSPC once the kernel runs out of maximum CLOSids it can support.

User can create as many cgroups as he wants, but having different l3_cbm
at the same time is restricted by the maximum number of CLOSids. Kernel
maintains a CLOSid <-> l3_cbm mapping which keeps count of cgroups using
a CLOSid.

Reuse of CLOSids for cgroups with same bitmask also has following
advantages:
- This helps to use the scant CLOSids optimally.
- This also implies that during context switch, write to PQR-MSR is
done only when a task with a different bitmask is scheduled in.

Signed-off-by: Vikas Shivappa <vikas.shivappa@xxxxxxxxxxxxxxx>
---
arch/x86/include/asm/intel_rdt.h | 3 +
arch/x86/kernel/cpu/intel_rdt.c | 202 ++++++++++++++++++++++++++++++++++++++-
2 files changed, 204 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h
index a887004..58bac91 100644
--- a/arch/x86/include/asm/intel_rdt.h
+++ b/arch/x86/include/asm/intel_rdt.h
@@ -4,6 +4,9 @@
#ifdef CONFIG_CGROUP_RDT

#include <linux/cgroup.h>
+#define MAX_CBM_LENGTH 32
+#define IA32_L3_CBM_BASE 0xc90
+#define CBM_FROM_INDEX(x) (IA32_L3_CBM_BASE + x)

struct rdt_subsys_info {
unsigned long *closmap;
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 52e1fd6..115f136 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -34,6 +34,13 @@ static struct clos_cbm_table *cctable;
static struct rdt_subsys_info rdtss_info;
static DEFINE_MUTEX(rdt_group_mutex);
struct intel_rdt rdt_root_group;
+/*
+ * Mask of CPUs for writing CBM values. We only need one CPU per-socket.
+ */
+static cpumask_t rdt_cpumask;
+
+#define rdt_for_each_child(pos_css, parent_ir) \
+ css_for_each_child((pos_css), &(parent_ir)->css)

static inline void closid_get(u32 closid)
{
@@ -117,12 +124,192 @@ static void intel_rdt_css_free(struct cgroup_subsys_state *css)
mutex_unlock(&rdt_group_mutex);
}

+static int intel_cache_alloc_cbm_read(struct seq_file *m, void *v)
+{
+ struct intel_rdt *ir = css_rdt(seq_css(m));
+
+ seq_printf(m, "%08lx\n", cctable[ir->closid].l3_cbm);
+
+ return 0;
+}
+
+static inline bool cbm_is_contiguous(unsigned long var)
+{
+ unsigned long maxcbm = MAX_CBM_LENGTH;
+ unsigned long first_bit, zero_bit;
+
+ if (!var)
+ return false;
+
+ first_bit = find_first_bit(&var, maxcbm);
+ zero_bit = find_next_zero_bit(&var, maxcbm, first_bit);
+
+ if (find_next_bit(&var, maxcbm, zero_bit) < maxcbm)
+ return false;
+
+ return true;
+}
+
+static int cbm_validate(struct intel_rdt *ir, unsigned long cbmvalue)
+{
+ struct cgroup_subsys_state *css;
+ struct intel_rdt *par, *c;
+ unsigned long *cbm_tmp;
+ int err = 0;
+
+ if (!cbm_is_contiguous(cbmvalue)) {
+ err = -EINVAL;
+ goto out_err;
+ }
+
+ par = parent_rdt(ir);
+ cbm_tmp = &cctable[par->closid].l3_cbm;
+ if (!bitmap_subset(&cbmvalue, cbm_tmp, MAX_CBM_LENGTH)) {
+ err = -EINVAL;
+ goto out_err;
+ }
+
+ rcu_read_lock();
+ rdt_for_each_child(css, ir) {
+ c = css_rdt(css);
+ cbm_tmp = &cctable[c->closid].l3_cbm;
+ if (!bitmap_subset(cbm_tmp, &cbmvalue, MAX_CBM_LENGTH)) {
+ rcu_read_unlock();
+ err = -EINVAL;
+ goto out_err;
+ }
+ }
+ rcu_read_unlock();
+out_err:
+
+ return err;
+}
+
+static bool cbm_search(unsigned long cbm, u32 *closid)
+{
+ u32 maxid = boot_cpu_data.x86_cache_max_closid;
+ u32 i;
+
+ for (i = 0; i < maxid; i++) {
+ if (bitmap_equal(&cbm, &cctable[i].l3_cbm, MAX_CBM_LENGTH)) {
+ *closid = i;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static void closcbm_map_dump(void)
+{
+ u32 i;
+
+ pr_debug("CBMMAP\n");
+ for (i = 0; i < boot_cpu_data.x86_cache_max_closid; i++) {
+ pr_debug("l3_cbm: 0x%x,clos_refcnt: %u\n",
+ (unsigned int)cctable[i].l3_cbm, cctable[i].clos_refcnt);
+ }
+}
+
+static void cbm_cpu_update(void *info)
+{
+ u32 closid = (u32) info;
+
+ wrmsrl(CBM_FROM_INDEX(closid), cctable[closid].l3_cbm);
+}
+
+/*
+ * cbm_update_all() - Update the cache bit mask for all packages.
+ */
+static inline void cbm_update_all(u32 closid)
+{
+ on_each_cpu_mask(&rdt_cpumask, cbm_cpu_update, (void *)closid, 1);
+}
+
+/*
+ * intel_cache_alloc_cbm_write() - Validates and writes the
+ * cache bit mask(cbm) to the IA32_L3_MASK_n
+ * and also store the same in the cctable.
+ *
+ * CLOSids are reused for cgroups which have same bitmask.
+ * This helps to use the scant CLOSids optimally. This also
+ * implies that at context switch write to PQR-MSR is done
+ * only when a task with a different bitmask is scheduled in.
+ */
+static int intel_cache_alloc_cbm_write(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 cbmvalue)
+{
+ u32 max_cbm = boot_cpu_data.x86_cache_max_cbm_len;
+ struct intel_rdt *ir = css_rdt(css);
+ u64 max_mask;
+ int err = 0;
+ u32 closid;
+
+ if (ir == &rdt_root_group)
+ return -EPERM;
+
+ /*
+ * Need global mutex as cbm write may allocate a closid.
+ */
+ mutex_lock(&rdt_group_mutex);
+
+ max_mask = (1ULL << max_cbm) - 1;
+ if (cbmvalue & ~max_mask) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (cbmvalue == cctable[ir->closid].l3_cbm)
+ goto out;
+
+ err = cbm_validate(ir, cbmvalue);
+ if (err)
+ goto out;
+
+ /*
+ * Try to get a reference for a different CLOSid and release the
+ * reference to the current CLOSid.
+ * Need to put down the reference here and get it back in case we
+ * run out of closids. Otherwise we run into a problem when
+ * we could be using the last closid that could have been available.
+ */
+ closid_put(ir->closid);
+ if (cbm_search(cbmvalue, &closid)) {
+ ir->closid = closid;
+ closid_get(closid);
+ } else {
+ closid = ir->closid;
+ err = closid_alloc(ir);
+ if (err) {
+ closid_get(ir->closid);
+ goto out;
+ }
+
+ cctable[ir->closid].l3_cbm = cbmvalue;
+ cbm_update_all(ir->closid);
+ }
+ closcbm_map_dump();
+out:
+ mutex_unlock(&rdt_group_mutex);
+
+ return err;
+}
+
+static inline void rdt_cpumask_update(int cpu)
+{
+ static cpumask_t tmp;
+
+ cpumask_and(&tmp, &rdt_cpumask, topology_core_cpumask(cpu));
+ if (cpumask_empty(&tmp))
+ cpumask_set_cpu(cpu, &rdt_cpumask);
+}
+
static int __init intel_rdt_late_init(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
static struct clos_cbm_table *cct;
u32 maxid, max_cbm_len;
- int err = 0, size;
+ int err = 0, size, i;

if (!cpu_has(c, X86_FEATURE_CAT_L3)) {
rdt_root_group.css.ss->disabled = 1;
@@ -152,6 +339,9 @@ static int __init intel_rdt_late_init(void)
cct->l3_cbm = (1ULL << max_cbm_len) - 1;
cct->clos_refcnt = 1;

+ for_each_online_cpu(i)
+ rdt_cpumask_update(i);
+
pr_info("Intel cache allocation enabled\n");
out_err:

@@ -160,8 +350,18 @@ out_err:

late_initcall(intel_rdt_late_init);

+static struct cftype rdt_files[] = {
+ {
+ .name = "l3_cbm",
+ .seq_show = intel_cache_alloc_cbm_read,
+ .write_u64 = intel_cache_alloc_cbm_write,
+ },
+ { } /* terminate */
+};
+
struct cgroup_subsys intel_rdt_cgrp_subsys = {
.css_alloc = intel_rdt_css_alloc,
.css_free = intel_rdt_css_free,
+ .legacy_cftypes = rdt_files,
.early_init = 0,
};
--
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/