RE: [PATCH v2] perf,x86: add Intel Memory Bandwidth Monitoring (MBM) PMU

From: Juvva, Kanaka D
Date: Mon Aug 03 2015 - 13:51:19 EST


Thomas,

Acknowledged. I'll update the patch and generate separate patch(s) wherever cqm changes
are required. One thing was checkpatch.pl gave errors in current cqm code. I had to break lines
as it complained about some line(s) exceeding 80 chars.

Regards.
-Kanaka

> -----Original Message-----
> From: Thomas Gleixner [mailto:tglx@xxxxxxxxxxxxx]
> Sent: Tuesday, July 28, 2015 12:59 AM
> To: Kanaka Juvva
> Cc: Juvva, Kanaka D; Williamson, Glenn P; Fleming, Matt; Auld, Will; Andi Kleen;
> LKML; Luck, Tony; Peter Zijlstra; Tejun Heo; Autee, Priya V; x86@xxxxxxxxxx;
> mingo@xxxxxxxxxx; H. Peter Anvin; Shivappa, Vikas
> Subject: Re: [PATCH v2] perf,x86: add Intel Memory Bandwidth Monitoring
> (MBM) PMU
>
> On Tue, 21 Jul 2015, Kanaka Juvva wrote:
> > diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
> > b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
> > index 1880761..dc1ce58 100644
> > --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
> > +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
> > @@ -12,10 +12,20 @@
> > #define MSR_IA32_PQR_ASSOC 0x0c8f
> > #define MSR_IA32_QM_CTR 0x0c8e
> > #define MSR_IA32_QM_EVTSEL 0x0c8d
> > +#define MAX_MBM_CNTR 0xffffff
>
> Can we please have a consistent name space MBM_... here?
>
> > +#define MBM_SOCKET_MAX 8
> > +#define MBM_TIME_DELTA_MAX 1000
> > +#define MBM_TIME_DELTA_MIN 1000
> > +#define MBM_SCALING_FACTOR 1000
> > +#define MBM_SCALING_HALF (MBM_SCALING_FACTOR/2)
> > +#define MBM_FIFO_SIZE_MIN 10
> > +#define MBM_FIFO_SIZE_MAX 300
> >
> > -static u32 cqm_max_rmid = -1;
> > -static unsigned int cqm_l3_scale; /* supposedly cacheline size */
> >
> > +static u32 cqm_max_rmid = -1;
> > +static unsigned int cqm_l3_scale;/* supposedly cacheline size */
>
> Pointless white space change
>
> > +static unsigned mbm_window_size = MBM_FIFO_SIZE_MIN; static bool
> > +cqm_llc_occ, cqm_mbm;
> > /**
> > * struct intel_pqr_state - State cache for the PQR MSR
> > * @rmid: The cached Resource Monitoring ID
> > @@ -42,6 +52,34 @@ struct intel_pqr_state {
> > * interrupts disabled, which is sufficient for the protection.
> > */
> > static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
> > +static DEFINE_PER_CPU(struct mbm_pmu *, mbm_pmu); static
> > +DEFINE_PER_CPU(struct mbm_pmu *, mbm_pmu_to_free);
> > +
> > +/*
> > + * mbm pmu is used for storing mbm_local and mbm_total
> > + * events
> > + */
> > +struct mbm_pmu {
> > + spinlock_t lock;
>
> What is this lock protecting and how does it nest into perf locking?
>
> > + int n_active; /* number of active events */
>
> If you want to document your struct members, please use kerneldoc and not
> these hard to read tail comments.
>
> > + struct list_head active_list;
> > + struct pmu *pmu; /* pointer to mbm perf_event */
> > + ktime_t timer_interval; /* in ktime_t unit */
> > + struct hrtimer hrtimer;
> > +};
> > +
> > +struct sample {
> > + u64 bytes; /* mbm counter value read*/
> > + u64 cum_avg; /* current running average of bandwidth */
> > + ktime_t prev_time;
> > + u64 index; /* current sample no */
> > + u32 mbmfifo[MBM_FIFO_SIZE_MAX]; /* window of last n bw values */
> > + u32 fifoin; /* mbmfifo in counter for sliding window */
> > + u32 fifoout; /* mbmfifo out counter for sliding window */
>
> So above you use a proper layout which is actualy readable (aside of the tail
> comments). Can you please stick with a consistent coding style?
>
> > +};
> > +
> > +struct sample *mbm_total; /* curent stats for mbm_total events */
> > +struct sample *mbm_local; /* current stats for mbm_local events */
>
> static ? And please get rid of these tail comments.
>
> > /*
> > * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
> > @@ -66,6 +104,9 @@ static cpumask_t cqm_cpumask;
> > #define RMID_VAL_UNAVAIL (1ULL << 62)
> >
> > #define QOS_L3_OCCUP_EVENT_ID (1 << 0)
> > +#define QOS_MBM_TOTAL_EVENT_ID (1 << 1)
>
> Please use tabs not spaces.
>
> > +#define QOS_MBM_LOCAL_EVENT_ID1 0x3
>
> What's ID1 for heavens sake? Looks like
>
> (QOS_L3_OCCUP_EVENT_ID | QOS_MBM_TOTAL_EVENT_ID)
>
> So this wants a descriptive ID name and a comment.
>
> > +#define QOS_MBM_LOCAL_EVENT_ID (1 << 2)
> >
> > #define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID
> >
> > @@ -90,7 +131,8 @@ static u32 intel_cqm_rotation_rmid;
> > */
> > static inline bool __rmid_valid(u32 rmid) {
> > - if (!rmid || rmid == INVALID_RMID)
> > + WARN_ON_ONCE(rmid > cqm_max_rmid);
> > + if (!rmid || (rmid == INVALID_RMID) || (rmid > cqm_max_rmid))
> > return false;
>
> How is that related to this patch? Looks like a fix or prerequisite change to the
> existing code.
>
> > return true;
> > @@ -125,6 +167,7 @@ struct cqm_rmid_entry {
> > enum rmid_recycle_state state;
> > struct list_head list;
> > unsigned long queue_time;
> > + bool config;
> > };
> >
> > /*
> > @@ -176,6 +219,17 @@ static inline struct cqm_rmid_entry
> *__rmid_entry(u32 rmid)
> > return entry;
> > }
> >
> > +static void mbm_reset_stats(u32 rmid) {
> > + u32 vrmid = topology_physical_package_id(smp_processor_id()) *
> > + cqm_max_rmid + rmid;
>
> Can you please explain that magic calculation? It's far from obvious why this
> would be correct.
>
> > +
> > + if ((!cqm_mbm) || (!mbm_local) || (!mbm_total))
> > + return;
> > + memset(&mbm_local[vrmid], 0, sizeof(struct sample));
> > + memset(&mbm_total[vrmid], 0, sizeof(struct sample)); }
> > +
> > /*
> > * Returns < 0 on fail.
> > *
> > @@ -190,8 +244,10 @@ static u32 __get_rmid(void)
> > if (list_empty(&cqm_rmid_free_lru))
> > return INVALID_RMID;
> >
> > - entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry,
> list);
> > + entry = list_first_entry(&cqm_rmid_free_lru,
> > + struct cqm_rmid_entry, list);
>
> And the point of this change is?
>
> > list_del(&entry->list);
> > + mbm_reset_stats(entry->rmid);
> >
> > return entry->rmid;
> > }
> > @@ -207,6 +263,7 @@ static void __put_rmid(u32 rmid)
> >
> > entry->queue_time = jiffies;
> > entry->state = RMID_YOUNG;
> > + mbm_reset_stats(rmid);
> >
> > list_add_tail(&entry->list, &cqm_rmid_limbo_lru); } @@ -232,6
> > +289,8 @@ static int intel_cqm_setup_rmid_cache(void)
> >
> > INIT_LIST_HEAD(&entry->list);
> > entry->rmid = r;
> > + entry->config = false;
> > +
> > cqm_rmid_ptrs[r] = entry;
> >
> > list_add_tail(&entry->list, &cqm_rmid_free_lru); @@ -254,6
> +313,8
> > @@ fail:
> > kfree(cqm_rmid_ptrs[r]);
> >
> > kfree(cqm_rmid_ptrs);
> > + kfree(mbm_local);
> > + kfree(mbm_total);
> > return -ENOMEM;
> > }
> >
> > @@ -403,9 +464,11 @@ static void __intel_cqm_event_count(void *info);
> > static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid) {
> > struct perf_event *event;
> > +
>
> Superfluous newline.
>
> > struct list_head *head = &group->hw.cqm_group_entry;
> > u32 old_rmid = group->hw.cqm_rmid;
> >
> > +
>
> Another one. Sigh!
>
> > lockdep_assert_held(&cache_mutex);
> >
> > /*
> > @@ -494,6 +557,166 @@ static bool intel_cqm_sched_in_event(u32 rmid)
> > return false;
> > }
> >
> > +static u32 mbm_fifo_sum_lastn_out(int rmid, bool is_localbw)
>
> Random space after u32
>
> > +{
> > + u32 val = 0, i, j, index;
> > +
> > + if (is_localbw) {
> > + if (++mbm_local[rmid].fifoout >= mbm_window_size)
> > + mbm_local[rmid].fifoout = 0;
> > + index = mbm_local[rmid].fifoout;
> > + for (i = 0; i < mbm_window_size - 1; i++) {
> > + if (index + i >= mbm_window_size)
> > + j = index + i - mbm_window_size;
> > + else
> > + j = index + i;
> > + val += mbm_local[rmid].mbmfifo[j];
> > + }
> > + return val;
> > + }
> > +
> > + if (++mbm_total[rmid].fifoout >= mbm_window_size)
> > + mbm_total[rmid].fifoout = 0;
> > + index = mbm_total[rmid].fifoout;
> > + for (i = 0; i < mbm_window_size - 1; i++) {
> > + if (index + i >= mbm_window_size)
> > + j = index + i - mbm_window_size;
> > + else
> > + j = index + i;
> > + val += mbm_total[rmid].mbmfifo[j];
> > + }
>
> The concept of helper functions to avoid code duplication is
> definitely applicable here.
>
> > + return val;
> > +}
> > +
> > +static int mbm_fifo_in(int rmid, u32 val, bool is_localbw)
>
> Random space after int.
>
> > +{
> > + if (is_localbw) {
> > + mbm_local[rmid].mbmfifo[mbm_local[rmid].fifoin] = val;
> > + if (++mbm_local[rmid].fifoin >= mbm_window_size)
> > + mbm_local[rmid].fifoin = 0;
> > + } else {
> > + mbm_total[rmid].mbmfifo[mbm_total[rmid].fifoin] = val;
> > + if (++mbm_total[rmid].fifoin >= mbm_window_size)
> > + mbm_total[rmid].fifoin = 0;
> > + }
>
> static void mbm_fifo_in(struct sample *, u32 val)
>
> would get rid of code duplication.
>
> > + return 0;
>
> What's the purpose of having a return value here?
>
> > +}
> > +
> > +/*
> > + * __rmid_read_mbm checks whether it is LOCAL or GLOBAL MBM event and
> reads
> > + * its MSR counter. if (MSR current value < MSR previous value) it is an
> > + * overflow and overflow is handled. If MSR is read within last 100ms,
> > + * then the value is ignored; this will suppress small deltas. We don't
> > + * process MBM samples that are within 100ms. Bandwidth is calculated as:
> > + * bandwidth = difference of last two msr counter values/time difference.
> > + * cum_avg = Running Average bandwidth of last 'n' samples that are
> processed
> > + * Sliding window is used to save the last 'n' samples. Hence,
> > + * n = sliding_window_size
> > + * cum_avg is scaled down by a factor MBM_SCALING_FACTOR and rounded
> to nearest
> > + * integer. User interface reads the BW in MB/sec.
> > + * Rounding algorithm : (X + 0.5):
> > + * where X is scaled BW value. To avoid floating point arithmetic :
> > + * BW- unscaled value
> > + * (BW + MBM_SCALING_HALF)/MBM_SCALING_FACTOR is computed which
> gives the
> > + * scaled bandwidth.
>
> This is completely unreadable.
>
> > + */
> > +static u64 __rmid_read_mbm(unsigned int rmid, bool read_mbm_local)
> > +{
> > + u64 val, tmp, diff_time, cma, bytes, index;
> > + bool overflow = false;
> > + ktime_t cur_time;
> > + u32 tmp32 = rmid;
> > + u32 vrmid = topology_physical_package_id(smp_processor_id()) *
> > + cqm_max_rmid + rmid;
> > +
> > + rmid = vrmid;
>
> This is completely backwards.
>
> tmp32 = rmid;
> rmid = vrmid;
> do_stuff(rmid);
> rmid = tmp32;
> do_other_stuff(rmid);
>
> Why can't you use vrmid for do_stuff() and leave rmid alone? Just
> because it would make the code simpler to read?
>
> > + cur_time = ktime_get_real();
>
> Why would this operate on wall time and not on clock monotonic time?
>
> > + if (read_mbm_local) {
> > + cma = mbm_local[rmid].cum_avg;
> > + diff_time = ktime_ms_delta(cur_time,
> > + mbm_local[rmid].prev_time);
> > + if (diff_time < 100)
> > + return cma;
> > + mbm_local[rmid].prev_time = ktime_set(0,
> > + (unsigned long)ktime_to_ns(cur_time));
> > + bytes = mbm_local[rmid].bytes;
> > + index = mbm_local[rmid].index;
> > + rmid = tmp32;
> > + wrmsr(MSR_IA32_QM_EVTSEL, QOS_MBM_LOCAL_EVENT_ID1,
> rmid);
>
> Why is this using ID1?
>
> > + } else {
> > + cma = mbm_total[rmid].cum_avg;
> > + diff_time = ktime_ms_delta(cur_time,
> > + mbm_total[rmid].prev_time);
> > + if (diff_time < 100)
> > + return cma;
> > + mbm_total[rmid].prev_time = ktime_set(0,
> > + (unsigned long)ktime_to_ns(cur_time));
> > + bytes = mbm_total[rmid].bytes;
> > + index = mbm_total[rmid].index;
> > + rmid = tmp32;
> > + wrmsr(MSR_IA32_QM_EVTSEL, QOS_MBM_TOTAL_EVENT_ID,
> rmid);
> > + }
>
> Random space after tab.
>
> > + rdmsrl(MSR_IA32_QM_CTR, val);
> > + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
> > + return val;
> > +
> > + tmp = val;
>
> You really have a faible for random storage. tmp gets assigned to
> mbm_*[rmid].bytes further down. This makes no sense at all.
>
> > + if (val < bytes) /* overflow handle it */ {
>
> These tail comments are crap.
>
> > + val = MAX_MBM_CNTR - bytes + val;
> > + overflow = true;
> > + } else
> > + val = val - bytes;
>
> That else clause is missing braces.
>
> > + if (diff_time < MBM_TIME_DELTA_MAX - MBM_TIME_DELTA_MIN)
> > + val = (val * MBM_TIME_DELTA_MAX) / diff_time;
>
> That resolves to
>
> if (diff_time < 0)
>
> Which is always false because diff_time is u64.
>
> What's the logic behind this?
>
> > +
> > + if ((diff_time > MBM_TIME_DELTA_MAX) && (!cma))
> > + /* First sample, we can't use the time delta */
> > + diff_time = MBM_TIME_DELTA_MAX;
>
> And this?
>
> > +
> > + rmid = vrmid;
>
> More obfuscation
>
> > + if ((diff_time <= MBM_TIME_DELTA_MAX + MBM_TIME_DELTA_MIN)
> ||
> > + overflow) {
>
> Again, that resolves to
>
> if (overflow)
>
> And even if MBM_TIME_DELTA_MAX would be not equal
> MBM_TIME_DELTA_MIN
> then this wants some explanation about
>
> MBM_TIME_DELTA_MAX - MBM_TIME_DELTA_MIN
>
> versus
>
> MBM_TIME_DELTA_MAX + MBM_TIME_DELTA_MIN
>
> and the whole logic behind this time delta magic, if there is any.
>
> > + int bw, ret;
> > +
> > + if (index & (index < mbm_window_size))
> > + val = cma * MBM_SCALING_FACTOR + val / index -
> > + cma / index;
> > +
> > + val = (val + MBM_SCALING_HALF) / MBM_SCALING_FACTOR;
> > + if (index >= mbm_window_size) {
>
> These conditionals along with the magic math are undocumented.
>
> > + /*
> > + * Compute the sum of recent n-1 samples and slide the
> > + * window by 1
> > + */
> > + ret = mbm_fifo_sum_lastn_out(rmid, read_mbm_local);
> > + /* recalculate the running average with current bw */
> > + ret = (ret + val) / mbm_window_size;
> > + if (ret < 0)
>
> The average of positive numbers becomes negative ?
>
> > + ret = 0;
> > + bw = val;
> > + val = ret;
>
> Your back and forth assignements of random variables is not making the
> code any more readable.
>
> > + } else
> > + bw = val;
> > + /* save the recent bw in fifo */
> > + mbm_fifo_in(rmid, bw, read_mbm_local);
> > +
> > + if (read_mbm_local) {
>
> Oh well. You have this read_mbm_local conditional 3 times in this
> function. Why don't you make the function oblivious of this and hand
> in a pointer to mbm_local or mbm_total? Would be too simple and not
> enough obfuscated, right?
>
> > + mbm_local[rmid].index++;
> > + mbm_local[rmid].cum_avg = val;
> > + mbm_local[rmid].bytes = tmp;
> > + mbm_local[rmid].prev_time = ktime_set(0,
> > + (unsigned long)ktime_to_ns(cur_time));
> > + } else {
> > + mbm_total[rmid].index++;
> > + mbm_total[rmid].cum_avg = val;
> > + mbm_total[rmid].bytes = tmp;
> > + mbm_total[rmid].prev_time = ktime_set(0,
> > + (unsigned long)ktime_to_ns(cur_time));
> > + }
> > + return val;
> > + }
> > + return cma;
> > +}
> > +
> > /*
> > * Initially use this constant for both the limbo queue time and the
> > * rotation timer interval, pmu::hrtimer_interval_ms.
> > @@ -568,7 +791,8 @@ static bool intel_cqm_rmid_stabilize(unsigned int
> *available)
> > /*
> > * Test whether an RMID is free for each package.
> > */
> > - on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true);
> > + if (entry->config)
>
> This entry->config change wants to be a separate patch. It's
> completely non obvious how this changes the behaviour of the existing code.
>
> > + on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL,
> true);
> >
> > list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) {
> > /*
> > @@ -846,8 +1070,9 @@ static void intel_cqm_setup_event(struct perf_event
> *event,
> > struct perf_event **group)
> > {
> > struct perf_event *iter;
> > - bool conflict = false;
> > +
> > u32 rmid;
> > + bool conflict = false;
>
> Fits the overall picture of this patch: Sloppy
>
> > list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
> > rmid = iter->hw.cqm_rmid;
> > @@ -875,6 +1100,40 @@ static void intel_cqm_setup_event(struct
> perf_event *event,
> > event->hw.cqm_rmid = rmid;
> > }
> >
> > +static void intel_cqm_event_update(struct perf_event *event)
> > +{
> > + unsigned int rmid;
> > + u64 val = 0;
> > +
> > + /*
> > + * Task events are handled by intel_cqm_event_count().
> > + */
> > + if (event->cpu == -1)
> > + return;
> > +
> > + rmid = event->hw.cqm_rmid;
> > + if (!__rmid_valid(rmid))
> > + return;
> > +
> > + switch (event->attr.config) {
> > + case QOS_MBM_TOTAL_EVENT_ID:
> > + val = __rmid_read_mbm(rmid, false);
> > + break;
> > + case QOS_MBM_LOCAL_EVENT_ID:
> > + val = __rmid_read_mbm(rmid, true);
> > + break;
> > + default:
> > + break;
>
> return?
>
> > + }
> > + /*
> > + * Ignore this reading on error states and do not update the value.
> > + */
> > + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
> > + return;
> > +
> > + local64_set(&event->count, val);
> > +}
> > +
> > static void intel_cqm_event_read(struct perf_event *event)
> > {
> > unsigned long flags;
> > @@ -887,6 +1146,12 @@ static void intel_cqm_event_read(struct perf_event
> *event)
> > if (event->cpu == -1)
> > return;
> >
> > + if ((event->attr.config & QOS_MBM_TOTAL_EVENT_ID) ||
> > + (event->attr.config & QOS_MBM_LOCAL_EVENT_ID))
> > + intel_cqm_event_update(event);
> > +
> > + if (!(event->attr.config & QOS_L3_OCCUP_EVENT_ID))
> > + return;
> > raw_spin_lock_irqsave(&cache_lock, flags);
> > rmid = event->hw.cqm_rmid;
> >
> > @@ -906,6 +1171,28 @@ out:
> > raw_spin_unlock_irqrestore(&cache_lock, flags);
> > }
> >
> > +static void __intel_cqm_event_total_bw_count(void *info)
> > +{
> > + struct rmid_read *rr = info;
> > + u64 val;
> > +
> > + val = __rmid_read_mbm(rr->rmid, false);
> > + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
> > + return;
> > + atomic64_add(val, &rr->value);
> > +}
> > +
> > +static void __intel_cqm_event_local_bw_count(void *info)
> > +{
> > + struct rmid_read *rr = info;
> > + u64 val;
> > +
> > + val = __rmid_read_mbm(rr->rmid, true);
> > + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
> > + return;
> > + atomic64_add(val, &rr->value);
>
> You're really a fan of copy and paste.
>
> > +}
> > +
> > static void __intel_cqm_event_count(void *info)
> > {
> > struct rmid_read *rr = info;
> > @@ -967,7 +1254,21 @@ static u64 intel_cqm_event_count(struct perf_event
> *event)
> > if (!__rmid_valid(rr.rmid))
> > goto out;
> >
> > - on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
> > + switch (event->attr.config) {
> > + case QOS_L3_OCCUP_EVENT_ID:
> > + on_each_cpu_mask(&cqm_cpumask,
> __intel_cqm_event_count, &rr, 1);
> > + break;
> > + case QOS_MBM_TOTAL_EVENT_ID:
> > + on_each_cpu_mask(&cqm_cpumask,
> __intel_cqm_event_total_bw_count,
> > + &rr, 1);
> > + break;
> > + case QOS_MBM_LOCAL_EVENT_ID:
> > + on_each_cpu_mask(&cqm_cpumask,
> __intel_cqm_event_local_bw_count,
> > + &rr, 1);
> > + break;
> > + default:
> > + break;
>
> So there is nothing to do and you happily proceed?
>
> > + }
> >
> > raw_spin_lock_irqsave(&cache_lock, flags);
> > if (event->hw.cqm_rmid == rr.rmid)
> > @@ -977,6 +1278,39 @@ out:
> > return __perf_event_count(event);
> > }
>
> > static void intel_cqm_event_start(struct perf_event *event, int mode)
> > {
> > struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
> > @@ -995,19 +1329,48 @@ static void intel_cqm_event_start(struct
> perf_event *event, int mode)
> > }
> >
> > state->rmid = rmid;
> > + if (event->attr.config & QOS_L3_OCCUP_EVENT_ID) {
> > + struct cqm_rmid_entry *entry;
> > +
> > + entry = __rmid_entry(rmid);
> > + entry->config = true;
> > + }
> > wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid);
> > +
> > + if (((event->attr.config & QOS_MBM_TOTAL_EVENT_ID) ||
> > + (event->attr.config & QOS_MBM_LOCAL_EVENT_ID)) &&
> (cqm_mbm)) {
> > + int cpu = get_cpu();
>
> pmu->start() is called with interrupts disabled. So why do you need
> get_cpu() here?
>
> > + struct mbm_pmu *pmu = per_cpu(mbm_pmu, cpu);
> > +
> > + if (pmu) {
> > + if (pmu->n_active == 0)
> > + mbm_hrtimer_init(pmu);
>
> That pmu timer wants to be initialized when the pmu is initialized.
>
> > + pmu->n_active++;
> > + list_add_tail(&event->active_entry,
> > + &pmu->active_list);
> > + if (pmu->n_active == 1)
> > + mbm_start_hrtimer(pmu);
> > + }
> > + put_cpu();
> > + }
> > }
> >
> > static void intel_cqm_event_stop(struct perf_event *event, int mode)
> > {
> > struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
> > + struct mbm_pmu *pmu = __this_cpu_read(mbm_pmu);
> >
> > if (event->hw.cqm_state & PERF_HES_STOPPED)
> > return;
> >
> > event->hw.cqm_state |= PERF_HES_STOPPED;
> >
> > - intel_cqm_event_read(event);
> > + if (event->attr.config & QOS_L3_OCCUP_EVENT_ID)
> > + intel_cqm_event_read(event);
> > +
> > + if ((event->attr.config & QOS_MBM_TOTAL_EVENT_ID) ||
> > + (event->attr.config & QOS_MBM_LOCAL_EVENT_ID))
> > + intel_cqm_event_update(event);
> >
> > if (!--state->rmid_usecnt) {
> > state->rmid = 0;
> > @@ -1015,8 +1378,18 @@ static void intel_cqm_event_stop(struct
> perf_event *event, int mode)
> > } else {
> > WARN_ON_ONCE(!state->rmid);
> > }
> > +
> > + if (pmu) {
> > + WARN_ON_ONCE(pmu->n_active <= 0);
> > + pmu->n_active--;
> > + if (pmu->n_active == 0)
> > + mbm_stop_hrtimer(pmu);
> > + list_del(&event->active_entry);
> > + }
> > +
> > }
> >
> > +
>
> Sigh
>
> > static int intel_cqm_event_add(struct perf_event *event, int mode)
> > {
>
> > +static ssize_t
> > +sliding_window_size_store(struct device *dev,
> > + struct device_attribute *attr,
> > + const char *buf, size_t count)
> > +{
> > + unsigned int bytes;
> > + int ret;
> > +
> > + ret = kstrtouint(buf, 0, &bytes);
> > + if (ret)
> > + return ret;
> > +
> > + mutex_lock(&cache_mutex);
> > + if (bytes > 0 && bytes <= MBM_FIFO_SIZE_MAX)
> > + mbm_window_size = bytes;
> > + else
> > + bytes = MBM_FIFO_SIZE_MIN;
>
> What kind of twisted logic is that?
>
> > +
> > + mutex_unlock(&cache_mutex);
> > +
> > + return count;
> > +}
> > +
> > static DEVICE_ATTR_RW(max_recycle_threshold);
> > +static DEVICE_ATTR_RW(sliding_window_size);
> >
> > static struct attribute *intel_cqm_attrs[] = {
> > &dev_attr_max_recycle_threshold.attr,
> > + &dev_attr_sliding_window_size.attr,
> > NULL,
> > };
> >
> > @@ -1241,16 +1699,17 @@ static inline void cqm_pick_event_reader(int cpu)
> >
> > for_each_cpu(i, &cqm_cpumask) {
> > if (phys_id == topology_physical_package_id(i))
> > - return; /* already got reader for this socket */
> > + return; /* already got reader for this socket */
>
> More random crap.
>
> > }
> >
> > cpumask_set_cpu(cpu, &cqm_cpumask);
> > }
> >
> > -static void intel_cqm_cpu_prepare(unsigned int cpu)
> > +static int intel_cqm_cpu_prepare(unsigned int cpu)
> > {
> > struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);
> > struct cpuinfo_x86 *c = &cpu_data(cpu);
> > + struct mbm_pmu *pmu = per_cpu(mbm_pmu, cpu);
> >
> > state->rmid = 0;
> > state->closid = 0;
> > @@ -1258,12 +1717,27 @@ static void intel_cqm_cpu_prepare(unsigned int
> cpu)
> >
> > WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
> > WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
> > +
> > + if ((!pmu) && (cqm_mbm)) {
> > + pmu = kzalloc_node(sizeof(*mbm_pmu), GFP_KERNEL,
> NUMA_NO_NODE);
> > + if (!pmu)
> > + return -ENOMEM;
> > + spin_lock_init(&pmu->lock);
> > + INIT_LIST_HEAD(&pmu->active_list);
> > + pmu->pmu = &intel_cqm_pmu;
> > + pmu->n_active = 0;
> > + pmu->timer_interval = ms_to_ktime(MBM_TIME_DELTA_MAX);
> > + per_cpu(mbm_pmu, cpu) = pmu;
> > + per_cpu(mbm_pmu_to_free, cpu) = NULL;
>
> So that mbm_pmu_to_free per cpu variable is NULL already and never
> becomes anything else than NULL. What's the purpose of setting it NULL
> again? And what's the purpose of it in general?
>
> > + }
> > + return 0;
> > }
> >
> > static void intel_cqm_cpu_exit(unsigned int cpu)
> > {
> > int phys_id = topology_physical_package_id(cpu);
> > int i;
> > + struct mbm_pmu *pmu = per_cpu(mbm_pmu, cpu);
> >
> > /*
> > * Is @cpu a designated cqm reader?
> > @@ -1280,6 +1754,13 @@ static void intel_cqm_cpu_exit(unsigned int cpu)
> > break;
> > }
> > }
> > +
> > + /* cancel overflow polling timer for CPU */
> > + if (pmu)
> > + mbm_stop_hrtimer(pmu);
> > + kfree(mbm_local);
> > + kfree(mbm_total);
>
> So this frees the module global storage if one cpu is unplugged and
> leaves the pointer initialized so the rest of the code can happily
> dereference it.
>
> > +
> > }
> >
> > static int intel_cqm_cpu_notifier(struct notifier_block *nb,
> > @@ -1289,7 +1770,7 @@ static int intel_cqm_cpu_notifier(struct
> notifier_block *nb,
> >
> > switch (action & ~CPU_TASKS_FROZEN) {
> > case CPU_UP_PREPARE:
> > - intel_cqm_cpu_prepare(cpu);
> > + return intel_cqm_cpu_prepare(cpu);
> > break;
> > case CPU_DOWN_PREPARE:
> > intel_cqm_cpu_exit(cpu);
> > @@ -1305,17 +1786,74 @@ static int intel_cqm_cpu_notifier(struct
> notifier_block *nb,
> > static const struct x86_cpu_id intel_cqm_match[] = {
> > { .vendor = X86_VENDOR_INTEL, .feature =
> X86_FEATURE_CQM_OCCUP_LLC },
> > {}
> > + }, intel_mbm_match[] = {
>
> What's wrong with having a separate
>
> static const struct x86_cpu_id intel_mbm_match[] = {
>
> > + { .vendor = X86_VENDOR_INTEL, .feature =
> X86_FEATURE_CQM_MBM_LOCAL },
> > + {}
> > };
>
> That again would make the change obvious, but that's not on your agenda.
>
> >
> > static int __init intel_cqm_init(void)
> > {
> > char *str, scale[20];
> > - int i, cpu, ret;
> > + int i = 0, cpu, ret;
>
> Initializing i is required because?
>
> > - if (!x86_match_cpu(intel_cqm_match))
> > + if (!x86_match_cpu(intel_cqm_match) &&
> > + (!x86_match_cpu(intel_mbm_match)))
> > return -ENODEV;
> >
> > cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
> > + cqm_max_rmid = boot_cpu_data.x86_cache_max_rmid;
> > +
> > + if (x86_match_cpu(intel_cqm_match)) {
> > + cqm_llc_occ = true;
> > + intel_cqm_events_group.attrs = intel_cmt_events_attr;
> > + } else
> > + cqm_llc_occ = false;
>
> I see you do not trust the compiler to put cqm_llc_occ into the BSS
> section and neither the kernel to zero it out proper.
>
> > +
> > + if (x86_match_cpu(intel_mbm_match)) {
> > + u32 mbm_scale_rounded = 0;
> > +
> > + cqm_mbm = true;
> > + cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
> > + /*
> > + * MBM counter values are in bytes. To conver this to MB/sec,
> > + * we scale the MBM scale factor by 1000. Another 1000 factor
> > + * scaling down is done
> > + * after reading the counter val i.e. in the function
> > + * __rmid_read_mbm()
>
> So the event attribute string is in KB/sec and at some other place you
> convert it to MB/sec? There seems to be some hidden logic for this,
> but that's definitely not decodeable for me.
>
> > + */
> > + mbm_scale_rounded = (cqm_l3_scale + 500) / 1000;
>
> div_round_up
>
> > + cqm_max_rmid = boot_cpu_data.x86_cache_max_rmid;
> > + snprintf(scale, sizeof(scale), "%u", mbm_scale_rounded);
> > + str = kstrdup(scale, GFP_KERNEL);
> > + if (!str) {
> > + ret = -ENOMEM;
> > + goto out;
> > + }
> > +
> > + if (cqm_llc_occ)
> > + intel_cqm_events_group.attrs =
> > + intel_cmt_mbm_events_attr;
>
> And this line break is required because?
>
> > + else
> > + intel_cqm_events_group.attrs =
> intel_mbm_events_attr;
> > +
> > + event_attr_intel_cqm_llc_local_bw_scale.event_str
> > + = event_attr_intel_cqm_llc_total_bw_scale.event_str = str;
>
> You really love to write unreadable crap. This is neither perl nor the
> obfuscated c-code contest. What's wrong with two separate lines which
> assign 'str' to each of the event attributes?
>
> > + mbm_local = kzalloc_node(sizeof(struct sample) *
> > + (cqm_max_rmid + 1) * MBM_SOCKET_MAX,
>
> Calculating the size in a separate line with a proper variable would
> make it readable.
>
> > + GFP_KERNEL, NUMA_NO_NODE);
> > + if (!mbm_local) {
>
> Leaks str
>
> > + ret = -ENOMEM;
> > + goto out;
> > + }
> > + mbm_total = kzalloc_node(sizeof(struct sample) *
> > + (cqm_max_rmid + 1) * MBM_SOCKET_MAX,
> > + GFP_KERNEL, NUMA_NO_NODE);
> > + if (!mbm_total) {
> > + ret = -ENOMEM;
>
> Leaks str and mbm_local
>
> > + goto out;
> > + }
>
> What about sticking that mess into a separate function?
>
> > + } else
> > + cqm_mbm = false;
>
> Sigh.
>
> > /*
> > * It's possible that not all resources support the same number
> > @@ -1328,44 +1866,48 @@ static int __init intel_cqm_init(void)
> > */
> > cpu_notifier_register_begin();
> >
> > - for_each_online_cpu(cpu) {
> > - struct cpuinfo_x86 *c = &cpu_data(cpu);
> > + if (cqm_llc_occ) {
> > + for_each_online_cpu(cpu) {
> > + struct cpuinfo_x86 *c = &cpu_data(cpu);
> >
> > - if (c->x86_cache_max_rmid < cqm_max_rmid)
> > - cqm_max_rmid = c->x86_cache_max_rmid;
> > + if (c->x86_cache_max_rmid < cqm_max_rmid)
> > + cqm_max_rmid = c->x86_cache_max_rmid;
> >
> > - if (c->x86_cache_occ_scale != cqm_l3_scale) {
> > - pr_err("Multiple LLC scale values, disabling\n");
> > - ret = -EINVAL;
> > - goto out;
> > + if (c->x86_cache_occ_scale != cqm_l3_scale) {
> > + pr_err("Multiple LLC scale values, disabling\n");
> > + ret = -EINVAL;
>
> More memory leaks
>
> > + goto out;
> > + }
> > }
> > - }
> >
> > - /*
> > - * A reasonable upper limit on the max threshold is the number
> > - * of lines tagged per RMID if all RMIDs have the same number of
> > - * lines tagged in the LLC.
> > - *
> > - * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
> > - */
> > - __intel_cqm_max_threshold =
> > + /*
> > + * A reasonable upper limit on the max threshold is the number
> > + * of lines tagged per RMID if all RMIDs have the same number
> of
> > + * lines tagged in the LLC.
> > + *
> > + * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
> > + */
> > + __intel_cqm_max_threshold =
> > boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1);
> >
> > - snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
> > - str = kstrdup(scale, GFP_KERNEL);
> > - if (!str) {
> > - ret = -ENOMEM;
> > - goto out;
> > - }
> > + snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
> > + str = kstrdup(scale, GFP_KERNEL);
> > + if (!str) {
> > + ret = -ENOMEM;
>
> Ditto.
>
> > + goto out;
> > + }
> >
> > - event_attr_intel_cqm_llc_scale.event_str = str;
> > + event_attr_intel_cqm_llc_scale.event_str = str;
> > + }
> >
> > ret = intel_cqm_setup_rmid_cache();
> > if (ret)
> > goto out;
> >
> > for_each_online_cpu(i) {
> > - intel_cqm_cpu_prepare(i);
> > + ret = intel_cqm_cpu_prepare(i);
> > + if (ret)
> > + goto out;
>
> And that leaves even more half initialized stuff around.
>
> > cqm_pick_event_reader(i);
> > }
>
> I fear your assessment in
>
>
> http://lkml.kernel.org/r/06033C969873E840BDE9FC9B584F66B58BB89D@ORS
> MSX116.amr.corp.intel.com
>
> is slightly wrong.
>
> That's a completely convoluted mess. Aside of that its broken all over
> the place.
>
> This wants to be split up into preparatory patches which remodel the
> existing code and move out cqm stuff into helper functions, so the mbm
> stuff can be plugged in with its own helpers nicely.
>
> Thanks,
>
> tglx

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/