Re: [RFC] perf_events: ctx_flexible_sched_in() not maximizing PMU utilization

From: Peter Zijlstra
Date: Fri May 14 2010 - 10:55:22 EST


On Mon, 2010-05-10 at 11:41 +0200, Stephane Eranian wrote:
> Looks like a good solution, at least better than what is there right now.

Something like the below I guess, still need to make it an actual patch
though ;-)

The interesting bit is the schedule() function which tries to schedule
two queues fairly (cpu context and task context), and selects between
the two based on lag -- I'm not quite sure that that works out as
expected though, still have to do the actual math.

Also, I guess we should come up with a saner solution for the
per-task-per-cpu events (the unservicable stuff)

---


struct flexible_queue {
struct rb_root waiting;
struct rb_node *rb_leftmost;

u64 min_service;
u64 rel_avg;

u64 clock;
u64 running_stamp;

struct list_head running;
struct list_head unservicable;
int unservicable_cpu;
int nr_waiting;
};

enum flexible_type {
flexible_waiting,
flexible_running,
flexible_unservicable,
};

struct flexible_event {
union {
struct rb_node rb_entry;
struct list_head list_entry;
};
enum flexible_type type;

u64 service;
};

static inline
s64 flexible_event_rel(struct flexible_queue *fq, struct flexible_event *fe)
{
return fe->sevice - fq->min_service;
}

static void __enqueue_event(struct flexible_queue *fq, struct flexible_event *fe)
{
struct rb_node **link = &fq->waiting.rb_node;
struct rb_node *parent = NULL;
struct flexible_event *entry;
s64 rel = flexible_event_rel(fq, fe);
int leftmost = 1;

if (rel > 0) {
fq->min_service += rel;
fq->rel_avg -= fq->nr_waiting * rel;
rel = 0;
}

fq->nr_waiting++;
fq->rel_avg += rel;

fe->type = flexible_waiting;

while (*link) {
parent = *link;
entry = rb_entry(parent, struct flexible_event, rb_entry);
/*
* We dont care about collisions. Nodes with
* the same rel stay together.
*/
if (rel < flexible_event_rel(fq, fe)) {
link = &parent->rb_left;
} else {
link = &parent->rb_right;
leftmost = 0;
}
}

if (leftmost)
cfs_rq->rb_leftmost = &se->rb_entry;

rb_link_node(&fe->rb_entry, parent, link);
rb_insert_color(&fe->rb_entry, &fq->waiting);
}

static void __dequeue_event(struct flexible_queue *fq, struct flexible_event *fe)
{
if (fq->rb_leftmost == &fe->rb_entry) {
struct rb_node *next_node;

next_node = rb_next(&fe->rb_entry);
fq->rb_leftmost = next_node;
}

rb_erase(&fe->rb_entry, &fq->waiting);

fq->nr_waiting--;
fq->rel_avg -= flexible_event_rel(fq, fe);
}

static void
flexible_event_add(struct flexible_queue *fq, struct flexible_event *fe)
{
fe->service = fq->min_service + fq->rel_avg;
__enqueue_event(fq, fe);
}

static void
flexible_event_del(struct flexible_queue *fq, struct flexible_event *fe)
{
switch (fe->type) {
case flexible_waiting:
__dequeue_event(fq, fe);
break;

case flexible_running:
case flexible_unservicable:
list_del(&fe->list_entry);
break;
}
}

static void flexible_unschedule(struct flexible_queue *fq)
{
struct flexible_event *fe, *tmp;
s64 delta = (s64)(fq->clock - fq->running_stamp);

list_for_each_entry_safe(fe, tmp, &fq->running, list_entry) {
list_del(&fe->list_entry);
fe->service += delta;
__enqueue_event(fq, fe);
}
}

static
s64 flexible_event_lag(struct flexible_queue *fq, struct flexible_event *fe)
{
return fq->min_service + (fq->rel_avg / fq->nr_waiting) - fe->service;
}

static struct flexible_event *__pick_event(struct flexible_queue *fq)
{
struct flexible_event *fe;

if (!fq->rb_leftmost)
return NULL;

fe = rb_entry(fq->rb_leftmost, struct flexible_event, rb_entry);

return fe;
}

static
int flexible_schedule(struct flexible_queue *fq1, struct flexible_queue *fq2)
{
struct flexible_event *tmp, *fe;
struct flexible_queue *fq;
s64 tmp_lag, max_lag;

if (fq->unservicable_cpu != smp_processor_id()) {
list_for_each_entry_save(fe, tmp, &fq->unservicable, list_entry) {
list_del(&fe->list_entry);
flexible_event_add(fq, fe);
}

fq->unservicable_cpu = smp_processor_id();
}

again:
max_lag = 0;
fe = NULL;

tmp =__pick_event(fq1);
if (tmp) {
tmp_lag = flexible_event_lag(fq1, fe1);
if (tmp_lag > max_lag) {
fq = fq1;
fe = fe1;
max_lag = tmp_lag;
}
}

tmp =__pick_event(fq2);
if (tmp) {
tmp_lag = flexible_event_lag(fq2, fe2);
if (tmp_lag > max_lag) {
fq = fq2;
fe = fe2;
max_lag = tmp_lag;
}
}

if (!fe)
return 1; /* no more events to schedule */

fq->running_stamp = fq->clock;

if (event_of(fe)->cpu != -1 && event_of(fe)->cpu != smp_processor_id()) {
__dequeue_event(fq, fe);
fe->type = flexible_unservicable;
list_add(&fe->list_entry, &fq->unservicable);
goto again; /* can't run this event, try another */
}

if (group_sched_in(event_of(fe), ...) == 0) {
__dequeue_event(fq, fe);
fe->type = flexible_running;
list_add(&fe->list_entry, &fq->running);
return 0; /* success */
}

return -EBUSY; /* doesn't fit */
}


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/