Re: [RFC][PATCH] remove rq->lock from cpuacct cgroup (Was Re:[PATCH] cpuacct: add a branch prediction

From: Peter Zijlstra
Date: Mon Mar 02 2009 - 09:56:38 EST

Next message: scameron: "Re: [PATCH] hpsa: SCSI driver for HP Smart Array controllers"
Previous message: Pavel Machek: "Re: [PATCH 3/5] lis3: reorder functions to make forward declobsolete"
Next in thread: KAMEZAWA Hiroyuki: "Re: [RFC][PATCH] remove rq->lock from cpuacct cgroup (Was Re:[PATCH] cpuacct: add a branch prediction"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

On Fri, 2009-02-27 at 12:22 +0900, KAMEZAWA Hiroyuki wrote:

Comments below..

> From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx>
>
> cgroup/cpuacct subsystem counts cpu usage by 64bit coutnter in
> per-cpu object. In read-side (via cpuacct.usage file), for reading 64bit
> value in safe manner, it takes rq->lock of (other) cpus.
>
> In general, taking rq->lock of other cpus from codes not for scheduler
> is not good. This patch tries to remove rq->lock used in read-side.
>
> To read 64bit value in safe, this patch uses seqcounter.
>
> Pros.
> - rq->lock is not necessary.
> Cons.
> - When updating counter, sequence number must be updated.
> (I hope this per-cpu sequence number is on cache...)
> - not simple.
>
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx>
> ---
> kernel/sched.c | 141 ++++++++++++++++++++++++++++++++++++++++++---------------
> 1 file changed, 105 insertions(+), 36 deletions(-)
>
> Index: mmotm-2.6.29-Feb24/kernel/sched.c
> ===================================================================
> --- mmotm-2.6.29-Feb24.orig/kernel/sched.c
> +++ mmotm-2.6.29-Feb24/kernel/sched.c
> @@ -9581,6 +9581,67 @@ struct cgroup_subsys cpu_cgroup_subsys =
>
> #ifdef CONFIG_CGROUP_CPUACCT
>
> +#ifndef CONFIG_64BIT
> +DEFINE_PER_CPU(struct seqcount, cpuacct_cgroup_seq);
> +
> +static inline void cpuacct_start_counter_update(void)
> +{
> + /* This is called under rq->lock and IRQ is off */
> + struct seqcount *s = &get_cpu_var(cpuacct_cgroup_seq);
> +
> + write_seqcount_begin(s);
> + put_cpu_var(cpuacct_cgroup_seq);
> +}
> +
> +static inline void cpuacct_end_counter_update(void)
> +{
> + struct seqcount *s = &get_cpu_var(cpuacct_cgroup_seq);
> +
> + write_seqcount_end(s);
> + put_cpu_var(cpuacct_cgroup_seq);
> +}

It seems odd we disable/enable preemption in both, I would expect for
start to disable preemption, and have end enable it again, or use
__get_cpu_var() and assume preemption is already disabled (callsites are
under rq->lock, right?)

> +static inline u64
> +cpuacct_read_counter(u64 *val, int cpu)
> +{
> + struct seqcount *s = &per_cpu(cpuacct_cgroup_seq, cpu);
> + unsigned int seq;
> + u64 data;
> +
> + do {
> + seq = read_seqcount_begin(s);
> + data = *val;
> + } while (read_seqcount_retry(s, seq));
> + return data;
> +}
> +/* This is a special funtion called against "offline" cpus. */
> +static inline void cpuacct_reset_offline_counter(u64 *val, int cpu)
> +{
> + struct seqcount *s = &per_cpu(cpuacct_cgroup_seq, cpu);
> +
> + preempt_disable();
> + write_seqcount_begin(s);
> + *val = 0;
> + write_seqcount_end(s);
> + preempt_enable();
> +}

And here you double disable preemption, quite useless if you take a
remote cpu's per-cpu data.

> +#else
> +static inline void cpuacct_start_counter_update(void)
> +{
> +}
> +static inline void cpuacct_end_counter_update(void)
> +{
> +}
> +static inline u64 cpuacct_read_counter(u64 *val, int cpu)
> +{
> + return *val;
> +}
> +static inline void cpuacct_reset_offline_counter(u64 *val, int cpu)
> +{
> + *val = 0;
> +}
> +#endif
> +
> /*
> * CPU accounting code for task groups.
> *
> @@ -9596,6 +9657,11 @@ struct cpuacct {
> struct cpuacct *parent;
> };
>
> +struct cpuacct_work {
> + struct work_struct work;
> + struct cpuacct *cpuacct;
> +};
> +
> struct cgroup_subsys cpuacct_subsys;
>
> /* return cpu accounting group corresponding to this container */
> @@ -9643,39 +9709,29 @@ cpuacct_destroy(struct cgroup_subsys *ss
> kfree(ca);
> }
>
> +/* In 32bit enviroment, seqcounter is used for reading 64bit in safe way */
> static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
> {
> u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
> u64 data;
>
> -#ifndef CONFIG_64BIT
> - /*
> - * Take rq->lock to make 64-bit read safe on 32-bit platforms.
> - */
> - spin_lock_irq(&cpu_rq(cpu)->lock);
> - data = *cpuusage;
> - spin_unlock_irq(&cpu_rq(cpu)->lock);
> -#else
> - data = *cpuusage;
> -#endif
> + data = cpuacct_read_counter(cpuusage, cpu);
>
> return data;
> }
>
> -static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
> +/* called by per-cpu workqueue */
> +static void cpuacct_cpuusage_reset_cpu(struct work_struct *work)
> {
> + struct cpuacct_work *cw = container_of(work, struct cpuacct_work, work);
> + struct cpuacct *ca = cw->cpuacct;
> + int cpu = get_cpu();
> u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
>
> -#ifndef CONFIG_64BIT
> - /*
> - * Take rq->lock to make 64-bit write safe on 32-bit platforms.
> - */
> - spin_lock_irq(&cpu_rq(cpu)->lock);
> - *cpuusage = val;
> - spin_unlock_irq(&cpu_rq(cpu)->lock);
> -#else
> - *cpuusage = val;
> -#endif
> + cpuacct_start_counter_update();
> + *cpuusage = 0;
> + cpuacct_end_counter_update();
> + put_cpu();
> }
>
> /* return total cpu usage (in nanoseconds) of a group */
> @@ -9691,23 +9747,34 @@ static u64 cpuusage_read(struct cgroup *
> return totalcpuusage;
> }
>
> -static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
> - u64 reset)
> +static int cpuacct_cpuusage_reset(struct cgroup *cgrp, unsigned int event)
> {
> struct cpuacct *ca = cgroup_ca(cgrp);
> - int err = 0;
> - int i;
> -
> - if (reset) {
> - err = -EINVAL;
> - goto out;
> + int cpu;
> + /*
> + * Reset All counters....doesn't need to be fast.
> + * "ca" will be stable while doing this. We are in write() syscall.
> + */
> + get_online_cpus();
> + /*
> + * Because we use alloc_percpu() for allocating counter, we have
> + * a counter per a possible cpu. Reset all online's by workqueue and
> + * reset offline cpu's directly.
> + */
> + for_each_possible_cpu(cpu) {
> + if (cpu_online(cpu)) {
> + struct cpuacct_work cw;
> + INIT_WORK(&cw.work, cpuacct_cpuusage_reset_cpu);
> + cw.cpuacct = ca;
> + schedule_work_on(cpu, &cw.work);
> + flush_work(&cw.work);
> + } else {
> + u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
> + cpuacct_reset_offline_counter(cpuusage, cpu);
> + }

I'm not particularly convinced this is the right way, schedule_work_on()
sounds way expensive for setting a variable to 0.

Furthermore, if you want something like schedule_work_on() for each cpu,
there's schedule_on_each_cpu().

> }
> -
> - for_each_present_cpu(i)
> - cpuacct_cpuusage_write(ca, i, 0);
> -
> -out:
> - return err;
> + put_online_cpus();
> + return 0;
> }
>
> static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
> @@ -9729,7 +9796,7 @@ static struct cftype files[] = {
> {
> .name = "usage",
> .read_u64 = cpuusage_read,
> - .write_u64 = cpuusage_write,
> + .trigger = cpuacct_cpuusage_reset,
> },
> {
> .name = "usage_percpu",
> @@ -9756,10 +9823,12 @@ static void cpuacct_charge(struct task_s
> cpu = task_cpu(tsk);
> ca = task_ca(tsk);
>
> + cpuacct_start_counter_update();
> for (; ca; ca = ca->parent) {
> u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
> *cpuusage += cputime;
> }
> + cpuacct_end_counter_update();
> }
>
> struct cgroup_subsys cpuacct_subsys = {
>

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Next message: scameron: "Re: [PATCH] hpsa: SCSI driver for HP Smart Array controllers"
Previous message: Pavel Machek: "Re: [PATCH 3/5] lis3: reorder functions to make forward declobsolete"
Next in thread: KAMEZAWA Hiroyuki: "Re: [RFC][PATCH] remove rq->lock from cpuacct cgroup (Was Re:[PATCH] cpuacct: add a branch prediction"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]