[patch v3 3/3] percpu_counter: use percpu data to track add start

From: Shaohua Li
Date: Tue May 17 2011 - 04:44:55 EST


add_start causes a lot of cache bouncing because it's updated by all
cpus. We can actually make it a percpu variable. This will completely
reduce the cache bouncing.
With the patch and last patch, I get about 7x faster running the
workload that last patch described. Only with last patch, the workload
is only about 4x faster.
This doesn't slow down _sum because we removed lock for _sum. I did
a stress test. 23 CPU run _add, one cpu runs _sum. In _add fast path
(don't hold) lock, _sum runs a little slow (about 20% slower). In
_add slow path (hold lock), _sum runs much faster (about 9x faster);

V2: uses one percpu data as Eric suggested.

Signed-off-by: Shaohua Li <shaohua.li@xxxxxxxxx>
---
include/linux/percpu_counter.h | 11 ++++++--
lib/percpu_counter.c | 51 ++++++++++++++++++++---------------------
2 files changed, 33 insertions(+), 29 deletions(-)

Index: linux/include/linux/percpu_counter.h
===================================================================
--- linux.orig/include/linux/percpu_counter.h 2011-05-16 10:46:14.000000000 +0800
+++ linux/include/linux/percpu_counter.h 2011-05-17 14:06:23.000000000 +0800
@@ -15,13 +15,18 @@

#ifdef CONFIG_SMP

+struct percpu_counter_data {
+ s32 counter;
+ char add_start;
+};
+
struct percpu_counter {
- atomic_t sum_start, add_start;
+ atomic_t sum_start;
atomic64_t count;
#ifdef CONFIG_HOTPLUG_CPU
struct list_head list; /* All percpu_counters are on a list */
#endif
- s32 __percpu *counters;
+ struct percpu_counter_data __percpu *cpu_data;
};

extern int percpu_counter_batch;
@@ -71,7 +76,7 @@ static inline s64 percpu_counter_read_po

static inline int percpu_counter_initialized(struct percpu_counter *fbc)
{
- return (fbc->counters != NULL);
+ return (fbc->cpu_data != NULL);
}

#else
Index: linux/lib/percpu_counter.c
===================================================================
--- linux.orig/lib/percpu_counter.c 2011-05-16 10:46:14.000000000 +0800
+++ linux/lib/percpu_counter.c 2011-05-17 14:11:04.000000000 +0800
@@ -59,10 +59,8 @@ void percpu_counter_set(struct percpu_co
{
int cpu;

- for_each_possible_cpu(cpu) {
- s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
- *pcount = 0;
- }
+ for_each_possible_cpu(cpu)
+ per_cpu_ptr(fbc->cpu_data, cpu)->counter = 0;
atomic64_set(&fbc->count, amount);
}
EXPORT_SYMBOL(percpu_counter_set);
@@ -72,23 +70,25 @@ void __percpu_counter_add(struct percpu_
s64 count;

preempt_disable();
- count = __this_cpu_read(*fbc->counters) + amount;
+ count = __this_cpu_ptr(fbc->cpu_data)->counter + amount;
if (count >= batch || count <= -batch) {
while (1) {
- atomic_inc_return(&fbc->add_start);
+ __this_cpu_ptr(fbc->cpu_data)->add_start = 1;
+ /* Guarantee add_starts is seen by _sum */
+ smp_wmb();
if (atomic_read(&fbc->sum_start) == 0)
break;
- atomic_dec(&fbc->add_start);
+ __this_cpu_ptr(fbc->cpu_data)->add_start = 0;
while (atomic_read(&fbc->sum_start) != 0)
cpu_relax();
}

atomic64_add(count, &fbc->count);
- __this_cpu_write(*fbc->counters, 0);
+ __this_cpu_ptr(fbc->cpu_data)->counter = 0;

- atomic_dec(&fbc->add_start);
+ __this_cpu_ptr(fbc->cpu_data)->add_start = 0;
} else {
- __this_cpu_write(*fbc->counters, count);
+ __this_cpu_ptr(fbc->cpu_data)->counter = count;
}
preempt_enable();
}
@@ -104,15 +104,15 @@ s64 __percpu_counter_sum(struct percpu_c
int cpu;

atomic_inc_return(&fbc->sum_start);
- while (atomic_read(&fbc->add_start) != 0)
- cpu_relax();
-
- ret = atomic64_read(&fbc->count);
for_each_online_cpu(cpu) {
- s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
- ret += *pcount;
+ while (per_cpu_ptr(fbc->cpu_data, cpu)->add_start != 0)
+ cpu_relax();
}

+ ret = atomic64_read(&fbc->count);
+ for_each_online_cpu(cpu)
+ ret += __this_cpu_ptr(fbc->cpu_data)->counter;
+
atomic_dec(&fbc->sum_start);
return ret;
}
@@ -122,9 +122,8 @@ int percpu_counter_init(struct percpu_co
{
atomic64_set(&fbc->count, amount);
atomic_set(&fbc->sum_start, 0);
- atomic_set(&fbc->add_start, 0);
- fbc->counters = alloc_percpu(s32);
- if (!fbc->counters)
+ fbc->cpu_data = alloc_percpu(struct percpu_counter_data);
+ if (!fbc->cpu_data)
return -ENOMEM;

debug_percpu_counter_activate(fbc);
@@ -141,7 +140,7 @@ EXPORT_SYMBOL(percpu_counter_init);

void percpu_counter_destroy(struct percpu_counter *fbc)
{
- if (!fbc->counters)
+ if (!fbc->cpu_data)
return;

debug_percpu_counter_deactivate(fbc);
@@ -151,8 +150,8 @@ void percpu_counter_destroy(struct percp
list_del(&fbc->list);
mutex_unlock(&percpu_counters_lock);
#endif
- free_percpu(fbc->counters);
- fbc->counters = NULL;
+ free_percpu(fbc->cpu_data);
+ fbc->cpu_data = NULL;
}
EXPORT_SYMBOL(percpu_counter_destroy);

@@ -180,11 +179,11 @@ static int __cpuinit percpu_counter_hotc
cpu = (unsigned long)hcpu;
mutex_lock(&percpu_counters_lock);
list_for_each_entry(fbc, &percpu_counters, list) {
- s32 *pcount;
+ struct percpu_counter_data *data =
+ per_cpu_ptr(fbc->cpu_data, cpu);

- pcount = per_cpu_ptr(fbc->counters, cpu);
- atomic64_add(*pcount, &fbc->count);
- *pcount = 0;
+ atomic64_add(data->counter, &fbc->count);
+ data->counter = 0;
}
mutex_unlock(&percpu_counters_lock);
#endif

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/