[PATCH 4/5] locking/lockdep: Make class->ops a percpu counter

From: Waiman Long
Date: Fri Sep 28 2018 - 13:53:47 EST


A sizable portion of the CPU cycles spent on the __lock_acquire() is used
up by the atomic increment of class->ops stat counter. By changing it
to a per-cpu counter, we can reduce the amount of cacheline contention
on the class structure when multiple CPUs are trying to acquire locks
of the same class simultaneously.

This patch also fixes a bug in the increment code as the counter is of
the unsigned long type, but atomic_inc() was used to increment it.

Signed-off-by: Waiman Long <longman@xxxxxxxxxx>
---
include/linux/lockdep.h | 2 +-
kernel/locking/lockdep.c | 18 ++++++++++++++----
2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index b0d0b51..f8bf705 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -102,7 +102,7 @@ struct lock_class {
/*
* Statistics counter:
*/
- unsigned long ops;
+ unsigned long __percpu *pops;

const char *name;
int name_version;
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index ca002c0..7a0ed1d 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -139,6 +139,7 @@ static inline int debug_locks_off_graph_unlock(void)
*/
unsigned long nr_lock_classes;
static struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
+static DEFINE_PER_CPU(unsigned long [MAX_LOCKDEP_KEYS], lock_class_ops);

static inline struct lock_class *hlock_class(struct held_lock *hlock)
{
@@ -784,11 +785,14 @@ static bool assign_lock_key(struct lockdep_map *lock)
dump_stack();
return NULL;
}
- class = lock_classes + nr_lock_classes++;
+ class = lock_classes + nr_lock_classes;
debug_atomic_inc(nr_unused_locks);
class->key = key;
class->name = lock->name;
class->subclass = subclass;
+ class->pops = &lock_class_ops[nr_lock_classes];
+ nr_lock_classes++;
+
INIT_LIST_HEAD(&class->lock_entry);
INIT_LIST_HEAD(&class->locks_before);
INIT_LIST_HEAD(&class->locks_after);
@@ -1387,11 +1391,15 @@ static inline int usage_match(struct lock_list *entry, void *bit)

static void print_lock_class_header(struct lock_class *class, int depth)
{
- int bit;
+ int bit, cpu;
+ unsigned long ops = 0UL;
+
+ for_each_possible_cpu(cpu)
+ ops += *per_cpu(class->pops, cpu);

printk("%*s->", depth, "");
print_lock_name(class);
- printk(KERN_CONT " ops: %lu", class->ops);
+ printk(KERN_CONT " ops: %lu", ops);
printk(KERN_CONT " {\n");

for (bit = 0; bit < LOCK_USAGE_STATES; bit++) {
@@ -3226,7 +3234,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (!class)
return 0;
}
- atomic_inc((atomic_t *)&class->ops);
+
+ __this_cpu_inc(*class->pops);
+
if (very_verbose(class)) {
printk("\nacquire class [%px] %s", class->key, class->name);
if (class->name_version > 1)
--
1.8.3.1