tracepoints for kernel/mutex.c

From: Jason Baron
Date: Thu Oct 16 2008 - 17:05:44 EST


hi,

Below are 3 tracepoints I've been playing with in kernel/mutex.c using
a SystemTap script. The idea is to detect and determine the cause of
lock contention. Currently I get the following output:

<contended mutex nam> <process name and pid of the contention> <time of
contention> <pid that woke me up(caused the contention?)>

for example:

name: &inode->i_mutex, pid name: udevd, pid: 26567, jiffies: 1 cause:
26566


Using SystemTap I can now modify my script to figure out the inode number, the
file and pathname, or the backtrace and get to the bottom of the
contention.

I think this approach has a number of advantages. It has low
overhead in the off case, since its based on tracepoints. It is
minimally invasive in the code path (3 tracepoints). It also allows me
to explore data structures and parts of the kernel by simply modifying
the SystemTap script. I do not need to re-compile the kernel and reboot.

You can run the script via: $stap mutex.stp

thanks,

-Jason


diff --git a/include/trace/mutex.h b/include/trace/mutex.h
new file mode 100644
index 0000000..8d365d6
--- /dev/null
+++ b/include/trace/mutex.h
@@ -0,0 +1,10 @@
+#ifndef _TRACE_MUTEX_H
+#define _TRACE_MUTEX_H
+
+#include <linux/tracepoint.h>
+
+DEFINE_TRACE(mutex_lock_contention, TPPROTO(int pid, unsigned long time), TPARGS(pid, time));
+DEFINE_TRACE(mutex_lock_contention_length, TPPROTO(char *name, int pid, unsigned long time), TPARGS(name, pid, time));
+DEFINE_TRACE(mutex_wakeup, TPPROTO(int target_pid, int sender_pid), TPARGS(target_pid, sender_pid));
+
+#endif
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 1d94160..847a9fb 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -97,6 +97,7 @@ void debug_mutex_init(struct mutex *lock, const char *name,
#endif
lock->owner = NULL;
lock->magic = lock;
+ lock->name = name;
}

/***
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 12c779d..62c2aab 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -18,6 +18,7 @@
#include <linux/spinlock.h>
#include <linux/interrupt.h>
#include <linux/debug_locks.h>
+#include <trace/mutex.h>

/*
* In the DEBUG case we are using the "NULL fastpath" for mutexes,
@@ -147,6 +148,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
goto done;

lock_contended(&lock->dep_map, ip);
+ trace_mutex_lock_contention((int)task->pid, jiffies);

for (;;) {
/*
@@ -182,7 +184,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
schedule();
spin_lock_mutex(&lock->wait_lock, flags);
}
-
+ trace_mutex_lock_contention_length(lock->name, (int)task->pid, jiffies);
done:
lock_acquired(&lock->dep_map);
/* got the lock - rejoice! */
@@ -256,7 +258,7 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
struct mutex_waiter, list);

debug_mutex_wake_waiter(lock, waiter);
-
+ trace_mutex_wakeup(waiter->task->pid, current->pid);
wake_up_process(waiter->task);
}

diff --git a/ltt/probes/Makefile b/ltt/probes/Makefile
index 2ee8a46..0c54d01 100644
--- a/ltt/probes/Makefile
+++ b/ltt/probes/Makefile
@@ -10,7 +10,8 @@ CFLAGS_REMOVE_rcu-trace.o = -pg
endif

obj-$(CONFIG_LTT_TRACEPROBES) += kernel-trace.o mm-trace.o fs-trace.o \
- ipc-trace.o lockdep-trace.o rcu-trace.o
+ ipc-trace.o lockdep-trace.o rcu-trace.o \
+ mutex-trace.o

ifeq ($(CONFIG_NET),y)
ifdef CONFIG_FTRACE
diff --git a/ltt/probes/mutex-trace.c b/ltt/probes/mutex-trace.c
new file mode 100644
index 0000000..3b1ebeb
--- /dev/null
+++ b/ltt/probes/mutex-trace.c
@@ -0,0 +1,43 @@
+/*
+ * ltt/probes/fs-trace.c
+ *
+ * Mutex tracepoint probes.
+ */
+
+#include <linux/module.h>
+#include <trace/mutex.h>
+
+static unsigned long mutex_times[PID_MAX_DEFAULT+1];
+static int mutex_wakeup[PID_MAX_DEFAULT+1];
+
+void probe_mutex_lock_contention(int pid, unsigned long time)
+{
+ mutex_times[pid] = (unsigned long)time;
+
+ trace_mark_tp(mutex_lock_contention, mutex_lock_contention,
+ probe_mutex_lock_contention, "pid %d time %lu", pid, time);
+}
+
+void probe_mutex_lock_contention_length(char *name, int pid, unsigned long time)
+{
+ unsigned long delay = time - mutex_times[pid];
+ int reason = mutex_wakeup[pid];
+
+ /* FIXME: why is name sometimes null? */
+ if (!name)
+ return;
+
+ trace_mark_tp(mutex_lock_contention_length, mutex_lock_contention_length,
+ probe_mutex_lock_contention_length,
+ "name %s pid %d delay %lu cause %d",
+ name, pid, delay, reason);
+}
+
+void probe_mutex_wakeup(int target_pid, int sender_pid)
+{
+ mutex_wakeup[target_pid] = sender_pid;
+
+ trace_mark_tp(mutex_wakeup, mutex_wakeup,
+ probe_mutex_wakeup, "pid %d pid %d", target_pid, sender_pid);
+}
+


# mutex.stp

probe kernel.mark("mutex_lock_contention")
{
}

probe kernel.mark("mutex_wakeup")
{
}

probe kernel.mark("mutex_lock_contention_length")
{
name = $arg1
pid = $arg2
delay = $arg3
cause = $arg4

printf("name: %s, pid name: %s, pid: %d, jiffies: %d cause: %d\n",
name, execname(), pid, delay, cause);
}
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/