[PACTH v9] stacktrace: Eliminate task stack trace duplication

From: robert . foss
Date: Tue Aug 16 2016 - 19:13:23 EST


From: Ying Han <yinghan@xxxxxxxxxx>

The problem with small dmesg ring buffer like 512k is that only limited number
of task traces will be logged. Sometimes we lose important information only
because of too many duplicated stack traces. This problem occurs when dumping
lots of stacks in a single operation, such as sysrq-T.

This patch tries to reduce the duplication of task stack trace in the dump
message by hashing the task stack. The hashtable is a 32k pre-allocated buffer
during bootup. Each time if we find the identical task trace in the task stack,
we dump only the pid of the task which has the task trace dumped. So it is easy
to back track to the full stack with the pid.

When we do the hashing, we eliminate garbage entries from stack traces. Those
entries are still being printed in the dump to provide more debugging
informations.

[ 53.510162] kworker/0:0 S ffffffff8161d820 0 4 2 0x00000000
[ 53.517237] ffff88027547de60 0000000000000046 ffffffff812ab840 0000000000000000
[ 53.524663] ffff880275460080 ffff88027547dfd8 ffff88027547dfd8 ffff88027547dfd8
[ 53.532092] ffffffff81813020 ffff880275460080 0000000000000000 ffff8808758670c0
[ 53.539521] Call Trace:
[ 53.541974] [<ffffffff812ab840>] ? cfq_init_queue+0x350/0x350
[ 53.547791] [<ffffffff81524d49>] schedule+0x29/0x70
[ 53.552761] [<ffffffff810945a3>] worker_thread+0x233/0x380
[ 53.558318] [<ffffffff81094370>] ? manage_workers.isra.28+0x230/0x230
[ 53.564839] [<ffffffff81099a73>] kthread+0x93/0xa0
[ 53.569714] [<ffffffff8152e6d4>] kernel_thread_helper+0x4/0x10
[ 53.575628] [<ffffffff810999e0>] ? kthread_worker_fn+0x140/0x140
[ 53.581714] [<ffffffff8152e6d0>] ? gs_change+0xb/0xb
[ 53.586762] kworker/u:0 S ffffffff8161d820 0 5 2 0x00000000
[ 53.593858] ffff88027547fe60 0000000000000046 ffffffffa005cc70 0000000000000000
[ 53.601307] ffff8802754627d0 ffff88027547ffd8 ffff88027547ffd8 ffff88027547ffd8
[ 53.608788] ffffffff81813020 ffff8802754627d0 0000000000011fc0 ffff8804758670c0
[ 53.616232] Call Trace:
[ 53.618676] <Same stack as pid 4>

Signed-off-by: Ying Han <yinghan@xxxxxxxxxx>
Tested-by: Robert Foss <robert.foss@xxxxxxxxxxxxx>
Signed-off-by: Robert Foss <robert.foss@xxxxxxxxxxxxx>

---

This is a resubmission of v9.

This series has previously had 8 versions submitted and v8 was acked, after
which the series was dropped on the floor.

https://lkml.org/lkml/2012/4/6/255

changelog v9..v8:
1. rebase on v4.8-trunk.
2. change return type of save_dup_stack_address

changelog v8..v7:
1. rebase on v3.4-rc1.

changelog v7..v6:
1. rebase on v3.3_rc2, the only change is moving changes from kernel/sched.c
to kernel/sched/core.c

changelog v6..v5:
1. clear saved stack trace before printing a set of stacks. this ensures the printed
stack traces are not omitted messages.
2. add log level in printing duplicate stack.
3. remove the show_stack() API change, and non-x86 arch won't need further change.
4. add more inline documentations.

changelog v5..v4:
1. removed changes to Kconfig file
2. changed hashtable to keep only hash value and length of stack
3. simplified hashtable lookup

changelog v4..v3:
1. improve de-duplication by eliminating garbage entries from stack traces.
with this change 793/825 stack traces were recognized as duplicates. in v3
only 482/839 were duplicates.

changelog v3..v2:
1. again better documentation on the patch description.
2. make the stack_hash_table to be allocated at compile time.
3. have better name of variable index
4. move save_dup_stack_trace() in kernel/stacktrace.c

changelog v2..v1:
1. better documentation on the patch description
2. move the spinlock inside the hash lockup, so reducing the holding time.

arch/x86/include/asm/stacktrace.h | 11 ++-
arch/x86/kernel/dumpstack.c | 12 ++--
arch/x86/kernel/dumpstack_32.c | 7 +-
arch/x86/kernel/dumpstack_64.c | 7 +-
arch/x86/kernel/stacktrace.c | 137 ++++++++++++++++++++++++++++++++++++++
include/linux/stacktrace.h | 8 +++
kernel/sched/core.c | 33 ++++++++-
kernel/stacktrace.c | 22 ++++++
8 files changed, 222 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index 0944218..d100e69 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -81,13 +81,20 @@ stack_frame(struct task_struct *task, struct pt_regs *regs)
}
#endif

+/*
+ * The parameter dup_stack_pid is used for task stack deduplication.
+ * The non-zero value of dup_stack_pid indicates the pid of the
+ * task with the same stack trace.
+ */
extern void
show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp, char *log_lvl);
+ unsigned long *stack, unsigned long bp, char *log_lvl,
+ pid_t dup_stack_pid);

extern void
show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *sp, unsigned long bp, char *log_lvl);
+ unsigned long *sp, unsigned long bp, char *log_lvl,
+ pid_t dup_stack_pid);

extern unsigned int code_bytes;

diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index ef8017c..ab15613 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -168,16 +168,20 @@ static const struct stacktrace_ops print_trace_ops = {

void
show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp, char *log_lvl)
+ unsigned long *stack, unsigned long bp, char *log_lvl,
+ pid_t dup_stack_pid)
{
printk("%sCall Trace:\n", log_lvl);
- dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
+ if (dup_stack_pid)
+ printk("%s<Same stack as pid %d>", log_lvl, dup_stack_pid);
+ else
+ dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
}

void show_trace(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, unsigned long bp)
{
- show_trace_log_lvl(task, regs, stack, bp, "");
+ show_trace_log_lvl(task, regs, stack, bp, "", 0);
}

void show_stack(struct task_struct *task, unsigned long *sp)
@@ -194,7 +198,7 @@ void show_stack(struct task_struct *task, unsigned long *sp)
bp = stack_frame(current, NULL);
}

- show_stack_log_lvl(task, NULL, sp, bp, "");
+ show_stack_log_lvl(task, NULL, sp, bp, "", 0);
}

static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED;
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index fef917e..049de0c 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -90,7 +90,8 @@ EXPORT_SYMBOL(dump_trace);

void
show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *sp, unsigned long bp, char *log_lvl)
+ unsigned long *sp, unsigned long bp, char *log_lvl,
+ pid_t dup_stack_pid)
{
unsigned long *stack;
int i;
@@ -115,7 +116,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
touch_nmi_watchdog();
}
pr_cont("\n");
- show_trace_log_lvl(task, regs, sp, bp, log_lvl);
+ show_trace_log_lvl(task, regs, sp, bp, log_lvl, dup_stack_pid);
}


@@ -137,7 +138,7 @@ void show_regs(struct pt_regs *regs)
u8 *ip;

pr_emerg("Stack:\n");
- show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG);
+ show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG, 0);

pr_emerg("Code:");

diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index d558a8a..6c390ef 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -245,7 +245,8 @@ EXPORT_SYMBOL(dump_trace);

void
show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *sp, unsigned long bp, char *log_lvl)
+ unsigned long *sp, unsigned long bp, char *log_lvl,
+ pid_t dup_stack_pid)
{
unsigned long *irq_stack_end;
unsigned long *irq_stack;
@@ -292,7 +293,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
preempt_enable();

pr_cont("\n");
- show_trace_log_lvl(task, regs, sp, bp, log_lvl);
+ show_trace_log_lvl(task, regs, sp, bp, log_lvl, dup_stack_pid);
}

void show_regs(struct pt_regs *regs)
@@ -316,7 +317,7 @@ void show_regs(struct pt_regs *regs)

printk(KERN_DEFAULT "Stack:\n");
show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
- 0, KERN_DEFAULT);
+ 0, KERN_DEFAULT, 0);

printk(KERN_DEFAULT "Code: ");

diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 9ee98ee..c9a6623 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -7,6 +7,7 @@
#include <linux/stacktrace.h>
#include <linux/module.h>
#include <linux/uaccess.h>
+#include <linux/jhash.h>
#include <asm/stacktrace.h>

static int save_stack_stack(void *data, char *name)
@@ -85,6 +86,142 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
}
EXPORT_SYMBOL_GPL(save_stack_trace_tsk);

+/*
+ * The implementation of stack trace dedup.
+ *
+ * It tries to reduce the duplication of task stack trace in the dump by hashing
+ * the stack trace. Each time if an identical trace is found in the stack, we
+ * dump only the pid of previous task. So it is easy to back track to the full
+ * stack with the pid.
+ *
+ * Note this chould be moved out of x86-specific code for all architectures
+ * use.
+ */
+
+/*
+ * DEDUP_STACK_HASH: pre-allocated buffer size of the hashtable.
+ * DEDUP_STACK_ENTRIES: number of task stack entries in hashtable.
+ * DEDUP_HASH_MAX_ITERATIONS: in hashtable lookup, retry several entries if
+ * there is a collision.
+ */
+#define DEDUP_STACK_HASH 32768
+#define DEDUP_STACK_ENTRIES (DEDUP_STACK_HASH/sizeof(struct task_stack))
+#define DEDUP_HASH_MAX_ITERATIONS 10
+
+/*
+ * The data structure of each hashtable entry
+ */
+struct task_stack {
+ /* the pid of the task of the stack trace */
+ pid_t pid;
+
+ /* the length of the stack entries */
+ int len;
+
+ /* the hash value of the stack trace*/
+ unsigned long hash;
+};
+
+static struct task_stack stack_hash_table[DEDUP_STACK_ENTRIES];
+static struct task_stack cur_stack;
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(stack_hash_lock);
+
+/*
+ * The stack hashtable uses linear probing to resolve collisions.
+ * We consider two stacks to be the same if their hash values and lengths
+ * are equal.
+ */
+static unsigned int stack_trace_lookup(void)
+{
+ int j;
+ int index;
+ unsigned int ret = 0;
+ struct task_stack *stack;
+
+ index = cur_stack.hash % DEDUP_STACK_ENTRIES;
+
+ for (j = 0; j < DEDUP_HASH_MAX_ITERATIONS; j++) {
+ stack = stack_hash_table + (index + j) % DEDUP_STACK_ENTRIES;
+ if (stack->hash == 0) {
+ *stack = cur_stack;
+ ret = 0;
+ break;
+ } else {
+ if (stack->hash == cur_stack.hash &&
+ stack->len == cur_stack.len) {
+ ret = stack->pid;
+ break;
+ }
+ }
+ }
+ if (j == DEDUP_HASH_MAX_ITERATIONS)
+ stack_hash_table[index] = cur_stack;
+
+ memset(&cur_stack, 0, sizeof(cur_stack));
+
+ return ret;
+}
+
+static int save_dup_stack_stack(void *data, char *name)
+{
+ return 0;
+}
+
+static int save_dup_stack_address(void *data, unsigned long addr, int reliable)
+{
+ /*
+ * To improve de-duplication, we'll only record reliable entries
+ * in the stack trace.
+ */
+ if (!reliable)
+ return 0;
+ cur_stack.hash = jhash(&addr, sizeof(addr), cur_stack.hash);
+ cur_stack.len++;
+
+ return 0;
+}
+
+static const struct stacktrace_ops save_dup_stack_ops = {
+ .stack = save_dup_stack_stack,
+ .address = save_dup_stack_address,
+ .walk_stack = print_context_stack,
+};
+
+/*
+ * Clear previously saved stack traces to ensure that later printed stacks do
+ * not reference previously printed stacks.
+ */
+void clear_dup_stack_traces(void)
+{
+ memset(stack_hash_table, 0, sizeof(stack_hash_table));
+}
+
+unsigned int save_dup_stack_trace(struct task_struct *tsk)
+{
+ unsigned int ret = 0;
+ unsigned int dummy = 0;
+
+ spin_lock(&stack_hash_lock);
+ dump_trace(tsk, NULL, NULL, 0, &save_dup_stack_ops, &dummy);
+ cur_stack.pid = tsk->pid;
+ ret = stack_trace_lookup();
+ spin_unlock(&stack_hash_lock);
+
+ return ret;
+}
+
+/*
+ * Similar to show_stack except accepting the dup_stack_pid parameter.
+ * The parameter indicates whether or not the caller side tries to do
+ * a stack dedup, and the non-zero value indicates the pid of the
+ * task with the same stack trace.
+ */
+void show_stack_dedup(struct task_struct *task, unsigned long *sp,
+ pid_t dup_stack_pid)
+{
+ show_stack_log_lvl(task, NULL, sp, 0, "", dup_stack_pid);
+}
+
/* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */

struct stack_frame_user {
diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h
index 0a34489..928830b 100644
--- a/include/linux/stacktrace.h
+++ b/include/linux/stacktrace.h
@@ -23,6 +23,11 @@ extern void print_stack_trace(struct stack_trace *trace, int spaces);
extern int snprint_stack_trace(char *buf, size_t size,
struct stack_trace *trace, int spaces);

+extern void clear_dup_stack_traces(void);
+extern unsigned int save_dup_stack_trace(struct task_struct *tsk);
+extern void show_stack_dedup(struct task_struct *task, unsigned long *sp,
+ pid_t dup_stack_pid);
+
#ifdef CONFIG_USER_STACKTRACE_SUPPORT
extern void save_stack_trace_user(struct stack_trace *trace);
#else
@@ -35,6 +40,9 @@ extern void save_stack_trace_user(struct stack_trace *trace);
# define save_stack_trace_user(trace) do { } while (0)
# define print_stack_trace(trace, spaces) do { } while (0)
# define snprint_stack_trace(buf, size, trace, spaces) do { } while (0)
+# define clear_dup_stack_traces() do { } while (0)
+# define save_dup_stack_trace(tsk) (0)
+# define show_stack_dedup(task, sp, dup_stack_pid) do { } while (0)
#endif

#endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 97ee9ac..fc8a38e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -71,6 +71,8 @@
#include <linux/ftrace.h>
#include <linux/slab.h>
#include <linux/init_task.h>
+#include <linux/stacktrace.h>
+#include <linux/binfmts.h>
#include <linux/context_tracking.h>
#include <linux/compiler.h>
#include <linux/frame.h>
@@ -5096,11 +5098,12 @@ out_unlock:

static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;

-void sched_show_task(struct task_struct *p)
+void _sched_show_task(struct task_struct *p, int dedup)
{
unsigned long free = 0;
int ppid;
unsigned long state = p->state;
+ pid_t dup_stack_pid = 0;

if (state)
state = __ffs(state) + 1;
@@ -5130,13 +5133,37 @@ void sched_show_task(struct task_struct *p)
(unsigned long)task_thread_info(p)->flags);

print_worker_info(KERN_INFO, p);
- show_stack(p, NULL);
+ if (dedup) {
+ dup_stack_pid = save_dup_stack_trace(p);
+ show_stack_dedup(p, NULL, dup_stack_pid);
+ } else
+ show_stack(p, NULL);
+}
+
+void sched_show_task(struct task_struct *p)
+{
+ _sched_show_task(p, 0);
+}
+
+/*
+ * Eliminate task stack trace duplication in multi-task stackdump.
+ * Note only x86-specific code now implements the feature.
+ */
+void sched_show_task_dedup(struct task_struct *p)
+{
+ _sched_show_task(p, 1);
}

void show_state_filter(unsigned long state_filter)
{
struct task_struct *g, *p;

+ /*
+ * Prevent below printed stack traces from referring to previously
+ * printed ones.
+ */
+ clear_dup_stack_traces();
+
#if BITS_PER_LONG == 32
printk(KERN_INFO
" task PC stack pid father\n");
@@ -5156,7 +5183,7 @@ void show_state_filter(unsigned long state_filter)
touch_nmi_watchdog();
touch_all_softlockup_watchdogs();
if (!state_filter || (p->state & state_filter))
- sched_show_task(p);
+ sched_show_task_dedup(p);
}

#ifdef CONFIG_SCHED_DEBUG
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index b6e4c16..2286ff3 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -73,3 +73,25 @@ save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace)
{
WARN_ONCE(1, KERN_INFO "save_stack_trace_regs() not implemented yet.\n");
}
+
+/*
+ * Architectures that do not implement the task stack dedup will fallback to
+ * the default functionality.
+ */
+__weak void
+clear_dup_stack_traces(void)
+{
+}
+
+__weak unsigned int
+save_dup_stack_trace(struct task_struct *tsk)
+{
+ return 0;
+}
+
+__weak void
+show_stack_dedup(struct task_struct *task, unsigned long *sp,
+ pid_t dup_stack_pid)
+{
+ show_stack(task, sp);
+}
--
2.7.4