[RFC] [Patch 4/4] timer_stats slimmed down: using statisticsinfrastucture

From: Martin Peschke
Date: Sat Apr 14 2007 - 06:11:40 EST


This patch implements timer_stats based on the statistics
infrastructure.

Here is some sample output, which even somewhat
resembles /proc/timer_stats.
(>80 char lines because of symbol names, sorry for line breaks).


It reads:
<statistic name> <basket> <hits> <label>

with label being:
<pid> <task name> <start_fn>(<stop_fn>)


[root@t2930041 timer_stats]# cat data
expired missed 0x0
expired - 5381 1019 multipathd do_nanosleep (hrtimer_wakeup)
expired - 2707 1 swapper queue_delayed_work_on
(delayed_work_timer_fn)
expired - 2707 1 swapper queue_delayed_work_on
(delayed_work_timer_fn)
expired - 2707 1 swapper queue_delayed_work_on
(delayed_work_timer_fn)
expired - 2707 1 swapper queue_delayed_work_on
(delayed_work_timer_fn)
expired - 1353 1 swapper neigh_table_init_no_netlink
(neigh_periodic_timer)
expired - 1083 0 swapper page_writeback_init (wb_timer_fn)
expired - 1083 1021 multipathd schedule_timeout
(process_timeout)
expired - 1081 1 init schedule_timeout
(process_timeout)
expired - 677 1 swapper neigh_table_init_no_netlink
(neigh_periodic_timer)
expired - 278 0 swapper sk_reset_timer (tcp_delack_timer)
expired - 193 1 swapper nfsd_export_init
(delayed_work_timer_fn)
expired - 181 975 syslogd do_setitimer (it_real_fn)
expired - 104 1296 sshd sk_reset_timer (tcp_write_timer)
expired - 90 1 swapper ip_rt_init (rt_check_expire)
expired - 90 1244 crond do_nanosleep (hrtimer_wakeup)
expired - 54 896 ip __netdev_watchdog_up
(dev_watchdog)
expired - 44 1 swapper inet_initpeers
(peer_check_expire)
expired - 44 0 swapper addrconf_verify (addrconf_verify)
.
.
.
expired - 1 1368 sadc schedule_timeout
(process_timeout)
expired - 1 1370 sadc blk_plug_device
(blk_unplug_timeout)
expired - 1 1370 sadc schedule_timeout
(process_timeout)


# cat /debug/timer_stats/definition
name=expired state=on units=timer/occurrence type=sparse entries=1024
data=[6920660.417313] started=[6920660.417313] stopped=[ 0.000000]


Signed-off-by: Martin Peschke <mp3@xxxxxxxxxx>
---

time/timer_stats.c | 385 ++++++++---------------------------------------------
timer.c | 2
2 files changed, 59 insertions(+), 328 deletions(-)

Index: linux/kernel/time/timer_stats.c
===================================================================
--- linux.orig/kernel/time/timer_stats.c
+++ linux/kernel/time/timer_stats.c
@@ -26,197 +26,95 @@
* the pid and cmdline from the owner process if applicable.
*
* Start/stop data collection:
- * # echo 1[0] >/proc/timer_stats
+ * # echo state=on[off] > /debug/timer_stats/definition
*
* Display the information collected so far:
- * # cat /proc/timer_stats
+ * # cat /debug/timer_stats/data
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/

-#include <linux/proc_fs.h>
#include <linux/module.h>
-#include <linux/spinlock.h>
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/kallsyms.h>
-
-#include <asm/uaccess.h>
+#include <linux/statistic.h>

/*
* This is our basic unit of interest: a timer expiry event identified
* by the timer, its start/expire functions and the PID of the task that
* started the timer. We count the number of times an event happens:
*/
-struct entry {
- /*
- * Hash list:
- */
- struct entry *next;
-
- /*
- * Hash keys:
- */
- void *timer;
- void *start_func;
- void *expire_func;
- pid_t pid;
-
- /*
- * Number of timeout events:
- */
- unsigned long count;
-
+struct tstat_key {
+ void *timer;
+ void *start_func;
+ void *expire_func;
+ pid_t pid;
/*
* We save the command-line string to preserve
* this information past task exit:
*/
- char comm[TASK_COMM_LEN + 1];
-
-} ____cacheline_aligned_in_smp;
-
-/*
- * Spinlock protecting the tables - not taken during lookup:
- */
-static DEFINE_SPINLOCK(table_lock);
-
-/*
- * Per-CPU lookup locks for fast hash lookup:
- */
-static DEFINE_PER_CPU(spinlock_t, lookup_lock);
-
-/*
- * Mutex to serialize state changes with show-stats activities:
- */
-static DEFINE_MUTEX(show_mutex);
-
-/*
- * Collection status, active/inactive:
- */
-static int __read_mostly active;
-
-/*
- * Beginning/end timestamps of measurement:
- */
-static ktime_t time_start, time_stop;
-
-/*
- * tstat entry structs only get allocated while collection is
- * active and never freed during that time - this simplifies
- * things quite a bit.
- *
- * They get freed when a new collection period is started.
- */
-#define MAX_ENTRIES_BITS 10
-#define MAX_ENTRIES (1UL << MAX_ENTRIES_BITS)
-
-static unsigned long nr_entries;
-static struct entry entries[MAX_ENTRIES];
-
-static atomic_t overflow_count;
-
-static void reset_entries(void)
-{
- nr_entries = 0;
- memset(entries, 0, sizeof(entries));
- atomic_set(&overflow_count, 0);
-}
+ char comm[TASK_COMM_LEN + 1];
+};

-static struct entry *alloc_entry(void)
+static void tstat_print_symbol(struct seq_file *m, unsigned long addr)
{
- if (nr_entries >= MAX_ENTRIES)
- return NULL;
-
- return entries + nr_entries++;
-}
-
-/*
- * The entries are in a hash-table, for fast lookup:
- */
-#define TSTAT_HASH_BITS (MAX_ENTRIES_BITS - 1)
-#define TSTAT_HASH_SIZE (1UL << TSTAT_HASH_BITS)
-#define TSTAT_HASH_MASK (TSTAT_HASH_SIZE - 1)
-
-#define __tstat_hashfn(entry) \
- (((unsigned long)(entry)->timer ^ \
- (unsigned long)(entry)->start_func ^ \
- (unsigned long)(entry)->expire_func ^ \
- (unsigned long)(entry)->pid ) & TSTAT_HASH_MASK)
-
-#define tstat_hashentry(entry) (tstat_hash_table + __tstat_hashfn(entry))
-
-static struct entry *tstat_hash_table[TSTAT_HASH_SIZE] __read_mostly;
+ char symname[KSYM_NAME_LEN+1];

-static int match_entries(struct entry *entry1, struct entry *entry2)
-{
- return entry1->timer == entry2->timer &&
- entry1->start_func == entry2->start_func &&
- entry1->expire_func == entry2->expire_func &&
- entry1->pid == entry2->pid;
+ if (lookup_symbol_name(addr, symname) < 0)
+ seq_printf(m, "<%p>", (void *)addr);
+ else
+ seq_printf(m, "%s", symname);
}

-/*
- * Look up whether an entry matching this item is present
- * in the hash already. Must be called with irqs off and the
- * lookup lock held:
- */
-static struct entry *tstat_lookup(struct entry *entry, char *comm)
+static void tstat_label(struct statistic_interface *interface, int i,
+ void *_key, struct seq_file *m)
{
- struct entry **head, *curr, *prev;
+ struct tstat_key *key = _key;

- head = tstat_hashentry(entry);
- curr = *head;
+ seq_printf(m, "\t%5d %-16s ", key->pid, key->comm);
+ tstat_print_symbol(m, (unsigned long)key->start_func);
+ seq_puts(m, " (");
+ tstat_print_symbol(m, (unsigned long)key->expire_func);
+ seq_puts(m, ")");
+}

- /*
- * The fastpath is when the entry is already hashed,
- * we do this with the lookup lock held, but with the
- * table lock not held:
- */
- while (curr) {
- if (match_entries(curr, entry))
- return curr;
+enum tstat_enum {
+ TSTAT_EXPIRED,
+ _TSTAT_NUMBER,
+};

- curr = curr->next;
+static struct statistic_info tstat_info[] = {
+ [TSTAT_EXPIRED] = {
+ .name = "expired",
+ .x_unit = "timer",
+ .y_unit = "occurrence",
+ .defaults = "type=sparse entries=1024",
+ .flags = STATISTIC_FLAGS_KEY,
+ .key_size = sizeof(struct tstat_key)
}
- /*
- * Slowpath: allocate, set up and link a new hash entry:
- */
- prev = NULL;
- curr = *head;
-
- spin_lock(&table_lock);
- /*
- * Make sure we have not raced with another CPU:
- */
- while (curr) {
- if (match_entries(curr, entry))
- goto out_unlock;
+};

- prev = curr;
- curr = curr->next;
- }
+struct statistic tstat_stat[_TSTAT_NUMBER];

- curr = alloc_entry();
- if (curr) {
- *curr = *entry;
- curr->count = 0;
- memcpy(curr->comm, comm, TASK_COMM_LEN);
- if (prev)
- prev->next = curr;
- else
- *head = curr;
- curr->next = NULL;
- }
- out_unlock:
- spin_unlock(&table_lock);
+struct statistic_interface tstat_interface = {
+ .stat = tstat_stat,
+ .info = tstat_info,
+ .number = _TSTAT_NUMBER,
+ .label = tstat_label,
+};

- return curr;
+static int __init tstat_init(void)
+{
+ statistic_create(&tstat_interface, "timer_stats");
+ return 0;
}
+__initcall(tstat_init);

/**
- * timer_stats_update_stats - Update the statistics for a timer.
+ * tstat_inc - Update the statistics for a timer.
* @timer: pointer to either a timer_list or a hrtimer
* @pid: the pid of the task which set up the timer
* @startf: pointer to the function which did the timer setup
@@ -229,179 +127,14 @@ static struct entry *tstat_lookup(struct
void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
void *timerf, char * comm)
{
- /*
- * It doesnt matter which lock we take:
- */
- spinlock_t *lock = &per_cpu(lookup_lock, raw_smp_processor_id());
- struct entry *entry, input;
- unsigned long flags;
-
- input.timer = timer;
- input.start_func = startf;
- input.expire_func = timerf;
- input.pid = pid;
-
- spin_lock_irqsave(lock, flags);
- if (!active)
- goto out_unlock;
-
- entry = tstat_lookup(&input, comm);
- if (likely(entry))
- entry->count++;
- else
- atomic_inc(&overflow_count);
-
- out_unlock:
- spin_unlock_irqrestore(lock, flags);
-}
-
-static void print_name_offset(struct seq_file *m, unsigned long addr)
-{
- char symname[KSYM_NAME_LEN+1];
-
- if (lookup_symbol_name(addr, symname) < 0)
- seq_printf(m, "<%p>", (void *)addr);
- else
- seq_printf(m, "%s", symname);
-}
-
-static int tstats_show(struct seq_file *m, void *v)
-{
- struct timespec period;
- struct entry *entry;
- unsigned long ms;
- long events = 0;
- ktime_t time;
- int i;
-
- mutex_lock(&show_mutex);
- /*
- * If still active then calculate up to now:
- */
- if (active)
- time_stop = ktime_get();
-
- time = ktime_sub(time_stop, time_start);
-
- period = ktime_to_timespec(time);
- ms = period.tv_nsec / 1000000;
-
- seq_puts(m, "Timer Stats Version: v0.1\n");
- seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms);
- if (atomic_read(&overflow_count))
- seq_printf(m, "Overflow: %d entries\n",
- atomic_read(&overflow_count));
-
- for (i = 0; i < nr_entries; i++) {
- entry = entries + i;
- seq_printf(m, "%4lu, %5d %-16s ",
- entry->count, entry->pid, entry->comm);
-
- print_name_offset(m, (unsigned long)entry->start_func);
- seq_puts(m, " (");
- print_name_offset(m, (unsigned long)entry->expire_func);
- seq_puts(m, ")\n");
-
- events += entry->count;
- }
-
- ms += period.tv_sec * 1000;
- if (!ms)
- ms = 1;
-
- if (events && period.tv_sec)
- seq_printf(m, "%ld total events, %ld.%ld events/sec\n", events,
- events / period.tv_sec, events * 1000 / ms);
- else
- seq_printf(m, "%ld total events\n", events);
-
- mutex_unlock(&show_mutex);
-
- return 0;
-}
-
-/*
- * After a state change, make sure all concurrent lookup/update
- * activities have stopped:
- */
-static void sync_access(void)
-{
- unsigned long flags;
- int cpu;
-
- for_each_online_cpu(cpu) {
- spin_lock_irqsave(&per_cpu(lookup_lock, cpu), flags);
- /* nothing */
- spin_unlock_irqrestore(&per_cpu(lookup_lock, cpu), flags);
- }
-}
-
-static ssize_t tstats_write(struct file *file, const char __user *buf,
- size_t count, loff_t *offs)
-{
- char ctl[2];
+ struct tstat_key key;

- if (count != 2 || *offs)
- return -EINVAL;
+ memset(&key, 0, sizeof(struct tstat_key)); /* FIXME: get rid of it? */
+ key.timer = timer;
+ key.start_func = startf;
+ key.expire_func = timerf;
+ key.pid = pid;
+ memcpy(key.comm, comm, TASK_COMM_LEN);

- if (copy_from_user(ctl, buf, count))
- return -EFAULT;
-
- mutex_lock(&show_mutex);
- switch (ctl[0]) {
- case '0':
- if (active) {
- active = 0;
- time_stop = ktime_get();
- sync_access();
- }
- break;
- case '1':
- if (!active) {
- reset_entries();
- time_start = ktime_get();
- active = 1;
- }
- break;
- default:
- count = -EINVAL;
- }
- mutex_unlock(&show_mutex);
-
- return count;
-}
-
-static int tstats_open(struct inode *inode, struct file *filp)
-{
- return single_open(filp, tstats_show, NULL);
-}
-
-static struct file_operations tstats_fops = {
- .open = tstats_open,
- .read = seq_read,
- .write = tstats_write,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
-void __init init_timer_stats(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu)
- spin_lock_init(&per_cpu(lookup_lock, cpu));
-}
-
-static int __init init_tstats_procfs(void)
-{
- struct proc_dir_entry *pe;
-
- pe = create_proc_entry("timer_stats", 0644, NULL);
- if (!pe)
- return -ENOMEM;
-
- pe->proc_fops = &tstats_fops;
-
- return 0;
+ statistic_inc_as_key(STAT_SPARSE, tstat_stat, TSTAT_EXPIRED, &key);
}
-__initcall(init_tstats_procfs);
Index: linux/kernel/timer.c
===================================================================
--- linux.orig/kernel/timer.c
+++ linux/kernel/timer.c
@@ -1315,8 +1315,6 @@ void __init init_timers(void)
int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
(void *)(long)smp_processor_id());

- init_timer_stats();
-
BUG_ON(err == NOTIFY_BAD);
register_cpu_notifier(&timers_nb);
open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL);


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/