Re: [Patch 5/8] generic netlink interface for delay accounting

From: Andrew Morton
Date: Thu Mar 30 2006 - 00:03:01 EST


Shailabh Nagar <nagar@xxxxxxxxxxxxxx> wrote:
>
> delayacct-genetlink.patch
>
> Create a generic netlink interface (NETLINK_GENERIC family),
> called "taskstats", for getting delay and cpu statistics of
> tasks and thread groups during their lifetime and when they exit.
>
>

It's be best to have a netlink person review the netlinkisms here.

> +static inline int delayacct_add_tsk(struct taskstats *d,
> + struct task_struct *tsk)
> +{
> + if (!tsk->delays)
> + return -EINVAL;
> + return __delayacct_add_tsk(d, tsk);
> +}

hm. It's a worry that this can return an error if delay accounting simply
isn't enabled.

> +struct taskstats {
> + /* Maintain 64-bit alignment while extending */
> + /* Version 1 */
> +
> + /* XXX_count is number of delay values recorded.
> + * XXX_total is corresponding cumulative delay in nanoseconds
> + */
> +
> +#define TASKSTATS_NOCPUSTATS 1
> + __u64 cpu_count;
> + __u64 cpu_delay_total; /* wait, while runnable, for cpu */
> + __u64 blkio_count;
> + __u64 blkio_delay_total; /* sync,block io completion wait*/
> + __u64 swapin_count;
> + __u64 swapin_delay_total; /* swapin page fault wait*/
> +
> + __u64 cpu_run_total; /* cpu running time
> + * no count available/provided */
> +};

What locking is used to make updates to these u64's appear to be atomic?
Maybe it's deliberately nonatomic. Either way, it needs a comment.

> void __delayacct_tsk_exit(struct task_struct *tsk)
> {
> + /*
> + * Protect against racing thread group exits
> + */
> + mutex_lock(&delayacct_exit_mutex);
> + taskstats_exit_pid(tsk);
> if (tsk->delays) {
> kmem_cache_free(delayacct_cache, tsk->delays);
> tsk->delays = NULL;
> }
> + mutex_unlock(&delayacct_exit_mutex);
> }

hm, I wonder how contended that lock is likely to be.

The kmem_cache_free() can happen outside the lock.

> +#ifdef CONFIG_TASKSTATS
> +int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
> +{
> + nsec_t tmp;
> + struct timespec ts;
> + unsigned long t1,t2;
> +
> + /* zero XXX_total,non-zero XXX_count implies XXX stat overflowed */
> +
> + tmp = (nsec_t)d->cpu_run_total ;

stray space before semicolon.

> + tmp += (u64)(tsk->utime+tsk->stime)*TICK_NSEC;
> + d->cpu_run_total = (tmp < (nsec_t)d->cpu_run_total)? 0: tmp;

Missing space before ?, missing space before :

> + d->blkio_delay_total = (tmp < d->blkio_delay_total)? 0: tmp;
> + d->swapin_delay_total = (tmp < d->swapin_delay_total)? 0: tmp;

dittos.

> ===================================================================
> --- /dev/null 1970-01-01 00:00:00.000000000 +0000
> +++ linux-2.6.16/kernel/taskstats.c 2006-03-29 18:13:18.000000000 -0500
> @@ -0,0 +1,292 @@
> +/*
> + * taskstats.c - Export per-task statistics to userland
> + *
> + * Copyright (C) Shailabh Nagar, IBM Corp. 2006
> + * (C) Balbir Singh, IBM Corp. 2006
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/taskstats.h>
> +#include <linux/delayacct.h>
> +#include <net/genetlink.h>
> +#include <asm/atomic.h>
> +
> +const int taskstats_version = TASKSTATS_VERSION;

Odd. It'd be better to put TASKSTATS_VERSION in the header file, use that
directly.

> +
> +static inline int fill_pid(pid_t pid, struct task_struct *pidtsk,
> + struct taskstats *stats)
> +{
> + int rc;
> + struct task_struct *tsk = pidtsk;
> +
> + if (!pidtsk) {
> + read_lock(&tasklist_lock);
> + tsk = find_task_by_pid(pid);
> + if (!tsk) {
> + read_unlock(&tasklist_lock);
> + return -ESRCH;
> + }
> + get_task_struct(tsk);
> + read_unlock(&tasklist_lock);
> + } else
> + get_task_struct(tsk);
> +
> + rc = delayacct_add_tsk(stats, tsk);
> + put_task_struct(tsk);
> +
> + return rc;
> +
> +}

Has two callsites, should be uninlined.

> +static inline int fill_tgid(pid_t tgid, struct task_struct *tgidtsk,
> + struct taskstats *stats)
> +{
> + int rc;
> + struct task_struct *tsk, *first;
> +
> + first = tgidtsk;
> + read_lock(&tasklist_lock);
> + if (!first) {
> + first = find_task_by_pid(tgid);
> + if (!first) {
> + read_unlock(&tasklist_lock);
> + return -ESRCH;
> + }
> + }
> + tsk = first;
> + do {
> + rc = delayacct_add_tsk(stats, tsk);
> + if (rc)
> + break;
> + } while_each_thread(first, tsk);
> + read_unlock(&tasklist_lock);
> +
> + return rc;
> +}

Ditto.

It's somewhat similar to fill_pid() - perhaps they can be combined, halving
the overhead?

> +static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info)
> +{
> + int rc = 0;
> + struct sk_buff *rep_skb;
> + struct taskstats stats;
> + void *reply;
> + size_t size;
> + struct nlattr *na;
> +
> + /*
> + * Size includes space for nested attribute as well
> + * The returned data is of the format
> + * TASKSTATS_TYPE_AGGR_PID/TGID
> + * --> TASKSTATS_TYPE_PID/TGID
> + * --> TASKSTATS_TYPE_STATS
> + */
> + size = nla_total_size(sizeof(u32)) +
> + nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
> +
> + memset(&stats, 0, sizeof(stats));
> + rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
> + if (rc < 0)
> + return rc;
> +
> + if (info->attrs[TASKSTATS_CMD_ATTR_PID]) {
> + u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
> + rc = fill_pid((pid_t)pid, NULL, &stats);

We shouldn't have a typecast here. If it generates a warning then we need
to get in there and find out why.

> + if (rc < 0)
> + goto err;
> +
> + na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID);
> + NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, pid);
> + } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) {
> + u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
> + rc = fill_tgid((pid_t)tgid, NULL, &stats);

Ditto.

> + if (rc < 0)
> + goto err;
> +
> + na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
> + NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, tgid);
> + } else {
> + rc = -EINVAL;
> + goto err;
> + }
> +
> + NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, stats);
> + nla_nest_end(rep_skb, na);
> +
> + return send_reply(rep_skb, info->snd_pid, TASKSTATS_MSG_UNICAST);
> +
> +nla_put_failure:
> + return genlmsg_cancel(rep_skb, reply);

Extra space.

> +err:
> + nlmsg_free(rep_skb);
> + return rc;
> +}
> +
> +
> +/* Send pid data out on exit */
> +void taskstats_exit_pid(struct task_struct *tsk)
> +{
> + int rc = 0;
> + struct sk_buff *rep_skb;
> + void *reply;
> + struct taskstats stats;
> + size_t size;
> + int is_thread_group = !thread_group_empty(tsk);
> + struct nlattr *na;
> +
> + /*
> + * tasks can start to exit very early. Ensure that the family
> + * is registered before notifications are sent out
> + */
> + if (!family_registered)
> + return;

This code risks evaluating thread_group_empty() even if !family_registered.
The compiler will most likely sort that out in this case, but it's a risk
when using these initialisers.

> +
> +static int __init taskstats_init(void)
> +{
> + if (genl_register_family(&family))
> + return -EFAULT;

EFAULT?

> + family_registered = 1;

whitespace broke.

> +
> + if (genl_register_ops(&family, &taskstats_ops))
> + goto err;
> +
> + return 0;
> +err:
> + genl_unregister_family(&family);
> + family_registered = 0;
> + return -EFAULT;
> +}
> +
> +late_initcall(taskstats_init);

Why late_initcall()? (A comment would be appropriate)

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/