[PATCH] tiobench on UP and ptg-D3-mm1

From: Ed Tomlinson (tomlins@cam.org)
Date: Sat Mar 01 2003 - 10:04:49 EST

Next message: Jan-Benedict Glaw: "Re: syslog full of kernel BUGS, frequent intermittent instability"
Previous message: John Bradford: "Re: syslog full of kernel BUGS, frequent intermittent instability"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

Andrew,

You mentioned problems with tiobench on UP. This message was partly
composed with this script running:

for dir in /pool{a,e,g}/tio
do
        ( cd $dir
                tiobench --size 128 --threads 16 > /dev/null 2>&1 &
        )
done

response was slow but usable. Its actually a fairly good example showing
what the ptg patch can do. Here is a "vmstat -a 5" of the run.

procs -----------memory---------- ---swap-- -----io---- --system-- ----cpu----
r b swpd free inact active si so bi bo in cs us sy id wa
3 0 72188 255068 39132 180080 0 0 0 67 1052 621 4 3 93 0
2 0 72188 254868 39236 180144 0 0 0 66 1062 639 5 3 92 0
39 0 72188 250604 39324 183796 0 0 2 82 1201 1163 27 13 60 0
49 0 72188 250196 39416 183924 0 0 0 65 1200 1053 92 8 0 0
52 3 72188 129660 159300 184104 0 0 0 12670 1228 782 27 73 0 0
52 2 72188 10364 275964 185144 0 0 0 16706 1304 970 16 84 0 0
58 6 74248 5292 275584 190256 13 394 21 19348 1401 925 15 85 0 0
22 29 74248 2248 277480 191160 36 0 65 19530 1543 1249 37 55 0 8
31 27 74248 2284 277472 191124 0 0 19 8378 1277 686 87 13 0 0
11 34 74248 4308 275360 191124 0 0 6 5119 1576 1174 54 19 0 28
3 51 74248 4164 275036 191544 0 0 51 1805 1603 1005 44 11 0 45
1 49 74248 3524 274308 192388 0 0 133 1690 1613 1694 21 9 0 69
2 38 74248 3484 274664 193212 0 0 56 1755 1485 831 7 6 0 87
19 11 74248 3300 273792 194276 0 0 204 1741 1502 955 24 7 0 69
16 7 74248 3584 216036 252272 0 0 10351 1333 1716 1456 32 33 0 35
14 25 74248 128772 147112 196100 39 0 5041 376 1413 1565 70 29 0 1
3 16 74248 57316 156176 259012 0 0 14367 0 1698 1393 51 49 0 0
4 4 74248 150240 83964 238672 0 0 7649 722 1396 1096 66 34 0 0
9 3 74248 142896 85368 244680 0 0 1466 12 1286 1053 90 10 0 0
8 0 74248 220180 33184 219640 82 0 917 77 1263 985 86 14 0 0
2 0 74248 270764 9512 193160 0 0 0 58 1057 665 69 6 25 0
4 0 74248 270788 9576 193220 0 0 0 60 1056 720 15 4 81 0

This is using cfq on a k6-III 400 with 512m all impacted fs(es) are reiserfs.

What this does is detect thread groups (where they are defined as processes sharing
both mm and FDs or processes tagged as members of a kernel thread group) and reduces
the timeslices given to these processes when to many processes are active in a
group. This allows other tasks to get cpu IF there is a demand. There is also a
governor set for user tasks - in this case it will not affect the test.

The patch has been tested on UP and compiles for SMP. It should be OK on SMP. On
numa boxes it would really benefit from a dynamic way to alloc per node storage.
The ptgroup->active[] and user->active[] arrays should really point to atomic_t(s)
in per node storage.

I have been using variants of this patch since the beginning of Jan - it lets me run
a java freenet server, which is heavily threaded, without it impacting my interactive
response much.

Ed Tomlinson

PS. patch applies to 2.5.63-mm1, with a little twiddling it should also be
applicable to .63 (sched.c) or .63bk (sched.c, fork.c)

---------------
# This is a BitKeeper generated patch for the following project:
# Project Name: Linux kernel tree
# This patch format is intended for GNU patch command version 2.5 or higher.
# This patch includes the following deltas:
# ChangeSet 1.1026 -> 1.1028
# include/linux/sched.h 1.139 -> 1.140
# kernel/fork.c 1.111 -> 1.113
# kernel/user.c 1.8 -> 1.9
# kernel/sched.c 1.164 -> 1.165
#
# The following is the BitKeeper ChangeSet Log
# --------------------------------------------
# 03/02/28 ed@oscar.et.ca 1.1027
# Add user and thread group governors to prevent either from monoplizing
# the system. The governors work by limiting the sum of the timeslices
# of active tasks in a group to <n> timeslices. The defaults set <n> to
# 1.5 for thread groups and to 30 for user tasks. For numa systems the
# governors are per node.
# --------------------------------------------
#
diff -Nru a/include/linux/sched.h b/include/linux/sched.h
--- a/include/linux/sched.h Fri Feb 28 07:33:49 2003
+++ b/include/linux/sched.h Fri Feb 28 07:33:49 2003
@@ -195,6 +195,11 @@

#include <linux/aio.h>

+struct ptg_struct { /* pseudo thread groups */
+ atomic_t active[MAX_NUMNODES];
+ atomic_t count; /* number of refs */
+};
+
struct mm_struct {
         struct vm_area_struct * mmap; /* list of VMAs */
         struct rb_root mm_rb;
@@ -295,6 +300,7 @@
struct user_struct {
         atomic_t __count; /* reference count */
         atomic_t processes; /* How many processes does this user have? */
+ atomic_t active[MAX_NUMNODES];
         atomic_t files; /* How many open files does this user have? */

         /* Hash table maintenance information */
@@ -361,6 +367,8 @@
         struct list_head ptrace_list;

         struct mm_struct *mm, *active_mm;
+ struct ptg_struct * ptgroup; /* pseudo thread group for this task */
+ atomic_t *governor; /* the atomic_t that governs this task */

/* task state */
         struct linux_binfmt *binfmt;
diff -Nru a/kernel/fork.c b/kernel/fork.c
--- a/kernel/fork.c Fri Feb 28 07:33:49 2003
+++ b/kernel/fork.c Fri Feb 28 07:33:49 2003
@@ -72,12 +72,24 @@
         return total;
}

+void free_ptgroup(struct task_struct *tsk)
+{
+ if (tsk->ptgroup && atomic_sub_and_test(1,&tsk->ptgroup->count)) {
+ kfree(tsk->ptgroup);
+ tsk->ptgroup = NULL;
+ tsk->governor = &tsk->user->active[cpu_to_node(task_cpu(tsk))];
+ if (tsk == current)
+ atomic_inc(tsk->governor);
+ }
+}
+
void __put_task_struct(struct task_struct *tsk)
{
         WARN_ON(!(tsk->state & (TASK_DEAD | TASK_ZOMBIE)));
         WARN_ON(atomic_read(&tsk->usage));
         WARN_ON(tsk == current);

+ free_ptgroup(tsk);
         security_task_free(tsk);
         free_uid(tsk->user);

@@ -465,6 +477,7 @@

         tsk->mm = NULL;
         tsk->active_mm = NULL;
+ tsk->ptgroup = NULL;

         /*
          * Are we cloning a kernel thread?
@@ -730,6 +743,32 @@
         p->flags = new_flags;
}

+static inline int setup_governor(unsigned long clone_flags, struct task_struct *p)
+{
+ if ( ((clone_flags & CLONE_VM) && (clone_flags & CLONE_FILES)) ||
+ (clone_flags & CLONE_THREAD)) {
+ if (current->ptgroup)
+ atomic_inc(&current->ptgroup->count);
+ else {
+ int i;
+ current->ptgroup = kmalloc(sizeof(struct ptg_struct), GFP_ATOMIC);
+ if (!current->ptgroup)
+ return 1;
+ /* printk(KERN_INFO "ptgroup - pid %u\n",current->pid); */
+ atomic_set(&current->ptgroup->count,2);
+ for(i=0; i < MAX_NUMNODES; i++)
+ atomic_set(&current->ptgroup->active[i], 0);
+ atomic_set(&current->ptgroup->active[numa_node_id()], 1);
+ atomic_dec(current->governor);
+ current->governor = &current->ptgroup->active[numa_node_id()];
+ }
+ p->ptgroup = current->ptgroup;
+ p->governor = &p->ptgroup->active[numa_node_id()];
+ } else
+ p->governor = &p->user->active[numa_node_id()];
+ return 0;
+}
+
asmlinkage int sys_set_tid_address(int *tidptr)
{
         current->clear_child_tid = tidptr;
@@ -872,6 +911,12 @@
                 goto bad_fork_cleanup_mm;
         retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
         if (retval)
+ goto bad_fork_cleanup_namespace;
+ /*
+ * Setup the governor pointer for the new process, allocating a new ptg as
+ * required if the process is a thread.
+ */
+ if (setup_governor(clone_flags, p))
                 goto bad_fork_cleanup_namespace;

         if (clone_flags & CLONE_CHILD_SETTID)
diff -Nru a/kernel/sched.c b/kernel/sched.c
--- a/kernel/sched.c Fri Feb 28 07:33:49 2003
+++ b/kernel/sched.c Fri Feb 28 07:33:49 2003
@@ -69,6 +69,9 @@
#define STARVATION_LIMIT (2*HZ)
#define AGRESSIVE_IDLE_STEAL 1
#define NODE_THRESHOLD 125
+#define THREAD_GOVERNOR 15 /* allow threads groups 1.5 full timeslices */
+#define USER_GOVERNOR 300 /* allow user 30 full timeslices */
+

/*
  * If a task is 'interactive' then we reinsert it in the active
@@ -124,7 +127,26 @@

static inline unsigned int task_timeslice(task_t *p)
{
- return BASE_TIMESLICE(p);
+ int slice = BASE_TIMESLICE(p);
+ int threads = atomic_read(p->governor) * 10;
+ int govern = threads;
+ if (p->user->uid)
+ govern = (p->ptgroup) ? THREAD_GOVERNOR : USER_GOVERNOR;
+ if (threads > govern) {
+ slice = (slice * govern) / threads;
+ slice = (slice > MIN_TIMESLICE) ? slice : MIN_TIMESLICE;
+ }
+#if 1
+ {
+ static int next;
+ if (time_after(jiffies, next)) {
+ printk(KERN_INFO "uid %d pid %d nod %d ptg %x gov %x threads %d lim %d slice %d\n",
+ p->uid, p->pid, numa_node_id(), p->ptgroup, p->governor, threads/10, govern, slice);
+ next = jiffies + HZ*300;
+ }
+ }
+#endif
+ return slice;
}

/*
@@ -251,16 +273,18 @@
         rq->node_nr_running = &node_nr_running[0];
}

-static inline void nr_running_inc(runqueue_t *rq)
+static inline void nr_running_inc(task_t *p, runqueue_t *rq)
{
         atomic_inc(rq->node_nr_running);
         rq->nr_running++;
+ atomic_inc(p->governor);
}

-static inline void nr_running_dec(runqueue_t *rq)
+static inline void nr_running_dec(task_t *p, runqueue_t *rq)
{
         atomic_dec(rq->node_nr_running);
         rq->nr_running--;
+ atomic_dec(p->governor);
}

__init void node_nr_running_init(void)
@@ -274,8 +298,8 @@
#else /* !CONFIG_NUMA */

# define nr_running_init(rq) do { } while (0)
-# define nr_running_inc(rq) do { (rq)->nr_running++; } while (0)
-# define nr_running_dec(rq) do { (rq)->nr_running--; } while (0)
+# define nr_running_inc(p, rq) do { (rq)->nr_running++; atomic_inc((p)->governor); } while (0)
+# define nr_running_dec(p, rq) do { (rq)->nr_running--; atomic_dec((p)->governor); } while (0)

#endif /* CONFIG_NUMA */

@@ -380,7 +404,7 @@
static inline void __activate_task(task_t *p, runqueue_t *rq)
{
         enqueue_task(p, rq->active);
- nr_running_inc(rq);
+ nr_running_inc(p, rq);
}

static inline void activate_task(task_t *p, runqueue_t *rq)
@@ -408,7 +432,7 @@
  */
static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
{
- nr_running_dec(rq);
+ nr_running_dec(p, rq);
         if (p->state == TASK_UNINTERRUPTIBLE)
                 rq->nr_uninterruptible++;
         dequeue_task(p, p->array);
@@ -1068,9 +1092,15 @@
static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, runqueue_t *this_rq, int this_cpu)
{
         dequeue_task(p, src_array);
- nr_running_dec(src_rq);
+ nr_running_dec(p, src_rq);
         set_task_cpu(p, this_cpu);
- nr_running_inc(this_rq);
+#ifdef CONFIG_NUMA
+ if (p->ptgroup)
+ p->governor = &p->ptgroup->active[cpu_to_node(this_cpu)];
+ else
+ p->governor = &p->user->active[cpu_to_node(this_cpu)];
+#endif
+ nr_running_inc(p, this_rq);
         enqueue_task(p, this_rq->active);
         wake_up_cpu(this_rq, this_cpu, p);
}
@@ -2729,6 +2759,8 @@
         cpu_idle_ptr(smp_processor_id()) = current;

         set_task_cpu(current, smp_processor_id());
+ current->governor = &current->user->active[numa_node_id()];
+ atomic_inc(current->governor);
         wake_up_forked_process(current);

         init_timers();
diff -Nru a/kernel/user.c b/kernel/user.c
--- a/kernel/user.c Fri Feb 28 07:33:49 2003
+++ b/kernel/user.c Fri Feb 28 07:33:49 2003
@@ -30,6 +30,7 @@
struct user_struct root_user = {
         .__count = ATOMIC_INIT(1),
         .processes = ATOMIC_INIT(1),
+ .active = {[0 ...MAX_NUMNODES-1] = ATOMIC_INIT(0)},
         .files = ATOMIC_INIT(0)
};

@@ -89,6 +90,7 @@

         if (!up) {
                 struct user_struct *new;
+ int i;

                 new = kmem_cache_alloc(uid_cachep, SLAB_KERNEL);
                 if (!new)
@@ -96,6 +98,8 @@
                 new->uid = uid;
                 atomic_set(&new->__count, 1);
                 atomic_set(&new->processes, 0);
+ for(i=0; i < MAX_NUMNODES; i++)
+ atomic_set(&new->active[i], 0);
                 atomic_set(&new->files, 0);

                 /*
@@ -130,6 +134,11 @@
         atomic_inc(&new_user->processes);
         atomic_dec(&old_user->processes);
         current->user = new_user;
+ if (!current->ptgroup) {
+ atomic_dec(current->governor);
+ current->governor = &current->user->active[numa_node_id()];
+ atomic_inc(current->governor);
+ }
         free_uid(old_user);
}

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Next message: Jan-Benedict Glaw: "Re: syslog full of kernel BUGS, frequent intermittent instability"
Previous message: John Bradford: "Re: syslog full of kernel BUGS, frequent intermittent instability"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

This archive was generated by hypermail 2b29 : Fri Mar 07 2003 - 22:00:16 EST