Re: [PATCH] sched: Add SCHED_BGND (background) scheduling policy

From: Peter Williams
Date: Tue Jul 04 2006 - 21:15:01 EST


Con Kolivas wrote:
some quick comments within code below.

On Wednesday 05 July 2006 09:35, Peter Williams wrote:
---
include/linux/init_task.h | 6 -
include/linux/sched.h | 11 ++
kernel/fork.c | 1
kernel/mutex.c | 28 ++++++-
kernel/sched.c | 183
++++++++++++++++++++++++++++++++++++++-------- 5 files changed, 192
insertions(+), 37 deletions(-)

Index: MM-2.6.17-mm6/include/linux/init_task.h
===================================================================
--- MM-2.6.17-mm6.orig/include/linux/init_task.h 2006-07-04
14:37:42.000000000 +1000 +++
MM-2.6.17-mm6/include/linux/init_task.h 2006-07-04 14:38:12.000000000 +1000
@@ -99,9 +99,9 @@ extern struct group_info init_groups;
.usage = ATOMIC_INIT(2), \
.flags = 0, \
.lock_depth = -1, \
- .prio = MAX_PRIO-20, \
- .static_prio = MAX_PRIO-20, \
- .normal_prio = MAX_PRIO-20, \
+ .prio = MAX_RT_PRIO+20, \
+ .static_prio = MAX_RT_PRIO+20, \
+ .normal_prio = MAX_RT_PRIO+20, \
.policy = SCHED_NORMAL, \
.cpus_allowed = CPU_MASK_ALL, \
.mm = NULL, \
Index: MM-2.6.17-mm6/include/linux/sched.h
===================================================================
--- MM-2.6.17-mm6.orig/include/linux/sched.h 2006-07-04 14:37:43.000000000
+1000 +++ MM-2.6.17-mm6/include/linux/sched.h 2006-07-04 14:38:12.000000000
+1000 @@ -34,6 +34,8 @@
#define SCHED_FIFO 1
#define SCHED_RR 2
#define SCHED_BATCH 3
+/* Scheduler class for background tasks */
+#define SCHED_BGND 4

#ifdef __KERNEL__

@@ -503,13 +505,16 @@ struct signal_struct {
#define MAX_USER_RT_PRIO 100
#define MAX_RT_PRIO MAX_USER_RT_PRIO

-#define MAX_PRIO (MAX_RT_PRIO + 40)
+#define BGND_PRIO (MAX_RT_PRIO + 40)
+/* add another slot for SCHED_BGND tasks */
+#define MAX_PRIO (BGND_PRIO + 1)

#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO)
#define rt_task(p) rt_prio((p)->prio)
#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH))
#define has_rt_policy(p) \
- unlikely((p)->policy != SCHED_NORMAL && (p)->policy != SCHED_BATCH)
+ unlikely((p)->policy != SCHED_NORMAL && (p)->policy < SCHED_BATCH)

idleprio tasks should be able to get rt_policy as well

I don't understand what you mean here. A task can only have one scheduling policy. The simple (direct) definition of has_rt_policy() is (p->policy == SCHED_FIFO || p->policy == SCHED_RR) and the one defined is just a rearrangement of that with a view to minimizing overhead in the majority of invocations.


+#define bgnd_task(p) (unlikely((p)->policy == SCHED_BGND))

/*
* Some day this will be a full-fledged user tracking system..
@@ -810,6 +815,7 @@ struct task_struct {
unsigned long sleep_avg;
unsigned long long timestamp, last_ran;
unsigned long long sched_time; /* sched_clock time spent running */
+ unsigned int mutexes_held; /* for knowing when it's safe to repress
SCHED_BGND tasks */ enum sleep_type sleep_type;

unsigned long policy;
@@ -1090,6 +1096,7 @@ static inline void put_task_struct(struc
#define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */
#define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */
#define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset
*/ +#define PF_UIWAKE 0x08000000 /* Just woken from uninterrptible sleep */
#define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */
#define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex
tester */

Index: MM-2.6.17-mm6/kernel/sched.c
===================================================================
--- MM-2.6.17-mm6.orig/kernel/sched.c 2006-07-04 14:37:43.000000000 +1000
+++ MM-2.6.17-mm6/kernel/sched.c 2006-07-04 14:38:12.000000000 +1000
@@ -59,7 +59,7 @@

/*
* Convert user-nice values [ -20 ... 0 ... 19 ]
- * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
+ * to static priority [ MAX_RT_PRIO..BGND_PRIO-1 ],
* and back.
*/
#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
@@ -73,7 +73,7 @@
*/
#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
-#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
+#define MAX_USER_PRIO (USER_PRIO(BGND_PRIO))

/*
* Some helpers for converting nanosecond timing to jiffy resolution
@@ -171,7 +171,7 @@
*/

#define SCALE_PRIO(x, prio) \
- max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
+ max(x * (BGND_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)

static unsigned int static_prio_timeslice(int static_prio)
{
@@ -186,6 +186,11 @@ static inline unsigned int task_timeslic
return static_prio_timeslice(p->static_prio);
}

+#define task_in_background(p) unlikely((p)->prio == BGND_PRIO)
+#define safe_to_background(p) \
+ (!((p)->mutexes_held || \
+ (p)->flags & (PF_FREEZE | PF_UIWAKE | PF_EXITING)))
+
/*
* These are the runqueue data structures:
*/
@@ -715,13 +720,17 @@ static inline int __normal_prio(struct t
{
int bonus, prio;

+ /* Ensure that background tasks stay at BGND_PRIO */
+ if (bgnd_task(p) && safe_to_background(p))
+ return BGND_PRIO;
+
bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;

prio = p->static_prio - bonus;
if (prio < MAX_RT_PRIO)
prio = MAX_RT_PRIO;
- if (prio > MAX_PRIO-1)
- prio = MAX_PRIO-1;
+ if (prio > BGND_PRIO-1)
+ prio = BGND_PRIO-1;
return prio;
}

@@ -761,8 +770,18 @@ static void set_load_weight(struct task_
else
#endif
p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
- } else
- p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
+ } else {
+ /*
+ * Reduce the probability of a task escaping the background
+ * due to load balancing leaving it on a lighly used CPU
+ * Can't use zero as that would kill load balancing when only
+ * background tasks are running.
+ */
+ if (bgnd_task(p))
+ p->load_weight = LOAD_WEIGHT(MIN_TIMESLICE / 2 ? : 1);

Why not just set it to 1 for all idleprio tasks? The granularity will be lost at anything lower anyway and it avoids a more complex calculation.

+ else
+ p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
+ }
}

static inline void
@@ -834,7 +853,10 @@ static void __activate_task(struct task_
{
struct prio_array *target = rq->active;

- if (batch_task(p))
+ /* Don't punish batch tasks just tasks actually in the background

An extra line here for multiline comments such as:
+ /*
+ * Don't punish batch tasks just tasks actually in the background


+ * as anything else is counter productive from a system wide aspect
+ */
+ if (task_in_background(p))
target = rq->expired;
enqueue_task(p, target);
inc_nr_running(p, rq);
@@ -942,6 +964,8 @@ static void activate_task(struct task_st
if (!rt_task(p))
p->prio = recalc_task_prio(p, now);

+ p->flags &= ~PF_UIWAKE;
+
/*
* This checks to make sure it's not an uninterruptible task
* that is now waking up.
@@ -1484,6 +1508,7 @@ out_activate:
* sleep_avg beyond just interactive state.
*/
p->sleep_type = SLEEP_NONINTERACTIVE;
+ p->flags |= PF_UIWAKE;
} else

/*
@@ -3024,6 +3049,48 @@ void scheduler_tick(void)
}
goto out_unlock;
}
+
+ if (bgnd_task(p)) {
+ /*
+ * Do this even if there's only one task on the queue as
+ * we want to set the priority low so that any waking tasks
+ * can preempt.
+ */
+ if (task_in_background(p)) {
+ /*
+ * Tasks currently in the background will be
+ * at BGND_PRIO priority and preemption
+ * should be enough to keep them in check provided we
+ * don't let them adversely effect tasks on the expired

ok I'm going to risk a lart and say "affect" ?

I have to refer you to the Oxford English Dictionary. According to it (when used as a verb):

affect: 1. like, love 2. like to use, practice or wear 3. aim at, seek 4. use or display ostentatiously 5. assume a false appearance 6. attack as a disease 7. move or touch.

effect: 1. bring about (an event or result) 2. produce (a state or condition) 3. make, construct or build


+ * array.
+ */
+ if (!safe_to_background(p)) {
+ dequeue_task(p, rq->active);
+ p->prio = effective_prio(p);
+ enqueue_task(p, rq->active);
+ } else if (rq->expired->nr_active &&
+ rq->best_expired_prio < p->prio) {
+ dequeue_task(p, rq->active);
+ enqueue_task(p, rq->expired);
+ set_tsk_need_resched(p);
+ goto out_unlock;
+ }
+ }
+ else if (safe_to_background(p)) {
+ dequeue_task(p, rq->active);
+ p->normal_prio = BGND_PRIO;
+ /* this should be safe for PI purposes */
+ p->prio = p->normal_prio;
+ enqueue_task(p, rq->expired);
+ /*
+ * think about making this conditional to reduce
+ * context switch rate
+ */
+ set_tsk_need_resched(p);
+ goto out_unlock;
+ }
+ }
+
if (!--p->time_slice) {
dequeue_task(p, rq->active);
set_tsk_need_resched(p);
@@ -3033,6 +3100,11 @@ void scheduler_tick(void)

if (!rq->expired_timestamp)
rq->expired_timestamp = jiffies;
+ /*
+ * No need to do anything special for background tasks here
+ * as TASK_INTERACTIVE() should fail when they're in the
+ * background.
+ */
if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
enqueue_task(p, rq->expired);
if (p->static_prio < rq->best_expired_prio)
@@ -3122,6 +3194,33 @@ smt_slice(struct task_struct *p, struct
}

/*
+ * task time slice for SMT dependent idle purposes
+ */
+static unsigned int smt_timeslice(struct task_struct *p)
+{
+ if (task_in_background(p))
+ return 0;
+
+ return task_timeslice(p);
+}
+
+/*
+ * Is the thisp a higher priority task than thatp for SMT dependent idle
+ * purposes?
+ */
+static int task_priority_gt(const struct task_struct *thisp,
+ const struct task_struct *thatp)
+{
+ if (task_in_background(thisp))
+ return !task_in_background(thatp);
+
+ if (task_in_background(thatp))
+ return 1;
+
+ return thisp->static_prio < thatp->static_prio;
+}
+
+/*
* To minimise lock contention and not have to drop this_rq's runlock we
only * trylock the sibling runqueues and bypass those runqueues if we fail
to * acquire their lock. As we only trylock the normal locking order does
not @@ -3180,9 +3279,9 @@ dependent_sleeper(int this_cpu, struct r
(sd->per_cpu_gain * DEF_TIMESLICE / 100))
ret = 1;
} else {
- if (smt_curr->static_prio < p->static_prio &&
+ if (task_priority_gt(smt_curr, p) &&
!TASK_PREEMPTS_CURR(p, smt_rq) &&
- smt_slice(smt_curr, sd) > task_timeslice(p))
+ smt_slice(smt_curr, sd) > smt_timeslice(p))
ret = 1;
}
unlock:
@@ -3245,6 +3344,22 @@ static inline int interactive_sleep(enum
}

/*
+ * Switch the active and expired arrays.
+ */
+static struct prio_array *switch_arrays(struct rq *rq, int
best_active_prio) +{

In the fast path this should be inlined even if it is large.

OK.


+ struct prio_array *array = rq->active;
+
+ schedstat_inc(rq, sched_switch);
+ rq->active = rq->expired;
+ rq->expired = array;
+ rq->expired_timestamp = 0;
+ rq->best_expired_prio = best_active_prio;
+
+ return rq->active;
+}
+
+/*
* schedule() is the main scheduler function.
*/
asmlinkage void __sched schedule(void)
@@ -3332,23 +3447,25 @@ need_resched_nonpreemptible:
}

array = rq->active;
- if (unlikely(!array->nr_active)) {
- /*
- * Switch the active and expired arrays.
- */
- schedstat_inc(rq, sched_switch);
- rq->active = rq->expired;
- rq->expired = array;
- array = rq->active;
- rq->expired_timestamp = 0;
- rq->best_expired_prio = MAX_PRIO;
- }
+ if (unlikely(!array->nr_active))
+ array = switch_arrays(rq, MAX_PRIO);

idx = sched_find_first_bit(array->bitmap);
+get_next:
queue = array->queue + idx;
next = list_entry(queue->next, struct task_struct, run_list);
+ /* very strict backgrounding */
+ if (unlikely(task_in_background(next) && rq->expired->nr_active)) {
+ int tmp = sched_find_first_bit(rq->expired->bitmap);
+
+ if (likely(tmp < idx)) {
+ array = switch_arrays(rq, idx);
+ idx = tmp;
+ goto get_next;
+ }
+ }

- if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
+ if (!rt_task(next) && interactive_sleep(next->sleep_type) &&
!bgnd_task(next)) { unsigned long long delta = now - next->timestamp;
if (unlikely((long long)(now - next->timestamp) < 0))
delta = 0;
@@ -4052,7 +4169,8 @@ recheck:
if (policy < 0)
policy = oldpolicy = p->policy;
else if (policy != SCHED_FIFO && policy != SCHED_RR &&
- policy != SCHED_NORMAL && policy != SCHED_BATCH)
+ policy != SCHED_NORMAL && policy != SCHED_BATCH &&
+ policy != SCHED_BGND)

how about a wrapper for all these policies? (see my sched_range patch)

I must admit that when I was doing this I wished it was less messy. But I figured that it was best to do it in a way that was easy to show as being correct and then do simplification later. Especially, as simplification would also effect the other policies.


return -EINVAL;
/*
* Valid priorities for SCHED_FIFO and SCHED_RR are
@@ -4063,8 +4181,8 @@ recheck:
(p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
(!p->mm && param->sched_priority > MAX_RT_PRIO-1))
return -EINVAL;
- if ((policy == SCHED_NORMAL || policy == SCHED_BATCH)
- != (param->sched_priority == 0))
+ if ((policy == SCHED_NORMAL || policy == SCHED_BATCH ||
+ policy == SCHED_BGND) != (param->sched_priority == 0))
return -EINVAL;

same

/*
@@ -4072,15 +4190,20 @@ recheck:
*/
if (!capable(CAP_SYS_NICE)) {
/*
- * can't change policy, except between SCHED_NORMAL
- * and SCHED_BATCH:
+ * can't change policy, except between SCHED_NORMAL,
+ * SCHED_BATCH or SCHED_BGND:
*/
- if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) &&
- (policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) &&
+ if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH &&
+ p->policy != SCHED_BGND) &&
+ (policy != SCHED_BATCH && p->policy != SCHED_NORMAL &&
+ p->policy != SCHED_BGND) &&
+ (policy != SCHED_BGND && p->policy != SCHED_NORMAL &&
+ p->policy != SCHED_BATCH)) &&

same

!p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
return -EPERM;
/* can't increase priority */
- if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) &&
+ if ((policy != SCHED_NORMAL && policy != SCHED_BATCH &&
+ policy != SCHED_BGND) &&
param->sched_priority > p->rt_priority &&
param->sched_priority >
p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
Index: MM-2.6.17-mm6/kernel/mutex.c
===================================================================
--- MM-2.6.17-mm6.orig/kernel/mutex.c 2006-07-04 14:37:43.000000000 +1000
+++ MM-2.6.17-mm6/kernel/mutex.c 2006-07-04 14:38:12.000000000 +1000
@@ -51,6 +51,16 @@ __mutex_init(struct mutex *lock, const c

EXPORT_SYMBOL(__mutex_init);

+static inline void inc_mutex_count(void)
+{
+ current->mutexes_held++;
+}
+
+static inline void dec_mutex_count(void)
+{
+ current->mutexes_held--;
+}
+
/*
* We split the mutex lock/unlock logic into separate fastpath and
* slowpath functions, to reduce the register pressure on the fastpath.
@@ -89,6 +99,7 @@ void inline fastcall __sched mutex_lock(
* 'unlocked' into 'locked' state.
*/
__mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
+ inc_mutex_count();
}

EXPORT_SYMBOL(mutex_lock);
@@ -114,6 +125,7 @@ void fastcall __sched mutex_unlock(struc
* into 'unlocked' state:
*/
__mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath);
+ dec_mutex_count();
}

EXPORT_SYMBOL(mutex_unlock);
@@ -274,9 +286,16 @@ __mutex_lock_interruptible_slowpath(atom
*/
int fastcall __sched mutex_lock_interruptible(struct mutex *lock)
{
+ int ret;
+
might_sleep();
- return __mutex_fastpath_lock_retval
+ ret = __mutex_fastpath_lock_retval
(&lock->count, __mutex_lock_interruptible_slowpath);
+
+ if (likely(!ret))
+ inc_mutex_count();
+
+ return ret;
}

EXPORT_SYMBOL(mutex_lock_interruptible);
@@ -331,8 +350,13 @@ static inline int __mutex_trylock_slowpa
*/
int fastcall __sched mutex_trylock(struct mutex *lock)
{
- return __mutex_fastpath_trylock(&lock->count,
+ int ret = __mutex_fastpath_trylock(&lock->count,
__mutex_trylock_slowpath);
+
+ if (likely(ret))
+ inc_mutex_count();
+
+ return ret;
}

EXPORT_SYMBOL(mutex_trylock);
Index: MM-2.6.17-mm6/kernel/fork.c
===================================================================
--- MM-2.6.17-mm6.orig/kernel/fork.c 2006-07-04 14:37:43.000000000 +1000
+++ MM-2.6.17-mm6/kernel/fork.c 2006-07-04 14:38:12.000000000 +1000
@@ -1029,6 +1029,7 @@ static struct task_struct *copy_process(
p->wchar = 0; /* I/O counter: bytes written */
p->syscr = 0; /* I/O counter: read syscalls */
p->syscw = 0; /* I/O counter: write syscalls */
+ p->mutexes_held = 0;
acct_clear_integrals(p);

p->it_virt_expires = cputime_zero;



--
Peter Williams pwil3058@xxxxxxxxxxxxxx

"Learning, n. The kind of ignorance distinguishing the studious."
-- Ambrose Bierce
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/