Re: [PATCH V5 1/6] Sched: Scheduler time slice extension
From: Steven Rostedt
Date: Mon Jun 09 2025 - 17:51:38 EST
On Mon, 9 Jun 2025 16:55:32 -0400
Steven Rostedt <rostedt@xxxxxxxxxxx> wrote:
> So I applied your patches and fixed up my "extend-sched.c" program to use
> your method. I booted on bare-metal PREEMPT_RT and ran:
In case anyone else wants to play, I'm attaching the source of extend-sched.c
I ran it with: sleep 5; ./extend-sched
Then switched over to cyclic test, counted to five and it was pretty
noticeable when it triggered.
To build, simply do:
$ cd linux.git
$ mkdir /tmp/extend
$ cp tools/testing/selftests/rseq/rseq-abi.h /tmp/extend
$ cd /tmp/extend
[ download extend-sched.c here ]
$ gcc extend-sched.c -o extend-sched
-- Steve
// Run with: GLIBC_TUNABLES=glibc.pthread.rseq=0
#define _GNU_SOURCE
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <stdbool.h>
#include <pthread.h>
#include <unistd.h>
#include <sys/time.h>
#ifdef ENABLE_TRACEFS
#include <tracefs.h>
#else
static inline void tracefs_printf(void *inst, const char *fmt, ...) { }
static inline void tracefs_print_init(void *inst) { }
#endif
#include <sys/rseq.h>
#include "rseq-abi.h"
static bool no_rseq;
static bool extend_wait;
/* In case we want to play with priorities */
static int busy_prio = 0;
static int lock_prio = 0;
static int loop_spin = 15000;
//#define barrier() asm volatile ("" ::: "memory")
#define rmb() asm volatile ("lfence" ::: "memory")
#define wmb() asm volatile ("sfence" ::: "memory")
#define NR_BUSY_THREADS 5
static pthread_barrier_t pbarrier;
static __thread struct rseq_abi *rseq_map;
static void init_extend_map(void)
{
if (no_rseq)
return;
rseq_map = (void *)__builtin_thread_pointer() + __rseq_offset;
}
struct data;
struct thread_data {
unsigned long long x_count;
unsigned long long total;
unsigned long long max;
unsigned long long min;
unsigned long long total_wait;
unsigned long long max_wait;
unsigned long long min_wait;
unsigned long long contention;
unsigned long long extended;
struct data *data;
int cpu;
};
struct data {
unsigned long long x;
unsigned long lock;
struct thread_data *tdata;
bool done;
};
static inline unsigned long
cmpxchg(volatile unsigned long *ptr, unsigned long old, unsigned long new)
{
unsigned long prev;
asm volatile("lock; cmpxchg %b1,%2"
: "=a"(prev)
: "q"(new), "m"(*(ptr)), "0"(old)
: "memory");
return prev;
}
static void extend(void)
{
if (no_rseq)
return;
rseq_map->flags |= 1 << 3;
}
static int unextend(void)
{
int flags;
if (no_rseq)
return 0;
flags = rseq_map->flags;
rseq_map->flags &= ~((1 << 3) | (1 << 4));
if (!(flags & (1 << 4)))
return 0;
tracefs_printf(NULL, "Yield!\n");
sched_yield();
return 1;
}
#define sec2usec(sec) (sec * 1000000ULL)
#define usec2sec(usec) (usec / 1000000ULL)
static unsigned long long get_time(void)
{
struct timeval tv;
unsigned long long time;
gettimeofday(&tv, NULL);
time = sec2usec(tv.tv_sec);
time += tv.tv_usec;
return time;
}
static void do_sleep(unsigned usecs)
{
struct timespec ts;
ts.tv_sec = 0;
ts.tv_nsec = usecs * 1000;
nanosleep(&ts, NULL);
}
static void grab_lock(struct thread_data *tdata, struct data *data)
{
unsigned long long start_wait, start, end, delta;
unsigned long long end_wait;
unsigned long prev;
bool contention = false;
start_wait = get_time();
rmb();
while (data->lock && !data->done) {
contention = true;
rmb();
}
tracefs_printf(NULL, "Grab lock\n");
if (extend_wait)
extend();
do {
if (!extend_wait)
extend();
start = get_time();
prev = cmpxchg(&data->lock, 0, 1);
if (prev) {
contention = true;
if (!extend_wait && unextend())
tdata->extended++;
while (data->lock && !data->done)
rmb();
}
} while (prev && !data->done);
if (contention)
tdata->contention++;
if (data->done)
return;
end_wait = get_time();
tracefs_printf(NULL, "Have lock!\n");
delta = end_wait - start_wait;
if (!tdata->total_wait || tdata->max_wait < delta)
tdata->max_wait = delta;
if (!tdata->total_wait || tdata->min_wait > delta)
tdata->min_wait = delta;
tdata->total_wait += delta;
data->x++;
if (data->lock != 1) {
printf("Failed locking\n");
exit(-1);
}
/* Loop */
for (int i = 0; i < loop_spin; i++)
wmb();
prev = cmpxchg(&data->lock, 1, 0);
end = get_time();
tracefs_printf(NULL, "released lock!\n");
if (unextend())
tdata->extended++;
if (prev != 1) {
printf("Failed unlocking\n");
exit(-1);
}
delta = end - start;
if (!tdata->total || tdata->max < delta) {
tracefs_printf(NULL, "New max: %lld\n", delta);
tdata->max = delta;
}
if (!tdata->total || tdata->min > delta)
tdata->min = delta;
tdata->total += delta;
tdata->x_count++;
}
static void *busy_thread(void *d)
{
struct data *data = d;
int i;
nice(busy_prio);
while (!data->done) {
for (i = 0; i < 100; i++)
wmb();
do_sleep(10);
rmb();
}
return NULL;
}
static void *run_thread(void *d)
{
struct thread_data *tdata = d;
struct data *data = tdata->data;
init_extend_map();
nice(lock_prio);
pthread_barrier_wait(&pbarrier);
while (!data->done) {
grab_lock(tdata, data);
/* Make slighty different waits */
/* 100us + cpu * 27us */
do_sleep(100 + tdata->cpu * 27);
rmb();
}
return NULL;
}
int main (int argc, char **argv)
{
unsigned long long total_wait = 0;
unsigned long long total_held = 0;
unsigned long long total_contention = 0;
unsigned long long total_extended = 0;
unsigned long long max_wait = 0;
unsigned long long max = 0;
unsigned long long secs;
unsigned long long avg_wait;
unsigned long long avg_secs;
unsigned long long avg_held;
unsigned long long avg_held_secs;
unsigned long long total_count = 0;
bool verbose = false;
pthread_t *threads;
cpu_set_t *save_affinity;
cpu_set_t *set_affinity;
size_t cpu_size;
struct data data;
int cpus;
int ch;
int i;
while ((ch = getopt(argc, argv, "dwv")) >= 0) {
switch (ch) {
case 'd':
no_rseq = true;
break;
case 'w':
extend_wait = true;
break;
case 'v':
verbose = true;
break;
default:
fprintf(stderr, "usage: extend-sched [-d|-w|-v]\n"
" -d: disable rseq\n"
" -w: extend while trying to get lock\n"
" -v: verbose output\n");
exit(-1);
}
}
memset(&data, 0, sizeof(data));
cpus = sysconf(_SC_NPROCESSORS_CONF);
cpu_size = CPU_ALLOC_SIZE(cpus);
save_affinity = CPU_ALLOC(cpus);
set_affinity = CPU_ALLOC(cpus);
if (!save_affinity || !set_affinity) {
perror("Allocating CPU sets");
exit(-1);
}
if (sched_getaffinity(0, cpu_size, save_affinity) < 0) {
perror("Getting affinity");
exit(-1);
}
/* Create two threads for ever CPU. One grabbing the lock, and a busy task */
threads = calloc(cpus * (NR_BUSY_THREADS + 1), sizeof(*threads));
if (!threads) {
perror("threads");
exit(-1);
}
/* Allocate the data for the lock grabbers */
data.tdata = calloc(cpus, sizeof(*data.tdata));
if (!data.tdata) {
perror("Allocating tdata");
exit(-1);
}
tracefs_print_init(NULL);
pthread_barrier_init(&pbarrier, NULL, cpus + 1);
/* Save current affinity */
for (i = 0; i < cpus; i++) {
int ret;
/* Set the affinity to this CPU as threads will inherit it */
CPU_ZERO_S(cpu_size, set_affinity);
CPU_SET_S(i, cpu_size, set_affinity);
if (sched_setaffinity(0, cpu_size, set_affinity) < 0) {
perror("Setting affinity");
fprintf(stderr, " Setting cpu %d\n", i);
exit(-1);
}
data.tdata[i].data = &data;
data.tdata[i].cpu = i;
ret = pthread_create(&threads[i], NULL, run_thread, &data.tdata[i]);
if (ret < 0) {
perror("creating lock threads");
exit(-1);
}
for (int n = 1; n <= NR_BUSY_THREADS; n++) {
ret = pthread_create(&threads[i + cpus * n], NULL, busy_thread, &data);
if (ret < 0) {
perror("creating busy threads");
exit(-1);
}
}
}
if (sched_setaffinity(0, cpu_size, save_affinity) < 0) {
perror("Setting saved affinity");
exit(-1);
}
pthread_barrier_wait(&pbarrier);
sleep(5);
printf("Finish up\n");
data.done = true;
wmb();
for (i = 0; i < cpus; i++) {
for (int n = 1; n <= NR_BUSY_THREADS; n++)
pthread_join(threads[i + cpus * n], NULL);
}
for (i = 0; i < cpus; i++) {
pthread_join(threads[i], NULL);
if (verbose) {
printf("thread %i:\n", i);
printf(" count:\t%lld\n", data.tdata[i].x_count);
printf(" total:\t%lld\n", data.tdata[i].total);
printf(" max:\t%lld\n", data.tdata[i].max);
printf(" min:\t%lld\n", data.tdata[i].min);
printf(" total wait:\t%lld\n", data.tdata[i].total_wait);
printf(" max wait:\t%lld\n", data.tdata[i].max_wait);
printf(" min wait:\t%lld\n", data.tdata[i].min_wait);
printf(" contention:\t%lld\n", data.tdata[i].contention);
printf(" extended:\t%lld\n", data.tdata[i].extended);
}
total_count += data.tdata[i].x_count;
total_wait += data.tdata[i].total_wait;
total_contention += data.tdata[i].contention;
total_held += data.tdata[i].total;
total_extended += data.tdata[i].extended;
if (data.tdata[i].max_wait > max_wait)
max_wait = data.tdata[i].max_wait;
if (data.tdata[i].max > max)
max = data.tdata[i].max;
}
secs = usec2sec(total_wait);
avg_wait = total_count ? total_wait / total_count : 0;
avg_secs = usec2sec(avg_wait);
avg_held = total_count ? total_held / total_count : 0;
avg_held_secs = usec2sec(avg_held);
printf("Ran for %lld times\n", data.x);
printf("Total wait time: %llu.%06llu (avg: %llu.%06llu)\n", secs, total_wait - sec2usec(secs),
avg_secs, avg_wait - sec2usec(avg_secs));
printf("Total contetion: %lld\n", total_contention);
printf("Total extended: %lld\n", total_extended);
printf(" max wait: %lld\n", max_wait);
printf(" max: %lld (avg: %llu.%06llu)\n", max, avg_held_secs, avg_held - sec2usec(avg_held_secs));
return 0;
}