Re: [tip:sched/core] sched: Lower chances of cputime scaling overflow

From: Stanislaw Gruszka
Date: Sat Apr 13 2013 - 10:49:05 EST


On Fri, Apr 12, 2013 at 09:55:56AM +0200, Peter Zijlstra wrote:
> > The above is totally untested, but each step is pretty damn simple and
> > fairly cheap. Sure, it's a loop, but it's bounded to 32 (cheap)
> > iterations, and the normal case is that it's not done at all, or done
> > only a few times.
>
> Right it gets gradually heavier the bigger the numbers get; which is
> more and more unlikely.
>
> > And the advantage is that the end result is always that simple
> > 32x32/32 case that we started out with as the common case.
> >
> > I dunno. Maybe I'm overlooking something, and the above is horrible,
> > but the above seems reasonably efficient if not optimal, and
> > *understandable*.
>
> I suppose that entirely matters on what one is used to ;-) I had to
> stare rather hard at it for a little while.
>
> But yes, you take it one step further and are willing to ditch rtime
> bits too and I suppose that's fine.
>
> Should work,.. Stanislaw could you stick this into your userspace
> thingy and verify the numbers are sane enough?

It works fine - gives relative error less than 0.1% for very big
numbers.

For the record I'm attaching test program and script.

Thanks
Stanislaw

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <strings.h>
#include <stdint.h>

typedef uint64_t u64;
typedef uint32_t u32;

static u64 div_u64_u32(u64 a, u32 b)
{
return a / b;
}

static u64 scale_stime(u64 stime, u64 rtime, u64 total)
{

/* We know one of the values has a bit set in the high 32 bits */
for (;;) {
/* Make sure "rtime" is the bigger of stime/rtime */
if (stime > rtime) {
u64 tmp = rtime; rtime = stime; stime = tmp;
}

/* Do we need to balance stime/rtime bits? */
if (rtime >> 32) {
if (stime >> 31)
goto drop_precision;

/* We can grow rtime and shrink stime and try to make them both fit */
stime <<= 1;
rtime >>= 1;
continue;
}

/* stime/rtime fits in 32 bits, how about total? */
if (!(total >> 32))
break;

drop_precision:
/* We drop from rtime, it has more bits than stime */
rtime >>= 1;
total >>= 1;
}

if (!total)
return stime;

/* Make sure gcc understands that this is a 32x32->64 multiply,
* followed by a 64/32->64 divide */
return div_u64_u32((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
}

int main(int argc, char *argv[])
{
u64 rtime, total, stime, scaled;

if (argc != 4)
return;

rtime = strtoll(argv[1], NULL, 10);
total = strtoll(argv[2], NULL, 10);
stime = strtoll(argv[3], NULL, 10);

assert (total >= stime);

scaled = scale_stime(stime, rtime, total);
printf("%llu\n", scaled);

return 0;
}
#!/usr/bin/python

import subprocess
import random
import math

def kernel_scale (rtime, total, stime):
p = subprocess.Popen("./scale_stime5 " + str(rtime) + " " + str(total) + " " + str(stime) , shell=True, stdout=subprocess.PIPE)
return int(p.stdout.read())

def python_scale (rtime, total, stime):
return (stime * rtime) / total

max_rtime = 10*4096*364*24*60*60*1000; # 10 years for 4096 threads

fail=False
K=10000
for i in range(0, K):
rtime = random.randrange(max_rtime)
total = int(random.uniform(0.1, 1.9) * rtime)

for n in range(1, 100):
stime = (n * total / 100)
r1 = kernel_scale(rtime, total, stime)
r2 = python_scale(rtime, total, stime)
if (float(abs(r1 - r2)) / float(r2)) > 0.001:
print "FAIL!"
print "rtime: " + str(rtime)
print "total: " + str(total)
print "stime: " + str(stime)
print "kernel: " + str(r1)
print "python: " + str(r2)

fail=True
break
if fail:
break;
if (i % 100) == 99:
print str(i/100) + "/" + str(K/100) + " OK"