mmap/munmap 20ms latencies, 2.6.37

From: Andy Isaacson
Date: Tue Jan 11 2011 - 03:19:36 EST


I'm seeing high latencies (up to 20 ms with no sign of CPU contention)
from mmap/munmap, using the enclosed test program on various Core i7
processors. For example on a "Intel(R) Core(TM) i7 CPU 930@xxxxxxx"
with 12GB RAM running 0c21e3a,

% /tmp/munmapslow -t 6
...
[1294732507.539851] <8636> free(0x7f2608f08000) len=57344 took 4041 us
[1294732507.559841] <8632> free(0x7f260e810000) len=32768 took 8032 us
[1294732507.570254] <8632> malloc(81920) took 2452 us
[1294732507.583834] <8635> malloc(73728) took 8035 us
[1294732507.595840] <8631> malloc(65536) took 4048 us
[1294732507.599838] <8635> malloc(45056) took 8041 us
[1294732507.603834] <8631> malloc(20480) took 4029 us
...
[1294732510.603845] <8634> realloc(0x756170, 23408) took 4035 us
[1294732510.647850] <8634> malloc(73728) took 28047 us
[1294732510.675831] <8631> malloc(53248) took 28028 us
[1294732510.731835] <8631> malloc(81920) took 28035 us
...
[1294732543.685480] <8633> 5244182 heap events, 3739 allocated, 4425 max, 153.57 MiB total
[1294732543.685503] <8633> 78060.59 ms total for 5244182 allocations (15 us per allocation)
[1294732543.685509] <8633> 13779.82 ms total for 5240443 frees (3 us per free)

Most of the time, mmap takes about 15 us and munmap takes less than 5 us
for these small buffers (<20 pages).

Of course if a significant background job fires, a lot of "slow
operations" are seen; but I'm seeing cases where preemption doesn't
appear to be the cause. (I suppose I could parse /proc/self/sched to
confirm this...)

I'll try trace-cmd tomorrow, but I wonder if anyone already knows what's
going on?

(As you may guess, this is an attempt to isolate a problem seen in a
larger system that's not easy to reduce to a simple testcase. We're
seeing a 3x higher system time reported via munin than previously, even
though the CPUs went from 20%-40% idle to 100% pegged and overall
throughput *dropped* somewhat due to being CPU limited. It's not clear
what changed to trigger the increased system time -- but munmap popped
out of strace -c.)

Thanks,
-andy

/*
* Attempt to reproduce munmap slowness seen on
* 2.6.35-23-generic
* Intel(R) Xeon(R) CPU E5504 @ 2.00GHz
* with HT turned on.
*
* strace -c shows that munmap is taking up to 11 ms per call and
* that's where we are spending 95% of our time.
*
* Plan for testcase:
*
* fork N processes.
* In each process, repeatedly allocate and free various buffer sizes.
*/

#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
#include <string.h>
#include <errno.h>

#include <sys/time.h>

#include <unistd.h>
#include <sys/wait.h>
#include <sys/mman.h>

#define MAX_TIME (1000 * 1e-6)

typedef unsigned int u32;

static int pgsz;

double rtc(void)
{
struct timeval t;

gettimeofday(&t, 0);
return t.tv_sec + t.tv_usec/1e6;
}

void die(char *fmt, ...) __attribute__((noreturn));

void die(char *fmt, ...)
{
va_list ap;

fprintf(stderr, "[%.6f] <%d> ", rtc(), (int)getpid());
va_start(ap, fmt);
vfprintf(stderr, fmt, ap);
va_end(ap);
exit(1);
}

void msg(char *fmt, ...)
{
va_list ap;

printf("[%.6f] <%d> ", rtc(), (int)getpid());
va_start(ap, fmt);
vfprintf(stdout, fmt, ap);
va_end(ap);
}

/*
* returns a random integer chosen from a uniform distribution on [0, range).
*/
u32 myrand(u32 range)
{
u32 x, mask;
int i, MAX = 1000;

if (range == 1) return 0;

mask = range - 1;
while (mask & (mask - 1))
mask &= (mask - 1);
mask |= (mask - 1);

for (i = 0; i < MAX; i++) {
x = lrand48() & mask;
if (x < range)
return x;
}
die("myrand failed after %d iter, range = %lld mask = 0x%llx x = 0x%llx\n",
i, (long long)range, (long long) mask, (long long)x);
}

static void **heap;
static int num_heap, max_num_heap;
static long long totsz;

static double elapsed_alloc, elapsed_free;
int num_alloc, num_free;

void heap_stats(void)
{
msg("%d heap events, %d allocated, %d max, %.2f MiB total\n",
num_alloc, num_heap, max_num_heap, totsz / 1024. / 1024);
msg("%.2f ms total for %d allocations (%.0f us per allocation)\n",
elapsed_alloc * 1e3, num_alloc, elapsed_alloc * 1e6 / num_alloc);
msg("%.2f ms total for %d frees (%.0f us per free)\n",
elapsed_free * 1e3, num_free, elapsed_free * 1e6 / num_alloc);
}

void *myalloc(size_t sz)
{
double t0, t1, e;
void *p;

t0 = rtc();
p = mmap(0, sz, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
if (p == MAP_FAILED)
die("mmap(%d): %s\n", (int)sz, strerror(errno));
t1 = rtc();

e = t1 - t0;

if (e > MAX_TIME)
msg("malloc(%d) took %.0f us\n", (int)sz, e * 1e6);

elapsed_alloc += e;
num_alloc++;

return p;
}

void myfree(void *p)
{
double t0, t1, e;
int sz = *(int *)p;

t0 = rtc();
munmap(p, sz);
t1 = rtc();

e = t1 - t0;
if (e > MAX_TIME)
msg("free(%p) len=%d took %.0f us\n", p, sz, e * 1e6);
elapsed_free += e;
num_free++;
}

void heap_allocate(int sz)
{
void *n;
void **h2;
double t1, t2, e;

n = myalloc(sz);
if (!n) die("malloc(%d) failed.\n", sz);

*(int *)n = sz;

t1 = rtc();
h2 = realloc(heap, sizeof(*heap) * ++num_heap);
t2 = rtc();
e = t2 - t1;
if (!h2)
die("realloc(%p, %d) failed.\n",
heap, (int)(sizeof(*heap) * num_heap));

if ((t2 - t1) > MAX_TIME)
msg("realloc(%p, %d) took %.0f us\n",
heap, (int)(sizeof(*heap) * num_heap), e * 1e6);

if (num_heap > max_num_heap) max_num_heap = num_heap;
totsz += sz;

h2[num_heap-1] = n;
heap = h2;
}

void heap_free(void)
{
int i;

if (num_heap <= 0)
return;

i = myrand(num_heap--);

totsz -= *(int *)heap[i];

if (heap[i])
myfree(heap[i]);

for (; i < num_heap; i++)
heap[i] = heap[i+1];
}

void send_message(void)
{
}

void receive_message(void)
{
}

void worker(void)
{
long long i;
long long counts[4] = { 0 };
int ITERCOUNT = 10 * 1024 * 1024;

for (i=0; ; i++) {
int action = myrand(4);

if (i < 10000 && action == 1 && myrand(2) == 1) action = 0;
counts[action]++;
switch(action) {
case 0:
heap_allocate(pgsz * (myrand(20) + 1));
break;
case 1:
heap_free();
break;
case 2:
send_message();
break;
case 3:
receive_message();
break;
}
if (i % ITERCOUNT == ITERCOUNT - 1) {
heap_stats();
}
}
}

void usage(const char *cmd)
{
die("Usage: %s [-t numthreads]\n", cmd);
}

int main(int argc, char **argv)
{
int c, i;
int nproc = sysconf(_SC_NPROCESSORS_ONLN);

pgsz = getpagesize();

while ((c = getopt(argc, argv, "t:")) != EOF) {
switch (c) {
case 't':
nproc = strtol(optarg, 0, 0);
break;
default:
usage(argv[0]);
}
}

printf("forking %d children\n", nproc);
for (i=0; i < nproc; i++) {
switch(fork()) {
case -1:
die("fork: %s\n", strerror(errno));
case 0: /* child */
worker();
exit(0);
default: /* parent */
/* nothing */
break;
}
}

for (i=0; i < nproc; i++) {
int x, p;

p = wait(&x);
msg("child %d exited\n", p);
}

return 0;
}
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/