Re: 2.6.37: Multi-second I/O latency while untarring

From: Andrew Lutomirski
Date: Mon Mar 28 2011 - 23:50:46 EST


On Mon, Feb 14, 2011 at 10:22 AM, Chris Mason <chris.mason@xxxxxxxxxx> wrote:
> Excerpts from Andrew Lutomirski's message of 2011-02-11 19:35:02 -0500:
>> On Fri, Feb 11, 2011 at 10:44 AM, Chris Mason <chris.mason@xxxxxxxxxx> wrote:

>> >
>> > We can tell more if you post the full traces from latencytop.  I have a
>> > patch here for latencytop that adds a -c mode, which dumps the traces
>> > out to a text files.
>> >
>> > http://oss.oracle.com/~mason/latencytop.patch
>> >
>> > Based on what you have here, I think it's probably a latency problem
>> > between btrfs and the dm-crypt stuff.  How easily can setup a test
>> > partition without dm-crypt?
>>
>> Done, on the same physical disk as before.  The latency is just as
>> bad.  On this test, I wrote a total of 3.1G, which is under half of my
>> RAM.  That should rule out lots of VM issues.  latencytop trace below.
>
> Just to confirm, you say on a physical disk you mean without dm-crypt?

Sorry for the exceedingly slow reply.

This problem is really bad with 2.6.38.1. To make it a little easier
to demonstrate, I wrote a tool that shows off the problem.

I made a test btrfs partition on a plain disk partition (same disk as
my dm-crypt but an unencrypted partition). Now clone a kernel tree
there and run make -j8. Wait until the disk starts to write data out
in earnest (takes awhile to dirty enough pages). Watch crap like this
happen (with nr_requests = 2048, scheduler = deadline).

io_latency_watch read <1M file on test partition>

read took 0.000 seconds (worst = 0.963s)
read took 0.000 seconds (worst = 0.963s)
read took 0.022 seconds (worst = 0.963s)
read took 0.000 seconds (worst = 0.963s)
read took 0.028 seconds (worst = 0.963s)
read took 1.430 seconds (worst = 1.430s)
read took 0.270 seconds (worst = 1.430s)
read took 1.237 seconds (worst = 1.430s)
read took 0.282 seconds (worst = 1.430s)
read took 0.131 seconds (worst = 1.430s)

io_latency_watch read <1M file on other partition on same disk> is
similar, and io_latency_test write <dir on other partition> is even
worse.

The cfq scheduler is similar.

--Andy
/* io_latency_test.c
* Copyright (c) 2011 Andy Lutomirski
* Licensed under GPLv2.
*
* Compile with gcc -O2 -std=gnu99 -lrt
*/

#define _FILE_OFFSET_BITS 64
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <stdbool.h>
#include <time.h>
#include <stdint.h>
#include <string.h>
#include <signal.h>
#include <inttypes.h>
#include <fcntl.h>

volatile const char *file_to_unlink;

void handler(int x)
{
if (file_to_unlink)
unlink((char*)file_to_unlink);

_exit(0);
}

void do_read(const char *name)
{
int fd = open(name, O_RDONLY | O_DIRECT);
if (fd < 0) {
perror("open");
exit(1);
}

uint64_t worst = 0;
off_t size = lseek(fd, 0, SEEK_END);
if (size == (off_t)-1) {
perror("lseek");
abort();
}

size -= (size % 4096);

if (size < 4096) {
printf("File is smaller than 4k\n");
exit(1);
}

printf("File size is %" PRIu64 " bytes -- bigger is better\n", (uint64_t)size);

while(true)
{
uint64_t pos = 4096 * (random() % (size / 4096));

struct timespec start;
clock_gettime(CLOCK_MONOTONIC, &start);

unsigned char x[4096];
if (pread(fd, x, 4096, pos) != 4096) {
perror("pread");
abort();
}

struct timespec end;
clock_gettime(CLOCK_MONOTONIC, &end);

uint64_t ns = (end.tv_nsec - start.tv_nsec) + 1000000000ULL * (end.tv_sec - start.tv_sec);

if (ns > worst)
worst = ns;

printf("read took %.3f seconds (worst = %.3fs)\n",
1e-9 * ns, 1e-9 * worst);

if (posix_fadvise(fd, 0, size, POSIX_FADV_DONTNEED) != 0)
perror("posix_fadvise");

usleep(1000000);
}
}

void do_write(const char *dir)
{
char *name;
if (asprintf(&name, "%s/tmp.XXXXXX", dir) == -1)
abort();

int fd = mkstemp(name);
if (fd == -1) {
perror("mkstemp");
abort();
}

file_to_unlink = name;

uint64_t worst = 0;

unsigned char x;
while(true)
{
x++;
struct timespec start;
clock_gettime(CLOCK_MONOTONIC, &start);

if (pwrite(fd, &x, 1, 0) != 1) {
perror("pwrite");
abort();
}

if (fdatasync(fd) != 0) {
perror("fdatasync");
abort();
}

struct timespec end;
clock_gettime(CLOCK_MONOTONIC, &end);

uint64_t ns = (end.tv_nsec - start.tv_nsec) + 1000000000ULL * (end.tv_sec - start.tv_sec);

if (ns > worst)
worst = ns;

printf("write + fsync took %.3f seconds (worst = %.3fs)\n",
1e-9 * ns, 1e-9 * worst);

usleep(1000000);
}
}

int main(int argc, char **argv)
{
if (argc != 3) {
printf("Usage: %s write <dir> or %s read <file>\n", argv[0], argv[0]);
return 1;
}

bool write;
if (!strcmp(argv[1], "write")) {
write = true;
} else if (!strcmp(argv[1], "read")) {
write = false;
} else {
printf("Bad mode\n");
return 1;
}

struct sigaction sa;
sa.sa_handler = handler;
sigemptyset(&sa.sa_mask);
sa.sa_flags = 0;
if (sigaction(SIGINT, &sa, 0) != 0) {
perror("sigaction");
exit(1);
}

if (write)
do_write(argv[2]);
else
do_read(argv[2]);

return 0;
}