2.6.35-rc3: System unresponsive under load
From: Manfred Spraul
Date: Sat Jun 26 2010 - 11:46:31 EST
Hi Luca,
On 06/26/2010 02:52 PM, Luca Tettamanti wrote:
They don't seem really hung as before, I see two different behaviours:
* Near the end of the run ab is frozen for a few seconds, but in the
end all requests are processed; however I see a few "length" errors,
meaning that the received page does not match the expected content
(I'm testing a static page):
That's consistent with what I see:
If I run:
#./semtimedop 100 100&
#./semtimedop 100 100&
#./semtimedop 100 100&
#./semtimedop 100 100&
(i.e.: 4 times the attached test app concurrently), then the system
sometimes locks up for 10..20 seconds:
The keyboard is unresponsive, not even the numlock key is processed
(i.e.: the LED does not change anymore).
After 10 or 20 seconds, the keyboard reacts again (both to <enter> and
to Num Lock)
The stock Fedora 13 kernel (2.6.33.5) does not exhibit this behavior
The load average is 300 or so, that's expected.
I have no idea why and how to debug the behavior.
# CONFIG_PREEMPT_NONE is not set
CONFIG_PREEMPT_VOLUNTARY=y
# CONFIG_PREEMPT is not set
strace on apache shows:
[pid 3787] restart_syscall(<... resuming interrupted call ...> <unfinished ...>
[pid 3789] restart_syscall(<... resuming interrupted call ...> <unfinished ...>
[pid 3788] restart_syscall(<... resuming interrupted call ...> <unfinished ...>
[pid 3784] restart_syscall(<... resuming interrupted call ...> <unfinished ...>
[pid 3783] restart_syscall(<... resuming interrupted call ...> <unfinished ...>
[pid 3782] restart_syscall(<... resuming interrupted call ...> <unfinished ...>
[pid 3239] restart_syscall(<... resuming interrupted call ...> <unfinished ...>
[pid 3233] restart_syscall(<... resuming interrupted call ...> <unfinished ...>
[pid 3238] restart_syscall(<... resuming interrupted call ...> <unfinished ...>
[pid 3237] restart_syscall(<... resuming interrupted call ...>
That can't be semop:
sysv ipc and msg are among the (broken) parts of the kernel that do not
honor SA_RESTART.
--
Manfred
/*
* semscale.cpp - sysv scaling test
*
* Copyright (C) 1999, 2001, 2005, 2008, 2010 by Manfred Spraul.
* All rights reserved except the rights granted by the GPL.
*
* Redistribution of this file is permitted under the terms of the GNU
* General Public License (GPL) version 2 or later.
* $Header$
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <getopt.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/ipc.h>
#include <sys/sem.h>
#include <pthread.h>
#ifdef __sun
#include <sys/pset.h> /* P_PID, processor_bind() */
#endif
#define DELAY_1MS (1000)
static enum {
WAITING,
RUNNING,
STOPPED,
} volatile g_state = WAITING;
int g_svsem_id;
pthread_t *g_threads;
void* worker_thread(void *arg)
{
unsigned long long rounds;
int ret;
rounds = 0;
while(g_state == WAITING) {
#ifdef __GNUC__
#if defined(__i386__) || defined (__x86_64__)
__asm__ __volatile__("pause": : :"memory");
#else
__asm__ __volatile__("": : :"memory");
#endif
#endif
}
while(g_state == RUNNING) {
struct sembuf sop[1];
struct timespec t;
/* 1) check if the semaphore value is 0 */
sop[0].sem_num=0;
sop[0].sem_op=0;
sop[0].sem_flg=0;
t.tv_sec = 0;
t.tv_nsec = 1*1000*1000;
ret = semtimedop(g_svsem_id,sop,1, &t);
if (ret == 0)
continue;
if (ret == -1 && errno == EAGAIN)
continue;
if (ret == -1 && errno == EIDRM)
break;
printf("main semop failed, ret %d errno %d.\n", ret, errno);
fflush(stdout);
exit(1);
}
pthread_exit(0);
return NULL;
}
void init_thread(int i)
{
int ret;
ret = pthread_create(&g_threads[i], NULL, worker_thread, NULL);
if (ret) {
printf(" pthread_create failed with error code %d\n", ret);
exit(1);
}
}
//////////////////////////////////////////////////////////////////////////////
static void do_psem(int threads, int secs)
{
int res;
int i;
g_state = WAITING;
g_threads = (pthread_t*)malloc(sizeof(pthread_t)*threads);
g_svsem_id = semget(IPC_PRIVATE,threads,0777|IPC_CREAT);
if(g_svsem_id == -1) {
printf("sem array create failed.\n");
exit(1);
}
for (i=0;i<threads;i++)
init_thread(i);
usleep(DELAY_1MS);
g_state = RUNNING;
for (i=0;i<secs*500;i++) {
struct sembuf sop[1];
sop[0].sem_num=0;
sop[0].sem_op=1;
sop[0].sem_flg=0;
res = semop(g_svsem_id,sop,1);
if(res!=0) {
printf("control semop(,1,) failed, ret %d errno %d.\n", res, errno);
exit(2);
}
usleep(2*DELAY_1MS);
sop[0].sem_num=0;
sop[0].sem_op=-1;
sop[0].sem_flg=0;
res = semop(g_svsem_id,sop,1);
if(res!=0) {
printf("control semop(,-1,) failed, ret %d errno %d.\n", res, errno);
exit(2);
}
}
g_state = STOPPED;
usleep(DELAY_1MS);
res = semctl(g_svsem_id,1,IPC_RMID,NULL);
if (res < 0) {
printf("semctl(IPC_RMID) failed for %d, errno%d.\n",
g_svsem_id, errno);
}
for (i=0;i<threads;i++)
pthread_join(g_threads[i], NULL);
free(g_threads);
}
//////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
int timeout, threads;
printf("semtimedop [timeout] [threads]\n");
if (argc != 3) {
printf(" Invalid parameters.\n");
return 0;
}
timeout = atoi(argv[1]);
threads = atoi(argv[2]);
printf("Running %d threads for %d seconds...\n", threads, timeout);
do_psem(threads, timeout);
}