memory management in 3.7

From: Nikolay S.
Date: Mon Jan 21 2013 - 07:11:38 EST



Hello there,

I have recently upgraded from 3.2 to 3.7.3, and I am seeing, that the
behavior of kswapd is strange at least.

The machine is core2duo e7200 with 4G RAM, running 3.7.3 kernel. It has
compaction and THP (always) enabled.

The machine is serving files over the network, so it is constantly under
memory pressure from page cache. The network is slow, and average disk
read rate is between 2 and 8 megabytes per second.

In normal state, when page cache is filled, the free memory (according
to free and vmstat) is fluctuating between 100 and 150 megabytes, with
kswapd stepping in at 100M, quickly freeing to 150M and going to sleep
again.

On 3.7.3, after several hours after page cache is filled, kswapd enters
permanent D-state, with free memory keeping around 150M (high watermark,
I presume?). I have captured diffs for /proc/vmstat:

$ ./diffshow 5
----8<----
nr_free_pages: 38327 -> 38467 (140)
nr_active_anon: 110014 -> 110056 (42)
nr_inactive_file: 526153 -> 526297 (144)
nr_active_file: 98802 -> 98864 (62)
nr_anon_pages: 103475 -> 103512 (37)
nr_file_pages: 627957 -> 628160 (203)
nr_dirty: 15 -> 17 (2)
nr_page_table_pages: 2142 -> 2146 (4)
nr_kernel_stack: 251 -> 253 (2)
nr_dirtied: 1169312 -> 1169317 (5)
nr_written: 1211979 -> 1211982 (3)
nr_dirty_threshold: 159540 -> 159617 (77)
nr_dirty_background_threshold: 79770 -> 79808 (38)
pgpgin: 564650577 -> 564673241 (22664)
pgpgout: 5117612 -> 5117668 (56)
pgalloc_dma32: 105487556 -> 105491067 (3511)
pgalloc_normal: 84026173 -> 84029309 (3136)
pgfree: 190134573 -> 190141394 (6821)
pgactivate: 2750244 -> 2750283 (39)
pgfault: 67214984 -> 67216222 (1238)
pgsteal_kswapd_dma32: 45793109 -> 45795077 (1968)
pgsteal_kswapd_normal: 61391466 -> 61394464 (2998)
pgscan_kswapd_dma32: 45812628 -> 45814596 (1968)
pgscan_kswapd_normal: 61465283 -> 61468281 (2998)
slabs_scanned: 30783104 -> 30786432 (3328)
pageoutrun: 2936967 -> 2937033 (66)

vmstat:
procs -----------memory---------- ---swap-- -----io---- -system-- ----cpu----
r b swpd free buff cache si so bi bo in cs us sy id wa
1 1 296924 153064 6936 2479664 0 0 5408 0 11711 1350 1 2 44 53
0 1 296924 152448 6928 2480048 0 0 6760 0 9723 1127 1 4 47 48
0 1 296924 152948 6916 2479464 0 0 3512 16 10392 1231 1 2 48 49
0 1 296924 153616 6916 2478804 0 0 2724 0 10279 1078 0 2 48 49
0 1 296924 152972 6916 2480132 0 0 3584 0 11289 1252 1 3 49 48
0 1 296924 155348 6916 2478396 0 0 6472 0 11285 1132 1 2 45 53
0 1 296924 152988 6916 2481024 0 0 5112 20 10039 1257 0 2 46 52
0 1 296924 152968 6916 2481016 0 0 3244 0 9586 1127 1 3 46 51
0 1 296924 153500 6916 2481196 0 0 3516 0 10899 1127 1 1 48 49
0 1 296924 152860 6916 2481688 0 0 4240 0 10418 1245 1 3 47 49
0 2 296924 153016 6912 2478584 0 0 5632 0 12136 1516 2 3 46 49
0 2 296924 153292 6912 2480984 0 0 4668 0 10872 1248 1 2 49 48
0 1 296924 152420 6916 2481844 0 0 4764 56 11236 1402 1 3 45 51
0 1 296924 152652 6916 2481204 0 0 4628 0 9422 1208 0 3 46 51

buddyinfo:
$ cat /proc/buddyinfo; sleep 1; cat /proc/buddyinfo
Node 0, zone DMA 0 0 0 1 2 1 1 0 1 1 3
Node 0, zone DMA32 515 205 242 201 1384 116 21 8 1 0 0
Node 0, zone Normal 1779 0 0 18 11 3 1 3 0 0 0
Node 0, zone DMA 0 0 0 1 2 1 1 0 1 1 3
Node 0, zone DMA32 480 197 227 176 1384 116 21 8 1 0 0
Node 0, zone Normal 1792 9 0 18 11 3 1 3 0 0 0

Also from time to time situation switches, where free memory is fixed at
some random point, fluctuating around this values at +-1 megabyte.
There is vmstat:
procs -----------memory---------- ---swap-- -----io---- -system-- ----cpu----
r b swpd free buff cache si so bi bo in cs us sy id wa
0 0 296480 381052 9732 2481324 1 2 2022 19 45 44 1 2 81 16
0 0 296480 382040 9732 2481180 0 0 2324 0 6505 825 1 2 96 1
0 0 296480 382500 9732 2481060 0 0 3824 0 5941 1046 1 2 96 1
0 0 296480 382092 9740 2480976 0 0 2048 16 7701 862 0 2 97 1
0 0 296480 382160 9740 2481896 0 0 5008 0 6443 1017 1 2 93 5
0 0 296480 382484 9740 2481668 0 0 2764 0 6972 799 0 2 97 1
0 0 296480 381912 9740 2481620 0 0 3780 0 7632 1036 1 2 96 1
0 0 296480 382240 9744 2481632 0 0 2796 0 7533 981 1 2 95 3
1 0 296480 382372 9748 2481756 0 0 2940 0 6565 1048 2 2 95 2
0 0 296480 383064 9748 2480320 0 0 5980 0 6352 979 0 3 92 5
0 0 296480 381380 9748 2481752 0 0 2732 0 6322 999 1 2 96 1
0 0 296480 381640 9748 2481992 0 0 2468 0 5640 849 0 2 97 2
0 0 296480 381684 9748 2481856 0 0 2760 0 7064 944 2 2 95 1
0 0 296480 381908 9748 2481664 0 0 2608 0 6797 952 0 2 94 4
0 0 296480 384024 9748 2479424 0 0 4804 0 6342 2767 1 2 94 4
0 0 296480 381948 9748 2481080 0 0 1868 0 6428 803 0 2 97 2
0 0 296480 382088 9748 2481524 0 0 3252 0 6464 990 1 1 98 1
0 0 296480 381884 9748 2481816 0 0 2892 0 7880 858 1 2 94 3
0 0 296480 382120 9748 2481848 0 0 2500 0 6207 905 1 1 96 2
0 1 296480 381976 9748 2479876 0 0 5188 0 6691 908 1 2 94 4
0 0 296480 381708 9748 2481584 0 0 2692 0 7904 1030 1 2 94 3
0 0 296480 382196 9748 2481704 0 0 2092 0 6715 722 1 1 97 1


The /proc/vmstat diff is like this:

$ ./diffshow 5
----8<----
nr_free_pages: 94999 -> 95630 (631)
nr_inactive_anon: 47076 -> 47196 (120)
nr_inactive_file: 347048 -> 347080 (32)
nr_active_file: 270128 -> 270462 (334)
nr_file_pages: 619886 -> 620314 (428)
nr_dirty: 10 -> 109 (99)
nr_kernel_stack: 248 -> 249 (1)
nr_isolated_file: 0 -> 10 (10)
nr_dirtied: 1147486 -> 1147659 (173)
nr_written: 1189947 -> 1190013 (66)
nr_dirty_threshold: 168770 -> 168974 (204)
nr_dirty_background_threshold: 84385 -> 84487 (102)
pgpgin: 528729753 -> 528750521 (20768)
pgpgout: 5013688 -> 5014216 (528)
pswpin: 77715 -> 77827 (112)
pgalloc_dma32: 95912002 -> 95912631 (629)
pgalloc_normal: 82241808 -> 82247860 (6052)
pgfree: 178827810 -> 178834939 (7129)
pgactivate: 2644761 -> 2645104 (343)
pgfault: 63365808 -> 63369261 (3453)
pgmajfault: 23571 -> 23591 (20)
pgsteal_kswapd_normal: 60067802 -> 60072006 (4204)
pgscan_kswapd_normal: 60141548 -> 60145753 (4205)
slabs_scanned: 28914432 -> 28915456 (1024)
kswapd_low_wmark_hit_quickly: 589343 -> 589376 (33)
kswapd_high_wmark_hit_quickly: 763703 -> 763752 (49)
pageoutrun: 2852120 -> 2852305 (185)
compact_blocks_moved: 10852682 -> 10852847 (165)
compact_pagemigrate_failed: 39862700 -> 39865324 (2624)

kswapd is stuck on normal zone!

Also there is raw vmstat:
nr_free_pages 95343
nr_inactive_anon 47196
nr_active_anon 114110
nr_inactive_file 348142
nr_active_file 272638
nr_unevictable 552
nr_mlock 552
nr_anon_pages 100386
nr_mapped 6158
nr_file_pages 623530
nr_dirty 0
nr_writeback 0
nr_slab_reclaimable 21356
nr_slab_unreclaimable 15570
nr_page_table_pages 2045
nr_kernel_stack 244
nr_unstable 0
nr_bounce 0
nr_vmscan_write 149405
nr_vmscan_immediate_reclaim 13896
nr_writeback_temp 0
nr_isolated_anon 0
nr_isolated_file 4
nr_shmem 48
nr_dirtied 1147666
nr_written 1190129
nr_anon_transparent_hugepages 116
nr_free_cma 0
nr_dirty_threshold 169553
nr_dirty_background_threshold 84776
pgpgin 529292001
pgpgout 5014788
pswpin 77827
pswpout 148890
pgalloc_dma 0
pgalloc_dma32 95940824
pgalloc_normal 82395157
pgalloc_movable 0
pgfree 179010711
pgactivate 2647284
pgdeactivate 2513412
pgfault 63427189
pgmajfault 23606
pgrefill_dma 0
pgrefill_dma32 1915983
pgrefill_normal 430939
pgrefill_movable 0
pgsteal_kswapd_dma 0
pgsteal_kswapd_dma32 39927548
pgsteal_kswapd_normal 60180622
pgsteal_kswapd_movable 0
pgsteal_direct_dma 0
pgsteal_direct_dma32 14062458
pgsteal_direct_normal 1894412
pgsteal_direct_movable 0
pgscan_kswapd_dma 0
pgscan_kswapd_dma32 39946808
pgscan_kswapd_normal 60254407
pgscan_kswapd_movable 0
pgscan_direct_dma 0
pgscan_direct_dma32 14260652
pgscan_direct_normal 1895350
pgscan_direct_movable 0
pgscan_direct_throttle 0
pginodesteal 25301
slabs_scanned 28931968
kswapd_inodesteal 26119
kswapd_low_wmark_hit_quickly 591050
kswapd_high_wmark_hit_quickly 766006
kswapd_skip_congestion_wait 15
pageoutrun 2858733
allocstall 156938
pgrotated 161518
compact_blocks_moved 10860505
compact_pages_moved 411760
compact_pagemigrate_failed 39987369
compact_stall 29399
compact_fail 23718
compact_success 5681
htlb_buddy_alloc_success 0
htlb_buddy_alloc_fail 0
unevictable_pgs_culled 6416
unevictable_pgs_scanned 0
unevictable_pgs_rescued 5337
unevictable_pgs_mlocked 6672
unevictable_pgs_munlocked 6120
unevictable_pgs_cleared 0
unevictable_pgs_stranded 0
thp_fault_alloc 41
thp_fault_fallback 302
thp_collapse_alloc 507
thp_collapse_alloc_failed 3704
thp_split 111

Buddyinfo:
$ cat /proc/buddyinfo; sleep 1; cat /proc/buddyinfo
Node 0, zone DMA 0 0 0 1 2 1 1 0 1 1 3
Node 0, zone DMA32 29527 26916 489 221 40 5 0 0 0 0 0
Node 0, zone Normal 3158 0 0 2 1 1 1 1 0 0 0
Node 0, zone DMA 0 0 0 1 2 1 1 0 1 1 3
Node 0, zone DMA32 29527 26909 489 211 41 5 0 0 0 0 0
Node 0, zone Normal 2790 29 0 8 1 1 1 1 0 0 0

Zoneinfo:
$ cat /proc/zoneinfo
Node 0, zone DMA
pages free 3976
min 64
low 80
high 96
scanned 0
spanned 4080
present 3912
nr_free_pages 3976
nr_inactive_anon 0
nr_active_anon 0
nr_inactive_file 0
nr_active_file 0
nr_unevictable 0
nr_mlock 0
nr_anon_pages 0
nr_mapped 0
nr_file_pages 0
nr_dirty 0
nr_writeback 0
nr_slab_reclaimable 0
nr_slab_unreclaimable 0
nr_page_table_pages 0
nr_kernel_stack 0
nr_unstable 0
nr_bounce 0
nr_vmscan_write 0
nr_vmscan_immediate_reclaim 0
nr_writeback_temp 0
nr_isolated_anon 0
nr_isolated_file 0
nr_shmem 0
nr_dirtied 0
nr_written 0
nr_anon_transparent_hugepages 0
nr_free_cma 0
protection: (0, 3503, 4007, 4007)
pagesets
cpu: 0
count: 0
high: 0
batch: 1
vm stats threshold: 8
cpu: 1
count: 0
high: 0
batch: 1
vm stats threshold: 8
all_unreclaimable: 1
start_pfn: 16
inactive_ratio: 1
Node 0, zone DMA32
pages free 87395
min 14715
low 18393
high 22072
scanned 0
spanned 1044480
present 896960
nr_free_pages 87395
nr_inactive_anon 18907
nr_active_anon 92242
nr_inactive_file 325044
nr_active_file 267577
nr_unevictable 0
nr_mlock 0
nr_anon_pages 51703
nr_mapped 4369
nr_file_pages 593009
nr_dirty 17
nr_writeback 0
nr_slab_reclaimable 14988
nr_slab_unreclaimable 11515
nr_page_table_pages 1305
nr_kernel_stack 133
nr_unstable 0
nr_bounce 0
nr_vmscan_write 140220
nr_vmscan_immediate_reclaim 62
nr_writeback_temp 0
nr_isolated_anon 0
nr_isolated_file 0
nr_shmem 10
nr_dirtied 810741
nr_written 862763
nr_anon_transparent_hugepages 116
nr_free_cma 0
protection: (0, 0, 504, 504)
pagesets
cpu: 0
count: 123
high: 186
batch: 31
vm stats threshold: 24
cpu: 1
count: 29
high: 186
batch: 31
vm stats threshold: 24
all_unreclaimable: 0
start_pfn: 4096
inactive_ratio: 5
Node 0, zone Normal
pages free 3200
min 2116
low 2645
high 3174
scanned 0
spanned 131072
present 129024
nr_free_pages 3200
nr_inactive_anon 25943
nr_active_anon 24590
nr_inactive_file 23132
nr_active_file 10275
nr_unevictable 552
nr_mlock 552
nr_anon_pages 49050
nr_mapped 2088
nr_file_pages 35785
nr_dirty 3
nr_writeback 0
nr_slab_reclaimable 2340
nr_slab_unreclaimable 3926
nr_page_table_pages 786
nr_kernel_stack 114
nr_unstable 0
nr_bounce 0
nr_vmscan_write 9297
nr_vmscan_immediate_reclaim 13835
nr_writeback_temp 0
nr_isolated_anon 0
nr_isolated_file 10
nr_shmem 38
nr_dirtied 338110
nr_written 328638
nr_anon_transparent_hugepages 0
nr_free_cma 0
protection: (0, 0, 0, 0)
pagesets
cpu: 0
count: 152
high: 186
batch: 31
vm stats threshold: 12
cpu: 1
count: 172
high: 186
batch: 31
vm stats threshold: 12
all_unreclaimable: 0
start_pfn: 1048576
inactive_ratio: 1

I have tried disabling compaction (1000
> /proc/sys/vm/extdefrag_threshold), and symptoms do change. There is no
kswapd stuck in D, but instead page cache is almost cleaned from time to
time

I use this simple script to get difference for /proc/vmstat
$ cat diffshow
#!/bin/sh

sleep_int=$1
first_pass=1

while [ 0 ]; do
echo '----8<----'

while read a b; do
if [ $first_pass -eq 0 ]; then
eval "diff=\$((b - ${a}_last))"

[ $diff -gt 0 ] && \
eval "printf \"%s:\t%d -> %d (%d)\n\" $a \$${a}_last $b $diff"
fi

eval "${a}_last=$b"
done < /proc/vmstat

first_pass=0
sleep $sleep_int
done

Also I have a piece of code, which can reproduce the first problem with
kswapd in D state on another amd64 system, which has normal zone
artificially limited to the same ratio against dma32 zone. It needs a
large file, which is at least twice as large as system RAM (the larger
the better):
dd if=/dev/zero of=tf bs=1M count=$((1024*8))

Then start smth like this:
./a.out tf 32
and let it run for some time to fill the page cache.

The code will random read the file in fixed chunks at fixed rate in two
"streams": one stream of 1/3 rate will be scattered across the whole
file and mark pages with WILLNEED. Another stream at 2/3 rate is
contained in 1/10 of a file and will not pass any hints.

#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <unistd.h>
#include <limits.h>
#include <errno.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <time.h>
#include <sys/time.h>
#include <fcntl.h>

#define ERR(a) do { printf ("System error in " a ": %d (%s)", errno, strerror (errno)); exit (EXIT_FAILURE); } while (0)
#define READ_CHUNK 16384
#define READ_RATE (6 * 1024 * 1024) /* Bytes per second */
#define GIGA 1000000000

#define min(a,b) ({ \
typeof(a) __b = (b); \
typeof(b) __a = (a); \
__a < __b ? __a : __b; \
})

enum block_type_e {
BLOCK_HOT,
BLOCK_COLD,
};

static size_t pagesize;

void my_read_block (int fd, off_t offset, ssize_t size, enum block_type_e blk_type) {
#ifdef USE_MMAP
off_t map_start;
size_t map_size;
void *map;
#endif
static char buf[READ_CHUNK];
ssize_t to_read = size;

offset *= size;

#ifdef USE_MMAP
map_size = size / pagesize * pagesize;
if (map_size < size) map_size += pagesize;

map_start = offset / pagesize * pagesize;
offset -= map_start;

map = mmap (NULL, map_size, PROT_READ, MAP_PRIVATE, fd, map_start);
if (!map) ERR ("mmap");
#else
lseek (fd, offset, SEEK_SET);
#endif

for (to_read = size; to_read > 0; to_read -= READ_CHUNK, offset += READ_CHUNK) {
#ifdef USE_MMAP
memcpy (buf, (char*) map + offset, min (READ_CHUNK, to_read));
#else
if (blk_type == BLOCK_COLD)
posix_fadvise (fd, offset, min (READ_CHUNK, to_read), POSIX_FADV_WILLNEED);

read (fd, buf, min (READ_CHUNK, to_read));
#endif
}

#ifdef USE_MMAP
munmap (map, map_size);
#endif
}

int main (int argc, char *argv[]) {
int fd, ret, i = 0;
char *b, *file;
struct timespec now, read_next = {};
size_t read_block;
struct stat f_stat;
off_t file_size_blocks;

if (argc < 3) ERR ("Not enough arguments");
file = argv[1];
read_block = atol (argv[2]) * 1024;

pagesize = sysconf (_SC_PAGESIZE);
if (pagesize <= 1) pagesize = 4096;

clock_gettime (CLOCK_MONOTONIC, &now);

fd = open ("/dev/urandom", O_RDONLY);
if (fd < 0)
ERR ("open /dev/urandom");

b = (char*) malloc (64);
if ((ret = read (fd, b, 64)) > 0) {
char *state = initstate (now.tv_nsec, b, ret);
if (!state) ERR ("initstate");
setstate (state);
}

free (b);
close (fd);

fd = open (file, O_RDONLY);
if (fd < 0)
ERR ("open");

if (fstat (fd, &f_stat) != 0)
ERR ("stat");

file_size_blocks = (unsigned long long) f_stat.st_size / read_block;
printf ("File has %llu blocks of size %ld\n", (unsigned long long) file_size_blocks, read_block);

clock_gettime (CLOCK_MONOTONIC, &now);

while (1) {
ssize_t read_off;
enum block_type_e read_type;

if ((i = (i+1) % 3)) {
read_type = BLOCK_COLD;
read_off = (unsigned long long) random() * file_size_blocks /
(unsigned long long) RAND_MAX;
} else {
read_type = BLOCK_HOT;
read_off = (unsigned long long) random() * file_size_blocks /
(unsigned long long) RAND_MAX / 10;
}

my_read_block (fd, read_off, read_block, read_type);

read_next.tv_nsec = now.tv_nsec + GIGA / READ_RATE * read_block;
read_next.tv_sec = now.tv_sec + read_next.tv_nsec / GIGA;
read_next.tv_nsec %= GIGA;

while (clock_nanosleep (CLOCK_MONOTONIC, TIMER_ABSTIME, &read_next, NULL) != 0);
clock_gettime (CLOCK_MONOTONIC, &now);
}

return 0;
}


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/