Re: [PATCH v2 1/2] mm, thp: check page mapping when truncating page cache

From: Rongwei Wang
Date: Fri Sep 24 2021 - 03:12:56 EST




On 9/24/21 10:43 AM, Andrew Morton wrote:
On Thu, 23 Sep 2021 01:04:54 +0800 Rongwei Wang <rongwei.wang@xxxxxxxxxxxxxxxxx> wrote:



On Sep 22, 2021, at 7:37 PM, Matthew Wilcox <willy@xxxxxxxxxxxxx> wrote:

On Wed, Sep 22, 2021 at 03:06:44PM +0800, Rongwei Wang wrote:
Transparent huge page has supported read-only non-shmem files. The file-
backed THP is collapsed by khugepaged and truncated when written (for
shared libraries).

However, there is race in two possible places.

1) multiple writers truncate the same page cache concurrently;
2) collapse_file rolls back when writer truncates the page cache;

As I've said before, the bug here is that somehow there is a writable fd
to a file with THPs. That's what we need to track down and fix.
Hi, Matthew
I am not sure get your means. We know “mm, thp: relax the VM_DENYWRITE constraint on file-backed THPs"
Introduced file-backed THPs for DSO. It is possible {very rarely} for DSO to be opened in writeable way.

...

https://lore.kernel.org/linux-mm/YUdL3lFLFHzC80Wt@xxxxxxxxxxxxxxxxxxxx/
All in all, what you mean is that we should solve this race at the source?

Matthew is being pretty clear here: we shouldn't be permitting
userspace to get a writeable fd for a thp-backed file.

Why are we permitting the DSO to be opened writeably? If there's a
legitimate case for doing this then presumably "mm, thp: relax the
There is a use case to stress file-backed THP within attachment.
I test this case in a system which has enabled CONFIG_READ_ONLY_THP_FOR_FS:

$ gcc -Wall -g -o stress_madvise_dso stress_madvise_dso.c
$ ulimit -s unlimited
$ ./stress_madvise_dso 10000 <libtest.so>

the meaning of above parameters:
10000: the max test time;
<libtest.so>: the DSO that will been mapped into file-backed THP by madvise. It recommended that the text segment of DSO to be tested is greater than 2M.

The crash will been triggered at once in the latest kernel. And this
case also can used to trigger the bug that mentioned in our another patch.

VM_DENYWRITE constraint on file-backed THPs: should be fixed or
reverted.

If there is no legitimate use case for returning a writeable fd for a
thp-backed file then we should fail such an attempt at open(). This
approach has back-compatibility issues which need to be thought about.
Perhaps we should permit the open-writeably attempt to appear to
succeed, but to really return a read-only fd?
/*
* case: stress file-backed THP
*/
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif

#include <assert.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <sched.h>
#include <time.h>
#include <string.h>
#include <fcntl.h>
#include <signal.h> /* for signal */
#include <sys/mman.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
#include <errno.h>

#define PATH_MAX 1024
#define BUFF_MAX 1024
#define TIME_DFL 180 /* seconds */

void signal_handler(int signo)
{
/* Restore env */
system("echo never > /sys/kernel/mm/transparent_hugepage/enabled");
system("echo 10000 > /sys/kernel/mm/transparent_hugepage/khugepaged/scan_sleep_millisecs");

printf("\nrestore env:\n");
printf(" echo never > /sys/kernel/mm/transparent_hugepage/enabled\n");
printf(" echo 10000 > /sys/kernel/mm/transparent_hugepage/khugepaged/scan_sleep_millisecs\n");
exit(-1);
}

/* in KB */
#define text_size (14UL << 10)

#define PROCMAP_SZ 8
struct procmap {
uint64_t vm_start;
uint64_t vm_end;
uint64_t pgoff;
uint32_t maj;
uint32_t min;
uint32_t ino;
#define PROT_SZ 5
char prot[PROT_SZ];
char fname[PATH_MAX];
};

unsigned long sleep_secs = 0;

/*
* Routines of procmap, i.e., /proc/pid/(s)maps
*/
static int get_memory_map(pid_t pid, struct procmap *procmap,
const char *fname)
{
char path[PATH_MAX];
char line[BUFF_MAX];
FILE *fp = NULL;
char *end = NULL;
char *pos, *sp = NULL, *in[PROCMAP_SZ];
char dlm[] = "- : ";
uint64_t counter;
int i;

snprintf(path, PATH_MAX, "/proc/%u/maps", pid);

fp = fopen(path, "r");
if (fp == NULL) {
printf("fopen: %s: %s\n", path, strerror(errno));
return -1;
}

if (procmap == NULL || fname == NULL) {
perror("fail: procmap or fname is NULL");
goto failed;
}

while (fgets(line, BUFF_MAX, fp)) {
/* Split line into fields */
pos = line;
for (i = 0; i < PROCMAP_SZ; i++) {
in[i] = strtok_r(pos, &dlm[i], &sp);
if (in[i] == NULL)
break;
pos = NULL;
}

/* Check this line is procmap item header */
if (i != PROCMAP_SZ)
continue;

memcpy(procmap->prot, in[2], PROT_SZ);
memcpy(procmap->fname, in[7], PATH_MAX);

/* Find the target entry */
if (strcmp(procmap->prot, "r-xp") ||
!strstr(procmap->fname, fname))
continue;

/* Convert/Copy each field as needed */
errno = 0;
procmap->vm_start = strtoull(in[0], &end, 16);
if ((in[0] == '\0') || (end == NULL) || (*end != '\0') ||
(errno != 0))
goto failed;

procmap->vm_end = strtoull(in[1], &end, 16);
if ((in[1] == '\0') || (end == NULL) || (*end != '\0') ||
(errno != 0))
goto failed;

procmap->pgoff = strtoull(in[3], &end, 16);
if ((in[3] == '\0') || (end == NULL) || (*end != '\0') ||
(errno != 0))
goto failed;

procmap->ino = strtoul(in[6], &end, 16);
if ((in[6] == '\0') || (end == NULL) || (*end != '\0') ||
(errno != 0))
goto failed;
}

if (fp)
fclose(fp);
return 0;

failed:
if (fp)
fclose(fp);
printf("fail: exit\n");

return -1;
}

#define NR_CPU 32
uint64_t gettimeofday_sec(void);
inline uint64_t gettimeofday_sec(void)
{
struct timeval tv;

gettimeofday(&tv, NULL);
return tv.tv_sec;
}

void thread_read(int cpu, char *args)
{
int fd;
char *dso_path = args;
char buf[0x800000];
struct procmap maps;
pid_t pid = getpid();

cpu_set_t mask;
CPU_ZERO(&mask);
CPU_SET(cpu, &mask);
if (sched_setaffinity(0, sizeof(cpu_set_t), &mask) == -1) {
printf("warning: can not set CPU affinity\n");
}

printf("read %s\n", dso_path);
fd = open(dso_path, O_RDONLY);
/* The start addr must be alignment with 2M */
void *p = mmap((void *)0x40000dc00000UL, 0x800000, PROT_READ | PROT_EXEC,
MAP_PRIVATE, fd, 0);
if (p == MAP_FAILED) {
perror("mmap");
goto out;
}

/* get the mapping address (ONLY r-xp) of the DSO */
get_memory_map(pid, &maps, dso_path);
printf("pid: %d\n", pid);
printf("text vm_start: 0x%lx\n", maps.vm_start);
printf("text vm_end: 0x%lx\n", maps.vm_end);
madvise((void *)maps.vm_start, maps.vm_end - maps.vm_start, MADV_HUGEPAGE);
lseek(fd, 0, SEEK_SET);
for(;;) {
memcpy(buf, p, 0x800000 - 1);
sleep(1);
}

sleep(100);

out:
/* Restore env */
system("echo never > /sys/kernel/mm/transparent_hugepage/enabled");
system("echo 10000 > /sys/kernel/mm/transparent_hugepage/khugepaged/scan_sleep_millisecs");

printf("read exit %s\n", dso_path);
}

void thread_write(int cpu, char *args)
{
void *p = NULL;
char buf[32];
uint64_t sec = 1;
uint64_t count = 0;
char *dso_path = args;

cpu_set_t mask;
CPU_ZERO(&mask);
CPU_SET(cpu, &mask);
if (sched_setaffinity(0, sizeof(cpu_set_t), &mask) == -1) {
printf("warning: can not set CPU affinity\n");
}

sleep(3);
printf("write %s\n", dso_path);
for (;;) {
sec = gettimeofday_sec();
while ((sec % 10) >= 3) {
sec = gettimeofday_sec();
}

int fd = open(dso_path, O_RDWR);
p = mmap(NULL, 0x800000, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if (p == MAP_FAILED) {
perror("mmap");
goto out; /* fail */
}

lseek(fd, 0x1600, SEEK_SET);
for(long i=1; i <= 2; i++){
memcpy(p + 0x10, buf, 16);
}

munmap(p, 0x800000);
close(fd);

sleep(2);
count++;
if (count >= sleep_secs) {
printf("test finish: %ld\n", count);
break;
}
} /* end for */

out:
printf("write exit %s\n", dso_path);
}

/*
* usage:
* stress_madvise_dso <test time> <libtest.so>
*/
int main(int argc, char *argv[])
{
struct timeval start, end;
char dso_path[80];
int ret = 0;
pid_t pid;

if (argc > 2) {
sleep_secs = strtoul(argv[1], NULL, 10);
realpath(argv[2], dso_path);
}
else {
printf("usage error:\n"\
" stress_madvise_dso <test time> <libtest.so>\n"\
" e.g. stress_madvise_dso 10000 libtest.so\n");
exit(-1);
}

/* Set env */
system("ulimit -s unlimited");
system("echo madvise > /sys/kernel/mm/transparent_hugepage/enabled");
system("echo 1000 > /sys/kernel/mm/transparent_hugepage/khugepaged/scan_sleep_millisecs");

gettimeofday(&start, NULL);

/*
* fork 32 task to read and write the same DSO:
* task 0: read dso;
* task 1 - 31: write dso;
*/
for (int i = 0; i < NR_CPU; ++i) {
pid = fork();
if (pid == 0) {
if (i == 0)
thread_read(i, dso_path);
else
thread_write(i, dso_path);
break; /* forbid child fork */
}
else {
/* parent */
}
}

if (pid != 0) {
signal(SIGINT, signal_handler);
signal(SIGSEGV, signal_handler);
signal(SIGABRT, signal_handler);
/* wait */
while (1) {
int status;

pid_t done = wait(&status);
if (done == -1) {
if (errno == ECHILD)
break; /* No more child processes */
} else {
if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) {
printf("Pid:%d failed\n", done);
goto out;
}
}
}
}

out:
if (ret == 0) {
gettimeofday(&end, NULL);
printf("time to collapse file thp: %ld ms\n",
1000 * (end.tv_sec - start.tv_sec) +
(end.tv_usec - start.tv_usec) / 1000);
}

return ret;
}