[PATCH linux-next] initramfs: create a mount point for rootfs,so docker on rootfs can use pivot_root

From: cgel . zte
Date: Fri May 06 2022 - 04:57:51 EST


From: Zhang Yunkai <zhang.yunkai@xxxxxxxxxx>

When using container platforms such as Docker, it has two ways to change
the root directory to the specified path, pivot_root or chroot.

Docker uses pivot_root by default, which can be handled very cleanly.
But it only support for a disk or block device, not for rootfs. Because
the specified directory does not have a parent mount.

So if we want use docker on rootfs, we need specify DOCKER_RAMDISK=yes.
Then docker change the root directory will use chroot instead of
pivot_root.

There are at least two reasons, we still have to use pivot_root for
rootfs. Chroot can only simply change the root directory, which will
lead to resource leakage. An example is that a USB device connected
prior to the creation of a containers on the host gets disconnected
after a container is created. if the USB device was mounted on
containers, but already removed and umounted on the host, the mount
point will not go away until all containers unmount the USB device.
Containers will have mount point even if they haven't done a mount
action.

Another reason for Docker to use pivot_root is that upon initialization
the net-namspace is mounted under /var/run/docker/netns/ on the host by
dockerd. Without pivot_root Docker must either wait to create the
network namespace prior to the creation of containers or simply deal
with leaking this to each container.

This patch creates a parent mount point for rootfs to support
pivot_root. The main steps are:
mkdir /root
cd /root
mount tmpfs to /root
decompress initramfs and initrd to tmpfs
mount . /
ksys_chroot .

In addition, because there is an additional layer of mounting, it is
necessary to slightly modify the way init_eaccess searches for files
during the kernel initialization.

While mounting tmpfs to /root, 'rootflags' is passed, and it means that
we can set options for the mount of rootfs in boot cmd now. For example,
the size of tmpfs can be set with 'rootflags=size=1024M'.

Tested-by: Zeal Robot <zealci@xxxxxxxxxx>
Signed-off-by: Zhang Yunkai <zhang.yunkai@xxxxxxxxxx>
---
fs/init.c | 10 ++++++++--
include/linux/init.h | 1 +
init/do_mounts.c | 45 ++++++++++++++++++++++++++++++++++++++++++++
init/do_mounts.h | 14 ++++++++++++++
init/initramfs.c | 16 ++++++++++++++--
init/main.c | 6 +++++-
usr/Kconfig | 10 ++++++++++
7 files changed, 97 insertions(+), 5 deletions(-)

diff --git a/fs/init.c b/fs/init.c
index 5c36adaa9b44..4974f19bf645 100644
--- a/fs/init.c
+++ b/fs/init.c
@@ -112,14 +112,20 @@ int __init init_chmod(const char *filename, umode_t mode)

int __init init_eaccess(const char *filename)
{
- struct path path;
+ struct path path, root;
int error;

- error = kern_path(filename, LOOKUP_FOLLOW, &path);
+ error = kern_path("/", LOOKUP_DOWN, &root);
if (error)
return error;
+ error = vfs_path_lookup(root.dentry, root.mnt, filename,
+ LOOKUP_FOLLOW, &path);
+ if (error)
+ goto on_err;
error = path_permission(&path, MAY_ACCESS);
path_put(&path);
+on_err:
+ path_put(&root);
return error;
}

diff --git a/include/linux/init.h b/include/linux/init.h
index baf0b29a7010..6eddd3730ce8 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -149,6 +149,7 @@ extern unsigned int reset_devices;
void setup_arch(char **);
void prepare_namespace(void);
void __init init_rootfs(void);
+bool ramdisk_exec_exist(void);
extern struct file_system_type rootfs_fs_type;

#if defined(CONFIG_STRICT_KERNEL_RWX) || defined(CONFIG_STRICT_MODULE_RWX)
diff --git a/init/do_mounts.c b/init/do_mounts.c
index 7058e14ad5f7..c28a5792ddc3 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -649,6 +649,50 @@ void __init prepare_namespace(void)
}

static bool is_tmpfs;
+#ifdef CONFIG_ROOTFS_MOUNT
+
+/*
+ * Give systems running from the rootfs and making use of pivot_root a
+ * proper mount so it can be umounted during pivot_root.
+ */
+int __init prepare_mount_rootfs(void)
+{
+ char *rootfs = "ramfs";
+
+ if (is_tmpfs)
+ rootfs = "tmpfs";
+
+ init_mkdir("/root", 0700);
+ return do_mount_root(rootfs, rootfs,
+ root_mountflags & ~MS_RDONLY,
+ root_mount_data);
+}
+
+/*
+ * Revert to previous mount by chdir to '/' and unmounting the second
+ * mount.
+ */
+void __init revert_mount_rootfs(void)
+{
+ init_chdir("/");
+ init_umount(".", MNT_DETACH);
+}
+
+/*
+ * Change root to the new rootfs that mounted in prepare_mount_rootfs()
+ * if cpio is unpacked successfully and 'ramdisk_execute_command' exist.
+ */
+void __init finish_mount_rootfs(void)
+{
+ init_mount(".", "/", NULL, MS_MOVE, NULL);
+ if (likely(ramdisk_exec_exist()))
+ init_chroot(".");
+ else
+ revert_mount_rootfs();
+}
+
+#define rootfs_init_fs_context ramfs_init_fs_context
+#else
static int rootfs_init_fs_context(struct fs_context *fc)
{
if (IS_ENABLED(CONFIG_TMPFS) && is_tmpfs)
@@ -656,6 +700,7 @@ static int rootfs_init_fs_context(struct fs_context *fc)

return ramfs_init_fs_context(fc);
}
+#endif

struct file_system_type rootfs_fs_type = {
.name = "rootfs",
diff --git a/init/do_mounts.h b/init/do_mounts.h
index 7a29ac3e427b..6bc954b84015 100644
--- a/init/do_mounts.h
+++ b/init/do_mounts.h
@@ -14,6 +14,20 @@ void mount_block_root(char *name, int flags);
void mount_root(void);
extern int root_mountflags;

+#ifdef CONFIG_ROOTFS_MOUNT
+
+int prepare_mount_rootfs(void);
+void finish_mount_rootfs(void);
+void revert_mount_rootfs(void);
+
+#else
+
+static inline int prepare_mount_rootfs(void) { return 0; }
+static inline void finish_mount_rootfs(void) { }
+static inline void revert_mount_rootfs(void) { }
+
+#endif
+
static inline __init int create_dev(char *name, dev_t dev)
{
init_unlink(name);
diff --git a/init/initramfs.c b/init/initramfs.c
index 2f3d96dc3db6..7b68c5aeff7d 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -17,6 +17,8 @@
#include <linux/init_syscalls.h>
#include <linux/umh.h>

+#include "do_mounts.h"
+
static ssize_t __init xwrite(struct file *file, const char *p, size_t count,
loff_t *pos)
{
@@ -671,12 +673,19 @@ static void __init populate_initrd_image(char *err)
static void __init do_populate_rootfs(void *unused, async_cookie_t cookie)
{
/* Load the built in initramfs */
- char *err = unpack_to_rootfs(__initramfs_start, __initramfs_size);
+ char *err;
+
+ if (prepare_mount_rootfs())
+ panic("Failed to mount rootfs\n");
+
+ err = unpack_to_rootfs(__initramfs_start, __initramfs_size);
if (err)
panic_show_mem("%s", err); /* Failed to decompress INTERNAL initramfs */

- if (!initrd_start || IS_ENABLED(CONFIG_INITRAMFS_FORCE))
+ if (!initrd_start || IS_ENABLED(CONFIG_INITRAMFS_FORCE)) {
+ finish_mount_rootfs();
goto done;
+ }

if (IS_ENABLED(CONFIG_BLK_DEV_RAM))
printk(KERN_INFO "Trying to unpack rootfs image as initramfs...\n");
@@ -685,11 +694,14 @@ static void __init do_populate_rootfs(void *unused, async_cookie_t cookie)

err = unpack_to_rootfs((char *)initrd_start, initrd_end - initrd_start);
if (err) {
+ revert_mount_rootfs();
#ifdef CONFIG_BLK_DEV_RAM
populate_initrd_image(err);
#else
printk(KERN_EMERG "Initramfs unpacking failed: %s\n", err);
#endif
+ } else {
+ finish_mount_rootfs();
}

done:
diff --git a/init/main.c b/init/main.c
index 98182c3c2c4b..2e4875834f97 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1580,6 +1580,10 @@ void __init console_on_rootfs(void)
fput(file);
}

+bool __init ramdisk_exec_exist(void)
+{
+ return init_eaccess(ramdisk_execute_command) == 0;
+}
static noinline void __init kernel_init_freeable(void)
{
/* Now the scheduler is fully set up and can do blocking allocations */
@@ -1621,7 +1625,7 @@ static noinline void __init kernel_init_freeable(void)
* check if there is an early userspace init. If yes, let it do all
* the work
*/
- if (init_eaccess(ramdisk_execute_command) != 0) {
+ if (!ramdisk_exec_exist()) {
ramdisk_execute_command = NULL;
prepare_namespace();
}
diff --git a/usr/Kconfig b/usr/Kconfig
index 8bbcf699fe3b..03dbb22e95f9 100644
--- a/usr/Kconfig
+++ b/usr/Kconfig
@@ -52,6 +52,16 @@ config INITRAMFS_ROOT_GID

If you are not sure, leave it set to "0".

+config ROOTFS_MOUNT
+ bool "Create mount point for rootfs to make pivot_root() supported"
+ default n
+ help
+ Before unpacking cpio, create a mount point and make it become
+ the root filesystem. Therefore, rootfs will be supported by
+ pivot_root().
+
+ If container platforms is used with rootfs, say Y.
+
config RD_GZIP
bool "Support initial ramdisk/ramfs compressed using gzip"
default y
--
2.25.1