[PATCH v4 01/12] mm/shmem: Introduce F_SEAL_INACCESSIBLE

From: Chao Peng
Date: Tue Jan 18 2022 - 08:22:15 EST


From: "Kirill A. Shutemov" <kirill.shutemov@xxxxxxxxxxxxxxx>

Introduce a new seal F_SEAL_INACCESSIBLE indicating the content of
the file is inaccessible from userspace through ordinary MMU access
(e.g., read/write/mmap). However, the file content can be accessed
via a different mechanism (e.g. KVM MMU) indirectly.

It provides semantics required for KVM guest private memory support
that a file descriptor with this seal set is going to be used as the
source of guest memory in confidential computing environments such
as Intel TDX/AMD SEV but may not be accessible from host userspace.

At this time only shmem implements this seal.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@xxxxxxxxxxxxxxx>
Signed-off-by: Chao Peng <chao.p.peng@xxxxxxxxxxxxxxx>
---
include/uapi/linux/fcntl.h | 1 +
mm/shmem.c | 40 ++++++++++++++++++++++++++++++++++++--
2 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 2f86b2ad6d7e..09ef34754dfa 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -43,6 +43,7 @@
#define F_SEAL_GROW 0x0004 /* prevent file from growing */
#define F_SEAL_WRITE 0x0008 /* prevent writes */
#define F_SEAL_FUTURE_WRITE 0x0010 /* prevent future writes while mapped */
+#define F_SEAL_INACCESSIBLE 0x0020 /* prevent ordinary MMU access (e.g. read/write/mmap) to file content */
/* (1U << 31) is reserved for signed error codes */

/*
diff --git a/mm/shmem.c b/mm/shmem.c
index 18f93c2d68f1..72185630e7c4 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1098,6 +1098,13 @@ static int shmem_setattr(struct user_namespace *mnt_userns,
(newsize > oldsize && (info->seals & F_SEAL_GROW)))
return -EPERM;

+ if (info->seals & F_SEAL_INACCESSIBLE) {
+ if(i_size_read(inode))
+ return -EPERM;
+ if (newsize & ~PAGE_MASK)
+ return -EINVAL;
+ }
+
if (newsize != oldsize) {
error = shmem_reacct_size(SHMEM_I(inode)->flags,
oldsize, newsize);
@@ -1364,6 +1371,8 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
goto redirty;
if (!total_swap_pages)
goto redirty;
+ if (info->seals & F_SEAL_INACCESSIBLE)
+ goto redirty;

/*
* Our capabilities prevent regular writeback or sync from ever calling
@@ -2262,6 +2271,9 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
if (ret)
return ret;

+ if (info->seals & F_SEAL_INACCESSIBLE)
+ return -EPERM;
+
/* arm64 - allow memory tagging on RAM-based files */
vma->vm_flags |= VM_MTE_ALLOWED;

@@ -2459,12 +2471,15 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
pgoff_t index = pos >> PAGE_SHIFT;

/* i_rwsem is held by caller */
- if (unlikely(info->seals & (F_SEAL_GROW |
- F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) {
+ if (unlikely(info->seals & (F_SEAL_GROW | F_SEAL_WRITE |
+ F_SEAL_FUTURE_WRITE |
+ F_SEAL_INACCESSIBLE))) {
if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))
return -EPERM;
if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
return -EPERM;
+ if (info->seals & F_SEAL_INACCESSIBLE)
+ return -EPERM;
}

return shmem_getpage(inode, index, pagep, SGP_WRITE);
@@ -2538,6 +2553,21 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
end_index = i_size >> PAGE_SHIFT;
if (index > end_index)
break;
+
+ /*
+ * inode_lock protects setting up seals as well as write to
+ * i_size. Setting F_SEAL_INACCESSIBLE only allowed with
+ * i_size == 0.
+ *
+ * Check F_SEAL_INACCESSIBLE after i_size. It effectively
+ * serialize read vs. setting F_SEAL_INACCESSIBLE without
+ * taking inode_lock in read path.
+ */
+ if (SHMEM_I(inode)->seals & F_SEAL_INACCESSIBLE) {
+ error = -EPERM;
+ break;
+ }
+
if (index == end_index) {
nr = i_size & ~PAGE_MASK;
if (nr <= offset)
@@ -2663,6 +2693,12 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
goto out;
}

+ if ((info->seals & F_SEAL_INACCESSIBLE) &&
+ (offset & ~PAGE_MASK || len & ~PAGE_MASK)) {
+ error = -EINVAL;
+ goto out;
+ }
+
shmem_falloc.waitq = &shmem_falloc_waitq;
shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT;
shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
--
2.17.1