[RFC PATCH 1/3] Introduce per thread user-kernel shared structure

From: Prakash Sangappa
Date: Fri Aug 27 2021 - 19:34:51 EST


A structure per thread is allocated from a page that is shared mapped
between user space and kernel as means for faster communication. This will
facilitate sharing information, Ex: per thread stats shared between kernel
and user space, that can be read by applications without the need for
making frequent system calls in latency sensitive code path.

A new system call is added, which will allocate the shared structure and
return its mapped user address. Multiple such structures will be allocated
on a page to accommodate requests from different threads of a multithreaded
process. Available space on a page is managed using a bitmap. When a thread
exits, the shared structure is freed and can get reused for another thread
that requests the shared structure. More pages will be allocated and used as
needed based on the number of threads requesting use of shared structures.
These pages are all freed when the process exits.

Each of these shared structures are rounded to 128 bytes. Available space
in this structure can be used to accommodate additional per thread stats,
state etc as needed. In future, if more space beyond 128 bytes, is
needed, multiple such shared structures per thread could be allocated and
managed by the kernel. Although, space in shared structure for sharing any
kind of stats or state should be sparingly used. Therefore shared structure
layout is not exposed to user space. the system call will return the
mapped user address of a specific member or nested structure within the
shared structure corresponding to stats requested, This would allow future
enhancements/changes without breaking the API.

Signed-off-by: Prakash Sangappa <prakash.sangappa@xxxxxxxxxx>
---
arch/x86/entry/syscalls/syscall_32.tbl | 1 +
arch/x86/entry/syscalls/syscall_64.tbl | 1 +
include/linux/mm_types.h | 2 +
include/linux/sched.h | 3 +
include/linux/syscalls.h | 2 +
include/linux/task_shared.h | 57 +++++++
include/uapi/asm-generic/unistd.h | 5 +-
kernel/fork.c | 7 +
kernel/sys_ni.c | 3 +
mm/Makefile | 2 +-
mm/task_shared.c | 301 +++++++++++++++++++++++++++++++++
11 files changed, 382 insertions(+), 2 deletions(-)
create mode 100644 include/linux/task_shared.h
create mode 100644 mm/task_shared.c

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index ce763a1..a194581 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -452,3 +452,4 @@
445 i386 landlock_add_rule sys_landlock_add_rule
446 i386 landlock_restrict_self sys_landlock_restrict_self
447 i386 memfd_secret sys_memfd_secret
+448 i386 task_getshared sys_task_getshared
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index f6b5779..9dda907 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -369,6 +369,7 @@
445 common landlock_add_rule sys_landlock_add_rule
446 common landlock_restrict_self sys_landlock_restrict_self
447 common memfd_secret sys_memfd_secret
+448 common task_getshared sys_task_getshared

#
# Due to a historical design error, certain syscalls are numbered differently
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 52bbd2b..5ec26ed 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -572,6 +572,8 @@ struct mm_struct {
#ifdef CONFIG_IOMMU_SUPPORT
u32 pasid;
#endif
+ /* user shared pages */
+ void *usharedpg;
} __randomize_layout;

/*
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ec8d07d..237aa21 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1400,6 +1400,9 @@ struct task_struct {
struct llist_head kretprobe_instances;
#endif

+ /* user shared struct */
+ void *task_ushrd;
+
/*
* New fields for task_struct should be added above here, so that
* they are included in the randomized portion of task_struct.
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 69c9a70..09680b7 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1052,6 +1052,8 @@ asmlinkage long sys_landlock_add_rule(int ruleset_fd, enum landlock_rule_type ru
asmlinkage long sys_landlock_restrict_self(int ruleset_fd, __u32 flags);
asmlinkage long sys_memfd_secret(unsigned int flags);

+asmlinkage long sys_task_getshared(long opt, long flags, void __user *uaddr);
+
/*
* Architecture-specific system calls
*/
diff --git a/include/linux/task_shared.h b/include/linux/task_shared.h
new file mode 100644
index 0000000..de17849
--- /dev/null
+++ b/include/linux/task_shared.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __TASK_SHARED_H__
+#define __TASK_SHARED_H__
+
+#include <linux/mm_types.h>
+
+/*
+ * Track user-kernel shared pages referred by mm_struct
+ */
+struct ushared_pages {
+ struct list_head plist;
+ struct list_head frlist;
+ unsigned long pcount;
+};
+
+/*
+ * Following is the per task struct shared with kernel for
+ * fast communication.
+ */
+struct task_ushared {
+ long version;
+};
+
+/*
+ * Following is used for cacheline aligned allocations in a page.
+ */
+union task_shared {
+ struct task_ushared tu;
+ char s[128];
+};
+
+/*
+ * Struct to track per page slots
+ */
+struct ushared_pg {
+ struct list_head list;
+ struct list_head fr_list;
+ struct page *pages[2];
+ u64 bitmap; /* free slots */
+ int slot_count;
+ unsigned long kaddr;
+ unsigned long vaddr; /* user address */
+ struct vm_special_mapping ushrd_mapping;
+};
+
+/*
+ * Following struct is referred by tast_struct
+ */
+struct task_ushrd_struct {
+ struct task_ushared *kaddr; /* kernel address */
+ struct task_ushared *uaddr; /* user address */
+ struct ushared_pg *upg;
+};
+
+extern void task_ushared_free(struct task_struct *t);
+extern void mm_ushared_clear(struct mm_struct *mm);
+#endif /* __TASK_SHARED_H__ */
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index a9d6fcd..7c985b1 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -878,8 +878,11 @@ __SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self)
__SYSCALL(__NR_memfd_secret, sys_memfd_secret)
#endif

+#define __NR_task_getshared 448
+__SYSCALL(__NR_task_getshared, sys_task_getshared)
+
#undef __NR_syscalls
-#define __NR_syscalls 448
+#define __NR_syscalls 449

/*
* 32 bit systems traditionally used different
diff --git a/kernel/fork.c b/kernel/fork.c
index bc94b2c..f84bac0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -97,6 +97,7 @@
#include <linux/scs.h>
#include <linux/io_uring.h>
#include <linux/bpf.h>
+#include <linux/task_shared.h>

#include <asm/pgalloc.h>
#include <linux/uaccess.h>
@@ -903,6 +904,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
if (err)
goto free_stack;

+ /* task's ushared struct not inherited across fork */
+ tsk->task_ushrd = NULL;
+
#ifdef CONFIG_SECCOMP
/*
* We must handle setting up seccomp filters once we're under
@@ -1049,6 +1053,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
mm->pmd_huge_pte = NULL;
#endif
+ mm->usharedpg = NULL;
mm_init_uprobes_state(mm);

if (current->mm) {
@@ -1099,6 +1104,7 @@ static inline void __mmput(struct mm_struct *mm)
ksm_exit(mm);
khugepaged_exit(mm); /* must run before exit_mmap */
exit_mmap(mm);
+ mm_ushared_clear(mm);
mm_put_huge_zero_page(mm);
set_mm_exe_file(mm, NULL);
if (!list_empty(&mm->mmlist)) {
@@ -1308,6 +1314,7 @@ static int wait_for_vfork_done(struct task_struct *child,
static void mm_release(struct task_struct *tsk, struct mm_struct *mm)
{
uprobe_free_utask(tsk);
+ task_ushared_free(tsk);

/* Get rid of any cached register state */
deactivate_mm(tsk, mm);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 30971b1..8fbdc55 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -481,3 +481,6 @@ COND_SYSCALL(setuid16);

/* restartable sequence */
COND_SYSCALL(rseq);
+
+/* task shared */
+COND_SYSCALL(task_getshared);
diff --git a/mm/Makefile b/mm/Makefile
index e343674..03f88fe 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -52,7 +52,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
mm_init.o percpu.o slab_common.o \
compaction.o vmacache.o \
interval_tree.o list_lru.o workingset.o \
- debug.o gup.o mmap_lock.o $(mmu-y)
+ debug.o gup.o mmap_lock.o task_shared.o $(mmu-y)

# Give 'page_alloc' its own module-parameter namespace
page-alloc-y := page_alloc.o
diff --git a/mm/task_shared.c b/mm/task_shared.c
new file mode 100644
index 0000000..3ec5eb6
--- /dev/null
+++ b/mm/task_shared.c
@@ -0,0 +1,301 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/mm.h>
+#include <linux/uio.h>
+#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/highmem.h>
+#include <linux/ptrace.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+#include <linux/task_shared.h>
+
+/* Shared page */
+
+#define TASK_USHARED_SLOTS (PAGE_SIZE/sizeof(union task_shared))
+
+/*
+ * Called once to init struct ushared_pages pointer.
+ */
+static int init_mm_ushared(struct mm_struct *mm)
+{
+ struct ushared_pages *usharedpg;
+
+ usharedpg = kmalloc(sizeof(struct ushared_pages), GFP_KERNEL);
+ if (usharedpg == NULL)
+ return 1;
+
+ INIT_LIST_HEAD(&usharedpg->plist);
+ INIT_LIST_HEAD(&usharedpg->frlist);
+ usharedpg->pcount = 0;
+ mmap_write_lock(mm);
+ if (mm->usharedpg == NULL) {
+ mm->usharedpg = usharedpg;
+ usharedpg = NULL;
+ }
+ mmap_write_unlock(mm);
+ if (usharedpg != NULL)
+ kfree(usharedpg);
+ return 0;
+}
+
+static int init_task_ushrd(struct task_struct *t)
+{
+ struct task_ushrd_struct *ushrd;
+
+ ushrd = kzalloc(sizeof(struct task_ushrd_struct), GFP_KERNEL);
+ if (ushrd == NULL)
+ return 1;
+
+ mmap_write_lock(t->mm);
+ if (t->task_ushrd == NULL) {
+ t->task_ushrd = ushrd;
+ ushrd = NULL;
+ }
+ mmap_write_unlock(t->mm);
+ if (ushrd != NULL)
+ kfree(ushrd);
+ return 0;
+}
+
+/*
+ * Called from __mmput(), mm is going away
+ */
+void mm_ushared_clear(struct mm_struct *mm)
+{
+ struct ushared_pg *upg;
+ struct ushared_pg *tmp;
+ struct ushared_pages *usharedpg;
+
+ if (mm == NULL || mm->usharedpg == NULL)
+ return;
+
+ usharedpg = mm->usharedpg;
+ if (list_empty(&usharedpg->frlist))
+ goto out;
+
+ list_for_each_entry_safe(upg, tmp, &usharedpg->frlist, fr_list) {
+ list_del(&upg->fr_list);
+ put_page(upg->pages[0]);
+ kfree(upg);
+ }
+out:
+ kfree(mm->usharedpg);
+ mm->usharedpg = NULL;
+
+}
+
+void task_ushared_free(struct task_struct *t)
+{
+ struct task_ushrd_struct *ushrd = t->task_ushrd;
+ struct mm_struct *mm = t->mm;
+ struct ushared_pages *usharedpg;
+ int slot;
+
+ if (mm == NULL || mm->usharedpg == NULL || ushrd == NULL)
+ return;
+
+ usharedpg = mm->usharedpg;
+ mmap_write_lock(mm);
+
+ if (ushrd->upg == NULL)
+ goto out;
+
+ slot = (unsigned long)((unsigned long)ushrd->uaddr
+ - ushrd->upg->vaddr) / sizeof(union task_shared);
+ clear_bit(slot, (unsigned long *)(&ushrd->upg->bitmap));
+
+ /* move to head */
+ if (ushrd->upg->slot_count == 0) {
+ list_del(&ushrd->upg->fr_list);
+ list_add(&ushrd->upg->fr_list, &usharedpg->frlist);
+ }
+
+ ushrd->upg->slot_count++;
+
+ ushrd->uaddr = ushrd->kaddr = NULL;
+ ushrd->upg = NULL;
+
+out:
+ t->task_ushrd = NULL;
+ mmap_write_unlock(mm);
+ kfree(ushrd);
+}
+
+/* map shared page */
+static int task_shared_add_vma(struct ushared_pg *pg)
+{
+ struct vm_area_struct *vma;
+ struct mm_struct *mm = current->mm;
+ unsigned long ret = 1;
+
+
+ if (!pg->vaddr) {
+ /* Try to map as high as possible, this is only a hint. */
+ pg->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE,
+ PAGE_SIZE, 0, 0);
+ if (pg->vaddr & ~PAGE_MASK) {
+ ret = 0;
+ goto fail;
+ }
+ }
+
+ vma = _install_special_mapping(mm, pg->vaddr, PAGE_SIZE,
+ VM_SHARED|VM_READ|VM_MAYREAD|VM_DONTCOPY,
+ &pg->ushrd_mapping);
+ if (IS_ERR(vma)) {
+ ret = 0;
+ pg->vaddr = 0;
+ goto fail;
+ }
+
+ pg->kaddr = (unsigned long)page_address(pg->pages[0]);
+fail:
+ return ret;
+}
+
+/*
+ * Allocate a page, map user address and add to freelist
+ */
+static struct ushared_pg *ushared_allocpg(void)
+{
+
+ struct ushared_pg *pg;
+ struct mm_struct *mm = current->mm;
+ struct ushared_pages *usharedpg = mm->usharedpg;
+
+ if (usharedpg == NULL)
+ return NULL;
+ pg = kzalloc(sizeof(*pg), GFP_KERNEL);
+
+ if (unlikely(!pg))
+ return NULL;
+ pg->ushrd_mapping.name = "[task_shared]";
+ pg->ushrd_mapping.fault = NULL;
+ pg->ushrd_mapping.pages = pg->pages;
+ pg->pages[0] = alloc_page(GFP_KERNEL);
+ if (!pg->pages[0])
+ goto out;
+ pg->pages[1] = NULL;
+ pg->bitmap = 0;
+
+ /*
+ * page size should be 4096 or 8192
+ */
+ pg->slot_count = TASK_USHARED_SLOTS;
+
+ mmap_write_lock(mm);
+ if (task_shared_add_vma(pg)) {
+ list_add(&pg->fr_list, &usharedpg->frlist);
+ usharedpg->pcount++;
+ mmap_write_unlock(mm);
+ return pg;
+ }
+ mmap_write_unlock(mm);
+
+out:
+ __free_page(pg->pages[0]);
+ kfree(pg);
+ return NULL;
+}
+
+
+/*
+ * Allocate task_ushared struct for calling thread.
+ */
+static int task_ushared_alloc(void)
+{
+ struct mm_struct *mm = current->mm;
+ struct ushared_pg *ent = NULL;
+ struct task_ushrd_struct *ushrd;
+ struct ushared_pages *usharedpg;
+ int tryalloc = 0;
+ int slot = -1;
+ int ret = -ENOMEM;
+
+ if (mm->usharedpg == NULL && init_mm_ushared(mm))
+ return ret;
+
+ if (current->task_ushrd == NULL && init_task_ushrd(current))
+ return ret;
+
+ usharedpg = mm->usharedpg;
+ ushrd = current->task_ushrd;
+repeat:
+ if (mmap_write_lock_killable(mm))
+ return -EINTR;
+
+ ent = list_empty(&usharedpg->frlist) ? NULL :
+ list_entry(usharedpg->frlist.next,
+ struct ushared_pg, fr_list);
+
+ if (ent == NULL || ent->slot_count == 0) {
+ if (tryalloc == 0) {
+ mmap_write_unlock(mm);
+ (void)ushared_allocpg();
+ tryalloc = 1;
+ goto repeat;
+ } else {
+ ent = NULL;
+ }
+ }
+
+ if (ent) {
+ slot = find_first_zero_bit((unsigned long *)(&ent->bitmap),
+ TASK_USHARED_SLOTS);
+ BUG_ON(slot >= TASK_USHARED_SLOTS);
+
+ set_bit(slot, (unsigned long *)(&ent->bitmap));
+
+ ushrd->uaddr = (struct task_ushared *)(ent->vaddr +
+ (slot * sizeof(union task_shared)));
+ ushrd->kaddr = (struct task_ushared *)(ent->kaddr +
+ (slot * sizeof(union task_shared)));
+ ushrd->upg = ent;
+ ent->slot_count--;
+ /* move it to tail */
+ if (ent->slot_count == 0) {
+ list_del(&ent->fr_list);
+ list_add_tail(&ent->fr_list, &usharedpg->frlist);
+ }
+
+ ret = 0;
+ }
+
+out:
+ mmap_write_unlock(mm);
+ return ret;
+}
+
+
+/*
+ * Task Shared : allocate if needed, and return address of shared struct for
+ * this thread/task.
+ */
+static long task_getshared(u64 opt, u64 flags, void __user *uaddr)
+{
+ struct task_ushrd_struct *ushrd = current->task_ushrd;
+
+ /* We have address, return. */
+ if (ushrd != NULL && ushrd->upg != NULL) {
+ if (copy_to_user(uaddr, &ushrd->uaddr,
+ sizeof(struct task_ushared *)))
+ return (-EFAULT);
+ return 0;
+ }
+
+ task_ushared_alloc();
+ ushrd = current->task_ushrd;
+ if (ushrd != NULL && ushrd->upg != NULL) {
+ if (copy_to_user(uaddr, &ushrd->uaddr,
+ sizeof(struct task_ushared *)))
+ return (-EFAULT);
+ return 0;
+ }
+ return (-ENOMEM);
+}
+
+
+SYSCALL_DEFINE3(task_getshared, u64, opt, u64, flags, void __user *, uaddr)
+{
+ return task_getshared(opt, flags, uaddr);
+}
--
2.7.4