Re: [RFC][PATCH] uprobe: support for private hugetlb mappings

From: Guillaume Morin
Date: Thu May 02 2024 - 00:00:15 EST


On 30 Apr 21:25, David Hildenbrand wrote:
> > I tried to get the hugepd stuff right but this was the first I heard
> > about it :-) Afaict follow_huge_pmd and friends were already DTRT
>
> I'll have to have a closer look at some details (the hugepd writability
> check looks a bit odd), but it's mostly what I would have expected!

Ok in the meantime, here is the uprobe change on your current
uprobes_cow trying to address the comments you made in your previous
message. Some of them were not 100% clear to me, so it's a best effort
patch :-) Again lightly tested

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -83,6 +83,10 @@ static const struct fs_parameter_spec hugetlb_fs_parameters[] = {
{}
};

+bool hugetlbfs_mapping(struct address_space *mapping) {
+ return mapping->a_ops == &hugetlbfs_aops;
+}
+
/*
* Mask used when checking the page offset value passed in via system
* calls. This value will be converted to a loff_t which is signed.
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -511,6 +511,8 @@ struct hugetlbfs_sb_info {
umode_t mode;
};

+bool hugetlbfs_mapping(struct address_space *mapping);
+
static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
{
return sb->s_fs_info;
@@ -557,6 +559,8 @@ static inline struct hstate *hstate_inode(struct inode *i)
{
return NULL;
}
+
+static inline bool hugetlbfs_mapping(struct address_space *mapping) { return false; }
#endif /* !CONFIG_HUGETLBFS */

#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -11,6 +11,7 @@

#include <linux/kernel.h>
#include <linux/highmem.h>
+#include <linux/hugetlb.h>
#include <linux/pagemap.h> /* read_mapping_page */
#include <linux/slab.h>
#include <linux/sched.h>
@@ -120,7 +121,7 @@ struct xol_area {
*/
static bool valid_vma(struct vm_area_struct *vma, bool is_register)
{
- vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;
+ vm_flags_t flags = VM_MAYEXEC | VM_MAYSHARE;

if (is_register)
flags |= VM_WRITE;
@@ -177,6 +178,19 @@ static void copy_to_page(struct page *page, unsigned long vaddr, const void *src
kunmap_atomic(kaddr);
}

+static bool compare_pages(struct page *page1, struct page *page2, unsigned long page_size)
+{
+ char *addr1, *addr2;
+ int ret;
+
+ addr1 = kmap_local_page(page1);
+ addr2 = kmap_local_page(page2);
+ ret = memcmp(addr1, addr2, page_size);
+ kunmap_local(addr2);
+ kunmap_local(addr1);
+ return ret == 0;
+}
+
static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode)
{
uprobe_opcode_t old_opcode;
@@ -366,7 +380,9 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
}

static bool orig_page_is_identical(struct vm_area_struct *vma,
- unsigned long vaddr, struct page *page, bool *large)
+ unsigned long vaddr, struct page *page,
+ unsigned long page_size,
+ bool *large)
{
const pgoff_t index = vaddr_to_offset(vma, vaddr) >> PAGE_SHIFT;
struct page *orig_page = find_get_page(vma->vm_file->f_inode->i_mapping,
@@ -380,7 +396,7 @@ static bool orig_page_is_identical(struct vm_area_struct *vma,

*large = folio_test_large(orig_folio);
identical = folio_test_uptodate(orig_folio) &&
- pages_identical(page, orig_page);
+ compare_pages(page, orig_page, page_size);
folio_put(orig_folio);
return identical;
}
@@ -396,6 +412,81 @@ struct uwo_data {
uprobe_opcode_t opcode;
};

+static int __write_opcode_hugetlb(pte_t *ptep, unsigned long page_mask,
+ unsigned long vaddr,
+ unsigned long next, struct mm_walk *walk)
+{
+ struct uwo_data *data = walk->private;
+ const bool is_register = !!is_swbp_insn(&data->opcode);
+ pte_t pte = huge_ptep_get(ptep);
+ struct folio *folio;
+ struct page *page;
+ bool large;
+ struct hstate *h = hstate_vma(walk->vma);
+ unsigned subpage_index = (vaddr & (huge_page_size(h) - 1)) >>
+ PAGE_SHIFT;
+
+ if (!pte_present(pte))
+ return UWO_RETRY;
+ page = vm_normal_page(walk->vma, vaddr, pte);
+ if (!page)
+ return UWO_RETRY;
+ folio = page_folio(page);
+
+ /* When unregistering and there is no anon folio anymore, we're done. */
+ if (!folio_test_anon(folio))
+ return is_register ? UWO_RETRY_WRITE_FAULT : UWO_DONE;
+
+ /*
+ * See can_follow_write_pte(): we'd actually prefer requiring a
+ * writable PTE here, but when unregistering we might no longer have
+ * VM_WRITE ...
+ */
+ if (!huge_pte_write(pte)) {
+ if (!PageAnonExclusive(page))
+ return UWO_RETRY_WRITE_FAULT;
+ if (unlikely(userfaultfd_wp(walk->vma) && huge_pte_uffd_wp(pte)))
+ return UWO_RETRY_WRITE_FAULT;
+ /* SOFTDIRTY is handled via pte_mkdirty() below. */
+ }
+
+ /* Unmap + flush the TLB, such that we can write atomically .*/
+ flush_cache_page(walk->vma, vaddr & page_mask, pte_pfn(pte));
+ pte = huge_ptep_clear_flush(walk->vma, vaddr & page_mask, ptep);
+ copy_to_page(nth_page(page, subpage_index), data->opcode_vaddr,
+ &data->opcode, UPROBE_SWBP_INSN_SIZE);
+
+ /*
+ * When unregistering, we may only zap a PTE if uffd is disabled and
+ * the folio is not pinned ...
+ */
+ if (is_register || userfaultfd_missing(walk->vma) ||
+ folio_maybe_dma_pinned(folio))
+ goto remap;
+
+ /*
+ * ... the mapped anon page is identical to the original page (that
+ * will get faulted in on next access), and we don't have GUP pins.
+ */
+ if (!orig_page_is_identical(walk->vma, vaddr & page_mask, page,
+ huge_page_size(h), &large))
+ goto remap;
+
+ hugetlb_remove_rmap(folio);
+ folio_put(folio);
+ return UWO_DONE;
+remap:
+ /*
+ * Make sure that our copy_to_page() changes become visible before the
+ * set_huge_pte_at() write.
+ */
+ smp_wmb();
+ /* We modified the page. Make sure to mark the PTE dirty. */
+ set_huge_pte_at(walk->mm , vaddr & page_mask, ptep,
+ huge_pte_mkdirty(pte), huge_page_size(h));
+ return UWO_DONE;
+}
+
static int __write_opcode_pte(pte_t *ptep, unsigned long vaddr,
unsigned long next, struct mm_walk *walk)
{
@@ -447,7 +538,7 @@ static int __write_opcode_pte(pte_t *ptep, unsigned long vaddr,
* ... the mapped anon page is identical to the original page (that
* will get faulted in on next access), and we don't have GUP pins.
*/
- if (!orig_page_is_identical(walk->vma, vaddr, page, &large))
+ if (!orig_page_is_identical(walk->vma, vaddr, page, PAGE_SIZE, &large))
goto remap;

/* Zap it and try to reclaim swap space. */
@@ -473,6 +564,7 @@ static int __write_opcode_pte(pte_t *ptep, unsigned long vaddr,
}

static const struct mm_walk_ops write_opcode_ops = {
+ .hugetlb_entry = __write_opcode_hugetlb,
.pte_entry = __write_opcode_pte,
.walk_lock = PGWALK_WRLOCK,
};
@@ -510,6 +602,8 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
struct mmu_notifier_range range;
int ret, ref_ctr_updated = 0;
struct page *page;
+ unsigned long page_size = PAGE_SIZE;
+ unsigned long page_mask = PAGE_MASK;

if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags)))
return -EINVAL;
@@ -528,6 +622,11 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
if (ret != 1)
goto out;

+ if (is_vm_hugetlb_page(vma)) {
+ struct hstate *h = hstate_vma(vma);
+ page_size = huge_page_size(h);
+ page_mask = huge_page_mask(h);
+ }
ret = verify_opcode(page, opcode_vaddr, &opcode);
put_page(page);
if (ret <= 0)
@@ -547,8 +646,9 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
* unregistering. So trigger MMU notifiers now, as we won't
* be able to do it under PTL.
*/
+ const unsigned long start = vaddr & page_mask;
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
- vaddr, vaddr + PAGE_SIZE);
+ start, start + page_size);
mmu_notifier_invalidate_range_start(&range);
}

@@ -830,8 +930,16 @@ static int __copy_insn(struct address_space *mapping, struct file *filp,
*/
if (mapping->a_ops->read_folio)
page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp);
- else
+ else if (!is_file_hugepages(filp))
page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
+ else {
+ struct hstate *h = hstate_file(filp);
+ unsigned long mask = huge_page_mask(h);
+ page = find_get_page(mapping, (offset & mask) >> PAGE_SHIFT);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+ page = nth_page(page, (offset & (huge_page_size(h) - 1)) >> PAGE_SHIFT);
+ }
if (IS_ERR(page))
return PTR_ERR(page);

@@ -1182,9 +1290,12 @@ static int __uprobe_register(struct inode *inode, loff_t offset,
if (!uc->handler && !uc->ret_handler)
return -EINVAL;

- /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
+ /* copy_insn() uses read_mapping_page() or shmem/hugetlbfs specific
+ * logic
+ */
if (!inode->i_mapping->a_ops->read_folio &&
- !shmem_mapping(inode->i_mapping))
+ !shmem_mapping(inode->i_mapping) &&
+ !hugetlbfs_mapping(inode->i_mapping))
return -EIO;
/* Racy, just to catch the obvious mistakes */
if (offset > i_size_read(inode))

--
Guillaume Morin <guillaume@xxxxxxxxxxx>