[RFC PATCH v2 4/5] hugetlb_cgroup: Add accounting for shared mappings

From: Mina Almasry
Date: Thu Aug 08 2019 - 19:14:10 EST


For shared mappings, the pointer to the hugetlb_cgroup to uncharge lives
in the resv_map entries, in file_region->reservation_counter.

When a file_region entry is added to the resv_map via region_add, we
also charge the appropriate hugetlb_cgroup and put the pointer to that
in file_region->reservation_counter. This is slightly delicate since we
need to not modify the resv_map until we know that charging the
reservation has succeeded. If charging doesn't succeed, we report the
error to the caller, so that the kernel fails the reservation.

On region_del, which is when the hugetlb memory is unreserved, we delete
the file_region entry in the resv_map, but also uncharge the
file_region->reservation_counter.

---
mm/hugetlb.c | 208 +++++++++++++++++++++++++++++++++++++++++----------
1 file changed, 170 insertions(+), 38 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 235996aef6618..d76e3137110ab 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -242,8 +242,72 @@ struct file_region {
struct list_head link;
long from;
long to;
+#ifdef CONFIG_CGROUP_HUGETLB
+ /*
+ * On shared mappings, each reserved region appears as a struct
+ * file_region in resv_map. These fields hold the info needed to
+ * uncharge each reservation.
+ */
+ struct page_counter *reservation_counter;
+ unsigned long pages_per_hpage;
+#endif
};

+/* Must be called with resv->lock held. Calling this with dry_run == true will
+ * count the number of pages added but will not modify the linked list.
+ */
+static long consume_regions_we_overlap_with(struct file_region *rg,
+ struct list_head *head, long f, long *t,
+ struct hugetlb_cgroup *h_cg,
+ struct hstate *h,
+ bool dry_run)
+{
+ long add = 0;
+ struct file_region *trg = NULL, *nrg = NULL;
+
+ /* Consume any regions we now overlap with. */
+ nrg = rg;
+ list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
+ if (&rg->link == head)
+ break;
+ if (rg->from > *t)
+ break;
+
+ /* If this area reaches higher then extend our area to
+ * include it completely. If this is not the first area
+ * which we intend to reuse, free it.
+ */
+ if (rg->to > *t)
+ *t = rg->to;
+ if (rg != nrg) {
+ /* Decrement return value by the deleted range.
+ * Another range will span this area so that by
+ * end of routine add will be >= zero
+ */
+ add -= (rg->to - rg->from);
+ if (!dry_run) {
+ list_del(&rg->link);
+ kfree(rg);
+ }
+ }
+ }
+
+ add += (nrg->from - f); /* Added to beginning of region */
+ add += *t - nrg->to; /* Added to end of region */
+
+ if (!dry_run) {
+ nrg->from = f;
+ nrg->to = *t;
+#ifdef CONFIG_CGROUP_HUGETLB
+ nrg->reservation_counter =
+ &h_cg->reserved_hugepage[hstate_index(h)];
+ nrg->pages_per_hpage = pages_per_huge_page(h);
+#endif
+ }
+
+ return add;
+}
+
/*
* Add the huge page range represented by [f, t) to the reserve
* map. In the normal case, existing regions will be expanded
@@ -258,11 +322,13 @@ struct file_region {
* Return the number of new huge pages added to the map. This
* number is greater than or equal to zero.
*/
-static long region_add(struct resv_map *resv, long f, long t)
+static long region_add(struct hstate *h, struct resv_map *resv, long f, long t)
{
struct list_head *head = &resv->regions;
- struct file_region *rg, *nrg, *trg;
- long add = 0;
+ struct file_region *rg, *nrg;
+ long add = 0, newadd = 0;
+ struct hugetlb_cgroup *h_cg = NULL;
+ int ret = 0;

spin_lock(&resv->lock);
/* Locate the region we are either in or before. */
@@ -277,6 +343,23 @@ static long region_add(struct resv_map *resv, long f, long t)
* from the cache and use it for this range.
*/
if (&rg->link == head || t < rg->from) {
+#ifdef CONFIG_CGROUP_HUGETLB
+ /*
+ * If res->reservation_counter is NULL, then it means this is
+ * a shared mapping, and hugetlb cgroup accounting should be
+ * done on the file_region entries inside resv_map.
+ */
+ if (!resv->reservation_counter) {
+ ret = hugetlb_cgroup_charge_cgroup(
+ hstate_index(h),
+ (t - f) * pages_per_huge_page(h),
+ &h_cg, true);
+ }
+
+ if (ret)
+ goto out_locked;
+#endif
+
VM_BUG_ON(resv->region_cache_count <= 0);

resv->region_cache_count--;
@@ -286,6 +369,15 @@ static long region_add(struct resv_map *resv, long f, long t)

nrg->from = f;
nrg->to = t;
+
+#ifdef CONFIG_CGROUP_HUGETLB
+ if (h_cg) {
+ nrg->reservation_counter =
+ &h_cg->reserved_hugepage[hstate_index(h)];
+ nrg->pages_per_hpage = pages_per_huge_page(h);
+ }
+#endif
+
list_add(&nrg->link, rg->link.prev);

add += t - f;
@@ -296,38 +388,37 @@ static long region_add(struct resv_map *resv, long f, long t)
if (f > rg->from)
f = rg->from;

- /* Check for and consume any regions we now overlap with. */
- nrg = rg;
- list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
- if (&rg->link == head)
- break;
- if (rg->from > t)
- break;
-
- /* If this area reaches higher then extend our area to
- * include it completely. If this is not the first area
- * which we intend to reuse, free it. */
- if (rg->to > t)
- t = rg->to;
- if (rg != nrg) {
- /* Decrement return value by the deleted range.
- * Another range will span this area so that by
- * end of routine add will be >= zero
- */
- add -= (rg->to - rg->from);
- list_del(&rg->link);
- kfree(rg);
- }
+#ifdef CONFIG_CGROUP_HUGETLB
+ /* Count any regions we now overlap with. */
+ add = consume_regions_we_overlap_with(rg, head, f, &t, NULL, NULL,
+ true);
+
+ if (!resv->reservation_counter) {
+ ret = hugetlb_cgroup_charge_cgroup(
+ hstate_index(h),
+ add * pages_per_huge_page(h),
+ &h_cg, true);
}

- add += (nrg->from - f); /* Added to beginning of region */
- nrg->from = f;
- add += t - nrg->to; /* Added to end of region */
- nrg->to = t;
+ if (ret)
+ goto out_locked;
+#endif
+
+ /* Check for and consume any regions we now overlap with. */
+ newadd = consume_regions_we_overlap_with(rg, head, f, &t, h_cg, h,
+ false);
+ /*
+ * If these aren't equal, then there is a bug with
+ * consume_regions_we_overlap_with, and we're charging the wrong amount
+ * of memory.
+ */
+ WARN_ON(add != newadd);

out_locked:
resv->adds_in_progress--;
spin_unlock(&resv->lock);
+ if (ret)
+ return ret;
VM_BUG_ON(add < 0);
return add;
}
@@ -487,6 +578,10 @@ static long region_del(struct resv_map *resv, long f, long t)
struct file_region *rg, *trg;
struct file_region *nrg = NULL;
long del = 0;
+#ifdef CONFIG_CGROUP_HUGETLB
+ struct page_counter *reservation_counter = NULL;
+ unsigned long pages_per_hpage = 0;
+#endif

retry:
spin_lock(&resv->lock);
@@ -514,6 +609,14 @@ static long region_del(struct resv_map *resv, long f, long t)
nrg = list_first_entry(&resv->region_cache,
struct file_region,
link);
+#ifdef CONFIG_CGROUP_HUGETLB
+ /*
+ * Save counter information from the deleted
+ * node, in case we need to do an uncharge.
+ */
+ reservation_counter = nrg->reservation_counter;
+ pages_per_hpage = nrg->pages_per_hpage;
+#endif
list_del(&nrg->link);
resv->region_cache_count--;
}
@@ -543,6 +646,14 @@ static long region_del(struct resv_map *resv, long f, long t)

if (f <= rg->from && t >= rg->to) { /* Remove entire region */
del += rg->to - rg->from;
+#ifdef CONFIG_CGROUP_HUGETLB
+ /*
+ * Save counter information from the deleted node,
+ * in case we need to do an uncharge.
+ */
+ reservation_counter = rg->reservation_counter;
+ pages_per_hpage = rg->pages_per_hpage;
+#endif
list_del(&rg->link);
kfree(rg);
continue;
@@ -559,6 +670,19 @@ static long region_del(struct resv_map *resv, long f, long t)

spin_unlock(&resv->lock);
kfree(nrg);
+#ifdef CONFIG_CGROUP_HUGETLB
+ /*
+ * If resv->reservation_counter is NULL, then this is shared
+ * reservation, and the reserved memory is tracked in the file_struct
+ * entries inside of resv_map. So we need to uncharge the memory here.
+ */
+ if (reservation_counter && pages_per_hpage && del > 0 &&
+ !resv->reservation_counter) {
+ hugetlb_cgroup_uncharge_counter(
+ reservation_counter,
+ del * pages_per_hpage);
+ }
+#endif
return del;
}

@@ -1930,7 +2054,7 @@ static long __vma_reservation_common(struct hstate *h,
ret = region_chg(resv, idx, idx + 1);
break;
case VMA_COMMIT_RESV:
- ret = region_add(resv, idx, idx + 1);
+ ret = region_add(h, resv, idx, idx + 1);
break;
case VMA_END_RESV:
region_abort(resv, idx, idx + 1);
@@ -1938,7 +2062,7 @@ static long __vma_reservation_common(struct hstate *h,
break;
case VMA_ADD_RESV:
if (vma->vm_flags & VM_MAYSHARE)
- ret = region_add(resv, idx, idx + 1);
+ ret = region_add(h, resv, idx, idx + 1);
else {
region_abort(resv, idx, idx + 1);
ret = region_del(resv, idx, idx + 1);
@@ -4536,7 +4660,7 @@ int hugetlb_reserve_pages(struct inode *inode,
struct vm_area_struct *vma,
vm_flags_t vm_flags)
{
- long ret, chg;
+ long ret, chg, add;
struct hstate *h = hstate_inode(inode);
struct hugepage_subpool *spool = subpool_inode(inode);
struct resv_map *resv_map;
@@ -4624,9 +4748,7 @@ int hugetlb_reserve_pages(struct inode *inode,
*/
ret = hugetlb_acct_memory(h, gbl_reserve);
if (ret < 0) {
- /* put back original number of pages, chg */
- (void)hugepage_subpool_put_pages(spool, chg);
- goto out_err;
+ goto out_put_pages;
}

/*
@@ -4641,7 +4763,12 @@ int hugetlb_reserve_pages(struct inode *inode,
* else has to be done for private mappings here
*/
if (!vma || vma->vm_flags & VM_MAYSHARE) {
- long add = region_add(resv_map, from, to);
+ add = region_add(h, resv_map, from, to);
+ if (add < 0) {
+ ret = -ENOMEM;
+ goto out_acct_memory;
+ }
+

if (unlikely(chg > add)) {
/*
@@ -4659,10 +4786,15 @@ int hugetlb_reserve_pages(struct inode *inode,
}
}
return 0;
+out_acct_memory:
+ hugetlb_acct_memory(h, -gbl_reserve);
+out_put_pages:
+ /* put back original number of pages, chg */
+ (void)hugepage_subpool_put_pages(spool, chg);
out_err:
if (!vma || vma->vm_flags & VM_MAYSHARE)
- /* Don't call region_abort if region_chg failed */
- if (chg >= 0)
+ /* Don't call region_abort if region_chg or region_add failed */
+ if (chg >= 0 && add >= 0)
region_abort(resv_map, from, to);
if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
kref_put(&resv_map->refs, resv_map_release);
--
2.23.0.rc1.153.gdeed80330f-goog