Re: [PATCH 1/2] [RFC] Range tree implementation
From: Aneesh Kumar K.V
Date:  Tue Mar 20 2012 - 12:45:11 EST
John Stultz <john.stultz@xxxxxxxxxx> writes:
....
> +/**
> + * range_tree_in_range - Returns the first node that overlaps with the
> + *                       given range
> + * @root: range_tree root
> + * @start: range start
> + * @end: range end
> + *
> + */
> +struct range_tree_node *range_tree_in_range(struct range_tree_root *root,
> +						u64 start, u64 end)
> +{
> +	struct rb_node **p = &root->head.rb_node;
> +	struct range_tree_node *candidate;
> +
> +	while (*p) {
> +		candidate = rb_entry(*p, struct range_tree_node, rb);
> +		if (end < candidate->start)
> +			p = &(*p)->rb_left;
> +		else if (start > candidate->end)
> +			p = &(*p)->rb_right;
> +		else
> +			return candidate;
> +	}
> +
> +	return 0;
return NULL ?
> +}
> +
> +
> +/**
> + * range_tree_in_range - Returns the first node that overlaps or is adjacent
> + *                       with the given range
> + * @root: range_tree root
> + * @start: range start
> + * @end: range end
> + *
> + */
The comment needs update 
> +struct range_tree_node *range_tree_in_range_adjacent(
> +					struct range_tree_root *root,
> +					u64 start, u64 end)
> +{
> +	struct rb_node **p = &root->head.rb_node;
> +	struct range_tree_node *candidate;
> +
> +	while (*p) {
> +		candidate = rb_entry(*p, struct range_tree_node, rb);
> +		if (end+1 < candidate->start)
> +			p = &(*p)->rb_left;
> +		else if (start > candidate->end + 1)
> +			p = &(*p)->rb_right;
> +		else
> +			return candidate;
> +	}
> +	return 0;
> +}
> +
Below is my hack to get hugetlbfs code converted. The patch compiles.
Will test and send a signed-off-by version later.
not-signed-off-by: Aneesh Kumar K.V <aneesh.kumar@xxxxxxxxxxxxxxxxxx>
 fs/hugetlbfs/inode.c    |    3 +-
 include/linux/hugetlb.h |    2 +
 mm/hugetlb.c            |  291 ++++++++++++++++++++++-------------------------
 3 files changed, 139 insertions(+), 157 deletions(-)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index ca4fa70..8309f5e 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -455,6 +455,7 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb,
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		info = HUGETLBFS_I(inode);
 		mpol_shared_policy_init(&info->policy, NULL);
+		range_tree_init(&info->rg_root);
 		inode->i_op = &hugetlbfs_dir_inode_operations;
 		inode->i_fop = &simple_dir_operations;
 		/* directory inodes start off with i_nlink == 2 (for "." entry) */
@@ -478,7 +479,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
 		inode->i_mapping->a_ops = &hugetlbfs_aops;
 		inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-		INIT_LIST_HEAD(&inode->i_mapping->private_list);
 		info = HUGETLBFS_I(inode);
 		/*
 		 * The policy is initialized here even if we are creating a
@@ -488,6 +488,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
 		 * the rb tree will still be empty.
 		 */
 		mpol_shared_policy_init(&info->policy, NULL);
+		range_tree_init(&info->rg_root);
 		switch (mode & S_IFMT) {
 		default:
 			init_special_inode(inode, mode, dev);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 32e948c..b785541a 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -5,6 +5,7 @@
 #include <linux/fs.h>
 #include <linux/hugetlb_inline.h>
 #include <linux/cgroup.h>
+#include <linux/rangetree.h>
 
 struct ctl_table;
 struct user_struct;
@@ -150,6 +151,7 @@ struct hugetlbfs_sb_info {
 
 struct hugetlbfs_inode_info {
 	struct shared_policy policy;
+	struct range_tree_root rg_root;
 	struct inode vfs_inode;
 };
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4e1462d..a83727d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -69,148 +69,94 @@ static DEFINE_SPINLOCK(hugetlb_lock);
  *	down_read(&mm->mmap_sem);
  *	mutex_lock(&hugetlb_instantiation_mutex);
  */
-struct file_region {
-	struct list_head link;
-	long from;
-	long to;
-};
-
-static long region_add(struct list_head *head, long f, long t)
+static long region_chg(struct range_tree_root *rg_root, long start, long end,
+		       struct range_tree_node **rg_nodep)
 {
-	struct file_region *rg, *nrg, *trg;
+	long chg = 0;
+	struct range_tree_node *rg_node;
 
-	/* Locate the region we are either in or before. */
-	list_for_each_entry(rg, head, link)
-		if (f <= rg->to)
-			break;
+	rg_node = range_tree_in_range_adjacent(rg_root, start, end);
+	/*
+	 * If we need to allocate a new range_tree_node, we return a charge
+	 * with NULL *rg_node;
+	 */
+	if (rg_node == NULL)
+		return end - start;
 
-	/* Round our left edge to the current segment if it encloses us. */
-	if (f > rg->from)
-		f = rg->from;
+	if (start < rg_node->start)
+		chg +=  rg_node->start - start;
+	if (rg_node->end < end)
+		chg += end - rg_node->end;
 
-	/* Check for and consume any regions we now overlap with. */
-	nrg = rg;
-	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
-		if (&rg->link == head)
-			break;
-		if (rg->from > t)
-			break;
-
-		/* If this area reaches higher then extend our area to
-		 * include it completely.  If this is not the first area
-		 * which we intend to reuse, free it. */
-		if (rg->to > t)
-			t = rg->to;
-		if (rg != nrg) {
-			list_del(&rg->link);
-			kfree(rg);
-		}
-	}
-	nrg->from = f;
-	nrg->to = t;
-	return 0;
+	*rg_nodep = rg_node;
+	return chg;
 }
 
-static long region_chg(struct list_head *head, long f, long t)
+static void region_add(struct range_tree_root *rg_root, long start, long end,
+		       struct range_tree_node *rg_node)
 {
-	struct file_region *rg, *nrg;
-	long chg = 0;
-
-	/* Locate the region we are before or in. */
-	list_for_each_entry(rg, head, link)
-		if (f <= rg->to)
-			break;
+	if (rg_node == NULL)
+		return;
 
-	/* If we are below the current region then a new region is required.
-	 * Subtle, allocate a new region at the position but make it zero
-	 * size such that we can guarantee to record the reservation. */
-	if (&rg->link == head || t < rg->from) {
-		nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
-		if (!nrg)
-			return -ENOMEM;
-		nrg->from = f;
-		nrg->to   = f;
-		INIT_LIST_HEAD(&nrg->link);
-		list_add(&nrg->link, rg->link.prev);
+	if (start < rg_node->start)
+		rg_node->start = start;
 
-		return t - f;
-	}
+	if (end > rg_node->end)
+		rg_node->end = end;
 
-	/* Round our left edge to the current segment if it encloses us. */
-	if (f > rg->from)
-		f = rg->from;
-	chg = t - f;
-
-	/* Check for and consume any regions we now overlap with. */
-	list_for_each_entry(rg, rg->link.prev, link) {
-		if (&rg->link == head)
-			break;
-		if (rg->from > t)
-			return chg;
-
-		/* We overlap with this area, if it extends further than
-		 * us then we must extend ourselves.  Account for its
-		 * existing reservation. */
-		if (rg->to > t) {
-			chg += rg->to - t;
-			t = rg->to;
-		}
-		chg -= rg->to - rg->from;
-	}
-	return chg;
+	range_tree_add(rg_root, rg_node);
 }
 
-static long region_truncate(struct list_head *head, long end)
+static long region_truncate(struct range_tree_root *rg_root, long off)
 {
-	struct file_region *rg, *trg;
 	long chg = 0;
-
-	/* Locate the region we are either in or before. */
-	list_for_each_entry(rg, head, link)
-		if (end <= rg->to)
-			break;
-	if (&rg->link == head)
-		return 0;
-
-	/* If we are in the middle of a region then adjust it. */
-	if (end > rg->from) {
-		chg = rg->to - end;
-		rg->to = end;
-		rg = list_entry(rg->link.next, typeof(*rg), link);
-	}
-
-	/* Drop any remaining regions. */
-	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
-		if (&rg->link == head)
-			break;
-		chg += rg->to - rg->from;
-		list_del(&rg->link);
-		kfree(rg);
+	struct rb_node *rb_node;
+
+restart:
+	rb_node = rb_first(&rg_root->head);
+	while (rb_node) {
+		struct range_tree_node *rg_node;
+		rg_node = rb_entry(rb_node, struct range_tree_node, rb);
+		if (rg_node->end <= off) {
+			rb_node = rb_next(rb_node);
+			continue;
+		}
+		if (rg_node->start < off) {
+			chg += rg_node->end - off;
+			rg_node->end = off;
+			rb_node = rb_next(rb_node);
+			continue;
+		}
+		chg += rg_node->end - rg_node->start;
+		rb_erase(rb_node, &rg_root->head);
+		goto restart;
 	}
 	return chg;
 }
 
-static long region_count(struct list_head *head, long f, long t)
+static long region_count(struct range_tree_root *rg_root, long start, long end)
 {
-	struct file_region *rg;
 	long chg = 0;
+	struct rb_node *rb_node;
 
-	/* Locate each segment we overlap with, and count that overlap. */
-	list_for_each_entry(rg, head, link) {
-		int seg_from;
-		int seg_to;
+	rb_node = rb_first(&rg_root->head);
+	while (rb_node) {
+		int seg_from, seg_to;
+		struct range_tree_node *rg_node;
 
-		if (rg->to <= f)
+		rg_node = rb_entry(rb_node, struct range_tree_node, rb);
+		if (rg_node->end <= start) {
+			rb_node = rb_next(rb_node);
 			continue;
-		if (rg->from >= t)
+		}
+		if (rg_node->start >= end)
 			break;
-
-		seg_from = max(rg->from, f);
-		seg_to = min(rg->to, t);
+		seg_from = max(rg_node->start, (u64)start);
+		seg_to = min(rg_node->end, (u64)end);
 
 		chg += seg_to - seg_from;
+		rb_node = rb_next(rb_node);
 	}
-
 	return chg;
 }
 
@@ -302,7 +248,7 @@ static void set_vma_private_data(struct vm_area_struct *vma,
 
 struct resv_map {
 	struct kref refs;
-	struct list_head regions;
+	struct range_tree_root rg_root;
 };
 
 static struct resv_map *resv_map_alloc(void)
@@ -312,7 +258,7 @@ static struct resv_map *resv_map_alloc(void)
 		return NULL;
 
 	kref_init(&resv_map->refs);
-	INIT_LIST_HEAD(&resv_map->regions);
+	range_tree_init(&resv_map->rg_root);
 
 	return resv_map;
 }
@@ -322,7 +268,7 @@ static void resv_map_release(struct kref *ref)
 	struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
 
 	/* Clear out any active regions before we release the map. */
-	region_truncate(&resv_map->regions, 0);
+	region_truncate(&resv_map->rg_root, 0);
 	kfree(resv_map);
 }
 
@@ -980,16 +926,19 @@ static void return_unused_surplus_pages(struct hstate *h,
  * No action is required on failure.
  */
 static long vma_needs_reservation(struct hstate *h,
-			struct vm_area_struct *vma, unsigned long addr)
+				  struct vm_area_struct *vma,
+				  unsigned long addr,
+				  struct range_tree_node **rg_node)
 {
 	struct address_space *mapping = vma->vm_file->f_mapping;
 	struct inode *inode = mapping->host;
+	struct hugetlbfs_inode_info *hinfo = HUGETLBFS_I(inode);
 
+	*rg_node = NULL;
 	if (vma->vm_flags & VM_MAYSHARE) {
 		pgoff_t idx = vma_hugecache_offset(h, vma, addr);
-		return region_chg(&inode->i_mapping->private_list,
-							idx, idx + 1);
 
+		return region_chg(&hinfo->rg_root, idx, idx + 1, rg_node);
 	} else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
 		return 1;
 
@@ -998,28 +947,34 @@ static long vma_needs_reservation(struct hstate *h,
 		pgoff_t idx = vma_hugecache_offset(h, vma, addr);
 		struct resv_map *reservations = vma_resv_map(vma);
 
-		err = region_chg(&reservations->regions, idx, idx + 1);
+		err = region_chg(&reservations->rg_root, idx, idx + 1, rg_node);
 		if (err < 0)
 			return err;
 		return 0;
 	}
 }
 static void vma_commit_reservation(struct hstate *h,
-			struct vm_area_struct *vma, unsigned long addr)
+				   struct vm_area_struct *vma,
+				   unsigned long addr,
+				   struct range_tree_node *rg_node)
 {
 	struct address_space *mapping = vma->vm_file->f_mapping;
 	struct inode *inode = mapping->host;
+	struct hugetlbfs_inode_info *hinfo = HUGETLBFS_I(inode);
+
+	if (rg_node == NULL)
+		return;
 
 	if (vma->vm_flags & VM_MAYSHARE) {
 		pgoff_t idx = vma_hugecache_offset(h, vma, addr);
-		region_add(&inode->i_mapping->private_list, idx, idx + 1);
+		region_add(&hinfo->rg_root, idx, idx + 1, rg_node);
 
 	} else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
 		pgoff_t idx = vma_hugecache_offset(h, vma, addr);
 		struct resv_map *reservations = vma_resv_map(vma);
 
 		/* Mark this page used in the map. */
-		region_add(&reservations->regions, idx, idx + 1);
+		region_add(&reservations->rg_root, idx, idx + 1, rg_node);
 	}
 }
 
@@ -1027,6 +982,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
 				    unsigned long addr, int avoid_reserve)
 {
 	int ret, idx;
+	struct range_tree_node *rg_node;
 	struct hstate *h = hstate_vma(vma);
 	struct page *page;
 	struct mem_cgroup *memcg;
@@ -1042,18 +998,24 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
 	 * MAP_NORESERVE mappings may also need pages and quota allocated
 	 * if no reserve mapping overlaps.
 	 */
-	chg = vma_needs_reservation(h, vma, addr);
-	if (chg < 0)
-		return ERR_PTR(-ENOMEM);
-	if (chg)
-		if (hugetlb_get_quota(inode->i_mapping, chg))
-			return ERR_PTR(-ENOSPC);
+	chg = vma_needs_reservation(h, vma, addr, &rg_node);
+	if (chg > 0 && rg_node == NULL) {
+		rg_node = kzalloc(sizeof(*rg_node), GFP_KERNEL);
+		if (rg_node == NULL)
+			return ERR_PTR(-ENOMEM);
+	}
 
+	if (chg) {
+		if (hugetlb_get_quota(inode->i_mapping, chg)) {
+			ret = -ENOSPC;
+			goto err_out;
+		}
+	}
 	ret = mem_cgroup_hugetlb_charge_page(idx, pages_per_huge_page(h),
 					     &memcg);
 	if (ret) {
-		hugetlb_put_quota(inode->i_mapping, chg);
-		return ERR_PTR(-ENOSPC);
+		ret = -ENOSPC;
+		goto err_out_quota;
 	}
 	spin_lock(&hugetlb_lock);
 	page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
@@ -1062,21 +1024,26 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
 	if (!page) {
 		page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
 		if (!page) {
-			mem_cgroup_hugetlb_uncharge_memcg(idx,
-							 pages_per_huge_page(h),
-							 memcg);
-			hugetlb_put_quota(inode->i_mapping, chg);
-			return ERR_PTR(-ENOSPC);
+			ret = -ENOSPC;
+			goto err_out_uncharge;
 		}
 	}
 
 	set_page_private(page, (unsigned long) mapping);
 
-	vma_commit_reservation(h, vma, addr);
+	vma_commit_reservation(h, vma, addr, rg_node);
 	/* update page cgroup details */
 	mem_cgroup_hugetlb_commit_charge(idx, pages_per_huge_page(h),
 					 memcg, page);
 	return page;
+
+err_out_uncharge:
+	mem_cgroup_hugetlb_uncharge_memcg(idx, pages_per_huge_page(h), memcg);
+err_out_quota:
+	hugetlb_put_quota(inode->i_mapping, chg);
+err_out:
+	kfree(rg_node);
+	return ERR_PTR(ret);
 }
 
 int __weak alloc_bootmem_huge_page(struct hstate *h)
@@ -2170,7 +2137,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
 		end = vma_hugecache_offset(h, vma, vma->vm_end);
 
 		reserve = (end - start) -
-			region_count(&reservations->regions, start, end);
+			region_count(&reservations->rg_root, start, end);
 
 		kref_put(&reservations->refs, resv_map_release);
 
@@ -2697,11 +2664,13 @@ retry:
 	 * any allocations necessary to record that reservation occur outside
 	 * the spinlock.
 	 */
-	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
-		if (vma_needs_reservation(h, vma, address) < 0) {
+	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
+		struct range_tree_node *rg_node;
+		if (vma_needs_reservation(h, vma, address, &rg_node) < 0) {
 			ret = VM_FAULT_OOM;
 			goto backout_unlocked;
 		}
+	}
 
 	spin_lock(&mm->page_table_lock);
 	size = i_size_read(mapping->host) >> huge_page_shift(h);
@@ -2789,7 +2758,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * consumed.
 	 */
 	if ((flags & FAULT_FLAG_WRITE) && !pte_write(entry)) {
-		if (vma_needs_reservation(h, vma, address) < 0) {
+		struct range_tree_node *rg_node;
+		if (vma_needs_reservation(h, vma, address, &rg_node) < 0) {
 			ret = VM_FAULT_OOM;
 			goto out_mutex;
 		}
@@ -2975,7 +2945,9 @@ int hugetlb_reserve_pages(struct inode *inode,
 					vm_flags_t vm_flags)
 {
 	long ret, chg;
+	struct range_tree_node *rg_node;
 	struct hstate *h = hstate_inode(inode);
+	struct hugetlbfs_inode_info *hinfo = HUGETLBFS_I(inode);
 
 	/*
 	 * Only apply hugepage reservation if asked. At fault time, an
@@ -2992,25 +2964,27 @@ int hugetlb_reserve_pages(struct inode *inode,
 	 * called to make the mapping read-write. Assume !vma is a shm mapping
 	 */
 	if (!vma || vma->vm_flags & VM_MAYSHARE)
-		chg = region_chg(&inode->i_mapping->private_list, from, to);
+		chg = region_chg(&hinfo->rg_root, from, to, &rg_node);
 	else {
 		struct resv_map *resv_map = resv_map_alloc();
 		if (!resv_map)
 			return -ENOMEM;
 
-		chg = to - from;
-
+		chg = region_chg(&resv_map->rg_root, from, to, &rg_node);
 		set_vma_resv_map(vma, resv_map);
 		set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
 	}
-
-	if (chg < 0)
-		return chg;
-
+	if (chg > 0 && rg_node == NULL ) {
+		/* We need allocate a new node */
+		rg_node = kzalloc(sizeof(*rg_node), GFP_KERNEL);
+		if (rg_node == NULL)
+			return -ENOMEM;
+	}
 	/* There must be enough filesystem quota for the mapping */
-	if (hugetlb_get_quota(inode->i_mapping, chg))
-		return -ENOSPC;
-
+	if (hugetlb_get_quota(inode->i_mapping, chg)) {
+		ret = -ENOSPC;
+		goto err_out;
+	}
 	/*
 	 * Check enough hugepages are available for the reservation.
 	 * Hand back the quota if there are not
@@ -3018,7 +2992,7 @@ int hugetlb_reserve_pages(struct inode *inode,
 	ret = hugetlb_acct_memory(h, chg);
 	if (ret < 0) {
 		hugetlb_put_quota(inode->i_mapping, chg);
-		return ret;
+		goto err_out;
 	}
 
 	/*
@@ -3033,14 +3007,19 @@ int hugetlb_reserve_pages(struct inode *inode,
 	 * else has to be done for private mappings here
 	 */
 	if (!vma || vma->vm_flags & VM_MAYSHARE)
-		region_add(&inode->i_mapping->private_list, from, to);
+		region_add(&hinfo->rg_root, from, to, rg_node);
 	return 0;
+err_out:
+	kfree(rg_node);
+	return ret;
 }
 
 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
 {
 	struct hstate *h = hstate_inode(inode);
-	long chg = region_truncate(&inode->i_mapping->private_list, offset);
+	struct hugetlbfs_inode_info *hinfo = HUGETLBFS_I(inode);
+
+	long chg = region_truncate(&hinfo->rg_root, offset);
 
 	spin_lock(&inode->i_lock);
 	inode->i_blocks -= (blocks_per_huge_page(h) * freed);
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/