Re: [RFC REPOST] cgroup: removing css reference drain wait duringcgroup removal

From: Tejun Heo
Date: Mon Mar 12 2012 - 19:23:54 EST


On Mon, Mar 12, 2012 at 02:33:43PM -0700, Tejun Heo wrote:
> I'm appending the patch for the cgroup part of the change. It removes
> good amount of complication. I think cgroup_has_css_refs() check
> should be dropped from check_for_release() but other than that it
> seems to work fine with blkcg in linux-next.

Here's a version w/ cgroup_has_css_refs() removed. It removes 160
lines. I'm liking it.

---
include/linux/cgroup.h | 41 +------
kernel/cgroup.c | 265 +++++++++++--------------------------------------
2 files changed, 71 insertions(+), 235 deletions(-)

Index: work/include/linux/cgroup.h
===================================================================
--- work.orig/include/linux/cgroup.h
+++ work/include/linux/cgroup.h
@@ -16,6 +16,7 @@
#include <linux/prio_heap.h>
#include <linux/rwsem.h>
#include <linux/idr.h>
+#include <linux/workqueue.h>

#ifdef CONFIG_CGROUPS

@@ -81,7 +82,6 @@ struct cgroup_subsys_state {
/* bits in struct cgroup_subsys_state flags field */
enum {
CSS_ROOT, /* This CSS is the root of the subsystem */
- CSS_REMOVED, /* This CSS is dead */
};

/* Caller must verify that the css is not for root cgroup */
@@ -104,27 +104,18 @@ static inline void css_get(struct cgroup
__css_get(css, 1);
}

-static inline bool css_is_removed(struct cgroup_subsys_state *css)
-{
- return test_bit(CSS_REMOVED, &css->flags);
-}
-
/*
* Call css_tryget() to take a reference on a css if your existing
* (known-valid) reference isn't already ref-counted. Returns false if
* the css has been destroyed.
*/

+extern bool __css_tryget(struct cgroup_subsys_state *css);
static inline bool css_tryget(struct cgroup_subsys_state *css)
{
if (test_bit(CSS_ROOT, &css->flags))
return true;
- while (!atomic_inc_not_zero(&css->refcnt)) {
- if (test_bit(CSS_REMOVED, &css->flags))
- return false;
- cpu_relax();
- }
- return true;
+ return __css_tryget(css);
}

/*
@@ -132,11 +123,11 @@ static inline bool css_tryget(struct cgr
* css_get() or css_tryget()
*/

-extern void __css_put(struct cgroup_subsys_state *css, int count);
+extern void __css_put(struct cgroup_subsys_state *css);
static inline void css_put(struct cgroup_subsys_state *css)
{
if (!test_bit(CSS_ROOT, &css->flags))
- __css_put(css, 1);
+ __css_put(css);
}

/* bits in struct cgroup flags field */
@@ -211,6 +202,9 @@ struct cgroup {
/* List of events which userspace want to receive */
struct list_head event_list;
spinlock_t event_list_lock;
+
+ /* Used to dput @cgroup->dentry from css_put() */
+ struct work_struct dput_work;
};

/*
@@ -408,23 +402,6 @@ int cgroup_task_count(const struct cgrou
int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task);

/*
- * When the subsys has to access css and may add permanent refcnt to css,
- * it should take care of racy conditions with rmdir(). Following set of
- * functions, is for stop/restart rmdir if necessary.
- * Because these will call css_get/put, "css" should be alive css.
- *
- * cgroup_exclude_rmdir();
- * ...do some jobs which may access arbitrary empty cgroup
- * cgroup_release_and_wakeup_rmdir();
- *
- * When someone removes a cgroup while cgroup_exclude_rmdir() holds it,
- * it sleeps and cgroup_release_and_wakeup_rmdir() will wake him up.
- */
-
-void cgroup_exclude_rmdir(struct cgroup_subsys_state *css);
-void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css);
-
-/*
* Control Group taskset, used to pass around set of tasks to cgroup_subsys
* methods.
*/
@@ -453,7 +430,7 @@ int cgroup_taskset_size(struct cgroup_ta

struct cgroup_subsys {
struct cgroup_subsys_state *(*create)(struct cgroup *cgrp);
- int (*pre_destroy)(struct cgroup *cgrp);
+ void (*pre_destroy)(struct cgroup *cgrp);
void (*destroy)(struct cgroup *cgrp);
int (*can_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset);
void (*cancel_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset);
Index: work/kernel/cgroup.c
===================================================================
--- work.orig/kernel/cgroup.c
+++ work/kernel/cgroup.c
@@ -63,6 +63,8 @@

#include <linux/atomic.h>

+#define CSS_DEAD_BIAS INT_MIN
+
/*
* cgroup_mutex is the master lock. Any modification to cgroup or its
* hierarchy must be performed while holding it.
@@ -154,8 +156,8 @@ struct css_id {
* The css to which this ID points. This pointer is set to valid value
* after cgroup is populated. If cgroup is removed, this will be NULL.
* This pointer is expected to be RCU-safe because destroy()
- * is called after synchronize_rcu(). But for safe use, css_is_removed()
- * css_tryget() should be used for avoiding race.
+ * is called after synchronize_rcu(). But for safe use, css_tryget()
+ * should be used for avoiding race.
*/
struct cgroup_subsys_state __rcu *css;
/*
@@ -807,39 +809,14 @@ static struct inode *cgroup_new_inode(um
return inode;
}

-/*
- * Call subsys's pre_destroy handler.
- * This is called before css refcnt check.
- */
-static int cgroup_call_pre_destroy(struct cgroup *cgrp)
-{
- struct cgroup_subsys *ss;
- int ret = 0;
-
- for_each_subsys(cgrp->root, ss)
- if (ss->pre_destroy) {
- ret = ss->pre_destroy(cgrp);
- if (ret)
- break;
- }
-
- return ret;
-}
-
static void cgroup_diput(struct dentry *dentry, struct inode *inode)
{
/* is dentry a directory ? if so, kfree() associated cgroup */
if (S_ISDIR(inode->i_mode)) {
struct cgroup *cgrp = dentry->d_fsdata;
struct cgroup_subsys *ss;
+
BUG_ON(!(cgroup_is_removed(cgrp)));
- /* It's possible for external users to be holding css
- * reference counts on a cgroup; css_put() needs to
- * be able to access the cgroup after decrementing
- * the reference count in order to know if it needs to
- * queue the cgroup to be handled by the release
- * agent */
- synchronize_rcu();

mutex_lock(&cgroup_mutex);
/*
@@ -931,33 +908,6 @@ static void cgroup_d_remove_dir(struct d
}

/*
- * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
- * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
- * reference to css->refcnt. In general, this refcnt is expected to goes down
- * to zero, soon.
- *
- * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
- */
-static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
-
-static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
-{
- if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
- wake_up_all(&cgroup_rmdir_waitq);
-}
-
-void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
-{
- css_get(css);
-}
-
-void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
-{
- cgroup_wakeup_rmdir_waiter(css->cgroup);
- css_put(css);
-}
-
-/*
* Call with cgroup_mutex held. Drops reference counts on modules, including
* any duplicate ones that parse_cgroupfs_options took. If this function
* returns an error, no reference counts are touched.
@@ -1329,6 +1279,13 @@ static const struct super_operations cgr
.remount_fs = cgroup_remount,
};

+static void cgroup_dput_fn(struct work_struct *work)
+{
+ struct cgroup *cgrp = container_of(work, struct cgroup, dput_work);
+
+ dput(cgrp->dentry);
+}
+
static void init_cgroup_housekeeping(struct cgroup *cgrp)
{
INIT_LIST_HEAD(&cgrp->sibling);
@@ -1339,6 +1296,7 @@ static void init_cgroup_housekeeping(str
mutex_init(&cgrp->pidlist_mutex);
INIT_LIST_HEAD(&cgrp->event_list);
spin_lock_init(&cgrp->event_list_lock);
+ INIT_WORK(&cgrp->dput_work, cgroup_dput_fn);
}

static void init_cgroup_root(struct cgroupfs_root *root)
@@ -1936,12 +1894,6 @@ int cgroup_attach_task(struct cgroup *cg
}

synchronize_rcu();
-
- /*
- * wake up rmdir() waiter. the rmdir should fail since the cgroup
- * is no longer empty.
- */
- cgroup_wakeup_rmdir_waiter(cgrp);
out:
if (retval) {
for_each_subsys(root, ss) {
@@ -2111,7 +2063,6 @@ static int cgroup_attach_proc(struct cgr
* step 5: success! and cleanup
*/
synchronize_rcu();
- cgroup_wakeup_rmdir_waiter(cgrp);
retval = 0;
out_put_css_set_refs:
if (retval) {
@@ -3768,6 +3719,7 @@ static long cgroup_create(struct cgroup
err = PTR_ERR(css);
goto err_destroy;
}
+ dget(dentry); /* will be put on the last @css put */
init_cgroup_css(css, ss, cgrp);
if (ss->use_id) {
err = alloc_css_id(ss, parent, cgrp);
@@ -3809,8 +3761,10 @@ static long cgroup_create(struct cgroup
err_destroy:

for_each_subsys(root, ss) {
- if (cgrp->subsys[ss->subsys_id])
+ if (cgrp->subsys[ss->subsys_id]) {
+ dput(dentry);
ss->destroy(cgrp);
+ }
}

mutex_unlock(&cgroup_mutex);
@@ -3830,106 +3784,15 @@ static int cgroup_mkdir(struct inode *di
return cgroup_create(c_parent, dentry, mode | S_IFDIR);
}

-static int cgroup_has_css_refs(struct cgroup *cgrp)
-{
- /* Check the reference count on each subsystem. Since we
- * already established that there are no tasks in the
- * cgroup, if the css refcount is also 1, then there should
- * be no outstanding references, so the subsystem is safe to
- * destroy. We scan across all subsystems rather than using
- * the per-hierarchy linked list of mounted subsystems since
- * we can be called via check_for_release() with no
- * synchronization other than RCU, and the subsystem linked
- * list isn't RCU-safe */
- int i;
- /*
- * We won't need to lock the subsys array, because the subsystems
- * we're concerned about aren't going anywhere since our cgroup root
- * has a reference on them.
- */
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- struct cgroup_subsys_state *css;
- /* Skip subsystems not present or not in this hierarchy */
- if (ss == NULL || ss->root != cgrp->root)
- continue;
- css = cgrp->subsys[ss->subsys_id];
- /* When called from check_for_release() it's possible
- * that by this point the cgroup has been removed
- * and the css deleted. But a false-positive doesn't
- * matter, since it can only happen if the cgroup
- * has been deleted and hence no longer needs the
- * release agent to be called anyway. */
- if (css && (atomic_read(&css->refcnt) > 1))
- return 1;
- }
- return 0;
-}
-
-/*
- * Atomically mark all (or else none) of the cgroup's CSS objects as
- * CSS_REMOVED. Return true on success, or false if the cgroup has
- * busy subsystems. Call with cgroup_mutex held
- */
-
-static int cgroup_clear_css_refs(struct cgroup *cgrp)
-{
- struct cgroup_subsys *ss;
- unsigned long flags;
- bool failed = false;
- local_irq_save(flags);
- for_each_subsys(cgrp->root, ss) {
- struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
- int refcnt;
- while (1) {
- /* We can only remove a CSS with a refcnt==1 */
- refcnt = atomic_read(&css->refcnt);
- if (refcnt > 1) {
- failed = true;
- goto done;
- }
- BUG_ON(!refcnt);
- /*
- * Drop the refcnt to 0 while we check other
- * subsystems. This will cause any racing
- * css_tryget() to spin until we set the
- * CSS_REMOVED bits or abort
- */
- if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt)
- break;
- cpu_relax();
- }
- }
- done:
- for_each_subsys(cgrp->root, ss) {
- struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
- if (failed) {
- /*
- * Restore old refcnt if we previously managed
- * to clear it from 1 to 0
- */
- if (!atomic_read(&css->refcnt))
- atomic_set(&css->refcnt, 1);
- } else {
- /* Commit the fact that the CSS is removed */
- set_bit(CSS_REMOVED, &css->flags);
- }
- }
- local_irq_restore(flags);
- return !failed;
-}
-
static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
{
struct cgroup *cgrp = dentry->d_fsdata;
struct dentry *d;
struct cgroup *parent;
- DEFINE_WAIT(wait);
+ struct cgroup_subsys *ss;
struct cgroup_event *event, *tmp;
- int ret;

/* the vfs holds both inode->i_mutex already */
-again:
mutex_lock(&cgroup_mutex);
if (atomic_read(&cgrp->count) != 0) {
mutex_unlock(&cgroup_mutex);
@@ -3941,52 +3804,34 @@ again:
}
mutex_unlock(&cgroup_mutex);

- /*
- * In general, subsystem has no css->refcnt after pre_destroy(). But
- * in racy cases, subsystem may have to get css->refcnt after
- * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
- * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
- * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
- * and subsystem's reference count handling. Please see css_get/put
- * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
- */
- set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
-
- /*
- * Call pre_destroy handlers of subsys. Notify subsystems
- * that rmdir() request comes.
- */
- ret = cgroup_call_pre_destroy(cgrp);
- if (ret) {
- clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
- return ret;
- }
-
mutex_lock(&cgroup_mutex);
parent = cgrp->parent;
if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
- clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
mutex_unlock(&cgroup_mutex);
return -EBUSY;
}
- prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
- if (!cgroup_clear_css_refs(cgrp)) {
- mutex_unlock(&cgroup_mutex);
- /*
- * Because someone may call cgroup_wakeup_rmdir_waiter() before
- * prepare_to_wait(), we need to check this flag.
- */
- if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
- schedule();
- finish_wait(&cgroup_rmdir_waitq, &wait);
- clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
- if (signal_pending(current))
- return -EINTR;
- goto again;
+
+ /* deny new css_tryget() attempts */
+ for_each_subsys(cgrp->root, ss) {
+ struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+
+ WARN_ON(atomic_read(&css->refcnt) < 0);
+ atomic_add(CSS_DEAD_BIAS, &css->refcnt);
+ }
+
+ /*
+ * No new tryget reference will be handed out. Tell subsystems to
+ * drain the existing references.
+ */
+ for_each_subsys(cgrp->root, ss)
+ if (ss->pre_destroy)
+ ss->pre_destroy(cgrp);
+
+ for_each_subsys(cgrp->root, ss) {
+ struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+
+ css_put(css);
}
- /* NO css_tryget() can success after here. */
- finish_wait(&cgroup_rmdir_waitq, &wait);
- clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);

raw_spin_lock(&release_list_lock);
set_bit(CGRP_REMOVED, &cgrp->flags);
@@ -4670,8 +4515,8 @@ static void check_for_release(struct cgr
{
/* All of these checks rely on RCU to keep the cgroup
* structure alive */
- if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count)
- && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) {
+ if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count) &&
+ list_empty(&cgrp->children)) {
/* Control Group is currently removeable. If it's not
* already queued for a userspace notification, queue
* it now */
@@ -4689,21 +4534,35 @@ static void check_for_release(struct cgr
}

/* Caller must verify that the css is not for root cgroup */
-void __css_put(struct cgroup_subsys_state *css, int count)
+bool __css_tryget(struct cgroup_subsys_state *css)
+{
+ while (1) {
+ int v, t;
+
+ v = atomic_read(&css->refcnt);
+ if (unlikely(v < 0))
+ return false;
+
+ t = atomic_cmpxchg(&css->refcnt, v, v + 1);
+ if (likely(t == v))
+ return true;
+ }
+}
+
+/* Caller must verify that the css is not for root cgroup */
+void __css_put(struct cgroup_subsys_state *css)
{
struct cgroup *cgrp = css->cgroup;
int val;
- rcu_read_lock();
- val = atomic_sub_return(count, &css->refcnt);
- if (val == 1) {
+
+ val = atomic_dec_return(&css->refcnt);
+ if (val == CSS_DEAD_BIAS) {
if (notify_on_release(cgrp)) {
set_bit(CGRP_RELEASABLE, &cgrp->flags);
check_for_release(cgrp);
}
- cgroup_wakeup_rmdir_waiter(cgrp);
+ schedule_work(&cgrp->dput_work);
}
- rcu_read_unlock();
- WARN_ON_ONCE(val < 1);
}
EXPORT_SYMBOL_GPL(__css_put);

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/