[PATCH] RCU: Move expedited grace period (GP) work to RT kthread_worker

From: Kalesh Singh
Date: Fri Apr 01 2022 - 18:58:15 EST


It was noticed that enabling CONFIG_RCU_BOOST did not help RCU
performance because the workqueues that runs expedited GP work are
subject to scheduling delays. This patch moves the expedited GP
work items to RT kthread_worker.

The results were evaluated on arm64 Android devices (6GB ram) running
5.10 kernel, and caturing trace data during critical user journeys.

The table below compares the time synchronize_rcu_expedited() is blocked:

----------------------------------------------------------------------
| | Using WQ | Using kthead_worker | Diff |
-----------------------------------------------------------------------
| Max duration (ns) | 372766967 | 2329671 | -99.38% |
-----------------------------------------------------------------------
| Avg duration (ns) | 2746353.16 | 151242.311 | -94.49% |
----------------------------------------------------------------------

Cc: "Paul E. McKenney" <paulmck@xxxxxxxxxx>
Cc: Tejun Heo <tj@xxxxxxxxxx>
Reported-by: Tim Murray <timmurray@xxxxxxxxxx>
Reported-by: Wei Wang <wvw@xxxxxxxxxx>
Tested-by: Kyle Lin <kylelin@xxxxxxxxxx>
Tested-by: Chunwei Lu <chunweilu@xxxxxxxxxx>
Tested-by: Lulu Wang <luluw@xxxxxxxxxx>
Signed-off-by: Kalesh Singh <kaleshsingh@xxxxxxxxxx>
---
kernel/rcu/rcu.h | 3 ++-
kernel/rcu/tree.c | 41 +++++++++++++++++++++++++++++++++++++----
kernel/rcu/tree.h | 3 ++-
kernel/rcu/tree_exp.h | 35 +++++++++++++++--------------------
4 files changed, 56 insertions(+), 26 deletions(-)

diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 24b5f2c2de87..13d2b74bf19f 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -534,7 +534,8 @@ int rcu_get_gp_kthreads_prio(void);
void rcu_fwd_progress_check(unsigned long j);
void rcu_force_quiescent_state(void);
extern struct workqueue_struct *rcu_gp_wq;
-extern struct workqueue_struct *rcu_par_gp_wq;
+extern struct kthread_worker *rcu_exp_gp_kworker;
+extern struct kthread_worker *rcu_exp_par_gp_kworker;
#endif /* #else #ifdef CONFIG_TINY_RCU */

#ifdef CONFIG_RCU_NOCB_CPU
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a4b8189455d5..bd5e672ffa5a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4471,6 +4471,33 @@ static int rcu_pm_notify(struct notifier_block *self,
return NOTIFY_OK;
}

+struct kthread_worker *rcu_exp_gp_kworker;
+struct kthread_worker *rcu_exp_par_gp_kworker;
+
+static void __init rcu_start_exp_gp_kworkers(void)
+{
+ const char *par_gp_kworker_name = "rcu_exp_par_gp_kthread_worker";
+ const char *gp_kworker_name = "rcu_exp_gp_kthread_worker";
+ struct sched_param param = { .sched_priority = kthread_prio };
+
+ rcu_exp_gp_kworker = kthread_create_worker(0, gp_kworker_name);
+ if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) {
+ pr_err("Failed to create %s!\n", gp_kworker_name);
+ return;
+ }
+
+ rcu_exp_par_gp_kworker = kthread_create_worker(0, par_gp_kworker_name);
+ if (IS_ERR_OR_NULL(rcu_exp_par_gp_kworker)) {
+ pr_err("Failed to create %s!\n", par_gp_kworker_name);
+ kthread_destroy_worker(rcu_exp_gp_kworker);
+ return;
+ }
+
+ sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, &param);
+ sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_FIFO,
+ &param);
+}
+
/*
* Spawn the kthreads that handle RCU's grace periods.
*/
@@ -4500,6 +4527,10 @@ static int __init rcu_spawn_gp_kthread(void)
rcu_spawn_nocb_kthreads();
rcu_spawn_boost_kthreads();
rcu_spawn_core_kthreads();
+
+ /* Create kthread worker for expedited GPs */
+ rcu_start_exp_gp_kworkers();
+
return 0;
}
early_initcall(rcu_spawn_gp_kthread);
@@ -4745,7 +4776,6 @@ static void __init rcu_dump_rcu_node_tree(void)
}

struct workqueue_struct *rcu_gp_wq;
-struct workqueue_struct *rcu_par_gp_wq;

static void __init kfree_rcu_batch_init(void)
{
@@ -4808,11 +4838,14 @@ void __init rcu_init(void)
rcutree_online_cpu(cpu);
}

- /* Create workqueue for Tree SRCU and for expedited GPs. */
+ /*
+ * Create workqueue for Tree SRCU.
+ *
+ * Expedited GPs use RT kthread_worker.
+ * See: rcu_start_exp_gp_kworkers()
+ */
rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
WARN_ON(!rcu_gp_wq);
- rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
- WARN_ON(!rcu_par_gp_wq);

/* Fill in default value for rcutree.qovld boot parameter. */
/* -After- the rcu_node ->lock fields are initialized! */
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 926673ebe355..0193d67a706a 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -10,6 +10,7 @@
*/

#include <linux/cache.h>
+#include <linux/kthread.h>
#include <linux/spinlock.h>
#include <linux/rtmutex.h>
#include <linux/threads.h>
@@ -23,7 +24,7 @@
/* Communicate arguments to a workqueue handler. */
struct rcu_exp_work {
unsigned long rew_s;
- struct work_struct rew_work;
+ struct kthread_work rew_work;
};

/* RCU's kthread states for tracing. */
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 60197ea24ceb..f5f3722c0a74 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -334,7 +334,7 @@ static bool exp_funnel_lock(unsigned long s)
* Select the CPUs within the specified rcu_node that the upcoming
* expedited grace period needs to wait for.
*/
-static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
+static void sync_rcu_exp_select_node_cpus(struct kthread_work *wp)
{
int cpu;
unsigned long flags;
@@ -423,7 +423,6 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
*/
static void sync_rcu_exp_select_cpus(void)
{
- int cpu;
struct rcu_node *rnp;

trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("reset"));
@@ -435,28 +434,27 @@ static void sync_rcu_exp_select_cpus(void)
rnp->exp_need_flush = false;
if (!READ_ONCE(rnp->expmask))
continue; /* Avoid early boot non-existent wq. */
- if (!READ_ONCE(rcu_par_gp_wq) ||
+ if (!READ_ONCE(rcu_exp_par_gp_kworker) ||
rcu_scheduler_active != RCU_SCHEDULER_RUNNING ||
rcu_is_last_leaf_node(rnp)) {
- /* No workqueues yet or last leaf, do direct call. */
+ /* kthread worker not started yet or last leaf, do direct call. */
sync_rcu_exp_select_node_cpus(&rnp->rew.rew_work);
continue;
}
- INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
- cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1);
- /* If all offline, queue the work on an unbound CPU. */
- if (unlikely(cpu > rnp->grphi - rnp->grplo))
- cpu = WORK_CPU_UNBOUND;
- else
- cpu += rnp->grplo;
- queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work);
+ kthread_init_work(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
+ /*
+ * Use rcu_exp_par_gp_kworker, because flushing a work item from
+ * another work item on the same kthread worker can result in
+ * deadlock.
+ */
+ kthread_queue_work(rcu_exp_par_gp_kworker, &rnp->rew.rew_work);
rnp->exp_need_flush = true;
}

/* Wait for workqueue jobs (if any) to complete. */
rcu_for_each_leaf_node(rnp)
if (rnp->exp_need_flush)
- flush_work(&rnp->rew.rew_work);
+ kthread_flush_work(&rnp->rew.rew_work);
}

/*
@@ -625,7 +623,7 @@ static void rcu_exp_sel_wait_wake(unsigned long s)
/*
* Work-queue handler to drive an expedited grace period forward.
*/
-static void wait_rcu_exp_gp(struct work_struct *wp)
+static void wait_rcu_exp_gp(struct kthread_work *wp)
{
struct rcu_exp_work *rewp;

@@ -848,20 +846,17 @@ void synchronize_rcu_expedited(void)
} else {
/* Marshall arguments & schedule the expedited grace period. */
rew.rew_s = s;
- INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp);
- queue_work(rcu_gp_wq, &rew.rew_work);
+ kthread_init_work(&rew.rew_work, wait_rcu_exp_gp);
+ kthread_queue_work(rcu_exp_gp_kworker, &rew.rew_work);
}

/* Wait for expedited grace period to complete. */
rnp = rcu_get_root();
wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
sync_exp_work_done(s));
- smp_mb(); /* Workqueue actions happen before return. */
+ smp_mb(); /* kthread actions happen before return. */

/* Let the next expedited grace period start. */
mutex_unlock(&rcu_state.exp_mutex);
-
- if (likely(!boottime))
- destroy_work_on_stack(&rew.rew_work);
}
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);

base-commit: 7a3ecddc571cc3294e5d6bb5948ff2b0cfa12735
--
2.35.1.1094.g7c7d902a7c-goog