Re: [PATCH 6/8] lazy tlb: shoot lazies, a non-refcounting lazy tlb option

From: Peter Zijlstra
Date: Wed Dec 02 2020 - 09:20:56 EST


On Sun, Nov 29, 2020 at 02:01:39AM +1000, Nicholas Piggin wrote:
> + * - A delayed freeing and RCU-like quiescing sequence based on
> + * mm switching to avoid IPIs completely.

That one's interesting too. so basically you want to count switch_mm()
invocations on each CPU. Then, periodically snapshot the counter on each
CPU, and when they've all changed, increment a global counter.

Then, you snapshot the global counter and wait for it to increment
(twice I think, the first increment might already be in progress).

The only question here is what should drive this machinery.. the tick
probably.

This shouldn't be too hard to do I think.

Something a little like so perhaps?


diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 41404afb7f4c..27b64a60a468 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4525,6 +4525,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
* finish_task_switch()'s mmdrop().
*/
switch_mm_irqs_off(prev->active_mm, next->mm, next);
+ rq->nr_mm_switches++;

if (!prev->mm) { // from kernel
/* will mmdrop() in finish_task_switch(). */
@@ -4739,6 +4740,80 @@ unsigned long long task_sched_runtime(struct task_struct *p)
return ns;
}

+static DEFINE_PER_CPU(unsigned long[2], mm_switches);
+
+static struct {
+ unsigned long __percpu *switches[2];
+ unsigned long generation;
+ atomic_t complete;
+ struct wait_queue_dead wait;
+} mm_foo = {
+ .switches = &mm_switches,
+ .generation = 0,
+ .complete = -1, // XXX bootstrap, hotplug
+ .wait = __WAIT_QUEUE_HEAD_INITIALIZER(mm_foo.wait),
+};
+
+static void mm_gen_tick(int cpu, struct rq *rq)
+{
+ unsigned long prev, curr, switches = rq->nr_mm_switches;
+ int idx = READ_ONCE(mm_foo.generation) & 1;
+
+ /* DATA-DEP on mm_foo.generation */
+
+ prev = __this_cpu_read(mm_foo.switches[idx^1]);
+ curr = __this_cpu_read(mm_foo.switches[idx]);
+
+ /* we haven't switched since the last generation */
+ if (prev == switches)
+ return false;
+
+ __this_cpu_write(mm_foo.switches[idx], switches);
+
+ /*
+ * If @curr is less than @prev, this is the first update of
+ * this generation, per the above, switches has also increased since,
+ * so mark out CPU complete.
+ */
+ if ((long)(curr - prev) < 0 && atomic_dec_and_test(&mm_foo.complete)) {
+ /*
+ * All CPUs are complete, IOW they all switched at least once
+ * since the last generation. Reset the completion counter and
+ * increment the generation.
+ */
+ atomic_set(&mm_foo.complete, nr_online_cpus());
+ /*
+ * Matches the address dependency above:
+ *
+ * idx = gen & 1 complete = nr_cpus
+ * <DATA-DEP> <WMB>
+ * curr = sw[idx] generation++;
+ * prev = sw[idx^1]
+ * if (curr < prev)
+ * complete--
+ *
+ * If we don't observe the new generation; we'll not decrement. If we
+ * do see the new generation, we must also see the new completion count.
+ */
+ smp_wmb();
+ mm_foo.generation++;
+ return true;
+ }
+
+ return false;
+}
+
+static void mm_gen_wake(void)
+{
+ wake_up_all(&mm_foo.wait);
+}
+
+static void mm_gen_wait(void)
+{
+ unsigned int gen = READ_ONCE(mm_foo.generation);
+ wait_event(&mm_foo.wait, READ_ONCE(mm_foo.generation) - gen > 1);
+}
+
/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
@@ -4750,6 +4825,7 @@ void scheduler_tick(void)
struct task_struct *curr = rq->curr;
struct rq_flags rf;
unsigned long thermal_pressure;
+ bool wake_mm_gen;

arch_scale_freq_tick();
sched_clock_tick();
@@ -4763,8 +4839,13 @@ void scheduler_tick(void)
calc_global_load_tick(rq);
psi_task_tick(rq);

+ wake_mm_gen = mm_gen_tick(cpu, rq);
+
rq_unlock(rq, &rf);

+ if (wake_mm_gen)
+ mm_gen_wake();
+
perf_event_task_tick();

#ifdef CONFIG_SMP
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index bf9d8da7d35e..62fb685db8d0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -927,6 +927,7 @@ struct rq {
unsigned int ttwu_pending;
#endif
u64 nr_switches;
+ u64 nr_mm_switches;

#ifdef CONFIG_UCLAMP_TASK
/* Utilization clamp values based on CPU's RUNNABLE tasks */