[RFC PATCH v1 03/11] cpuidle: introduce cpuidle governor for idle prediction

From: Aubrey Li
Date: Sun Jul 09 2017 - 21:50:39 EST


From: Aubrey Li <aubrey.li@xxxxxxxxxxxxxxx>

Two factors taken from menu governor for idle prediction in the cpu
idle governor:
- Energy break even point
- Repeatable interval detector

Based on the actual known "next timer event" time, and coordinate with
the algorithm in the above two factors, we'll make a prediction of how
long we'll be idle
---
drivers/cpuidle/cpuidle.c | 165 ++++++++++++++++++++++++++++++++++++++++++++++
include/linux/cpuidle.h | 4 ++
2 files changed, 169 insertions(+)

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 2706be7..0be7f75 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -13,6 +13,8 @@
#include <linux/mutex.h>
#include <linux/sched.h>
#include <linux/sched/clock.h>
+#include <linux/sched/loadavg.h>
+#include <linux/sched/stat.h>
#include <linux/notifier.h>
#include <linux/pm_qos.h>
#include <linux/cpu.h>
@@ -179,6 +181,169 @@ int cpuidle_enter_freeze(struct cpuidle_driver *drv, struct cpuidle_device *dev)
}
#endif /* CONFIG_SUSPEND */

+static inline int which_bucket(unsigned int duration,
+ unsigned long nr_iowaiters)
+{
+ int bucket = 0;
+
+ /*
+ * We keep two groups of stats; one with no
+ * IO pending, one without.
+ * This allows us to calculate
+ * E(duration)|iowait
+ */
+ if (nr_iowaiters)
+ bucket = BUCKETS/2;
+
+ if (duration < 10)
+ return bucket;
+ if (duration < 100)
+ return bucket + 1;
+ if (duration < 1000)
+ return bucket + 2;
+ if (duration < 10000)
+ return bucket + 3;
+ if (duration < 100000)
+ return bucket + 4;
+ return bucket + 5;
+}
+
+/*
+ * Try detecting repeating patterns by keeping track of the last 8
+ * intervals, and checking if the standard deviation of that set
+ * of points is below a threshold. If it is... then use the
+ * average of these 8 points as the estimated value.
+ */
+static unsigned int get_typical_interval(struct cpuidle_device *dev)
+{
+ struct cpuidle_governor_stat *gov_stat =
+ (struct cpuidle_governor_stat *)&(dev->gov_stat);
+ int i, divisor;
+ unsigned int max, thresh, avg;
+ uint64_t sum, variance;
+
+ thresh = UINT_MAX; /* Discard outliers above this value */
+
+again:
+
+ /* First calculate the average of past intervals */
+ max = 0;
+ sum = 0;
+ divisor = 0;
+ for (i = 0; i < INTERVALS; i++) {
+ unsigned int value = gov_stat->intervals[i];
+
+ if (value <= thresh) {
+ sum += value;
+ divisor++;
+ if (value > max)
+ max = value;
+ }
+ }
+ if (divisor == INTERVALS)
+ avg = sum >> INTERVAL_SHIFT;
+ else
+ avg = div_u64(sum, divisor);
+
+ /* Then try to determine variance */
+ variance = 0;
+ for (i = 0; i < INTERVALS; i++) {
+ unsigned int value = gov_stat->intervals[i];
+
+ if (value <= thresh) {
+ int64_t diff = (int64_t)value - avg;
+
+ variance += diff * diff;
+ }
+ }
+ if (divisor == INTERVALS)
+ variance >>= INTERVAL_SHIFT;
+ else
+ do_div(variance, divisor);
+
+ /*
+ * The typical interval is obtained when standard deviation is
+ * small (stddev <= 20 us, variance <= 400 us^2) or standard
+ * deviation is small compared to the average interval (avg >
+ * 6*stddev, avg^2 > 36*variance). The average is smaller than
+ * UINT_MAX aka U32_MAX, so computing its square does not
+ * overflow a u64. We simply reject this candidate average if
+ * the standard deviation is greater than 715 s (which is
+ * rather unlikely).
+ *
+ * Use this result only if there is no timer to wake us up sooner.
+ */
+ if (likely(variance <= U64_MAX/36)) {
+ if ((((u64)avg*avg > variance*36) &&
+ (divisor * 4 >= INTERVALS * 3)) || variance <= 400) {
+ return avg;
+ }
+ }
+
+ /*
+ * If we have outliers to the upside in our distribution, discard
+ * those by setting the threshold to exclude these outliers, then
+ * calculate the average and standard deviation again. Once we get
+ * down to the bottom 3/4 of our samples, stop excluding samples.
+ *
+ * This can deal with workloads that have long pauses interspersed
+ * with sporadic activity with a bunch of short pauses.
+ */
+ if ((divisor * 4) <= INTERVALS * 3)
+ return UINT_MAX;
+
+ thresh = max - 1;
+ goto again;
+}
+
+/**
+ * cpuidle_predict - predict the next idle duration, in micro-second.
+ * There are two factors in the cpu idle governor, taken from menu:
+ * 1) Energy break even point
+ * 2) Repeatable interval detector
+ *
+ * Based on the actual known "next timer event" time, and coordinate with
+ * the algorithm in the two factors, we'll make a prediction of how long
+ * we'll be idle
+ */
+unsigned int cpuidle_predict(void)
+{
+ struct cpuidle_device *dev = cpuidle_get_device();
+ struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
+ struct cpuidle_governor_stat *gov_stat;
+ unsigned int expected_interval;
+ unsigned long nr_iowaiters, cpu_load;
+
+ if (need_resched())
+ return 0;
+
+ if (cpuidle_not_available(drv, dev))
+ return -ENODEV;
+
+ gov_stat = (struct cpuidle_governor_stat *)&(dev->gov_stat);
+
+ gov_stat->next_timer_us = ktime_to_us(tick_nohz_get_sleep_length());
+ get_iowait_load(&nr_iowaiters, &cpu_load);
+ gov_stat->bucket = which_bucket(gov_stat->next_timer_us, nr_iowaiters);
+
+ /*
+ * Force the result of multiplication to be 64 bits even if both
+ * operands are 32 bits.
+ * Make sure to round up for half microseconds.
+ */
+ gov_stat->predicted_us = DIV_ROUND_CLOSEST_ULL(
+ (uint64_t)gov_stat->next_timer_us *
+ gov_stat->correction_factor[gov_stat->bucket],
+ RESOLUTION * DECAY);
+
+ expected_interval = get_typical_interval(dev);
+ expected_interval = min(expected_interval, gov_stat->next_timer_us);
+
+ gov_stat->predicted_us = min(gov_stat->predicted_us, expected_interval);
+
+ return gov_stat->predicted_us;
+}
+
/**
* cpuidle_enter_state - enter the state and update stats
* @dev: cpuidle device for this cpu
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index f17a53b..b9964ec 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -164,6 +164,7 @@ extern int cpuidle_select(struct cpuidle_driver *drv,
extern int cpuidle_enter(struct cpuidle_driver *drv,
struct cpuidle_device *dev, int index);
extern void cpuidle_reflect(struct cpuidle_device *dev, int index);
+extern unsigned int cpuidle_predict(void);

extern int cpuidle_register_driver(struct cpuidle_driver *drv);
extern struct cpuidle_driver *cpuidle_get_driver(void);
@@ -198,6 +199,9 @@ static inline int cpuidle_enter(struct cpuidle_driver *drv,
struct cpuidle_device *dev, int index)
{return -ENODEV; }
static inline void cpuidle_reflect(struct cpuidle_device *dev, int index) { }
+static inline unsigned int cpuidle_predict(struct cpuidle_device *dev,
+ struct cpuidle_driver *drv)
+{return -ENODEV; }
static inline int cpuidle_register_driver(struct cpuidle_driver *drv)
{return -ENODEV; }
static inline struct cpuidle_driver *cpuidle_get_driver(void) {return NULL; }
--
2.7.4