Re: [patch 3/3] x86/fpu: Make FPU protection more robust

From: Jason A. Donenfeld
Date: Thu May 05 2022 - 09:49:17 EST


Hey again Thomas,

On Thu, May 05, 2022 at 01:02:02PM +0200, Jason A. Donenfeld wrote:
> Interestingly, disabling the simd paths makes things around 283 cycles
> slower on my Tiger Lake laptop, just doing ordinary things. I'm actually
> slightly surprised, so I'll probably keep playing with this. My patch
> for this is attached. Let me know if you have a different methodology in
> mind...

Using RDPMC/perf, the performance is shown to be even closer for real
world cases, with the simd code only ~80 cycles faster. Bench code
follows below. If the observation on this hardware holds for other
hardware, we can probably improve the performance of the generic code a
bit, and then the difference really won't matter. Any thoughts about
this and the test code?

Jason

diff --git a/drivers/char/random.c b/drivers/char/random.c
index bd292927654c..6577e9f2f3b7 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -53,6 +53,7 @@
#include <linux/uuid.h>
#include <linux/uaccess.h>
#include <linux/suspend.h>
+#include <linux/sort.h>
#include <crypto/chacha.h>
#include <crypto/blake2s.h>
#include <asm/processor.h>
@@ -755,9 +756,54 @@ static struct {
.lock = __SPIN_LOCK_UNLOCKED(input_pool.lock),
};

+static DEFINE_PER_CPU(int, pmc_index) = -1;
+static struct {
+ u32 durations[1 << 20];
+ u32 pos, len;
+} irqbench;
+
static void _mix_pool_bytes(const void *in, size_t nbytes)
{
+ int idx = *this_cpu_ptr(&pmc_index);
+ u32 ctr = input_pool.hash.t[0], reg = 0;
+ cycles_t end, start;
+
+
+ native_cpuid(&reg, &reg, &reg, &reg);
+ start = idx == -1 ? 0 : native_read_pmc(idx);
blake2s_update(&input_pool.hash, in, nbytes);
+ end = idx == -1 ? 0 : native_read_pmc(idx);
+
+ if (ctr == input_pool.hash.t[0] || !in_hardirq() || idx == -1)
+ return;
+
+ irqbench.durations[irqbench.pos++ % ARRAY_SIZE(irqbench.durations)] = end - start;
+ irqbench.len = min_t(u32, irqbench.len + 1, ARRAY_SIZE(irqbench.durations));
+}
+
+static int cmp_u32(const void *a, const void *b)
+{
+ return *(const u32 *)a - *(const u32 *)b;
+}
+
+static int proc_do_irqbench_median(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ u32 len = READ_ONCE(irqbench.len), median, *sorted;
+ struct ctl_table fake_table = {
+ .data = &median,
+ .maxlen = sizeof(median)
+ };
+ if (!len)
+ return -ENODATA;
+ sorted = kmalloc_array(len, sizeof(*sorted), GFP_KERNEL);
+ if (!sorted)
+ return -ENOMEM;
+ memcpy(sorted, irqbench.durations, len * sizeof(*sorted));
+ sort(sorted, len, sizeof(*sorted), cmp_u32, NULL);
+ median = sorted[len / 2];
+ kfree(sorted);
+ return write ? 0 : proc_douintvec(&fake_table, 0, buffer, lenp, ppos);
}

/*
@@ -1709,6 +1755,18 @@ static struct ctl_table random_table[] = {
.mode = 0444,
.proc_handler = proc_do_uuid,
},
+ {
+ .procname = "irqbench_median",
+ .mode = 0444,
+ .proc_handler = proc_do_irqbench_median,
+ },
+ {
+ .procname = "irqbench_count",
+ .data = &irqbench.len,
+ .maxlen = sizeof(irqbench.len),
+ .mode = 0444,
+ .proc_handler = proc_douintvec,
+ },
{ }
};

@@ -1718,6 +1776,21 @@ static struct ctl_table random_table[] = {
*/
static int __init random_sysctls_init(void)
{
+ int i;
+ struct perf_event *cycles_event;
+ struct perf_event_attr perf_cycles_attr = {
+ .type = PERF_TYPE_HARDWARE,
+ .config = PERF_COUNT_HW_CPU_CYCLES,
+ .size = sizeof(struct perf_event_attr),
+ .pinned = true
+ };
+ for_each_possible_cpu(i) {
+ cycles_event = perf_event_create_kernel_counter(&perf_cycles_attr, i, NULL, NULL, NULL);
+ if (IS_ERR(cycles_event))
+ pr_err("unable to create perf counter on cpu %d: %ld\n", i, PTR_ERR(cycles_event));
+ else
+ *per_cpu_ptr(&pmc_index, i) = cycles_event->hw.event_base_rdpmc;
+ }
register_sysctl_init("kernel/random", random_table);
return 0;
}