[RFC PATCH v5 09/16] watchdog/hardlockup/hpet: Group packages receiving IPIs when needed

From: Ricardo Neri
Date: Tue May 04 2021 - 15:07:57 EST


In order to keep the HPET interrupts of the hardlockup detector at a rate
of one per second or less frequent, the HPET timer only targets one of
the CPUs monitored by the detector. This is the handling CPU. The rest of
the CPUs are monitored via an IPI issued by the handling CPUs.
Furthermore, the monitored CPUs are partitioned into groups. Groups are
targeted by the HPET timer in a round-robin manner. A group is composed of
of all the CPUs in a physical package.

There may be situations in which it is not possible to keep the
aforementioned HPET interrupt rate. This may happen if, for instance,
watchdog_thresh is set to 1 second and there are more than one package in
the system. In such case, the HPET timer should expire 1/nr_packages
seconds.

It is possible to keep the HPET timer expiration at one second or less
frequent if the packages receiving the IPI are grouped together. Hence,
in the example above, all packages would be grouped together.

This approach has the drawback of having to issue IPIs across packages
However, these cases should be rare: only when there are more packages
than the value of watchdog_thresh in seconds.

Implement functionality to use the logic above: when the hardlockup
detector is enabled in a CPU, check if grouping is necessary based in the
value of watchdog_thresh. When updating target_cpumask, do it as many
times as packages in the group.

Cc: "H. Peter Anvin" <hpa@xxxxxxxxx>
Cc: Ashok Raj <ashok.raj@xxxxxxxxx>
Cc: Andi Kleen <andi.kleen@xxxxxxxxx>
Cc: Tony Luck <tony.luck@xxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Stephane Eranian <eranian@xxxxxxxxxx>
Cc: "Ravi V. Shankar" <ravi.v.shankar@xxxxxxxxx>
Cc: x86@xxxxxxxxxx
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@xxxxxxxxxxxxxxx>
---
Changes since v4:
* Introduced this patch.

Changes since v3:
* N/A

Changes since v2:
* N/A

Changes since v1:
*N/A
---
arch/x86/include/asm/hpet.h | 6 +++
arch/x86/kernel/watchdog_hld_hpet.c | 75 ++++++++++++++++++++++++-----
2 files changed, 68 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
index 8aea54f412e0..bb76f54effe4 100644
--- a/arch/x86/include/asm/hpet.h
+++ b/arch/x86/include/asm/hpet.h
@@ -104,6 +104,10 @@ extern void hpet_unregister_irq_handler(rtc_irq_handler handler);
* @ticks_per_second: Frequency of the HPET timer
* @irq: IRQ number assigned to the HPET channel
* @handling_cpu: CPU handling the HPET interrupt
+ * @pkgs_per_group: Number of physical packages in a group of CPUs
+ * receiving an IPI
+ * @nr_groups: Number of groups into which @monitored_cpumask
+ * is partitioned
* @msi_msg: MSI message to be written it the HPET registers
* @affinity_work: Used to update the affinity of the detector
* interrupts, both IPI and NMI.
@@ -121,6 +125,8 @@ struct hpet_hld_data {
u64 ticks_per_second;
int irq;
u32 handling_cpu;
+ u32 pkgs_per_group;
+ u32 nr_groups;
struct msi_msg msi_msg;
struct irq_work affinity_work;
cpumask_var_t monitored_cpumask;
diff --git a/arch/x86/kernel/watchdog_hld_hpet.c b/arch/x86/kernel/watchdog_hld_hpet.c
index a363f3cd45dd..04b354a35e68 100644
--- a/arch/x86/kernel/watchdog_hld_hpet.c
+++ b/arch/x86/kernel/watchdog_hld_hpet.c
@@ -235,26 +235,71 @@ static void update_ipi_target_cpumask(struct hpet_hld_data *hdata)
retry:
cpumask_clear(hdata->target_cpumask);

- next_cpu = get_first_cpu_in_next_pkg(next_cpu, hdata);
- if (next_cpu < 0 || next_cpu >= nr_cpu_ids) {
- /*
- * If a CPU in a next package was not identified,
- * fallback to the first monitored CPU instead of
- * bailing out.
- */
- next_cpu = cpumask_first(hdata->monitored_cpumask);
- goto retry;
+ for (i = 0 ; i < hdata->pkgs_per_group; i++) {
+ next_cpu = get_first_cpu_in_next_pkg(next_cpu, hdata);
+ if (next_cpu < 0 || next_cpu >= nr_cpu_ids) {
+ /*
+ * If a CPU in a next package was not identified,
+ * fallback to the first monitored CPU instead of
+ * bailing out.
+ */
+ next_cpu = cpumask_first(hdata->monitored_cpumask);
+ goto retry;
+ }
+
+ /* Select all the CPUs in the same package as @next_cpu */
+ cpumask_or(hdata->target_cpumask, hdata->target_cpumask,
+ topology_core_cpumask(next_cpu));
}

- /* Select all the CPUs in the same package as @next_cpu */
- cpumask_or(hdata->target_cpumask, hdata->target_cpumask,
- topology_core_cpumask(next_cpu));
-
/* Only select the CPUs that need to be monitored */
cpumask_and(hdata->target_cpumask, hdata->target_cpumask,
hdata->monitored_cpumask);
}

+/**
+ * count_monitored_packages() - Count the packages with monitored CPUs
+ * @hdata: A data structure with the monitored cpumask
+ *
+ * Return the number of packages with at least one CPU in the monitored_cpumask
+ * of @hdata
+ */
+static u32 count_monitored_packages(struct hpet_hld_data *hdata)
+{
+ int c = cpumask_first(hdata->monitored_cpumask);
+ u16 start_id, id;
+ u32 nr_pkgs = 0;
+
+ start_id = topology_physical_package_id(c);
+
+ do {
+ nr_pkgs++;
+ c = get_first_cpu_in_next_pkg(c, hdata);
+ id = topology_physical_package_id(c);
+ } while (start_id != id);
+
+ return nr_pkgs;
+}
+
+static void setup_cpu_groups(struct hpet_hld_data *hdata)
+{
+ u32 monitored_pkgs = count_monitored_packages(hdata);
+
+ hdata->pkgs_per_group = 0;
+ hdata->nr_groups = U32_MAX;
+
+ /*
+ * To keep the HPET timer to fire each 1 second or less frequently,
+ * the condition watchdog_thresh >= nr_groups nust be met. Thus,
+ * group together one or more packages until such condition is reached.
+ */
+ while (watchdog_thresh < hdata->nr_groups) {
+ hdata->pkgs_per_group++;
+ hdata->nr_groups = DIV_ROUND_UP(monitored_pkgs,
+ hdata->pkgs_per_group);
+ }
+}
+
static void update_timer_irq_affinity(struct irq_work *work)
{
struct hpet_hld_data *hdata = container_of(work, struct hpet_hld_data,
@@ -378,6 +423,8 @@ void hardlockup_detector_hpet_enable(unsigned int cpu)
{
cpumask_set_cpu(cpu, hld_data->monitored_cpumask);

+ setup_cpu_groups(hld_data);
+
update_ipi_target_cpumask(hld_data);

/*
@@ -421,6 +468,8 @@ void hardlockup_detector_hpet_disable(unsigned int cpu)
hld_data->handling_cpu = cpumask_first(hld_data->monitored_cpumask);
update_msi_destid(hld_data);

+ setup_cpu_groups(hld_data);
+
update_ipi_target_cpumask(hld_data);

enable_timer(hld_data);
--
2.17.1