Re: ARM64 board Hikey960 boot failure due to f2545b2d4ce1 (jump_label: Reorder hotplug lock and jump_label_lock)

From: Marc Zyngier
Date: Wed Jul 26 2017 - 11:13:57 EST


[+Mark]

Hi Leo,

On 24/07/17 15:34, Leo Yan wrote:
> Hi all,
>
> We found the mainline arm64 kernel boot failure on Hikey960 board,
> this is caused by patch f2545b2d4ce1 (jump_label: Reorder hotplug lock
> and jump_label_lock), this patch adds locking cpus_read_lock() in
> function static_key_slow_inc() and introduce the dead lock issue by
> acquiring lock twice. Below are detailed flow:
>
> arch_timer_register()
> `> cpuhp_setup_state()
> `> __cpuhp_setup_state()
> cpus_read_lock()
> `> __cpuhp_setup_state_cpuslocked()
> `> cpuhp_issue_call()
> `> arch_timer_starting_cpu()
> `> __arch_timer_setup()
> `> arch_timer_check_ool_workaround()
> `> arch_timer_enable_workaround()
> `> static_branch_enable()
> `> static_key_enable()
> `> static_key_slow_inc()
> `> cpus_read_lock()
>
> So finally there have called cpus_read_lock() twice, and kernel report
> log as below. So I am not sure what's the best way to fix this issue,
> could you give some suggestion for this? Thanks.

[...]

Thanks for this. Unfortunately, there is no easy fix for this.
Can you give the patch below a go and let us know if that solves
the issue you observed? I only tested in on a model...

Should this be considered an acceptable solution, I'll split that
into individual patches and repost it as a proper series.

Thanks,

M.

diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c
index aae87c4c546e..44232f378543 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -455,7 +455,11 @@ void arch_timer_enable_workaround(const struct arch_timer_erratum_workaround *wa
per_cpu(timer_unstable_counter_workaround, i) = wa;
}

- static_branch_enable(&arch_timer_read_ool_enabled);
+ /*
+ * Use the _locked version, as we're called from the CPU
+ * hotplug framework. Otherwise, we end-up in deadlock-land.
+ */
+ static_branch_enable_locked(&arch_timer_read_ool_enabled);

/*
* Don't use the vdso fastpath if errata require using the
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index 2afd74b9d844..5cfbf9ff3fe8 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -159,10 +159,14 @@ extern void arch_jump_label_transform_static(struct jump_entry *entry,
extern int jump_label_text_reserved(void *start, void *end);
extern void static_key_slow_inc(struct static_key *key);
extern void static_key_slow_dec(struct static_key *key);
+extern void static_key_slow_inc_locked(struct static_key *key);
+extern void static_key_slow_dec_locked(struct static_key *key);
extern void jump_label_apply_nops(struct module *mod);
extern int static_key_count(struct static_key *key);
extern void static_key_enable(struct static_key *key);
extern void static_key_disable(struct static_key *key);
+extern void static_key_enable_locked(struct static_key *key);
+extern void static_key_disable_locked(struct static_key *key);

/*
* We should be using ATOMIC_INIT() for initializing .enabled, but
@@ -213,12 +217,16 @@ static inline void static_key_slow_inc(struct static_key *key)
atomic_inc(&key->enabled);
}

+#define static_key_slow_inc_locked(k) static_key_slow_inc((k))
+
static inline void static_key_slow_dec(struct static_key *key)
{
STATIC_KEY_CHECK_USE();
atomic_dec(&key->enabled);
}

+#define static_key_slow_dec_locked(k) static_key_slow_inc((k))
+
static inline int jump_label_text_reserved(void *start, void *end)
{
return 0;
@@ -415,6 +423,8 @@ extern bool ____wrong_branch_error(void);

#define static_branch_enable(x) static_key_enable(&(x)->key)
#define static_branch_disable(x) static_key_disable(&(x)->key)
+#define static_branch_enable_locked(x) static_key_enable_locked(&(x)->key)
+#define static_branch_disable_locked(x) static_key_disable_locked(&(x)->key)

#endif /* __ASSEMBLY__ */

diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index d11c506a6ac3..f543f765a738 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -57,6 +57,11 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop)
}

static void jump_label_update(struct static_key *key);
+static void static_key_slow_inc_with_lock(struct static_key *key, bool lock);
+static void __static_key_slow_dec_with_lock(struct static_key *key,
+ bool lock,
+ unsigned long rate_limit,
+ struct delayed_work *work);

/*
* There are similar definitions for the !HAVE_JUMP_LABEL case in jump_label.h.
@@ -79,29 +84,53 @@ int static_key_count(struct static_key *key)
}
EXPORT_SYMBOL_GPL(static_key_count);

-void static_key_enable(struct static_key *key)
+static void static_key_enable_with_lock(struct static_key *key, bool lock)
{
int count = static_key_count(key);

WARN_ON_ONCE(count < 0 || count > 1);

if (!count)
- static_key_slow_inc(key);
+ static_key_slow_inc_with_lock(key, lock);
+}
+
+void static_key_enable(struct static_key *key)
+{
+ static_key_enable_with_lock(key, true);
}
EXPORT_SYMBOL_GPL(static_key_enable);

-void static_key_disable(struct static_key *key)
+void static_key_enable_locked(struct static_key *key)
+{
+ lockdep_assert_cpus_held();
+ static_key_enable_with_lock(key, false);
+}
+EXPORT_SYMBOL_GPL(static_key_enable_locked);
+
+static void static_key_disable_with_lock(struct static_key *key, bool lock)
{
int count = static_key_count(key);

WARN_ON_ONCE(count < 0 || count > 1);

if (count)
- static_key_slow_dec(key);
+ __static_key_slow_dec_with_lock(key, lock, 0, NULL);
+}
+
+void static_key_disable(struct static_key *key)
+{
+ static_key_disable_with_lock(key, true);
}
EXPORT_SYMBOL_GPL(static_key_disable);

-void static_key_slow_inc(struct static_key *key)
+void static_key_disable_locked(struct static_key *key)
+{
+ lockdep_assert_cpus_held();
+ static_key_disable_with_lock(key, false);
+}
+EXPORT_SYMBOL_GPL(static_key_disable_locked);
+
+static void static_key_slow_inc_with_lock(struct static_key *key, bool lock)
{
int v, v1;

@@ -125,7 +154,8 @@ void static_key_slow_inc(struct static_key *key)
return;
}

- cpus_read_lock();
+ if (lock)
+ cpus_read_lock();
jump_label_lock();
if (atomic_read(&key->enabled) == 0) {
atomic_set(&key->enabled, -1);
@@ -135,14 +165,30 @@ void static_key_slow_inc(struct static_key *key)
atomic_inc(&key->enabled);
}
jump_label_unlock();
- cpus_read_unlock();
+ if (lock)
+ cpus_read_unlock();
+}
+
+void static_key_slow_inc(struct static_key *key)
+{
+ static_key_slow_inc_with_lock(key, true);
}
EXPORT_SYMBOL_GPL(static_key_slow_inc);

-static void __static_key_slow_dec(struct static_key *key,
- unsigned long rate_limit, struct delayed_work *work)
+void static_key_slow_inc_locked(struct static_key *key)
{
- cpus_read_lock();
+ lockdep_assert_cpus_held();
+ static_key_slow_inc_with_lock(key, false);
+}
+EXPORT_SYMBOL_GPL(static_key_slow_inc_locked);
+
+static void __static_key_slow_dec_with_lock(struct static_key *key,
+ bool lock,
+ unsigned long rate_limit,
+ struct delayed_work *work)
+{
+ if (lock)
+ cpus_read_lock();
/*
* The negative count check is valid even when a negative
* key->enabled is in use by static_key_slow_inc(); a
@@ -153,7 +199,8 @@ static void __static_key_slow_dec(struct static_key *key,
if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) {
WARN(atomic_read(&key->enabled) < 0,
"jump label: negative count!\n");
- cpus_read_unlock();
+ if (lock)
+ cpus_read_unlock();
return;
}

@@ -164,27 +211,36 @@ static void __static_key_slow_dec(struct static_key *key,
jump_label_update(key);
}
jump_label_unlock();
- cpus_read_unlock();
+ if (lock)
+ cpus_read_unlock();
}

static void jump_label_update_timeout(struct work_struct *work)
{
struct static_key_deferred *key =
container_of(work, struct static_key_deferred, work.work);
- __static_key_slow_dec(&key->key, 0, NULL);
+ __static_key_slow_dec_with_lock(&key->key, true, 0, NULL);
}

void static_key_slow_dec(struct static_key *key)
{
STATIC_KEY_CHECK_USE();
- __static_key_slow_dec(key, 0, NULL);
+ __static_key_slow_dec_with_lock(key, true, 0, NULL);
}
EXPORT_SYMBOL_GPL(static_key_slow_dec);

+void static_key_slow_dec_locked(struct static_key *key)
+{
+ lockdep_assert_cpus_held();
+ STATIC_KEY_CHECK_USE();
+ __static_key_slow_dec_with_lock(key, false, 0, NULL);
+}
+EXPORT_SYMBOL_GPL(static_key_slow_dec_locked);
+
void static_key_slow_dec_deferred(struct static_key_deferred *key)
{
STATIC_KEY_CHECK_USE();
- __static_key_slow_dec(&key->key, key->timeout, &key->work);
+ __static_key_slow_dec_with_lock(&key->key, true, key->timeout, &key->work);
}
EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred);


--
Jazz is not dead. It just smells funny...