[PATCH] hung_task: recover hung task warnings in next check interval

From: Yafang Shao
Date: Thu Jun 20 2019 - 02:00:51 EST


When sys_hung_task_warnings reaches 0, the hang task messages will not
be reported any more.

If the user want to get more hung task messages, he must reset
kernel.hung_task_warnings to a postive integer or -1 with sysctl.
This is not a good way for the user.
We'd better reset hung task warnings in the kernel, and then the user
don't need to pay attention to this value.

With this patch, hung task warnings will be reset with
sys_hung_task_warnings setting in evenry check interval.

Another difference is if the user set kernel.hung_task_warnings with a
new value, the new value will take effect in next check interval.
For example, when the kernel is printing the hung task messages, the
user can't set it to 0 to stop the printing, but I don't think this will
happen in the real world. (If that happens, then sys_hung_task_warnings
must be protected by a lock)

Signed-off-by: Yafang Shao <laoar.shao@xxxxxxxxx>
---
Documentation/sysctl/kernel.txt | 5 ++++-
kernel/hung_task.c | 19 ++++++++++++-------
2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index f0c86fb..350df41 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -377,6 +377,8 @@ This file shows up if CONFIG_DETECT_HUNG_TASK is enabled.

0 (default): means use hung_task_timeout_secs as checking interval.
Possible values to set are in range {0..LONG_MAX/HZ}.
+hung_task_check_interval_secs must not be set greater than
+hung_task_timeout_secs.

==============================================================

@@ -384,7 +386,8 @@ hung_task_warnings:

The maximum number of warnings to report. During a check interval
if a hung task is detected, this value is decreased by 1.
-When this value reaches 0, no more warnings will be reported.
+When this value reaches 0, no more warnings will be reported until
+next check interval begins.
This file shows up if CONFIG_DETECT_HUNG_TASK is enabled.

-1: report an infinite number of warnings.
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 14a625c..01e6c94 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -85,7 +85,8 @@ static int __init hung_task_panic_setup(char *str)
.notifier_call = hung_task_panic,
};

-static void check_hung_task(struct task_struct *t, unsigned long timeout)
+static void check_hung_task(struct task_struct *t, unsigned long timeout,
+ int *warnings)
{
unsigned long switch_count = t->nvcsw + t->nivcsw;

@@ -124,9 +125,9 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
* Ok, the task did not get scheduled for more than 2 minutes,
* complain:
*/
- if (sysctl_hung_task_warnings) {
- if (sysctl_hung_task_warnings > 0)
- sysctl_hung_task_warnings--;
+ if (*warnings) {
+ if (*warnings > 0)
+ (*warnings)--;
pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n",
t->comm, t->pid, (jiffies - t->last_switch_time) / HZ);
pr_err(" %s %s %.*s\n",
@@ -170,7 +171,8 @@ static bool rcu_lock_break(struct task_struct *g, struct task_struct *t)
* a really long time (120 seconds). If that happens, print out
* a warning.
*/
-static void check_hung_uninterruptible_tasks(unsigned long timeout)
+static void check_hung_uninterruptible_tasks(unsigned long timeout,
+ int *warnings)
{
int max_count = sysctl_hung_task_check_count;
unsigned long last_break = jiffies;
@@ -195,7 +197,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
}
/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
if (t->state == TASK_UNINTERRUPTIBLE)
- check_hung_task(t, timeout);
+ check_hung_task(t, timeout, warnings);
}
unlock:
rcu_read_unlock();
@@ -271,6 +273,7 @@ static int hungtask_pm_notify(struct notifier_block *self,
static int watchdog(void *dummy)
{
unsigned long hung_last_checked = jiffies;
+ int warnings;

set_user_nice(current, 0);

@@ -284,9 +287,11 @@ static int watchdog(void *dummy)
interval = min_t(unsigned long, interval, timeout);
t = hung_timeout_jiffies(hung_last_checked, interval);
if (t <= 0) {
+ warnings = sysctl_hung_task_warnings;
if (!atomic_xchg(&reset_hung_task, 0) &&
!hung_detector_suspended)
- check_hung_uninterruptible_tasks(timeout);
+ check_hung_uninterruptible_tasks(timeout,
+ &warnings);
hung_last_checked = jiffies;
continue;
}
--
1.8.3.1