[tip: sched/core] sched/numa: Distinguish between the different task_numa_migrate() failure cases

From: tip-bot2 for Mel Gorman
Date: Mon Feb 24 2020 - 10:21:17 EST


The following commit has been merged into the sched/core branch of tip:

Commit-ID: b2b2042b204796190af7c20069ab790a614c36d0
Gitweb: https://git.kernel.org/tip/b2b2042b204796190af7c20069ab790a614c36d0
Author: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx>
AuthorDate: Mon, 24 Feb 2020 09:52:13
Committer: Ingo Molnar <mingo@xxxxxxxxxx>
CommitterDate: Mon, 24 Feb 2020 11:36:33 +01:00

sched/numa: Distinguish between the different task_numa_migrate() failure cases

sched:sched_stick_numa is meant to fire when a task is unable to migrate
to the preferred node but from the trace, it's possibile to tell the
difference between "no CPU found", "migration to idle CPU failed" and
"tasks could not be swapped". Extend the tracepoint accordingly.

Signed-off-by: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx>
[ Minor edits. ]
Signed-off-by: Ingo Molnar <mingo@xxxxxxxxxx>
Acked-by: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
Cc: Steven Rostedt <rostedt@xxxxxxxxxxx>
Cc: Vincent Guittot <vincent.guittot@xxxxxxxxxx>
Cc: Juri Lelli <juri.lelli@xxxxxxxxxx>
Cc: Dietmar Eggemann <dietmar.eggemann@xxxxxxx>
Cc: Valentin Schneider <valentin.schneider@xxxxxxx>
Cc: Phil Auld <pauld@xxxxxxxxxx>
Cc: Hillf Danton <hdanton@xxxxxxxx>
Link: https://lore.kernel.org/r/20200224095223.13361-4-mgorman@xxxxxxxxxxxxxxxxxxx
---
include/trace/events/sched.h | 49 +++++++++++++++++++----------------
kernel/sched/fair.c | 6 ++--
2 files changed, 30 insertions(+), 25 deletions(-)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 420e80e..9c3ebb7 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -487,7 +487,11 @@ TRACE_EVENT(sched_process_hang,
);
#endif /* CONFIG_DETECT_HUNG_TASK */

-DECLARE_EVENT_CLASS(sched_move_task_template,
+/*
+ * Tracks migration of tasks from one runqueue to another. Can be used to
+ * detect if automatic NUMA balancing is bouncing between nodes.
+ */
+TRACE_EVENT(sched_move_numa,

TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu),

@@ -519,23 +523,7 @@ DECLARE_EVENT_CLASS(sched_move_task_template,
__entry->dst_cpu, __entry->dst_nid)
);

-/*
- * Tracks migration of tasks from one runqueue to another. Can be used to
- * detect if automatic NUMA balancing is bouncing between nodes
- */
-DEFINE_EVENT(sched_move_task_template, sched_move_numa,
- TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu),
-
- TP_ARGS(tsk, src_cpu, dst_cpu)
-);
-
-DEFINE_EVENT(sched_move_task_template, sched_stick_numa,
- TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu),
-
- TP_ARGS(tsk, src_cpu, dst_cpu)
-);
-
-TRACE_EVENT(sched_swap_numa,
+DECLARE_EVENT_CLASS(sched_numa_pair_template,

TP_PROTO(struct task_struct *src_tsk, int src_cpu,
struct task_struct *dst_tsk, int dst_cpu),
@@ -561,11 +549,11 @@ TRACE_EVENT(sched_swap_numa,
__entry->src_ngid = task_numa_group_id(src_tsk);
__entry->src_cpu = src_cpu;
__entry->src_nid = cpu_to_node(src_cpu);
- __entry->dst_pid = task_pid_nr(dst_tsk);
- __entry->dst_tgid = task_tgid_nr(dst_tsk);
- __entry->dst_ngid = task_numa_group_id(dst_tsk);
+ __entry->dst_pid = dst_tsk ? task_pid_nr(dst_tsk) : 0;
+ __entry->dst_tgid = dst_tsk ? task_tgid_nr(dst_tsk) : 0;
+ __entry->dst_ngid = dst_tsk ? task_numa_group_id(dst_tsk) : 0;
__entry->dst_cpu = dst_cpu;
- __entry->dst_nid = cpu_to_node(dst_cpu);
+ __entry->dst_nid = dst_cpu >= 0 ? cpu_to_node(dst_cpu) : -1;
),

TP_printk("src_pid=%d src_tgid=%d src_ngid=%d src_cpu=%d src_nid=%d dst_pid=%d dst_tgid=%d dst_ngid=%d dst_cpu=%d dst_nid=%d",
@@ -575,6 +563,23 @@ TRACE_EVENT(sched_swap_numa,
__entry->dst_cpu, __entry->dst_nid)
);

+DEFINE_EVENT(sched_numa_pair_template, sched_stick_numa,
+
+ TP_PROTO(struct task_struct *src_tsk, int src_cpu,
+ struct task_struct *dst_tsk, int dst_cpu),
+
+ TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu)
+);
+
+DEFINE_EVENT(sched_numa_pair_template, sched_swap_numa,
+
+ TP_PROTO(struct task_struct *src_tsk, int src_cpu,
+ struct task_struct *dst_tsk, int dst_cpu),
+
+ TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu)
+);
+
+
/*
* Tracepoint for waking a polling cpu without an IPI.
*/
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f524ce3..5d9c23c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1849,7 +1849,7 @@ static int task_numa_migrate(struct task_struct *p)

/* No better CPU than the current one was found. */
if (env.best_cpu == -1) {
- trace_sched_stick_numa(p, env.src_cpu, -1);
+ trace_sched_stick_numa(p, env.src_cpu, NULL, -1);
return -EAGAIN;
}

@@ -1858,7 +1858,7 @@ static int task_numa_migrate(struct task_struct *p)
ret = migrate_task_to(p, env.best_cpu);
WRITE_ONCE(best_rq->numa_migrate_on, 0);
if (ret != 0)
- trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
+ trace_sched_stick_numa(p, env.src_cpu, NULL, env.best_cpu);
return ret;
}

@@ -1866,7 +1866,7 @@ static int task_numa_migrate(struct task_struct *p)
WRITE_ONCE(best_rq->numa_migrate_on, 0);

if (ret != 0)
- trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
+ trace_sched_stick_numa(p, env.src_cpu, env.best_task, env.best_cpu);
put_task_struct(env.best_task);
return ret;
}