Here is one example of the trace of the problem I reported in this e-mail to the LKML
Note Patch accepted! This patch is the first change that was proposed by the analysis of the task model I am developing in my Ph.D. Thesis.
Trace setup
The trace setup I am using is the following:
# cd /sys/kernel/debug/tracing/
# echo preempt_schedule_irq schedule schedule_preempt_disabled schedule_idle schedule_user >> set_graph_function ; done
# echo function_graph > current_tracer
# echo sched:sched_switch sched:sched_wakeup > set_event
# echo irq_vectors >> set_event
# echo $(( 1408 * 100 )) > buffer_size_kb
# echo > trace
In addition, I added a trace_prink() at the entry and exit of __schedule() and another when there is no context switch (because __schedule() is notrace).
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 70641e8f3c89..406a0b7ed6b3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3443,6 +3443,7 @@ static void __sched notrace __schedule(bool preempt)
struct rq *rq;
int cpu;
+ trace_printk("__schedule_entry\n");
cpu = smp_processor_id();
rq = cpu_rq(cpu);
prev = rq->curr;
@@ -3517,9 +3518,11 @@ static void __sched notrace __schedule(bool preempt)
} else {
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
rq_unlock_irq(rq, &rf);
+ trace_printk("oh boy, no context switch\n");
}
balance_callback(rq);
+ trace_printk("__schedule_exit\n");
}
This is the perf cmd-line I am using to generate a load that turns the problem visible:
# perf record -o /dev/shm/lala
I ran perf for some seconds and then read the trace.
Trace analysis
The kworker-336 is running:
------------------------------------------
0) ktimers-8 => kworker-336
------------------------------------------
310.241851 | 0) kworker-336 | | finish_task_switch() {
310.241851 | 0) kworker-336 | 0.183 us | preempt_count_sub();
310.241853 | 0) kworker-336 | 1.467 us | }
310.241853 | 0) kworker-336 | | /* __schedule_exit */
310.241854 | 0) kworker-336 | 0.229 us | preempt_count_sub();
310.241855 | 0) kworker-336 | | wq_worker_running() {
310.241856 | 0) kworker-336 | 0.280 us | kthread_data();
310.241857 | 0) kworker-336 | 1.569 us | }
310.241858 | 0) kworker-336 | * 39582.48 us | }
At this point it calls the scheduler:
310.242147 | 0) kworker-336 | | schedule() {
310.242148 | 0) kworker-336 | | wq_worker_sleeping() {
310.242149 | 0) kworker-336 | 0.239 us | kthread_data();
while in the wakeup_worker() call:
310.242151 | 0) kworker-336 | | wake_up_worker() {
310.242151 | 0) kworker-336 | 0.186 us | preempt_count_add();
310.242152 | 0) kworker-336 | | wake_up_process() {
310.242153 | 0) kworker-336 | | try_to_wake_up() {
And another kworker is awakened:
310.242153 | 0) kworker-336 | 0.200 us | preempt_count_add();
310.242155 | 0) kworker-336 | 0.186 us | preempt_count_add();
310.242156 | 0) kworker-336 | | update_rq_clock() {
310.242157 | 0) kworker-336 | 0.193 us | kvm_steal_clock();
310.242158 | 0) kworker-336 | 1.610 us | }
310.242159 | 0) kworker-336 | | activate_task() {
310.242160 | 0) kworker-336 | | enqueue_task_fair() {
310.242160 | 0) kworker-336 | | enqueue_entity() {
310.242161 | 0) kworker-336 | | update_curr() {
310.242162 | 0) kworker-336 | 0.249 us | update_min_vruntime();
310.242163 | 0) kworker-336 | | cpuacct_charge() {
310.242164 | 0) kworker-336 | 0.155 us | __rcu_read_lock();
310.242165 | 0) kworker-336 | 0.160 us | __rcu_read_unlock();
310.242166 | 0) kworker-336 | 2.495 us | }
310.242167 | 0) kworker-336 | 0.161 us | __rcu_read_lock();
310.242168 | 0) kworker-336 | 0.166 us | __rcu_read_unlock();
310.242169 | 0) kworker-336 | 7.787 us | }
310.242170 | 0) kworker-336 | 0.216 us | update_cfs_group();
310.242171 | 0) kworker-336 | 0.216 us | __enqueue_entity();
310.242172 | 0) kworker-336 | + 11.478 us | }
310.242173 | 0) kworker-336 | 0.158 us | hrtick_update();
310.242174 | 0) kworker-336 | + 13.759 us | }
310.242174 | 0) kworker-336 | + 15.052 us | }
310.242175 | 0) kworker-336 | | check_preempt_curr() {
310.242176 | 0) kworker-336 | | check_preempt_wakeup() {
310.242176 | 0) kworker-336 | 0.209 us | update_curr();
310.242178 | 0) kworker-336 | 0.158 us | wakeup_preempt_entity.isra.39();
310.242179 | 0) kworker-336 | 0.291 us | set_next_buddy();
310.242180 | 0) kworker-336 | 0.246 us | resched_curr_lazy();
310.242182 | 0) kworker-336 | 5.704 us | }
310.242183 | 0) kworker-336 | 7.150 us | }
310.242184 | 0) kworker-336 | | /* sched_wakeup: comm=kworker/0:2 pid=104 prio=120 target_cpu=000 */
310.242184 | 0) kworker-336 | 0.349 us | preempt_count_sub();
310.242186 | 0) kworker-336 | 0.214 us | preempt_count_sub();
310.242187 | 0) kworker-336 | + 34.075 us | }
310.242188 | 0) kworker-336 | + 35.213 us | }
310.242189 | 0) kworker-336 | | preempt_count_sub() {
And here __schedule is called:
310.242190 | 0) kworker-336 | | /* __schedule_entry */
310.242190 | 0) kworker-336 | | rcu_note_context_switch() {
310.242191 | 0) kworker-336 | 0.215 us | rcu_sched_qs();
310.242192 | 0) kworker-336 | 0.214 us | rcu_preempt_qs();
310.242194 | 0) kworker-336 | 2.982 us | }
310.242194 | 0) kworker-336 | 0.237 us | preempt_count_add();
310.242196 | 0) kworker-336 | | update_rq_clock() {
310.242197 | 0) kworker-336 | 0.209 us | kvm_steal_clock();
310.242198 | 0) kworker-336 | 1.657 us | }
310.242199 | 0) kworker-336 | | pick_next_task_fair() {
310.242199 | 0) kworker-336 | | update_curr() {
310.242201 | 0) kworker-336 | 0.261 us | update_min_vruntime();
310.242202 | 0) kworker-336 | | cpuacct_charge() {
310.242203 | 0) kworker-336 | 0.203 us | __rcu_read_lock();
310.242204 | 0) kworker-336 | 0.199 us | __rcu_read_unlock();
310.242206 | 0) kworker-336 | 2.838 us | }
310.242206 | 0) kworker-336 | 0.201 us | __rcu_read_lock();
310.242207 | 0) kworker-336 | 0.275 us | __rcu_read_unlock();
310.242209 | 0) kworker-336 | 8.898 us | }
310.242209 | 0) kworker-336 | 0.205 us | check_cfs_rq_runtime();
310.242211 | 0) kworker-336 | | pick_next_entity() {
310.242212 | 0) kworker-336 | 0.229 us | wakeup_preempt_entity.isra.39();
310.242213 | 0) kworker-336 | 0.317 us | clear_buddies();
310.242214 | 0) kworker-336 | 2.959 us | }
310.242215 | 0) kworker-336 | | put_prev_entity() {
310.242216 | 0) kworker-336 | 0.246 us | update_curr();
310.242217 | 0) kworker-336 | 0.214 us | check_cfs_rq_runtime();
310.242218 | 0) kworker-336 | 0.382 us | __enqueue_entity();
310.242220 | 0) kworker-336 | 4.481 us | }
310.242220 | 0) kworker-336 | 0.461 us | set_next_entity();
310.242222 | 0) kworker-336 | + 22.840 us | }
resulting in a context switch:
310.242223 | 0) kworker-336 | | /* sched_switch: prev_comm=kworker/0:7 prev_pid=336 prev_prio=120 prev_state=R+ ==> next_comm=kworker/0:2 next_pid=104 next_prio=120 */
310.242224 | 0) kworker-336 | 0.244 us | enter_lazy_tlb();
------------------------------------------
0) kworker-336 => kworker-104
------------------------------------------
[ lines deleted ]
Until the point in which the kworker leaves the processor:
310.243004 | 0) kworker-104 | | /* __schedule_entry */
310.243004 | 0) kworker-104 | | rcu_note_context_switch() {
310.243005 | 0) kworker-104 | 0.171 us | rcu_sched_qs();
310.243006 | 0) kworker-104 | 0.167 us | rcu_preempt_qs();
310.243007 | 0) kworker-104 | 2.478 us | }
310.243008 | 0) kworker-104 | 0.184 us | preempt_count_add();
310.243009 | 0) kworker-104 | | update_rq_clock() {
310.243009 | 0) kworker-104 | 0.176 us | kvm_steal_clock();
310.243011 | 0) kworker-104 | 1.405 us | }
310.243011 | 0) kworker-104 | | deactivate_task() {
310.243012 | 0) kworker-104 | | dequeue_task_fair() {
310.243012 | 0) kworker-104 | | dequeue_entity() {
310.243013 | 0) kworker-104 | | update_curr() {
310.243014 | 0) kworker-104 | 0.190 us | update_min_vruntime();
310.243015 | 0) kworker-104 | | cpuacct_charge() {
310.243015 | 0) kworker-104 | 0.176 us | __rcu_read_lock();
310.243016 | 0) kworker-104 | 0.157 us | __rcu_read_unlock();
310.243018 | 0) kworker-104 | 2.370 us | }
310.243018 | 0) kworker-104 | 0.180 us | __rcu_read_lock();
310.243019 | 0) kworker-104 | 0.153 us | __rcu_read_unlock();
310.243020 | 0) kworker-104 | 6.974 us | }
310.243021 | 0) kworker-104 | 0.193 us | clear_buddies();
310.243022 | 0) kworker-104 | 0.164 us | update_cfs_group();
310.243023 | 0) kworker-104 | + 10.589 us | }
310.243024 | 0) kworker-104 | 0.162 us | hrtick_update();
310.243025 | 0) kworker-104 | + 12.879 us | }
310.243026 | 0) kworker-104 | + 14.158 us | }
310.243026 | 0) kworker-104 | | pick_next_task_fair() {
310.243027 | 0) kworker-104 | 0.187 us | check_cfs_rq_runtime();
310.243028 | 0) kworker-104 | | pick_next_entity() {
310.243029 | 0) kworker-104 | 0.190 us | clear_buddies();
310.243030 | 0) kworker-104 | 1.275 us | }
310.243030 | 0) kworker-104 | | put_prev_entity() {
310.243031 | 0) kworker-104 | 0.181 us | check_cfs_rq_runtime();
310.243032 | 0) kworker-104 | 1.379 us | }
310.243033 | 0) kworker-104 | 0.335 us | set_next_entity();
310.243034 | 0) kworker-104 | 7.416 us | }
310.243035 | 0) kworker-104 | | /* sched_switch: prev_comm=kworker/0:2 prev_pid=104 prev_prio=120 prev_state=R+ ==> next_comm=kworker/0:7 next_pid=336 next_prio=120 */
310.243036 | 0) kworker-104 | 0.185 us | enter_lazy_tlb();
And switch the context back to the first kworker:
------------------------------------------
0) kworker-104 => kworker-336
------------------------------------------
310.243037 | 0) kworker-336 | | finish_task_switch() {
310.243038 | 0) kworker-336 | 0.181 us | preempt_count_sub();
310.243039 | 0) kworker-336 | 1.332 us | }
310.243040 | 0) kworker-336 | | /* __schedule_exit */
310.243040 | 0) kworker-336 | 0.248 us | } /* preempt_count_sub */
310.243041 | 0) kworker-336 | ! 889.847 us | } /* wake_up_worker */
After returning from wake_up_worker(); the preemption is disabled and __schedule() called again:
310.243041 | 0) kworker-336 | ! 892.553 us | } /* wq_worker_sleeping */
310.243042 | 0) kworker-336 | 0.181 us | preempt_count_add();
310.243043 | 0) kworker-336 | | /* __schedule_entry */
310.243043 | 0) kworker-336 | | rcu_note_context_switch() {
310.243044 | 0) kworker-336 | 0.175 us | rcu_sched_qs();
310.243045 | 0) kworker-336 | 0.174 us | rcu_preempt_qs();
310.243046 | 0) kworker-336 | 2.456 us | }
310.243047 | 0) kworker-336 | 0.188 us | preempt_count_add();
310.243048 | 0) kworker-336 | | update_rq_clock() {
310.243049 | 0) kworker-336 | 0.191 us | kvm_steal_clock();
310.243050 | 0) kworker-336 | 1.429 us | }
310.243051 | 0) kworker-336 | | pick_next_task_fair() {
310.243051 | 0) kworker-336 | | update_curr() {
310.243052 | 0) kworker-336 | 0.221 us | update_min_vruntime();
310.243053 | 0) kworker-336 | | cpuacct_charge() {
310.243054 | 0) kworker-336 | 0.191 us | __rcu_read_lock();
310.243055 | 0) kworker-336 | 0.158 us | __rcu_read_unlock();
310.243056 | 0) kworker-336 | 2.388 us | }
310.243056 | 0) kworker-336 | 0.188 us | __rcu_read_lock();
310.243058 | 0) kworker-336 | 0.154 us | __rcu_read_unlock();
310.243059 | 0) kworker-336 | 7.124 us | }
310.243059 | 0) kworker-336 | 0.200 us | check_cfs_rq_runtime();
310.243060 | 0) kworker-336 | | pick_next_entity() {
310.243061 | 0) kworker-336 | 0.192 us | clear_buddies();
310.243062 | 0) kworker-336 | 1.437 us | }
310.243063 | 0) kworker-336 | + 11.870 us | }
310.243064 | 0) kworker-336 | 0.190 us | preempt_count_sub();
This is the trace_prink() that shows that no context switch:
310.243065 | 0) kworker-336 | | /* oh boy, no context switch */
310.243065 | 0) kworker-336 | | /* __schedule_exit */
310.243065 | 0) kworker-336 | 0.216 us | preempt_count_sub();
310.243067 | 0) kworker-336 | | wq_worker_running() {
310.243067 | 0) kworker-336 | 0.214 us | kthread_data();
310.243069 | 0) kworker-336 | 1.390 us | }
310.243069 | 0) kworker-336 | ! 921.052 us | }
In this case, the second __schedule() is aways called in vain.