Here is one example of the trace of the problem I reported in this e-mail to the LKML
Note Patch accepted! This patch is the first change that was proposed by the analysis of the task model I am developing in my Ph.D. Thesis.
Trace setup
The trace setup I am using is the following:
# cd /sys/kernel/debug/tracing/ # echo preempt_schedule_irq schedule schedule_preempt_disabled schedule_idle schedule_user >> set_graph_function ; done # echo function_graph > current_tracer # echo sched:sched_switch sched:sched_wakeup > set_event # echo irq_vectors >> set_event # echo $(( 1408 * 100 )) > buffer_size_kb # echo > trace
In addition, I added a trace_prink() at the entry and exit of __schedule() and another when there is no context switch (because __schedule() is notrace).
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 70641e8f3c89..406a0b7ed6b3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3443,6 +3443,7 @@ static void __sched notrace __schedule(bool preempt) struct rq *rq; int cpu; + trace_printk("__schedule_entry\n"); cpu = smp_processor_id(); rq = cpu_rq(cpu); prev = rq->curr; @@ -3517,9 +3518,11 @@ static void __sched notrace __schedule(bool preempt) } else { rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); rq_unlock_irq(rq, &rf); + trace_printk("oh boy, no context switch\n"); } balance_callback(rq); + trace_printk("__schedule_exit\n"); }
This is the perf cmd-line I am using to generate a load that turns the problem visible:
# perf record -o /dev/shm/lala
I ran perf for some seconds and then read the trace.
Trace analysis
The kworker-336 is running:
------------------------------------------ 0) ktimers-8 => kworker-336 ------------------------------------------ 310.241851 | 0) kworker-336 | | finish_task_switch() { 310.241851 | 0) kworker-336 | 0.183 us | preempt_count_sub(); 310.241853 | 0) kworker-336 | 1.467 us | } 310.241853 | 0) kworker-336 | | /* __schedule_exit */ 310.241854 | 0) kworker-336 | 0.229 us | preempt_count_sub(); 310.241855 | 0) kworker-336 | | wq_worker_running() { 310.241856 | 0) kworker-336 | 0.280 us | kthread_data(); 310.241857 | 0) kworker-336 | 1.569 us | } 310.241858 | 0) kworker-336 | * 39582.48 us | }
At this point it calls the scheduler:
310.242147 | 0) kworker-336 | | schedule() { 310.242148 | 0) kworker-336 | | wq_worker_sleeping() { 310.242149 | 0) kworker-336 | 0.239 us | kthread_data();
while in the wakeup_worker() call:
310.242151 | 0) kworker-336 | | wake_up_worker() { 310.242151 | 0) kworker-336 | 0.186 us | preempt_count_add(); 310.242152 | 0) kworker-336 | | wake_up_process() { 310.242153 | 0) kworker-336 | | try_to_wake_up() {
And another kworker is awakened:
310.242153 | 0) kworker-336 | 0.200 us | preempt_count_add(); 310.242155 | 0) kworker-336 | 0.186 us | preempt_count_add(); 310.242156 | 0) kworker-336 | | update_rq_clock() { 310.242157 | 0) kworker-336 | 0.193 us | kvm_steal_clock(); 310.242158 | 0) kworker-336 | 1.610 us | } 310.242159 | 0) kworker-336 | | activate_task() { 310.242160 | 0) kworker-336 | | enqueue_task_fair() { 310.242160 | 0) kworker-336 | | enqueue_entity() { 310.242161 | 0) kworker-336 | | update_curr() { 310.242162 | 0) kworker-336 | 0.249 us | update_min_vruntime(); 310.242163 | 0) kworker-336 | | cpuacct_charge() { 310.242164 | 0) kworker-336 | 0.155 us | __rcu_read_lock(); 310.242165 | 0) kworker-336 | 0.160 us | __rcu_read_unlock(); 310.242166 | 0) kworker-336 | 2.495 us | } 310.242167 | 0) kworker-336 | 0.161 us | __rcu_read_lock(); 310.242168 | 0) kworker-336 | 0.166 us | __rcu_read_unlock(); 310.242169 | 0) kworker-336 | 7.787 us | } 310.242170 | 0) kworker-336 | 0.216 us | update_cfs_group(); 310.242171 | 0) kworker-336 | 0.216 us | __enqueue_entity(); 310.242172 | 0) kworker-336 | + 11.478 us | } 310.242173 | 0) kworker-336 | 0.158 us | hrtick_update(); 310.242174 | 0) kworker-336 | + 13.759 us | } 310.242174 | 0) kworker-336 | + 15.052 us | } 310.242175 | 0) kworker-336 | | check_preempt_curr() { 310.242176 | 0) kworker-336 | | check_preempt_wakeup() { 310.242176 | 0) kworker-336 | 0.209 us | update_curr(); 310.242178 | 0) kworker-336 | 0.158 us | wakeup_preempt_entity.isra.39(); 310.242179 | 0) kworker-336 | 0.291 us | set_next_buddy(); 310.242180 | 0) kworker-336 | 0.246 us | resched_curr_lazy(); 310.242182 | 0) kworker-336 | 5.704 us | } 310.242183 | 0) kworker-336 | 7.150 us | } 310.242184 | 0) kworker-336 | | /* sched_wakeup: comm=kworker/0:2 pid=104 prio=120 target_cpu=000 */ 310.242184 | 0) kworker-336 | 0.349 us | preempt_count_sub(); 310.242186 | 0) kworker-336 | 0.214 us | preempt_count_sub(); 310.242187 | 0) kworker-336 | + 34.075 us | } 310.242188 | 0) kworker-336 | + 35.213 us | } 310.242189 | 0) kworker-336 | | preempt_count_sub() {
And here __schedule is called:
310.242190 | 0) kworker-336 | | /* __schedule_entry */ 310.242190 | 0) kworker-336 | | rcu_note_context_switch() { 310.242191 | 0) kworker-336 | 0.215 us | rcu_sched_qs(); 310.242192 | 0) kworker-336 | 0.214 us | rcu_preempt_qs(); 310.242194 | 0) kworker-336 | 2.982 us | } 310.242194 | 0) kworker-336 | 0.237 us | preempt_count_add(); 310.242196 | 0) kworker-336 | | update_rq_clock() { 310.242197 | 0) kworker-336 | 0.209 us | kvm_steal_clock(); 310.242198 | 0) kworker-336 | 1.657 us | } 310.242199 | 0) kworker-336 | | pick_next_task_fair() { 310.242199 | 0) kworker-336 | | update_curr() { 310.242201 | 0) kworker-336 | 0.261 us | update_min_vruntime(); 310.242202 | 0) kworker-336 | | cpuacct_charge() { 310.242203 | 0) kworker-336 | 0.203 us | __rcu_read_lock(); 310.242204 | 0) kworker-336 | 0.199 us | __rcu_read_unlock(); 310.242206 | 0) kworker-336 | 2.838 us | } 310.242206 | 0) kworker-336 | 0.201 us | __rcu_read_lock(); 310.242207 | 0) kworker-336 | 0.275 us | __rcu_read_unlock(); 310.242209 | 0) kworker-336 | 8.898 us | } 310.242209 | 0) kworker-336 | 0.205 us | check_cfs_rq_runtime(); 310.242211 | 0) kworker-336 | | pick_next_entity() { 310.242212 | 0) kworker-336 | 0.229 us | wakeup_preempt_entity.isra.39(); 310.242213 | 0) kworker-336 | 0.317 us | clear_buddies(); 310.242214 | 0) kworker-336 | 2.959 us | } 310.242215 | 0) kworker-336 | | put_prev_entity() { 310.242216 | 0) kworker-336 | 0.246 us | update_curr(); 310.242217 | 0) kworker-336 | 0.214 us | check_cfs_rq_runtime(); 310.242218 | 0) kworker-336 | 0.382 us | __enqueue_entity(); 310.242220 | 0) kworker-336 | 4.481 us | } 310.242220 | 0) kworker-336 | 0.461 us | set_next_entity(); 310.242222 | 0) kworker-336 | + 22.840 us | }
resulting in a context switch:
310.242223 | 0) kworker-336 | | /* sched_switch: prev_comm=kworker/0:7 prev_pid=336 prev_prio=120 prev_state=R+ ==> next_comm=kworker/0:2 next_pid=104 next_prio=120 */ 310.242224 | 0) kworker-336 | 0.244 us | enter_lazy_tlb(); ------------------------------------------ 0) kworker-336 => kworker-104 ------------------------------------------ [ lines deleted ]
Until the point in which the kworker leaves the processor:
310.243004 | 0) kworker-104 | | /* __schedule_entry */ 310.243004 | 0) kworker-104 | | rcu_note_context_switch() { 310.243005 | 0) kworker-104 | 0.171 us | rcu_sched_qs(); 310.243006 | 0) kworker-104 | 0.167 us | rcu_preempt_qs(); 310.243007 | 0) kworker-104 | 2.478 us | } 310.243008 | 0) kworker-104 | 0.184 us | preempt_count_add(); 310.243009 | 0) kworker-104 | | update_rq_clock() { 310.243009 | 0) kworker-104 | 0.176 us | kvm_steal_clock(); 310.243011 | 0) kworker-104 | 1.405 us | } 310.243011 | 0) kworker-104 | | deactivate_task() { 310.243012 | 0) kworker-104 | | dequeue_task_fair() { 310.243012 | 0) kworker-104 | | dequeue_entity() { 310.243013 | 0) kworker-104 | | update_curr() { 310.243014 | 0) kworker-104 | 0.190 us | update_min_vruntime(); 310.243015 | 0) kworker-104 | | cpuacct_charge() { 310.243015 | 0) kworker-104 | 0.176 us | __rcu_read_lock(); 310.243016 | 0) kworker-104 | 0.157 us | __rcu_read_unlock(); 310.243018 | 0) kworker-104 | 2.370 us | } 310.243018 | 0) kworker-104 | 0.180 us | __rcu_read_lock(); 310.243019 | 0) kworker-104 | 0.153 us | __rcu_read_unlock(); 310.243020 | 0) kworker-104 | 6.974 us | } 310.243021 | 0) kworker-104 | 0.193 us | clear_buddies(); 310.243022 | 0) kworker-104 | 0.164 us | update_cfs_group(); 310.243023 | 0) kworker-104 | + 10.589 us | } 310.243024 | 0) kworker-104 | 0.162 us | hrtick_update(); 310.243025 | 0) kworker-104 | + 12.879 us | } 310.243026 | 0) kworker-104 | + 14.158 us | } 310.243026 | 0) kworker-104 | | pick_next_task_fair() { 310.243027 | 0) kworker-104 | 0.187 us | check_cfs_rq_runtime(); 310.243028 | 0) kworker-104 | | pick_next_entity() { 310.243029 | 0) kworker-104 | 0.190 us | clear_buddies(); 310.243030 | 0) kworker-104 | 1.275 us | } 310.243030 | 0) kworker-104 | | put_prev_entity() { 310.243031 | 0) kworker-104 | 0.181 us | check_cfs_rq_runtime(); 310.243032 | 0) kworker-104 | 1.379 us | } 310.243033 | 0) kworker-104 | 0.335 us | set_next_entity(); 310.243034 | 0) kworker-104 | 7.416 us | } 310.243035 | 0) kworker-104 | | /* sched_switch: prev_comm=kworker/0:2 prev_pid=104 prev_prio=120 prev_state=R+ ==> next_comm=kworker/0:7 next_pid=336 next_prio=120 */ 310.243036 | 0) kworker-104 | 0.185 us | enter_lazy_tlb();
And switch the context back to the first kworker:
------------------------------------------ 0) kworker-104 => kworker-336 ------------------------------------------ 310.243037 | 0) kworker-336 | | finish_task_switch() { 310.243038 | 0) kworker-336 | 0.181 us | preempt_count_sub(); 310.243039 | 0) kworker-336 | 1.332 us | } 310.243040 | 0) kworker-336 | | /* __schedule_exit */ 310.243040 | 0) kworker-336 | 0.248 us | } /* preempt_count_sub */ 310.243041 | 0) kworker-336 | ! 889.847 us | } /* wake_up_worker */
After returning from wake_up_worker(); the preemption is disabled and __schedule() called again:
310.243041 | 0) kworker-336 | ! 892.553 us | } /* wq_worker_sleeping */ 310.243042 | 0) kworker-336 | 0.181 us | preempt_count_add(); 310.243043 | 0) kworker-336 | | /* __schedule_entry */ 310.243043 | 0) kworker-336 | | rcu_note_context_switch() { 310.243044 | 0) kworker-336 | 0.175 us | rcu_sched_qs(); 310.243045 | 0) kworker-336 | 0.174 us | rcu_preempt_qs(); 310.243046 | 0) kworker-336 | 2.456 us | } 310.243047 | 0) kworker-336 | 0.188 us | preempt_count_add(); 310.243048 | 0) kworker-336 | | update_rq_clock() { 310.243049 | 0) kworker-336 | 0.191 us | kvm_steal_clock(); 310.243050 | 0) kworker-336 | 1.429 us | } 310.243051 | 0) kworker-336 | | pick_next_task_fair() { 310.243051 | 0) kworker-336 | | update_curr() { 310.243052 | 0) kworker-336 | 0.221 us | update_min_vruntime(); 310.243053 | 0) kworker-336 | | cpuacct_charge() { 310.243054 | 0) kworker-336 | 0.191 us | __rcu_read_lock(); 310.243055 | 0) kworker-336 | 0.158 us | __rcu_read_unlock(); 310.243056 | 0) kworker-336 | 2.388 us | } 310.243056 | 0) kworker-336 | 0.188 us | __rcu_read_lock(); 310.243058 | 0) kworker-336 | 0.154 us | __rcu_read_unlock(); 310.243059 | 0) kworker-336 | 7.124 us | } 310.243059 | 0) kworker-336 | 0.200 us | check_cfs_rq_runtime(); 310.243060 | 0) kworker-336 | | pick_next_entity() { 310.243061 | 0) kworker-336 | 0.192 us | clear_buddies(); 310.243062 | 0) kworker-336 | 1.437 us | } 310.243063 | 0) kworker-336 | + 11.870 us | } 310.243064 | 0) kworker-336 | 0.190 us | preempt_count_sub();
This is the trace_prink() that shows that no context switch:
310.243065 | 0) kworker-336 | | /* oh boy, no context switch */ 310.243065 | 0) kworker-336 | | /* __schedule_exit */ 310.243065 | 0) kworker-336 | 0.216 us | preempt_count_sub(); 310.243067 | 0) kworker-336 | | wq_worker_running() { 310.243067 | 0) kworker-336 | 0.214 us | kthread_data(); 310.243069 | 0) kworker-336 | 1.390 us | } 310.243069 | 0) kworker-336 | ! 921.052 us | }
In this case, the second __schedule() is aways called in vain.