__schedule() being called twice, the second in vain

Here is one example of the trace of the problem I reported in this e-mail to the LKML

Note Patch accepted! This patch is the first change that was proposed by the analysis of the task model I am developing in my Ph.D. Thesis.

Trace setup

The trace setup I am using is the following:

# cd /sys/kernel/debug/tracing/
# echo preempt_schedule_irq schedule schedule_preempt_disabled schedule_idle schedule_user >> set_graph_function ; done 
# echo function_graph > current_tracer
# echo sched:sched_switch sched:sched_wakeup > set_event
# echo irq_vectors >> set_event
# echo $(( 1408 * 100 )) > buffer_size_kb
# echo > trace

In addition, I added a trace_prink() at the entry and exit of __schedule() and another when there is no context switch (because __schedule() is notrace).

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 70641e8f3c89..406a0b7ed6b3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3443,6 +3443,7 @@ static void __sched notrace __schedule(bool preempt)
        struct rq *rq;
        int cpu;
 
+       trace_printk("__schedule_entry\n");
        cpu = smp_processor_id();
        rq = cpu_rq(cpu);
        prev = rq->curr;
@@ -3517,9 +3518,11 @@ static void __sched notrace __schedule(bool preempt)
        } else {
                rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
                rq_unlock_irq(rq, &rf);
+               trace_printk("oh boy, no context switch\n");
        }
 
        balance_callback(rq);
+       trace_printk("__schedule_exit\n");
 }

This is the perf cmd-line I am using to generate a load that turns the problem visible:

# perf record -o /dev/shm/lala

I ran perf for some seconds and then read the trace.

Trace analysis

The kworker-336 is running:

 ------------------------------------------
 0)   ktimers-8    =>  kworker-336  
 ------------------------------------------

  310.241851 |   0)  kworker-336   |               |    finish_task_switch() {
  310.241851 |   0)  kworker-336   |   0.183 us    |      preempt_count_sub();
  310.241853 |   0)  kworker-336   |   1.467 us    |    }
  310.241853 |   0)  kworker-336   |               |  /* __schedule_exit */
  310.241854 |   0)  kworker-336   |   0.229 us    |    preempt_count_sub();
  310.241855 |   0)  kworker-336   |               |    wq_worker_running() {
  310.241856 |   0)  kworker-336   |   0.280 us    |      kthread_data();
  310.241857 |   0)  kworker-336   |   1.569 us    |    }
  310.241858 |   0)  kworker-336   | * 39582.48 us |  }

At this point it calls the scheduler:

  310.242147 |   0)  kworker-336   |               |  schedule() {
  310.242148 |   0)  kworker-336   |               |    wq_worker_sleeping() {
  310.242149 |   0)  kworker-336   |   0.239 us    |      kthread_data();

while in the wakeup_worker() call:

  310.242151 |   0)  kworker-336   |               |      wake_up_worker() {
  310.242151 |   0)  kworker-336   |   0.186 us    |        preempt_count_add();
  310.242152 |   0)  kworker-336   |               |        wake_up_process() {
  310.242153 |   0)  kworker-336   |               |          try_to_wake_up() {

And another kworker is awakened:

  310.242153 |   0)  kworker-336   |   0.200 us    |            preempt_count_add();
  310.242155 |   0)  kworker-336   |   0.186 us    |            preempt_count_add();
  310.242156 |   0)  kworker-336   |               |            update_rq_clock() {
  310.242157 |   0)  kworker-336   |   0.193 us    |              kvm_steal_clock();
  310.242158 |   0)  kworker-336   |   1.610 us    |            }
  310.242159 |   0)  kworker-336   |               |            activate_task() {
  310.242160 |   0)  kworker-336   |               |              enqueue_task_fair() {
  310.242160 |   0)  kworker-336   |               |                enqueue_entity() {
  310.242161 |   0)  kworker-336   |               |                  update_curr() {
  310.242162 |   0)  kworker-336   |   0.249 us    |                    update_min_vruntime();
  310.242163 |   0)  kworker-336   |               |                    cpuacct_charge() {
  310.242164 |   0)  kworker-336   |   0.155 us    |                      __rcu_read_lock();
  310.242165 |   0)  kworker-336   |   0.160 us    |                      __rcu_read_unlock();
  310.242166 |   0)  kworker-336   |   2.495 us    |                    }
  310.242167 |   0)  kworker-336   |   0.161 us    |                    __rcu_read_lock();
  310.242168 |   0)  kworker-336   |   0.166 us    |                    __rcu_read_unlock();
  310.242169 |   0)  kworker-336   |   7.787 us    |                  }
  310.242170 |   0)  kworker-336   |   0.216 us    |                  update_cfs_group();
  310.242171 |   0)  kworker-336   |   0.216 us    |                  __enqueue_entity();
  310.242172 |   0)  kworker-336   | + 11.478 us   |                }
  310.242173 |   0)  kworker-336   |   0.158 us    |                hrtick_update();
  310.242174 |   0)  kworker-336   | + 13.759 us   |              }
  310.242174 |   0)  kworker-336   | + 15.052 us   |            }
  310.242175 |   0)  kworker-336   |               |            check_preempt_curr() {
  310.242176 |   0)  kworker-336   |               |              check_preempt_wakeup() {
  310.242176 |   0)  kworker-336   |   0.209 us    |                update_curr();
  310.242178 |   0)  kworker-336   |   0.158 us    |                wakeup_preempt_entity.isra.39();
  310.242179 |   0)  kworker-336   |   0.291 us    |                set_next_buddy();
  310.242180 |   0)  kworker-336   |   0.246 us    |                resched_curr_lazy();
  310.242182 |   0)  kworker-336   |   5.704 us    |              }
  310.242183 |   0)  kworker-336   |   7.150 us    |            }
  310.242184 |   0)  kworker-336   |               |            /* sched_wakeup: comm=kworker/0:2 pid=104 prio=120 target_cpu=000 */
  310.242184 |   0)  kworker-336   |   0.349 us    |            preempt_count_sub();
  310.242186 |   0)  kworker-336   |   0.214 us    |            preempt_count_sub();
  310.242187 |   0)  kworker-336   | + 34.075 us   |          }
  310.242188 |   0)  kworker-336   | + 35.213 us   |        }
  310.242189 |   0)  kworker-336   |               |        preempt_count_sub() {

And here __schedule is called:

  310.242190 |   0)  kworker-336   |               |          /* __schedule_entry */
  310.242190 |   0)  kworker-336   |               |          rcu_note_context_switch() {
  310.242191 |   0)  kworker-336   |   0.215 us    |            rcu_sched_qs();
  310.242192 |   0)  kworker-336   |   0.214 us    |            rcu_preempt_qs();
  310.242194 |   0)  kworker-336   |   2.982 us    |          }
  310.242194 |   0)  kworker-336   |   0.237 us    |          preempt_count_add();
  310.242196 |   0)  kworker-336   |               |          update_rq_clock() {
  310.242197 |   0)  kworker-336   |   0.209 us    |            kvm_steal_clock();
  310.242198 |   0)  kworker-336   |   1.657 us    |          }
  310.242199 |   0)  kworker-336   |               |          pick_next_task_fair() {
  310.242199 |   0)  kworker-336   |               |            update_curr() {
  310.242201 |   0)  kworker-336   |   0.261 us    |              update_min_vruntime();
  310.242202 |   0)  kworker-336   |               |              cpuacct_charge() {
  310.242203 |   0)  kworker-336   |   0.203 us    |                __rcu_read_lock();
  310.242204 |   0)  kworker-336   |   0.199 us    |                __rcu_read_unlock();
  310.242206 |   0)  kworker-336   |   2.838 us    |              }
  310.242206 |   0)  kworker-336   |   0.201 us    |              __rcu_read_lock();
  310.242207 |   0)  kworker-336   |   0.275 us    |              __rcu_read_unlock();
  310.242209 |   0)  kworker-336   |   8.898 us    |            }
  310.242209 |   0)  kworker-336   |   0.205 us    |            check_cfs_rq_runtime();
  310.242211 |   0)  kworker-336   |               |            pick_next_entity() {
  310.242212 |   0)  kworker-336   |   0.229 us    |              wakeup_preempt_entity.isra.39();
  310.242213 |   0)  kworker-336   |   0.317 us    |              clear_buddies();
  310.242214 |   0)  kworker-336   |   2.959 us    |            }
  310.242215 |   0)  kworker-336   |               |            put_prev_entity() {
  310.242216 |   0)  kworker-336   |   0.246 us    |              update_curr();
  310.242217 |   0)  kworker-336   |   0.214 us    |              check_cfs_rq_runtime();
  310.242218 |   0)  kworker-336   |   0.382 us    |              __enqueue_entity();
  310.242220 |   0)  kworker-336   |   4.481 us    |            }
  310.242220 |   0)  kworker-336   |   0.461 us    |            set_next_entity();
  310.242222 |   0)  kworker-336   | + 22.840 us   |          }

resulting in a context switch:

  310.242223 |   0)  kworker-336   |               |          /* sched_switch: prev_comm=kworker/0:7 prev_pid=336 prev_prio=120 prev_state=R+ ==> next_comm=kworker/0:2 next_pid=104 next_prio=120 */
  310.242224 |   0)  kworker-336   |   0.244 us    |          enter_lazy_tlb();
 ------------------------------------------
 0)  kworker-336   =>  kworker-104  
 ------------------------------------------
[ lines deleted ]

Until the point in which the kworker leaves the processor:

  310.243004 |   0)  kworker-104   |               |  /* __schedule_entry */
  310.243004 |   0)  kworker-104   |               |    rcu_note_context_switch() {
  310.243005 |   0)  kworker-104   |   0.171 us    |      rcu_sched_qs();
  310.243006 |   0)  kworker-104   |   0.167 us    |      rcu_preempt_qs();
  310.243007 |   0)  kworker-104   |   2.478 us    |    }
  310.243008 |   0)  kworker-104   |   0.184 us    |    preempt_count_add();
  310.243009 |   0)  kworker-104   |               |    update_rq_clock() {
  310.243009 |   0)  kworker-104   |   0.176 us    |      kvm_steal_clock();
  310.243011 |   0)  kworker-104   |   1.405 us    |    }
  310.243011 |   0)  kworker-104   |               |    deactivate_task() {
  310.243012 |   0)  kworker-104   |               |      dequeue_task_fair() {
  310.243012 |   0)  kworker-104   |               |        dequeue_entity() {
  310.243013 |   0)  kworker-104   |               |          update_curr() {
  310.243014 |   0)  kworker-104   |   0.190 us    |            update_min_vruntime();
  310.243015 |   0)  kworker-104   |               |            cpuacct_charge() {
  310.243015 |   0)  kworker-104   |   0.176 us    |              __rcu_read_lock();
  310.243016 |   0)  kworker-104   |   0.157 us    |              __rcu_read_unlock();
  310.243018 |   0)  kworker-104   |   2.370 us    |            }
  310.243018 |   0)  kworker-104   |   0.180 us    |            __rcu_read_lock();
  310.243019 |   0)  kworker-104   |   0.153 us    |            __rcu_read_unlock();
  310.243020 |   0)  kworker-104   |   6.974 us    |          }
  310.243021 |   0)  kworker-104   |   0.193 us    |          clear_buddies();
  310.243022 |   0)  kworker-104   |   0.164 us    |          update_cfs_group();
  310.243023 |   0)  kworker-104   | + 10.589 us   |        }
  310.243024 |   0)  kworker-104   |   0.162 us    |        hrtick_update();
  310.243025 |   0)  kworker-104   | + 12.879 us   |      }
  310.243026 |   0)  kworker-104   | + 14.158 us   |    }
  310.243026 |   0)  kworker-104   |               |    pick_next_task_fair() {
  310.243027 |   0)  kworker-104   |   0.187 us    |      check_cfs_rq_runtime();
  310.243028 |   0)  kworker-104   |               |      pick_next_entity() {
  310.243029 |   0)  kworker-104   |   0.190 us    |        clear_buddies();
  310.243030 |   0)  kworker-104   |   1.275 us    |      }
  310.243030 |   0)  kworker-104   |               |      put_prev_entity() {
  310.243031 |   0)  kworker-104   |   0.181 us    |        check_cfs_rq_runtime();
  310.243032 |   0)  kworker-104   |   1.379 us    |      }
  310.243033 |   0)  kworker-104   |   0.335 us    |      set_next_entity();
  310.243034 |   0)  kworker-104   |   7.416 us    |    }
  310.243035 |   0)  kworker-104   |               |  /* sched_switch: prev_comm=kworker/0:2 prev_pid=104 prev_prio=120 prev_state=R+ ==> next_comm=kworker/0:7 next_pid=336 next_prio=120 */
  310.243036 |   0)  kworker-104   |   0.185 us    |    enter_lazy_tlb();

And switch the context back to the first kworker:

 ------------------------------------------
 0)  kworker-104   =>  kworker-336  
 ------------------------------------------

  310.243037 |   0)  kworker-336   |               |          finish_task_switch() {
  310.243038 |   0)  kworker-336   |   0.181 us    |            preempt_count_sub();
  310.243039 |   0)  kworker-336   |   1.332 us    |          }
  310.243040 |   0)  kworker-336   |               |          /* __schedule_exit */
  310.243040 |   0)  kworker-336   |   0.248 us    |        } /* preempt_count_sub */
  310.243041 |   0)  kworker-336   | ! 889.847 us  |      } /* wake_up_worker */

After returning from wake_up_worker(); the preemption is disabled and __schedule() called again:

  310.243041 |   0)  kworker-336   | ! 892.553 us  |    } /* wq_worker_sleeping */
  310.243042 |   0)  kworker-336   |   0.181 us    |    preempt_count_add();
  310.243043 |   0)  kworker-336   |               |  /* __schedule_entry */
  310.243043 |   0)  kworker-336   |               |    rcu_note_context_switch() {
  310.243044 |   0)  kworker-336   |   0.175 us    |      rcu_sched_qs();
  310.243045 |   0)  kworker-336   |   0.174 us    |      rcu_preempt_qs();
  310.243046 |   0)  kworker-336   |   2.456 us    |    }
  310.243047 |   0)  kworker-336   |   0.188 us    |    preempt_count_add();
  310.243048 |   0)  kworker-336   |               |    update_rq_clock() {
  310.243049 |   0)  kworker-336   |   0.191 us    |      kvm_steal_clock();
  310.243050 |   0)  kworker-336   |   1.429 us    |    }
  310.243051 |   0)  kworker-336   |               |    pick_next_task_fair() {
  310.243051 |   0)  kworker-336   |               |      update_curr() {
  310.243052 |   0)  kworker-336   |   0.221 us    |        update_min_vruntime();
  310.243053 |   0)  kworker-336   |               |        cpuacct_charge() {
  310.243054 |   0)  kworker-336   |   0.191 us    |          __rcu_read_lock();
  310.243055 |   0)  kworker-336   |   0.158 us    |          __rcu_read_unlock();
  310.243056 |   0)  kworker-336   |   2.388 us    |        }
  310.243056 |   0)  kworker-336   |   0.188 us    |        __rcu_read_lock();
  310.243058 |   0)  kworker-336   |   0.154 us    |        __rcu_read_unlock();
  310.243059 |   0)  kworker-336   |   7.124 us    |      }
  310.243059 |   0)  kworker-336   |   0.200 us    |      check_cfs_rq_runtime();
  310.243060 |   0)  kworker-336   |               |      pick_next_entity() {
  310.243061 |   0)  kworker-336   |   0.192 us    |        clear_buddies();
  310.243062 |   0)  kworker-336   |   1.437 us    |      }
  310.243063 |   0)  kworker-336   | + 11.870 us   |    }
  310.243064 |   0)  kworker-336   |   0.190 us    |    preempt_count_sub();

This is the trace_prink() that shows that no context switch:

  310.243065 |   0)  kworker-336   |               |  /* oh boy, no context switch */
  310.243065 |   0)  kworker-336   |               |  /* __schedule_exit */
  310.243065 |   0)  kworker-336   |   0.216 us    |    preempt_count_sub();
  310.243067 |   0)  kworker-336   |               |    wq_worker_running() {
  310.243067 |   0)  kworker-336   |   0.214 us    |      kthread_data();
  310.243069 |   0)  kworker-336   |   1.390 us    |    }
  310.243069 |   0)  kworker-336   | ! 921.052 us  |  }

In this case, the second __schedule() is aways called in vain.