scx_simple.bpf.c 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159
  1. /* SPDX-License-Identifier: GPL-2.0 */
  2. /*
  3. * A simple scheduler.
  4. *
  5. * By default, it operates as a simple global weighted vtime scheduler and can
  6. * be switched to FIFO scheduling. It also demonstrates the following niceties.
  7. *
  8. * - Statistics tracking how many tasks are queued to local and global dsq's.
  9. * - Termination notification for userspace.
  10. *
  11. * While very simple, this scheduler should work reasonably well on CPUs with a
  12. * uniform L3 cache topology. While preemption is not implemented, the fact that
  13. * the scheduling queue is shared across all CPUs means that whatever is at the
  14. * front of the queue is likely to be executed fairly quickly given enough
  15. * number of CPUs. The FIFO scheduling mode may be beneficial to some workloads
  16. * but comes with the usual problems with FIFO scheduling where saturating
  17. * threads can easily drown out interactive ones.
  18. *
  19. * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
  20. * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
  21. * Copyright (c) 2022 David Vernet <dvernet@meta.com>
  22. */
  23. #include <scx/common.bpf.h>
  24. char _license[] SEC("license") = "GPL";
  25. const volatile bool fifo_sched;
  26. static u64 vtime_now;
  27. UEI_DEFINE(uei);
  28. /*
  29. * Built-in DSQs such as SCX_DSQ_GLOBAL cannot be used as priority queues
  30. * (meaning, cannot be dispatched to with scx_bpf_dsq_insert_vtime()). We
  31. * therefore create a separate DSQ with ID 0 that we dispatch to and consume
  32. * from. If scx_simple only supported global FIFO scheduling, then we could just
  33. * use SCX_DSQ_GLOBAL.
  34. */
  35. #define SHARED_DSQ 0
  36. struct {
  37. __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
  38. __uint(key_size, sizeof(u32));
  39. __uint(value_size, sizeof(u64));
  40. __uint(max_entries, 2); /* [local, global] */
  41. } stats SEC(".maps");
  42. static void stat_inc(u32 idx)
  43. {
  44. u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx);
  45. if (cnt_p)
  46. (*cnt_p)++;
  47. }
  48. s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
  49. {
  50. bool is_idle = false;
  51. s32 cpu;
  52. cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);
  53. if (is_idle) {
  54. stat_inc(0); /* count local queueing */
  55. scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
  56. }
  57. return cpu;
  58. }
  59. void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags)
  60. {
  61. stat_inc(1); /* count global queueing */
  62. if (fifo_sched) {
  63. scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);
  64. } else {
  65. u64 vtime = p->scx.dsq_vtime;
  66. /*
  67. * Limit the amount of budget that an idling task can accumulate
  68. * to one slice.
  69. */
  70. if (time_before(vtime, vtime_now - SCX_SLICE_DFL))
  71. vtime = vtime_now - SCX_SLICE_DFL;
  72. scx_bpf_dsq_insert_vtime(p, SHARED_DSQ, SCX_SLICE_DFL, vtime,
  73. enq_flags);
  74. }
  75. }
  76. void BPF_STRUCT_OPS(simple_dispatch, s32 cpu, struct task_struct *prev)
  77. {
  78. scx_bpf_dsq_move_to_local(SHARED_DSQ);
  79. }
  80. void BPF_STRUCT_OPS(simple_running, struct task_struct *p)
  81. {
  82. if (fifo_sched)
  83. return;
  84. /*
  85. * Global vtime always progresses forward as tasks start executing. The
  86. * test and update can be performed concurrently from multiple CPUs and
  87. * thus racy. Any error should be contained and temporary. Let's just
  88. * live with it.
  89. */
  90. if (time_before(vtime_now, p->scx.dsq_vtime))
  91. vtime_now = p->scx.dsq_vtime;
  92. }
  93. void BPF_STRUCT_OPS(simple_stopping, struct task_struct *p, bool runnable)
  94. {
  95. if (fifo_sched)
  96. return;
  97. /*
  98. * Scale the execution time by the inverse of the weight and charge.
  99. *
  100. * Note that the default yield implementation yields by setting
  101. * @p->scx.slice to zero and the following would treat the yielding task
  102. * as if it has consumed all its slice. If this penalizes yielding tasks
  103. * too much, determine the execution time by taking explicit timestamps
  104. * instead of depending on @p->scx.slice.
  105. */
  106. p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight;
  107. }
  108. void BPF_STRUCT_OPS(simple_enable, struct task_struct *p)
  109. {
  110. p->scx.dsq_vtime = vtime_now;
  111. }
  112. s32 BPF_STRUCT_OPS_SLEEPABLE(simple_init)
  113. {
  114. int ret;
  115. ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
  116. if (ret) {
  117. scx_bpf_error("failed to create DSQ %d (%d)", SHARED_DSQ, ret);
  118. return ret;
  119. }
  120. return 0;
  121. }
  122. void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei)
  123. {
  124. UEI_RECORD(uei, ei);
  125. }
  126. SCX_OPS_DEFINE(simple_ops,
  127. .select_cpu = (void *)simple_select_cpu,
  128. .enqueue = (void *)simple_enqueue,
  129. .dispatch = (void *)simple_dispatch,
  130. .running = (void *)simple_running,
  131. .stopping = (void *)simple_stopping,
  132. .enable = (void *)simple_enable,
  133. .init = (void *)simple_init,
  134. .exit = (void *)simple_exit,
  135. .name = "simple");