scx_qmap.bpf.c 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915
  1. /* SPDX-License-Identifier: GPL-2.0 */
  2. /*
  3. * A simple five-level FIFO queue scheduler.
  4. *
  5. * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets
  6. * assigned to one depending on its compound weight. Each CPU round robins
  7. * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from
  8. * queue0, 2 from queue1, 4 from queue2 and so on.
  9. *
  10. * This scheduler demonstrates:
  11. *
  12. * - BPF-side queueing using PIDs.
  13. * - Sleepable per-task storage allocation using ops.prep_enable().
  14. * - Using ops.cpu_release() to handle a higher priority scheduling class taking
  15. * the CPU away.
  16. * - Core-sched support.
  17. *
  18. * This scheduler is primarily for demonstration and testing of sched_ext
  19. * features and unlikely to be useful for actual workloads.
  20. *
  21. * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
  22. * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
  23. * Copyright (c) 2022 David Vernet <dvernet@meta.com>
  24. */
  25. #include <scx/common.bpf.h>
  26. enum consts {
  27. ONE_SEC_IN_NS = 1000000000,
  28. SHARED_DSQ = 0,
  29. HIGHPRI_DSQ = 1,
  30. HIGHPRI_WEIGHT = 8668, /* this is what -20 maps to */
  31. };
  32. char _license[] SEC("license") = "GPL";
  33. const volatile u64 slice_ns;
  34. const volatile u32 stall_user_nth;
  35. const volatile u32 stall_kernel_nth;
  36. const volatile u32 dsp_inf_loop_after;
  37. const volatile u32 dsp_batch;
  38. const volatile bool highpri_boosting;
  39. const volatile bool print_dsqs_and_events;
  40. const volatile bool print_msgs;
  41. const volatile s32 disallow_tgid;
  42. const volatile bool suppress_dump;
  43. u64 nr_highpri_queued;
  44. u32 test_error_cnt;
  45. UEI_DEFINE(uei);
  46. struct qmap {
  47. __uint(type, BPF_MAP_TYPE_QUEUE);
  48. __uint(max_entries, 4096);
  49. __type(value, u32);
  50. } queue0 SEC(".maps"),
  51. queue1 SEC(".maps"),
  52. queue2 SEC(".maps"),
  53. queue3 SEC(".maps"),
  54. queue4 SEC(".maps"),
  55. dump_store SEC(".maps");
  56. struct {
  57. __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
  58. __uint(max_entries, 5);
  59. __type(key, int);
  60. __array(values, struct qmap);
  61. } queue_arr SEC(".maps") = {
  62. .values = {
  63. [0] = &queue0,
  64. [1] = &queue1,
  65. [2] = &queue2,
  66. [3] = &queue3,
  67. [4] = &queue4,
  68. },
  69. };
  70. /*
  71. * If enabled, CPU performance target is set according to the queue index
  72. * according to the following table.
  73. */
  74. static const u32 qidx_to_cpuperf_target[] = {
  75. [0] = SCX_CPUPERF_ONE * 0 / 4,
  76. [1] = SCX_CPUPERF_ONE * 1 / 4,
  77. [2] = SCX_CPUPERF_ONE * 2 / 4,
  78. [3] = SCX_CPUPERF_ONE * 3 / 4,
  79. [4] = SCX_CPUPERF_ONE * 4 / 4,
  80. };
  81. /*
  82. * Per-queue sequence numbers to implement core-sched ordering.
  83. *
  84. * Tail seq is assigned to each queued task and incremented. Head seq tracks the
  85. * sequence number of the latest dispatched task. The distance between the a
  86. * task's seq and the associated queue's head seq is called the queue distance
  87. * and used when comparing two tasks for ordering. See qmap_core_sched_before().
  88. */
  89. static u64 core_sched_head_seqs[5];
  90. static u64 core_sched_tail_seqs[5];
  91. /* Per-task scheduling context */
  92. struct task_ctx {
  93. bool force_local; /* Dispatch directly to local_dsq */
  94. bool highpri;
  95. u64 core_sched_seq;
  96. };
  97. struct {
  98. __uint(type, BPF_MAP_TYPE_TASK_STORAGE);
  99. __uint(map_flags, BPF_F_NO_PREALLOC);
  100. __type(key, int);
  101. __type(value, struct task_ctx);
  102. } task_ctx_stor SEC(".maps");
  103. struct cpu_ctx {
  104. u64 dsp_idx; /* dispatch index */
  105. u64 dsp_cnt; /* remaining count */
  106. u32 avg_weight;
  107. u32 cpuperf_target;
  108. };
  109. struct {
  110. __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
  111. __uint(max_entries, 1);
  112. __type(key, u32);
  113. __type(value, struct cpu_ctx);
  114. } cpu_ctx_stor SEC(".maps");
  115. /* Statistics */
  116. u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued, nr_ddsp_from_enq;
  117. u64 nr_core_sched_execed;
  118. u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer;
  119. u32 cpuperf_min, cpuperf_avg, cpuperf_max;
  120. u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
  121. static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
  122. {
  123. s32 cpu;
  124. if (p->nr_cpus_allowed == 1 ||
  125. scx_bpf_test_and_clear_cpu_idle(prev_cpu))
  126. return prev_cpu;
  127. cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
  128. if (cpu >= 0)
  129. return cpu;
  130. return -1;
  131. }
  132. static struct task_ctx *lookup_task_ctx(struct task_struct *p)
  133. {
  134. struct task_ctx *tctx;
  135. if (!(tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
  136. scx_bpf_error("task_ctx lookup failed");
  137. return NULL;
  138. }
  139. return tctx;
  140. }
  141. s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
  142. s32 prev_cpu, u64 wake_flags)
  143. {
  144. struct task_ctx *tctx;
  145. s32 cpu;
  146. if (!(tctx = lookup_task_ctx(p)))
  147. return -ESRCH;
  148. cpu = pick_direct_dispatch_cpu(p, prev_cpu);
  149. if (cpu >= 0) {
  150. tctx->force_local = true;
  151. return cpu;
  152. } else {
  153. return prev_cpu;
  154. }
  155. }
  156. static int weight_to_idx(u32 weight)
  157. {
  158. /* Coarsely map the compound weight to a FIFO. */
  159. if (weight <= 25)
  160. return 0;
  161. else if (weight <= 50)
  162. return 1;
  163. else if (weight < 200)
  164. return 2;
  165. else if (weight < 400)
  166. return 3;
  167. else
  168. return 4;
  169. }
  170. void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
  171. {
  172. static u32 user_cnt, kernel_cnt;
  173. struct task_ctx *tctx;
  174. u32 pid = p->pid;
  175. int idx = weight_to_idx(p->scx.weight);
  176. void *ring;
  177. s32 cpu;
  178. if (enq_flags & SCX_ENQ_REENQ)
  179. __sync_fetch_and_add(&nr_reenqueued, 1);
  180. if (p->flags & PF_KTHREAD) {
  181. if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth))
  182. return;
  183. } else {
  184. if (stall_user_nth && !(++user_cnt % stall_user_nth))
  185. return;
  186. }
  187. if (test_error_cnt && !--test_error_cnt)
  188. scx_bpf_error("test triggering error");
  189. if (!(tctx = lookup_task_ctx(p)))
  190. return;
  191. /*
  192. * All enqueued tasks must have their core_sched_seq updated for correct
  193. * core-sched ordering. Also, take a look at the end of qmap_dispatch().
  194. */
  195. tctx->core_sched_seq = core_sched_tail_seqs[idx]++;
  196. /*
  197. * If qmap_select_cpu() is telling us to or this is the last runnable
  198. * task on the CPU, enqueue locally.
  199. */
  200. if (tctx->force_local) {
  201. tctx->force_local = false;
  202. scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
  203. return;
  204. }
  205. /* if select_cpu() wasn't called, try direct dispatch */
  206. if (!__COMPAT_is_enq_cpu_selected(enq_flags) &&
  207. (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) {
  208. __sync_fetch_and_add(&nr_ddsp_from_enq, 1);
  209. scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags);
  210. return;
  211. }
  212. /*
  213. * If the task was re-enqueued due to the CPU being preempted by a
  214. * higher priority scheduling class, just re-enqueue the task directly
  215. * on the global DSQ. As we want another CPU to pick it up, find and
  216. * kick an idle CPU.
  217. */
  218. if (enq_flags & SCX_ENQ_REENQ) {
  219. s32 cpu;
  220. scx_bpf_dsq_insert(p, SHARED_DSQ, 0, enq_flags);
  221. cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
  222. if (cpu >= 0)
  223. scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
  224. return;
  225. }
  226. ring = bpf_map_lookup_elem(&queue_arr, &idx);
  227. if (!ring) {
  228. scx_bpf_error("failed to find ring %d", idx);
  229. return;
  230. }
  231. /* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
  232. if (bpf_map_push_elem(ring, &pid, 0)) {
  233. scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, enq_flags);
  234. return;
  235. }
  236. if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
  237. tctx->highpri = true;
  238. __sync_fetch_and_add(&nr_highpri_queued, 1);
  239. }
  240. __sync_fetch_and_add(&nr_enqueued, 1);
  241. }
  242. /*
  243. * The BPF queue map doesn't support removal and sched_ext can handle spurious
  244. * dispatches. qmap_dequeue() is only used to collect statistics.
  245. */
  246. void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
  247. {
  248. __sync_fetch_and_add(&nr_dequeued, 1);
  249. if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)
  250. __sync_fetch_and_add(&nr_core_sched_execed, 1);
  251. }
  252. static void update_core_sched_head_seq(struct task_struct *p)
  253. {
  254. int idx = weight_to_idx(p->scx.weight);
  255. struct task_ctx *tctx;
  256. if ((tctx = lookup_task_ctx(p)))
  257. core_sched_head_seqs[idx] = tctx->core_sched_seq;
  258. }
  259. /*
  260. * To demonstrate the use of scx_bpf_dsq_move(), implement silly selective
  261. * priority boosting mechanism by scanning SHARED_DSQ looking for highpri tasks,
  262. * moving them to HIGHPRI_DSQ and then consuming them first. This makes minor
  263. * difference only when dsp_batch is larger than 1.
  264. *
  265. * scx_bpf_dispatch[_vtime]_from_dsq() are allowed both from ops.dispatch() and
  266. * non-rq-lock holding BPF programs. As demonstration, this function is called
  267. * from qmap_dispatch() and monitor_timerfn().
  268. */
  269. static bool dispatch_highpri(bool from_timer)
  270. {
  271. struct task_struct *p;
  272. s32 this_cpu = bpf_get_smp_processor_id();
  273. /* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */
  274. bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) {
  275. static u64 highpri_seq;
  276. struct task_ctx *tctx;
  277. if (!(tctx = lookup_task_ctx(p)))
  278. return false;
  279. if (tctx->highpri) {
  280. /* exercise the set_*() and vtime interface too */
  281. scx_bpf_dsq_move_set_slice(BPF_FOR_EACH_ITER, slice_ns * 2);
  282. scx_bpf_dsq_move_set_vtime(BPF_FOR_EACH_ITER, highpri_seq++);
  283. scx_bpf_dsq_move_vtime(BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0);
  284. }
  285. }
  286. /*
  287. * Scan HIGHPRI_DSQ and dispatch until a task that can run on this CPU
  288. * is found.
  289. */
  290. bpf_for_each(scx_dsq, p, HIGHPRI_DSQ, 0) {
  291. bool dispatched = false;
  292. s32 cpu;
  293. if (bpf_cpumask_test_cpu(this_cpu, p->cpus_ptr))
  294. cpu = this_cpu;
  295. else
  296. cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0);
  297. if (scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p, SCX_DSQ_LOCAL_ON | cpu,
  298. SCX_ENQ_PREEMPT)) {
  299. if (cpu == this_cpu) {
  300. dispatched = true;
  301. __sync_fetch_and_add(&nr_expedited_local, 1);
  302. } else {
  303. __sync_fetch_and_add(&nr_expedited_remote, 1);
  304. }
  305. if (from_timer)
  306. __sync_fetch_and_add(&nr_expedited_from_timer, 1);
  307. } else {
  308. __sync_fetch_and_add(&nr_expedited_lost, 1);
  309. }
  310. if (dispatched)
  311. return true;
  312. }
  313. return false;
  314. }
  315. void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
  316. {
  317. struct task_struct *p;
  318. struct cpu_ctx *cpuc;
  319. struct task_ctx *tctx;
  320. u32 zero = 0, batch = dsp_batch ?: 1;
  321. void *fifo;
  322. s32 i, pid;
  323. if (dispatch_highpri(false))
  324. return;
  325. if (!nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ))
  326. return;
  327. if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
  328. /*
  329. * PID 2 should be kthreadd which should mostly be idle and off
  330. * the scheduler. Let's keep dispatching it to force the kernel
  331. * to call this function over and over again.
  332. */
  333. p = bpf_task_from_pid(2);
  334. if (p) {
  335. scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, 0);
  336. bpf_task_release(p);
  337. return;
  338. }
  339. }
  340. if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
  341. scx_bpf_error("failed to look up cpu_ctx");
  342. return;
  343. }
  344. for (i = 0; i < 5; i++) {
  345. /* Advance the dispatch cursor and pick the fifo. */
  346. if (!cpuc->dsp_cnt) {
  347. cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5;
  348. cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
  349. }
  350. fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx);
  351. if (!fifo) {
  352. scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx);
  353. return;
  354. }
  355. /* Dispatch or advance. */
  356. bpf_repeat(BPF_MAX_LOOPS) {
  357. struct task_ctx *tctx;
  358. if (bpf_map_pop_elem(fifo, &pid))
  359. break;
  360. p = bpf_task_from_pid(pid);
  361. if (!p)
  362. continue;
  363. if (!(tctx = lookup_task_ctx(p))) {
  364. bpf_task_release(p);
  365. return;
  366. }
  367. if (tctx->highpri)
  368. __sync_fetch_and_sub(&nr_highpri_queued, 1);
  369. update_core_sched_head_seq(p);
  370. __sync_fetch_and_add(&nr_dispatched, 1);
  371. scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, 0);
  372. bpf_task_release(p);
  373. batch--;
  374. cpuc->dsp_cnt--;
  375. if (!batch || !scx_bpf_dispatch_nr_slots()) {
  376. if (dispatch_highpri(false))
  377. return;
  378. scx_bpf_dsq_move_to_local(SHARED_DSQ);
  379. return;
  380. }
  381. if (!cpuc->dsp_cnt)
  382. break;
  383. }
  384. cpuc->dsp_cnt = 0;
  385. }
  386. /*
  387. * No other tasks. @prev will keep running. Update its core_sched_seq as
  388. * if the task were enqueued and dispatched immediately.
  389. */
  390. if (prev) {
  391. tctx = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
  392. if (!tctx) {
  393. scx_bpf_error("task_ctx lookup failed");
  394. return;
  395. }
  396. tctx->core_sched_seq =
  397. core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
  398. }
  399. }
  400. void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
  401. {
  402. struct cpu_ctx *cpuc;
  403. u32 zero = 0;
  404. int idx;
  405. if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
  406. scx_bpf_error("failed to look up cpu_ctx");
  407. return;
  408. }
  409. /*
  410. * Use the running avg of weights to select the target cpuperf level.
  411. * This is a demonstration of the cpuperf feature rather than a
  412. * practical strategy to regulate CPU frequency.
  413. */
  414. cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + p->scx.weight / 4;
  415. idx = weight_to_idx(cpuc->avg_weight);
  416. cpuc->cpuperf_target = qidx_to_cpuperf_target[idx];
  417. scx_bpf_cpuperf_set(scx_bpf_task_cpu(p), cpuc->cpuperf_target);
  418. }
  419. /*
  420. * The distance from the head of the queue scaled by the weight of the queue.
  421. * The lower the number, the older the task and the higher the priority.
  422. */
  423. static s64 task_qdist(struct task_struct *p)
  424. {
  425. int idx = weight_to_idx(p->scx.weight);
  426. struct task_ctx *tctx;
  427. s64 qdist;
  428. tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
  429. if (!tctx) {
  430. scx_bpf_error("task_ctx lookup failed");
  431. return 0;
  432. }
  433. qdist = tctx->core_sched_seq - core_sched_head_seqs[idx];
  434. /*
  435. * As queue index increments, the priority doubles. The queue w/ index 3
  436. * is dispatched twice more frequently than 2. Reflect the difference by
  437. * scaling qdists accordingly. Note that the shift amount needs to be
  438. * flipped depending on the sign to avoid flipping priority direction.
  439. */
  440. if (qdist >= 0)
  441. return qdist << (4 - idx);
  442. else
  443. return qdist << idx;
  444. }
  445. /*
  446. * This is called to determine the task ordering when core-sched is picking
  447. * tasks to execute on SMT siblings and should encode about the same ordering as
  448. * the regular scheduling path. Use the priority-scaled distances from the head
  449. * of the queues to compare the two tasks which should be consistent with the
  450. * dispatch path behavior.
  451. */
  452. bool BPF_STRUCT_OPS(qmap_core_sched_before,
  453. struct task_struct *a, struct task_struct *b)
  454. {
  455. return task_qdist(a) > task_qdist(b);
  456. }
  457. SEC("tp_btf/sched_switch")
  458. int BPF_PROG(qmap_sched_switch, bool preempt, struct task_struct *prev,
  459. struct task_struct *next, unsigned long prev_state)
  460. {
  461. if (!__COMPAT_scx_bpf_reenqueue_local_from_anywhere())
  462. return 0;
  463. /*
  464. * If @cpu is taken by a higher priority scheduling class, it is no
  465. * longer available for executing sched_ext tasks. As we don't want the
  466. * tasks in @cpu's local dsq to sit there until @cpu becomes available
  467. * again, re-enqueue them into the global dsq. See %SCX_ENQ_REENQ
  468. * handling in qmap_enqueue().
  469. */
  470. switch (next->policy) {
  471. case 1: /* SCHED_FIFO */
  472. case 2: /* SCHED_RR */
  473. case 6: /* SCHED_DEADLINE */
  474. scx_bpf_reenqueue_local();
  475. }
  476. return 0;
  477. }
  478. void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
  479. {
  480. /* see qmap_sched_switch() to learn how to do this on newer kernels */
  481. if (!__COMPAT_scx_bpf_reenqueue_local_from_anywhere())
  482. scx_bpf_reenqueue_local();
  483. }
  484. s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
  485. struct scx_init_task_args *args)
  486. {
  487. if (p->tgid == disallow_tgid)
  488. p->scx.disallow = true;
  489. /*
  490. * @p is new. Let's ensure that its task_ctx is available. We can sleep
  491. * in this function and the following will automatically use GFP_KERNEL.
  492. */
  493. if (bpf_task_storage_get(&task_ctx_stor, p, 0,
  494. BPF_LOCAL_STORAGE_GET_F_CREATE))
  495. return 0;
  496. else
  497. return -ENOMEM;
  498. }
  499. void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
  500. {
  501. s32 i, pid;
  502. if (suppress_dump)
  503. return;
  504. bpf_for(i, 0, 5) {
  505. void *fifo;
  506. if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i)))
  507. return;
  508. scx_bpf_dump("QMAP FIFO[%d]:", i);
  509. /*
  510. * Dump can be invoked anytime and there is no way to iterate in
  511. * a non-destructive way. Pop and store in dump_store and then
  512. * restore afterwards. If racing against new enqueues, ordering
  513. * can get mixed up.
  514. */
  515. bpf_repeat(4096) {
  516. if (bpf_map_pop_elem(fifo, &pid))
  517. break;
  518. bpf_map_push_elem(&dump_store, &pid, 0);
  519. scx_bpf_dump(" %d", pid);
  520. }
  521. bpf_repeat(4096) {
  522. if (bpf_map_pop_elem(&dump_store, &pid))
  523. break;
  524. bpf_map_push_elem(fifo, &pid, 0);
  525. }
  526. scx_bpf_dump("\n");
  527. }
  528. }
  529. void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle)
  530. {
  531. u32 zero = 0;
  532. struct cpu_ctx *cpuc;
  533. if (suppress_dump || idle)
  534. return;
  535. if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu)))
  536. return;
  537. scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u",
  538. cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight,
  539. cpuc->cpuperf_target);
  540. }
  541. void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p)
  542. {
  543. struct task_ctx *taskc;
  544. if (suppress_dump)
  545. return;
  546. if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0)))
  547. return;
  548. scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu",
  549. taskc->force_local, taskc->core_sched_seq);
  550. }
  551. s32 BPF_STRUCT_OPS(qmap_cgroup_init, struct cgroup *cgrp, struct scx_cgroup_init_args *args)
  552. {
  553. if (print_msgs)
  554. bpf_printk("CGRP INIT %llu weight=%u period=%lu quota=%ld burst=%lu",
  555. cgrp->kn->id, args->weight, args->bw_period_us,
  556. args->bw_quota_us, args->bw_burst_us);
  557. return 0;
  558. }
  559. void BPF_STRUCT_OPS(qmap_cgroup_set_weight, struct cgroup *cgrp, u32 weight)
  560. {
  561. if (print_msgs)
  562. bpf_printk("CGRP SET %llu weight=%u", cgrp->kn->id, weight);
  563. }
  564. void BPF_STRUCT_OPS(qmap_cgroup_set_bandwidth, struct cgroup *cgrp,
  565. u64 period_us, u64 quota_us, u64 burst_us)
  566. {
  567. if (print_msgs)
  568. bpf_printk("CGRP SET %llu period=%lu quota=%ld burst=%lu",
  569. cgrp->kn->id, period_us, quota_us, burst_us);
  570. }
  571. /*
  572. * Print out the online and possible CPU map using bpf_printk() as a
  573. * demonstration of using the cpumask kfuncs and ops.cpu_on/offline().
  574. */
  575. static void print_cpus(void)
  576. {
  577. const struct cpumask *possible, *online;
  578. s32 cpu;
  579. char buf[128] = "", *p;
  580. int idx;
  581. possible = scx_bpf_get_possible_cpumask();
  582. online = scx_bpf_get_online_cpumask();
  583. idx = 0;
  584. bpf_for(cpu, 0, scx_bpf_nr_cpu_ids()) {
  585. if (!(p = MEMBER_VPTR(buf, [idx++])))
  586. break;
  587. if (bpf_cpumask_test_cpu(cpu, online))
  588. *p++ = 'O';
  589. else if (bpf_cpumask_test_cpu(cpu, possible))
  590. *p++ = 'X';
  591. else
  592. *p++ = ' ';
  593. if ((cpu & 7) == 7) {
  594. if (!(p = MEMBER_VPTR(buf, [idx++])))
  595. break;
  596. *p++ = '|';
  597. }
  598. }
  599. buf[sizeof(buf) - 1] = '\0';
  600. scx_bpf_put_cpumask(online);
  601. scx_bpf_put_cpumask(possible);
  602. bpf_printk("CPUS: |%s", buf);
  603. }
  604. void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu)
  605. {
  606. if (print_msgs) {
  607. bpf_printk("CPU %d coming online", cpu);
  608. /* @cpu is already online at this point */
  609. print_cpus();
  610. }
  611. }
  612. void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu)
  613. {
  614. if (print_msgs) {
  615. bpf_printk("CPU %d going offline", cpu);
  616. /* @cpu is still online at this point */
  617. print_cpus();
  618. }
  619. }
  620. struct monitor_timer {
  621. struct bpf_timer timer;
  622. };
  623. struct {
  624. __uint(type, BPF_MAP_TYPE_ARRAY);
  625. __uint(max_entries, 1);
  626. __type(key, u32);
  627. __type(value, struct monitor_timer);
  628. } monitor_timer SEC(".maps");
  629. /*
  630. * Print out the min, avg and max performance levels of CPUs every second to
  631. * demonstrate the cpuperf interface.
  632. */
  633. static void monitor_cpuperf(void)
  634. {
  635. u32 zero = 0, nr_cpu_ids;
  636. u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0;
  637. u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0;
  638. const struct cpumask *online;
  639. int i, nr_online_cpus = 0;
  640. nr_cpu_ids = scx_bpf_nr_cpu_ids();
  641. online = scx_bpf_get_online_cpumask();
  642. bpf_for(i, 0, nr_cpu_ids) {
  643. struct cpu_ctx *cpuc;
  644. u32 cap, cur;
  645. if (!bpf_cpumask_test_cpu(i, online))
  646. continue;
  647. nr_online_cpus++;
  648. /* collect the capacity and current cpuperf */
  649. cap = scx_bpf_cpuperf_cap(i);
  650. cur = scx_bpf_cpuperf_cur(i);
  651. cur_min = cur < cur_min ? cur : cur_min;
  652. cur_max = cur > cur_max ? cur : cur_max;
  653. /*
  654. * $cur is relative to $cap. Scale it down accordingly so that
  655. * it's in the same scale as other CPUs and $cur_sum/$cap_sum
  656. * makes sense.
  657. */
  658. cur_sum += cur * cap / SCX_CPUPERF_ONE;
  659. cap_sum += cap;
  660. if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) {
  661. scx_bpf_error("failed to look up cpu_ctx");
  662. goto out;
  663. }
  664. /* collect target */
  665. cur = cpuc->cpuperf_target;
  666. target_sum += cur;
  667. target_min = cur < target_min ? cur : target_min;
  668. target_max = cur > target_max ? cur : target_max;
  669. }
  670. cpuperf_min = cur_min;
  671. cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
  672. cpuperf_max = cur_max;
  673. cpuperf_target_min = target_min;
  674. cpuperf_target_avg = target_sum / nr_online_cpus;
  675. cpuperf_target_max = target_max;
  676. out:
  677. scx_bpf_put_cpumask(online);
  678. }
  679. /*
  680. * Dump the currently queued tasks in the shared DSQ to demonstrate the usage of
  681. * scx_bpf_dsq_nr_queued() and DSQ iterator. Raise the dispatch batch count to
  682. * see meaningful dumps in the trace pipe.
  683. */
  684. static void dump_shared_dsq(void)
  685. {
  686. struct task_struct *p;
  687. s32 nr;
  688. if (!(nr = scx_bpf_dsq_nr_queued(SHARED_DSQ)))
  689. return;
  690. bpf_printk("Dumping %d tasks in SHARED_DSQ in reverse order", nr);
  691. bpf_rcu_read_lock();
  692. bpf_for_each(scx_dsq, p, SHARED_DSQ, SCX_DSQ_ITER_REV)
  693. bpf_printk("%s[%d]", p->comm, p->pid);
  694. bpf_rcu_read_unlock();
  695. }
  696. static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer)
  697. {
  698. bpf_rcu_read_lock();
  699. dispatch_highpri(true);
  700. bpf_rcu_read_unlock();
  701. monitor_cpuperf();
  702. if (print_dsqs_and_events) {
  703. struct scx_event_stats events;
  704. dump_shared_dsq();
  705. __COMPAT_scx_bpf_events(&events, sizeof(events));
  706. bpf_printk("%35s: %lld", "SCX_EV_SELECT_CPU_FALLBACK",
  707. scx_read_event(&events, SCX_EV_SELECT_CPU_FALLBACK));
  708. bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE",
  709. scx_read_event(&events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE));
  710. bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_KEEP_LAST",
  711. scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST));
  712. bpf_printk("%35s: %lld", "SCX_EV_ENQ_SKIP_EXITING",
  713. scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING));
  714. bpf_printk("%35s: %lld", "SCX_EV_REFILL_SLICE_DFL",
  715. scx_read_event(&events, SCX_EV_REFILL_SLICE_DFL));
  716. bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DURATION",
  717. scx_read_event(&events, SCX_EV_BYPASS_DURATION));
  718. bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DISPATCH",
  719. scx_read_event(&events, SCX_EV_BYPASS_DISPATCH));
  720. bpf_printk("%35s: %lld", "SCX_EV_BYPASS_ACTIVATE",
  721. scx_read_event(&events, SCX_EV_BYPASS_ACTIVATE));
  722. }
  723. bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
  724. return 0;
  725. }
  726. s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
  727. {
  728. u32 key = 0;
  729. struct bpf_timer *timer;
  730. s32 ret;
  731. if (print_msgs)
  732. print_cpus();
  733. ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
  734. if (ret) {
  735. scx_bpf_error("failed to create DSQ %d (%d)", SHARED_DSQ, ret);
  736. return ret;
  737. }
  738. ret = scx_bpf_create_dsq(HIGHPRI_DSQ, -1);
  739. if (ret) {
  740. scx_bpf_error("failed to create DSQ %d (%d)", HIGHPRI_DSQ, ret);
  741. return ret;
  742. }
  743. timer = bpf_map_lookup_elem(&monitor_timer, &key);
  744. if (!timer)
  745. return -ESRCH;
  746. bpf_timer_init(timer, &monitor_timer, CLOCK_MONOTONIC);
  747. bpf_timer_set_callback(timer, monitor_timerfn);
  748. return bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
  749. }
  750. void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
  751. {
  752. UEI_RECORD(uei, ei);
  753. }
  754. SCX_OPS_DEFINE(qmap_ops,
  755. .select_cpu = (void *)qmap_select_cpu,
  756. .enqueue = (void *)qmap_enqueue,
  757. .dequeue = (void *)qmap_dequeue,
  758. .dispatch = (void *)qmap_dispatch,
  759. .tick = (void *)qmap_tick,
  760. .core_sched_before = (void *)qmap_core_sched_before,
  761. .cpu_release = (void *)qmap_cpu_release,
  762. .init_task = (void *)qmap_init_task,
  763. .dump = (void *)qmap_dump,
  764. .dump_cpu = (void *)qmap_dump_cpu,
  765. .dump_task = (void *)qmap_dump_task,
  766. .cgroup_init = (void *)qmap_cgroup_init,
  767. .cgroup_set_weight = (void *)qmap_cgroup_set_weight,
  768. .cgroup_set_bandwidth = (void *)qmap_cgroup_set_bandwidth,
  769. .cpu_online = (void *)qmap_cpu_online,
  770. .cpu_offline = (void *)qmap_cpu_offline,
  771. .init = (void *)qmap_init,
  772. .exit = (void *)qmap_exit,
  773. .timeout_ms = 5000U,
  774. .name = "qmap");