| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716 |
- /* SPDX-License-Identifier: GPL-2.0 */
- /*
- * Arena-based task data scheduler. This is a variation of scx_simple
- * that uses a combined allocator and indexing structure to organize
- * task data. Task context allocation is done when a task enters the
- * scheduler, while freeing is done when it exits. Task contexts are
- * retrieved from task-local storage, pointing to the allocated memory.
- *
- * The main purpose of this scheduler is to demostrate arena memory
- * management.
- *
- * Copyright (c) 2024-2025 Meta Platforms, Inc. and affiliates.
- * Copyright (c) 2024-2025 Emil Tsalapatis <etsal@meta.com>
- * Copyright (c) 2024-2025 Tejun Heo <tj@kernel.org>
- *
- */
- #include <scx/common.bpf.h>
- #include <scx/bpf_arena_common.bpf.h>
- #include "scx_sdt.h"
- char _license[] SEC("license") = "GPL";
- UEI_DEFINE(uei);
- struct {
- __uint(type, BPF_MAP_TYPE_ARENA);
- __uint(map_flags, BPF_F_MMAPABLE);
- #if defined(__TARGET_ARCH_arm64) || defined(__aarch64__)
- __uint(max_entries, 1 << 16); /* number of pages */
- __ulong(map_extra, (1ull << 32)); /* start of mmap() region */
- #else
- __uint(max_entries, 1 << 20); /* number of pages */
- __ulong(map_extra, (1ull << 44)); /* start of mmap() region */
- #endif
- } arena __weak SEC(".maps");
- #define SHARED_DSQ 0
- #define DEFINE_SDT_STAT(metric) \
- static inline void \
- stat_inc_##metric(struct scx_stats __arena *stats) \
- { \
- cast_kern(stats); \
- stats->metric += 1; \
- } \
- __u64 stat_##metric; \
- DEFINE_SDT_STAT(enqueue);
- DEFINE_SDT_STAT(init);
- DEFINE_SDT_STAT(exit);
- DEFINE_SDT_STAT(select_idle_cpu);
- DEFINE_SDT_STAT(select_busy_cpu);
- /*
- * Necessary for cond_break/can_loop's semantics. According to kernel commit
- * 011832b, the loop counter variable must be seen as imprecise and bounded
- * by the verifier. Initializing it from a constant (e.g., i = 0;), then,
- * makes it precise and prevents may_goto from helping with converging the
- * loop. For these loops we must initialize the loop counter from a variable
- * whose value the verifier cannot reason about when checking the program, so
- * that the loop counter's value is imprecise.
- */
- static __u64 zero = 0;
- /*
- * XXX Hack to get the verifier to find the arena for sdt_exit_task.
- * As of 6.12-rc5, The verifier associates arenas with programs by
- * checking LD.IMM instruction operands for an arena and populating
- * the program state with the first instance it finds. This requires
- * accessing our global arena variable, but scx methods do not necessarily
- * do so while still using pointers from that arena. Insert a bpf_printk
- * statement that triggers at most once to generate an LD.IMM instruction
- * to access the arena and help the verifier.
- */
- static volatile bool scx_arena_verify_once;
- __hidden void scx_arena_subprog_init(void)
- {
- if (scx_arena_verify_once)
- return;
- bpf_printk("%s: arena pointer %p", __func__, &arena);
- scx_arena_verify_once = true;
- }
- private(LOCK) struct bpf_spin_lock alloc_lock;
- private(POOL_LOCK) struct bpf_spin_lock alloc_pool_lock;
- /* allocation pools */
- struct sdt_pool desc_pool;
- struct sdt_pool chunk_pool;
- /* Protected by alloc_lock. */
- struct scx_alloc_stats alloc_stats;
- /* Allocate element from the pool. Must be called with a then pool lock held. */
- static
- void __arena *scx_alloc_from_pool(struct sdt_pool *pool)
- {
- __u64 elem_size, max_elems;
- void __arena *slab;
- void __arena *ptr;
- elem_size = pool->elem_size;
- max_elems = pool->max_elems;
- /* If the chunk is spent, get a new one. */
- if (pool->idx >= max_elems) {
- slab = bpf_arena_alloc_pages(&arena, NULL,
- div_round_up(max_elems * elem_size, PAGE_SIZE), NUMA_NO_NODE, 0);
- if (!slab)
- return NULL;
- pool->slab = slab;
- pool->idx = 0;
- }
- ptr = (void __arena *)((__u64) pool->slab + elem_size * pool->idx);
- pool->idx += 1;
- return ptr;
- }
- /* Alloc desc and associated chunk. Called with the allocator spinlock held. */
- static sdt_desc_t *scx_alloc_chunk(void)
- {
- struct sdt_chunk __arena *chunk;
- sdt_desc_t *desc;
- sdt_desc_t *out;
- chunk = scx_alloc_from_pool(&chunk_pool);
- if (!chunk)
- return NULL;
- desc = scx_alloc_from_pool(&desc_pool);
- if (!desc) {
- /*
- * Effectively frees the previous chunk allocation.
- * Index cannot be 0, so decrementing is always
- * valid.
- */
- chunk_pool.idx -= 1;
- return NULL;
- }
- out = desc;
- desc->nr_free = SDT_TASK_ENTS_PER_CHUNK;
- desc->chunk = chunk;
- alloc_stats.chunk_allocs += 1;
- return out;
- }
- static int pool_set_size(struct sdt_pool *pool, __u64 data_size, __u64 nr_pages)
- {
- if (unlikely(data_size % 8))
- return -EINVAL;
- if (unlikely(nr_pages == 0))
- return -EINVAL;
- pool->elem_size = data_size;
- pool->max_elems = (PAGE_SIZE * nr_pages) / pool->elem_size;
- /* Populate the pool slab on the first allocation. */
- pool->idx = pool->max_elems;
- return 0;
- }
- /* Initialize both the base pool allocators and the root chunk of the index. */
- __hidden int
- scx_alloc_init(struct scx_allocator *alloc, __u64 data_size)
- {
- size_t min_chunk_size;
- int ret;
- _Static_assert(sizeof(struct sdt_chunk) <= PAGE_SIZE,
- "chunk size must fit into a page");
- ret = pool_set_size(&chunk_pool, sizeof(struct sdt_chunk), 1);
- if (ret != 0)
- return ret;
- ret = pool_set_size(&desc_pool, sizeof(struct sdt_desc), 1);
- if (ret != 0)
- return ret;
- /* Wrap data into a descriptor and word align. */
- data_size += sizeof(struct sdt_data);
- data_size = round_up(data_size, 8);
- /*
- * Ensure we allocate large enough chunks from the arena to avoid excessive
- * internal fragmentation when turning chunks it into structs.
- */
- min_chunk_size = div_round_up(SDT_TASK_MIN_ELEM_PER_ALLOC * data_size, PAGE_SIZE);
- ret = pool_set_size(&alloc->pool, data_size, min_chunk_size);
- if (ret != 0)
- return ret;
- bpf_spin_lock(&alloc_lock);
- alloc->root = scx_alloc_chunk();
- bpf_spin_unlock(&alloc_lock);
- if (!alloc->root)
- return -ENOMEM;
- return 0;
- }
- static
- int set_idx_state(sdt_desc_t *desc, __u64 pos, bool state)
- {
- __u64 __arena *allocated = desc->allocated;
- __u64 bit;
- if (unlikely(pos >= SDT_TASK_ENTS_PER_CHUNK))
- return -EINVAL;
- bit = (__u64)1 << (pos % 64);
- if (state)
- allocated[pos / 64] |= bit;
- else
- allocated[pos / 64] &= ~bit;
- return 0;
- }
- static __noinline
- int mark_nodes_avail(sdt_desc_t *lv_desc[SDT_TASK_LEVELS], __u64 lv_pos[SDT_TASK_LEVELS])
- {
- sdt_desc_t *desc;
- __u64 u, level;
- int ret;
- for (u = zero; u < SDT_TASK_LEVELS && can_loop; u++) {
- level = SDT_TASK_LEVELS - 1 - u;
- /* Only propagate upwards if we are the parent's only free chunk. */
- desc = lv_desc[level];
- ret = set_idx_state(desc, lv_pos[level], false);
- if (unlikely(ret != 0))
- return ret;
- desc->nr_free += 1;
- if (desc->nr_free > 1)
- return 0;
- }
- return 0;
- }
- /*
- * Free the allocated struct with the given index. Called with the
- * allocator lock taken.
- */
- __hidden
- int scx_alloc_free_idx(struct scx_allocator *alloc, __u64 idx)
- {
- const __u64 mask = (1 << SDT_TASK_ENTS_PER_PAGE_SHIFT) - 1;
- sdt_desc_t *lv_desc[SDT_TASK_LEVELS];
- sdt_desc_t * __arena *desc_children;
- struct sdt_chunk __arena *chunk;
- sdt_desc_t *desc;
- struct sdt_data __arena *data;
- __u64 level, shift, pos;
- __u64 lv_pos[SDT_TASK_LEVELS];
- int ret;
- int i;
- if (!alloc)
- return 0;
- desc = alloc->root;
- if (unlikely(!desc))
- return -EINVAL;
- /* To appease the verifier. */
- for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) {
- lv_desc[level] = NULL;
- lv_pos[level] = 0;
- }
- /* Find the leaf node containing the index. */
- for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) {
- shift = (SDT_TASK_LEVELS - 1 - level) * SDT_TASK_ENTS_PER_PAGE_SHIFT;
- pos = (idx >> shift) & mask;
- lv_desc[level] = desc;
- lv_pos[level] = pos;
- if (level == SDT_TASK_LEVELS - 1)
- break;
- chunk = desc->chunk;
- desc_children = (sdt_desc_t * __arena *)chunk->descs;
- desc = desc_children[pos];
- if (unlikely(!desc))
- return -EINVAL;
- }
- chunk = desc->chunk;
- pos = idx & mask;
- data = chunk->data[pos];
- if (likely(data)) {
- *data = (struct sdt_data) {
- .tid.genn = data->tid.genn + 1,
- };
- /* Zero out one word at a time. */
- for (i = zero; i < alloc->pool.elem_size / 8 && can_loop; i++) {
- data->payload[i] = 0;
- }
- }
- ret = mark_nodes_avail(lv_desc, lv_pos);
- if (unlikely(ret != 0))
- return ret;
- alloc_stats.active_allocs -= 1;
- alloc_stats.free_ops += 1;
- return 0;
- }
- static inline
- int ffs(__u64 word)
- {
- unsigned int num = 0;
- if ((word & 0xffffffff) == 0) {
- num += 32;
- word >>= 32;
- }
- if ((word & 0xffff) == 0) {
- num += 16;
- word >>= 16;
- }
- if ((word & 0xff) == 0) {
- num += 8;
- word >>= 8;
- }
- if ((word & 0xf) == 0) {
- num += 4;
- word >>= 4;
- }
- if ((word & 0x3) == 0) {
- num += 2;
- word >>= 2;
- }
- if ((word & 0x1) == 0) {
- num += 1;
- word >>= 1;
- }
- return num;
- }
- /* find the first empty slot */
- __hidden
- __u64 chunk_find_empty(sdt_desc_t __arg_arena *desc)
- {
- __u64 freeslots;
- __u64 i;
- for (i = 0; i < SDT_TASK_CHUNK_BITMAP_U64S; i++) {
- freeslots = ~desc->allocated[i];
- if (freeslots == (__u64)0)
- continue;
- return (i * 64) + ffs(freeslots);
- }
- return SDT_TASK_ENTS_PER_CHUNK;
- }
- /*
- * Find and return an available idx on the allocator.
- * Called with the task spinlock held.
- */
- static sdt_desc_t * desc_find_empty(sdt_desc_t *desc, __u64 *idxp)
- {
- sdt_desc_t *lv_desc[SDT_TASK_LEVELS];
- sdt_desc_t * __arena *desc_children;
- struct sdt_chunk __arena *chunk;
- sdt_desc_t *tmp;
- __u64 lv_pos[SDT_TASK_LEVELS];
- __u64 u, pos, level;
- __u64 idx = 0;
- int ret;
- for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) {
- pos = chunk_find_empty(desc);
- /* If we error out, something has gone very wrong. */
- if (unlikely(pos > SDT_TASK_ENTS_PER_CHUNK))
- return NULL;
- if (pos == SDT_TASK_ENTS_PER_CHUNK)
- return NULL;
- idx <<= SDT_TASK_ENTS_PER_PAGE_SHIFT;
- idx |= pos;
- /* Log the levels to complete allocation. */
- lv_desc[level] = desc;
- lv_pos[level] = pos;
- /* The rest of the loop is for internal node traversal. */
- if (level == SDT_TASK_LEVELS - 1)
- break;
- /* Allocate an internal node if necessary. */
- chunk = desc->chunk;
- desc_children = (sdt_desc_t * __arena *)chunk->descs;
- desc = desc_children[pos];
- if (!desc) {
- desc = scx_alloc_chunk();
- if (!desc)
- return NULL;
- desc_children[pos] = desc;
- }
- }
- /*
- * Finding the descriptor along with any internal node
- * allocations was successful. Update all levels with
- * the new allocation.
- */
- bpf_for(u, 0, SDT_TASK_LEVELS) {
- level = SDT_TASK_LEVELS - 1 - u;
- tmp = lv_desc[level];
- ret = set_idx_state(tmp, lv_pos[level], true);
- if (ret != 0)
- break;
- tmp->nr_free -= 1;
- if (tmp->nr_free > 0)
- break;
- }
- *idxp = idx;
- return desc;
- }
- __hidden
- void __arena *scx_alloc(struct scx_allocator *alloc)
- {
- struct sdt_data __arena *data = NULL;
- struct sdt_chunk __arena *chunk;
- sdt_desc_t *desc;
- __u64 idx, pos;
- if (!alloc)
- return NULL;
- bpf_spin_lock(&alloc_lock);
- /* We unlock if we encounter an error in the function. */
- desc = desc_find_empty(alloc->root, &idx);
- if (unlikely(desc == NULL)) {
- bpf_spin_unlock(&alloc_lock);
- return NULL;
- }
- chunk = desc->chunk;
- /* Populate the leaf node if necessary. */
- pos = idx & (SDT_TASK_ENTS_PER_CHUNK - 1);
- data = chunk->data[pos];
- if (!data) {
- data = scx_alloc_from_pool(&alloc->pool);
- if (!data) {
- scx_alloc_free_idx(alloc, idx);
- bpf_spin_unlock(&alloc_lock);
- return NULL;
- }
- }
- chunk->data[pos] = data;
- /* The data counts as a chunk */
- alloc_stats.data_allocs += 1;
- alloc_stats.alloc_ops += 1;
- alloc_stats.active_allocs += 1;
- data->tid.idx = idx;
- bpf_spin_unlock(&alloc_lock);
- return data;
- }
- /*
- * Task BPF map entry recording the task's assigned ID and pointing to the data
- * area allocated in arena.
- */
- struct scx_task_map_val {
- union sdt_id tid;
- __u64 tptr;
- struct sdt_data __arena *data;
- };
- struct {
- __uint(type, BPF_MAP_TYPE_TASK_STORAGE);
- __uint(map_flags, BPF_F_NO_PREALLOC);
- __type(key, int);
- __type(value, struct scx_task_map_val);
- } scx_task_map SEC(".maps");
- static struct scx_allocator scx_task_allocator;
- __hidden
- void __arena *scx_task_alloc(struct task_struct *p)
- {
- struct sdt_data __arena *data = NULL;
- struct scx_task_map_val *mval;
- mval = bpf_task_storage_get(&scx_task_map, p, 0,
- BPF_LOCAL_STORAGE_GET_F_CREATE);
- if (!mval)
- return NULL;
- data = scx_alloc(&scx_task_allocator);
- if (unlikely(!data))
- return NULL;
- mval->tid = data->tid;
- mval->tptr = (__u64) p;
- mval->data = data;
- return (void __arena *)data->payload;
- }
- __hidden
- int scx_task_init(__u64 data_size)
- {
- return scx_alloc_init(&scx_task_allocator, data_size);
- }
- __hidden
- void __arena *scx_task_data(struct task_struct *p)
- {
- struct sdt_data __arena *data;
- struct scx_task_map_val *mval;
- scx_arena_subprog_init();
- mval = bpf_task_storage_get(&scx_task_map, p, 0, 0);
- if (!mval)
- return NULL;
- data = mval->data;
- return (void __arena *)data->payload;
- }
- __hidden
- void scx_task_free(struct task_struct *p)
- {
- struct scx_task_map_val *mval;
- scx_arena_subprog_init();
- mval = bpf_task_storage_get(&scx_task_map, p, 0, 0);
- if (!mval)
- return;
- bpf_spin_lock(&alloc_lock);
- scx_alloc_free_idx(&scx_task_allocator, mval->tid.idx);
- bpf_spin_unlock(&alloc_lock);
- bpf_task_storage_delete(&scx_task_map, p);
- }
- static inline void
- scx_stat_global_update(struct scx_stats __arena *stats)
- {
- cast_kern(stats);
- __sync_fetch_and_add(&stat_enqueue, stats->enqueue);
- __sync_fetch_and_add(&stat_init, stats->init);
- __sync_fetch_and_add(&stat_exit, stats->exit);
- __sync_fetch_and_add(&stat_select_idle_cpu, stats->select_idle_cpu);
- __sync_fetch_and_add(&stat_select_busy_cpu, stats->select_busy_cpu);
- }
- s32 BPF_STRUCT_OPS(sdt_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
- {
- struct scx_stats __arena *stats;
- bool is_idle = false;
- s32 cpu;
- stats = scx_task_data(p);
- if (!stats) {
- scx_bpf_error("%s: no stats for pid %d", __func__, p->pid);
- return 0;
- }
- cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);
- if (is_idle) {
- stat_inc_select_idle_cpu(stats);
- scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
- } else {
- stat_inc_select_busy_cpu(stats);
- }
- return cpu;
- }
- void BPF_STRUCT_OPS(sdt_enqueue, struct task_struct *p, u64 enq_flags)
- {
- struct scx_stats __arena *stats;
- stats = scx_task_data(p);
- if (!stats) {
- scx_bpf_error("%s: no stats for pid %d", __func__, p->pid);
- return;
- }
- stat_inc_enqueue(stats);
- scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);
- }
- void BPF_STRUCT_OPS(sdt_dispatch, s32 cpu, struct task_struct *prev)
- {
- scx_bpf_dsq_move_to_local(SHARED_DSQ);
- }
- s32 BPF_STRUCT_OPS_SLEEPABLE(sdt_init_task, struct task_struct *p,
- struct scx_init_task_args *args)
- {
- struct scx_stats __arena *stats;
- stats = scx_task_alloc(p);
- if (!stats) {
- scx_bpf_error("arena allocator out of memory");
- return -ENOMEM;
- }
- stats->pid = p->pid;
- stat_inc_init(stats);
- return 0;
- }
- void BPF_STRUCT_OPS(sdt_exit_task, struct task_struct *p,
- struct scx_exit_task_args *args)
- {
- struct scx_stats __arena *stats;
- stats = scx_task_data(p);
- if (!stats) {
- scx_bpf_error("%s: no stats for pid %d", __func__, p->pid);
- return;
- }
- stat_inc_exit(stats);
- scx_stat_global_update(stats);
- scx_task_free(p);
- }
- s32 BPF_STRUCT_OPS_SLEEPABLE(sdt_init)
- {
- int ret;
- ret = scx_task_init(sizeof(struct scx_stats));
- if (ret < 0) {
- scx_bpf_error("%s: failed with %d", __func__, ret);
- return ret;
- }
- ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
- if (ret) {
- scx_bpf_error("failed to create DSQ %d (%d)", SHARED_DSQ, ret);
- return ret;
- }
- return 0;
- }
- void BPF_STRUCT_OPS(sdt_exit, struct scx_exit_info *ei)
- {
- UEI_RECORD(uei, ei);
- }
- SCX_OPS_DEFINE(sdt_ops,
- .select_cpu = (void *)sdt_select_cpu,
- .enqueue = (void *)sdt_enqueue,
- .dispatch = (void *)sdt_dispatch,
- .init_task = (void *)sdt_init_task,
- .exit_task = (void *)sdt_exit_task,
- .init = (void *)sdt_init,
- .exit = (void *)sdt_exit,
- .name = "sdt");
|