| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615 |
- // SPDX-License-Identifier: GPL-2.0-only
- /*
- * Landlock - Cross-thread ruleset enforcement
- *
- * Copyright © 2025 Google LLC
- */
- #include <linux/atomic.h>
- #include <linux/cleanup.h>
- #include <linux/completion.h>
- #include <linux/cred.h>
- #include <linux/errno.h>
- #include <linux/overflow.h>
- #include <linux/rcupdate.h>
- #include <linux/sched.h>
- #include <linux/sched/signal.h>
- #include <linux/sched/task.h>
- #include <linux/slab.h>
- #include <linux/task_work.h>
- #include "cred.h"
- #include "tsync.h"
- /*
- * Shared state between multiple threads which are enforcing Landlock rulesets
- * in lockstep with each other.
- */
- struct tsync_shared_context {
- /* The old and tentative new creds of the calling thread. */
- const struct cred *old_cred;
- const struct cred *new_cred;
- /* True if sibling tasks need to set the no_new_privs flag. */
- bool set_no_new_privs;
- /* An error encountered in preparation step, or 0. */
- atomic_t preparation_error;
- /*
- * Barrier after preparation step in restrict_one_thread.
- * The calling thread waits for completion.
- *
- * Re-initialized on every round of looking for newly spawned threads.
- */
- atomic_t num_preparing;
- struct completion all_prepared;
- /* Sibling threads wait for completion. */
- struct completion ready_to_commit;
- /*
- * Barrier after commit step (used by syscall impl to wait for
- * completion).
- */
- atomic_t num_unfinished;
- struct completion all_finished;
- };
- struct tsync_work {
- struct callback_head work;
- struct task_struct *task;
- struct tsync_shared_context *shared_ctx;
- };
- /*
- * restrict_one_thread - update a thread's Landlock domain in lockstep with the
- * other threads in the same process
- *
- * When this is run, the same function gets run in all other threads in the same
- * process (except for the calling thread which called landlock_restrict_self).
- * The concurrently running invocations of restrict_one_thread coordinate
- * through the shared ctx object to do their work in lockstep to implement
- * all-or-nothing semantics for enforcing the new Landlock domain.
- *
- * Afterwards, depending on the presence of an error, all threads either commit
- * or abort the prepared credentials. The commit operation can not fail any
- * more.
- */
- static void restrict_one_thread(struct tsync_shared_context *ctx)
- {
- int err;
- struct cred *cred = NULL;
- if (current_cred() == ctx->old_cred) {
- /*
- * Switch out old_cred with new_cred, if possible.
- *
- * In the common case, where all threads initially point to the same
- * struct cred, this optimization avoids creating separate redundant
- * credentials objects for each, which would all have the same contents.
- *
- * Note: We are intentionally dropping the const qualifier here, because
- * it is required by commit_creds() and abort_creds().
- */
- cred = (struct cred *)get_cred(ctx->new_cred);
- } else {
- /* Else, prepare new creds and populate them. */
- cred = prepare_creds();
- if (!cred) {
- atomic_set(&ctx->preparation_error, -ENOMEM);
- /*
- * Even on error, we need to adhere to the protocol and coordinate
- * with concurrently running invocations.
- */
- if (atomic_dec_return(&ctx->num_preparing) == 0)
- complete_all(&ctx->all_prepared);
- goto out;
- }
- landlock_cred_copy(landlock_cred(cred),
- landlock_cred(ctx->new_cred));
- }
- /*
- * Barrier: Wait until all threads are done preparing.
- * After this point, we can have no more failures.
- */
- if (atomic_dec_return(&ctx->num_preparing) == 0)
- complete_all(&ctx->all_prepared);
- /*
- * Wait for signal from calling thread that it's safe to read the
- * preparation error now and we are ready to commit (or abort).
- */
- wait_for_completion(&ctx->ready_to_commit);
- /* Abort the commit if any of the other threads had an error. */
- err = atomic_read(&ctx->preparation_error);
- if (err) {
- abort_creds(cred);
- goto out;
- }
- /*
- * Make sure that all sibling tasks fulfill the no_new_privs prerequisite.
- * (This is in line with Seccomp's SECCOMP_FILTER_FLAG_TSYNC logic in
- * kernel/seccomp.c)
- */
- if (ctx->set_no_new_privs)
- task_set_no_new_privs(current);
- commit_creds(cred);
- out:
- /* Notify the calling thread once all threads are done */
- if (atomic_dec_return(&ctx->num_unfinished) == 0)
- complete_all(&ctx->all_finished);
- }
- /*
- * restrict_one_thread_callback - task_work callback for restricting a thread
- *
- * Calls restrict_one_thread with the struct landlock_shared_tsync_context.
- */
- static void restrict_one_thread_callback(struct callback_head *work)
- {
- struct tsync_work *ctx = container_of(work, struct tsync_work, work);
- restrict_one_thread(ctx->shared_ctx);
- }
- /*
- * struct tsync_works - a growable array of per-task contexts
- *
- * The zero-initialized struct represents the empty array.
- */
- struct tsync_works {
- struct tsync_work **works;
- size_t size;
- size_t capacity;
- };
- /*
- * tsync_works_provide - provides a preallocated tsync_work for the given task
- *
- * This also stores a task pointer in the context and increments the reference
- * count of the task.
- *
- * This function may fail in the case where we did not preallocate sufficient
- * capacity. This can legitimately happen if new threads get started after we
- * grew the capacity.
- *
- * Returns:
- * A pointer to the preallocated context struct, with task filled in.
- *
- * NULL, if we ran out of preallocated context structs.
- */
- static struct tsync_work *tsync_works_provide(struct tsync_works *s,
- struct task_struct *task)
- {
- struct tsync_work *ctx;
- if (s->size >= s->capacity)
- return NULL;
- ctx = s->works[s->size];
- s->size++;
- ctx->task = get_task_struct(task);
- return ctx;
- }
- /**
- * tsync_works_trim - Put the last tsync_work element
- *
- * @s: TSYNC works to trim.
- *
- * Put the last task and decrement the size of @s.
- *
- * This helper does not cancel a running task, but just reset the last element
- * to zero.
- */
- static void tsync_works_trim(struct tsync_works *s)
- {
- struct tsync_work *ctx;
- if (WARN_ON_ONCE(s->size <= 0))
- return;
- ctx = s->works[s->size - 1];
- /*
- * For consistency, remove the task from ctx so that it does not look like
- * we handed it a task_work.
- */
- put_task_struct(ctx->task);
- *ctx = (typeof(*ctx)){};
- /*
- * Cancel the tsync_works_provide() change to recycle the reserved memory
- * for the next thread, if any. This also ensures that cancel_tsync_works()
- * and tsync_works_release() do not see any NULL task pointers.
- */
- s->size--;
- }
- /*
- * tsync_works_grow_by - preallocates space for n more contexts in s
- *
- * On a successful return, the subsequent n calls to tsync_works_provide() are
- * guaranteed to succeed. (size + n <= capacity)
- *
- * Returns:
- * -ENOMEM if the (re)allocation fails
- * 0 if the allocation succeeds, partially succeeds, or no reallocation
- * was needed
- */
- static int tsync_works_grow_by(struct tsync_works *s, size_t n, gfp_t flags)
- {
- size_t i;
- size_t new_capacity;
- struct tsync_work **works;
- struct tsync_work *work;
- if (check_add_overflow(s->size, n, &new_capacity))
- return -EOVERFLOW;
- /* No need to reallocate if s already has sufficient capacity. */
- if (new_capacity <= s->capacity)
- return 0;
- works = krealloc_array(s->works, new_capacity, sizeof(s->works[0]),
- flags);
- if (!works)
- return -ENOMEM;
- s->works = works;
- for (i = s->capacity; i < new_capacity; i++) {
- work = kzalloc_obj(*work, flags);
- if (!work) {
- /*
- * Leave the object in a consistent state,
- * but return an error.
- */
- s->capacity = i;
- return -ENOMEM;
- }
- s->works[i] = work;
- }
- s->capacity = new_capacity;
- return 0;
- }
- /*
- * tsync_works_contains - checks for presence of task in s
- */
- static bool tsync_works_contains_task(const struct tsync_works *s,
- const struct task_struct *task)
- {
- size_t i;
- for (i = 0; i < s->size; i++)
- if (s->works[i]->task == task)
- return true;
- return false;
- }
- /*
- * tsync_works_release - frees memory held by s and drops all task references
- *
- * This does not free s itself, only the data structures held by it.
- */
- static void tsync_works_release(struct tsync_works *s)
- {
- size_t i;
- for (i = 0; i < s->size; i++) {
- if (WARN_ON_ONCE(!s->works[i]->task))
- continue;
- put_task_struct(s->works[i]->task);
- }
- for (i = 0; i < s->capacity; i++)
- kfree(s->works[i]);
- kfree(s->works);
- s->works = NULL;
- s->size = 0;
- s->capacity = 0;
- }
- /*
- * count_additional_threads - counts the sibling threads that are not in works
- */
- static size_t count_additional_threads(const struct tsync_works *works)
- {
- const struct task_struct *caller, *thread;
- size_t n = 0;
- caller = current;
- guard(rcu)();
- for_each_thread(caller, thread) {
- /* Skip current, since it is initiating the sync. */
- if (thread == caller)
- continue;
- /* Skip exited threads. */
- if (thread->flags & PF_EXITING)
- continue;
- /* Skip threads that we have already seen. */
- if (tsync_works_contains_task(works, thread))
- continue;
- n++;
- }
- return n;
- }
- /*
- * schedule_task_work - adds task_work for all eligible sibling threads
- * which have not been scheduled yet
- *
- * For each added task_work, atomically increments shared_ctx->num_preparing and
- * shared_ctx->num_unfinished.
- *
- * Returns:
- * true, if at least one eligible sibling thread was found
- */
- static bool schedule_task_work(struct tsync_works *works,
- struct tsync_shared_context *shared_ctx)
- {
- int err;
- const struct task_struct *caller;
- struct task_struct *thread;
- struct tsync_work *ctx;
- bool found_more_threads = false;
- caller = current;
- guard(rcu)();
- for_each_thread(caller, thread) {
- /* Skip current, since it is initiating the sync. */
- if (thread == caller)
- continue;
- /* Skip exited threads. */
- if (thread->flags & PF_EXITING)
- continue;
- /* Skip threads that we already looked at. */
- if (tsync_works_contains_task(works, thread))
- continue;
- /*
- * We found a sibling thread that is not doing its task_work yet, and
- * which might spawn new threads before our task work runs, so we need
- * at least one more round in the outer loop.
- */
- found_more_threads = true;
- ctx = tsync_works_provide(works, thread);
- if (!ctx) {
- /*
- * We ran out of preallocated contexts -- we need to try again with
- * this thread at a later time!
- * found_more_threads is already true at this point.
- */
- break;
- }
- ctx->shared_ctx = shared_ctx;
- atomic_inc(&shared_ctx->num_preparing);
- atomic_inc(&shared_ctx->num_unfinished);
- init_task_work(&ctx->work, restrict_one_thread_callback);
- err = task_work_add(thread, &ctx->work, TWA_SIGNAL);
- if (unlikely(err)) {
- /*
- * task_work_add() only fails if the task is about to exit. We
- * checked that earlier, but it can happen as a race. Resume
- * without setting an error, as the task is probably gone in the
- * next loop iteration.
- */
- tsync_works_trim(works);
- atomic_dec(&shared_ctx->num_preparing);
- atomic_dec(&shared_ctx->num_unfinished);
- }
- }
- return found_more_threads;
- }
- /*
- * cancel_tsync_works - cancel all task works where it is possible
- *
- * Task works can be canceled as long as they are still queued and have not
- * started running. If they get canceled, we decrement
- * shared_ctx->num_preparing and shared_ctx->num_unfished and mark the two
- * completions if needed, as if the task was never scheduled.
- */
- static void cancel_tsync_works(const struct tsync_works *works,
- struct tsync_shared_context *shared_ctx)
- {
- size_t i;
- for (i = 0; i < works->size; i++) {
- if (WARN_ON_ONCE(!works->works[i]->task))
- continue;
- if (!task_work_cancel(works->works[i]->task,
- &works->works[i]->work))
- continue;
- /* After dequeueing, act as if the task work had executed. */
- if (atomic_dec_return(&shared_ctx->num_preparing) == 0)
- complete_all(&shared_ctx->all_prepared);
- if (atomic_dec_return(&shared_ctx->num_unfinished) == 0)
- complete_all(&shared_ctx->all_finished);
- }
- }
- /*
- * restrict_sibling_threads - enables a Landlock policy for all sibling threads
- */
- int landlock_restrict_sibling_threads(const struct cred *old_cred,
- const struct cred *new_cred)
- {
- int err;
- struct tsync_shared_context shared_ctx;
- struct tsync_works works = {};
- size_t newly_discovered_threads;
- bool found_more_threads;
- atomic_set(&shared_ctx.preparation_error, 0);
- init_completion(&shared_ctx.all_prepared);
- init_completion(&shared_ctx.ready_to_commit);
- atomic_set(&shared_ctx.num_unfinished, 1);
- init_completion(&shared_ctx.all_finished);
- shared_ctx.old_cred = old_cred;
- shared_ctx.new_cred = new_cred;
- shared_ctx.set_no_new_privs = task_no_new_privs(current);
- /*
- * Serialize concurrent TSYNC operations to prevent deadlocks when
- * multiple threads call landlock_restrict_self() simultaneously.
- * If the lock is already held, we gracefully yield by restarting the
- * syscall. This allows the current thread to process pending
- * task_works before retrying.
- */
- if (!down_write_trylock(¤t->signal->exec_update_lock))
- return restart_syscall();
- /*
- * We schedule a pseudo-signal task_work for each of the calling task's
- * sibling threads. In the task work, each thread:
- *
- * 1) runs prepare_creds() and writes back the error to
- * shared_ctx.preparation_error, if needed.
- *
- * 2) signals that it's done with prepare_creds() to the calling task.
- * (completion "all_prepared").
- *
- * 3) waits for the completion "ready_to_commit". This is sent by the
- * calling task after ensuring that all sibling threads have done
- * with the "preparation" stage.
- *
- * After this barrier is reached, it's safe to read
- * shared_ctx.preparation_error.
- *
- * 4) reads shared_ctx.preparation_error and then either does commit_creds()
- * or abort_creds().
- *
- * 5) signals that it's done altogether (barrier synchronization
- * "all_finished")
- *
- * Unlike seccomp, which modifies sibling tasks directly, we do not need to
- * acquire the cred_guard_mutex and sighand->siglock:
- *
- * - As in our case, all threads are themselves exchanging their own struct
- * cred through the credentials API, no locks are needed for that.
- * - Our for_each_thread() loops are protected by RCU.
- * - We do not acquire a lock to keep the list of sibling threads stable
- * between our for_each_thread loops. If the list of available sibling
- * threads changes between these for_each_thread loops, we make up for
- * that by continuing to look for threads until they are all discovered
- * and have entered their task_work, where they are unable to spawn new
- * threads.
- */
- do {
- /* In RCU read-lock, count the threads we need. */
- newly_discovered_threads = count_additional_threads(&works);
- if (newly_discovered_threads == 0)
- break; /* done */
- err = tsync_works_grow_by(&works, newly_discovered_threads,
- GFP_KERNEL_ACCOUNT);
- if (err) {
- atomic_set(&shared_ctx.preparation_error, err);
- break;
- }
- /*
- * The "all_prepared" barrier is used locally to the loop body, this use
- * of for_each_thread(). We can reset it on each loop iteration because
- * all previous loop iterations are done with it already.
- *
- * num_preparing is initialized to 1 so that the counter can not go to 0
- * and mark the completion as done before all task works are registered.
- * We decrement it at the end of the loop body.
- */
- atomic_set(&shared_ctx.num_preparing, 1);
- reinit_completion(&shared_ctx.all_prepared);
- /*
- * In RCU read-lock, schedule task work on newly discovered sibling
- * tasks.
- */
- found_more_threads = schedule_task_work(&works, &shared_ctx);
- /*
- * Decrement num_preparing for current, to undo that we initialized it
- * to 1 a few lines above.
- */
- if (atomic_dec_return(&shared_ctx.num_preparing) > 0) {
- if (wait_for_completion_interruptible(
- &shared_ctx.all_prepared)) {
- /* In case of interruption, we need to retry the system call. */
- atomic_set(&shared_ctx.preparation_error,
- -ERESTARTNOINTR);
- /*
- * Opportunistic improvement: try to cancel task
- * works for tasks that did not start running
- * yet. We do not have a guarantee that it
- * cancels any of the enqueued task works
- * because task_work_run() might already have
- * dequeued them.
- */
- cancel_tsync_works(&works, &shared_ctx);
- /*
- * Break the loop with error. The cleanup code
- * after the loop unblocks the remaining
- * task_works.
- */
- break;
- }
- }
- } while (found_more_threads &&
- !atomic_read(&shared_ctx.preparation_error));
- /*
- * We now have either (a) all sibling threads blocking and in "prepared"
- * state in the task work, or (b) the preparation error is set. Ask all
- * threads to commit (or abort).
- */
- complete_all(&shared_ctx.ready_to_commit);
- /*
- * Decrement num_unfinished for current, to undo that we initialized it to 1
- * at the beginning.
- */
- if (atomic_dec_return(&shared_ctx.num_unfinished) > 0)
- wait_for_completion(&shared_ctx.all_finished);
- tsync_works_release(&works);
- up_write(¤t->signal->exec_update_lock);
- return atomic_read(&shared_ctx.preparation_error);
- }
|