tsync.c 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * Landlock - Cross-thread ruleset enforcement
  4. *
  5. * Copyright © 2025 Google LLC
  6. */
  7. #include <linux/atomic.h>
  8. #include <linux/cleanup.h>
  9. #include <linux/completion.h>
  10. #include <linux/cred.h>
  11. #include <linux/errno.h>
  12. #include <linux/overflow.h>
  13. #include <linux/rcupdate.h>
  14. #include <linux/sched.h>
  15. #include <linux/sched/signal.h>
  16. #include <linux/sched/task.h>
  17. #include <linux/slab.h>
  18. #include <linux/task_work.h>
  19. #include "cred.h"
  20. #include "tsync.h"
  21. /*
  22. * Shared state between multiple threads which are enforcing Landlock rulesets
  23. * in lockstep with each other.
  24. */
  25. struct tsync_shared_context {
  26. /* The old and tentative new creds of the calling thread. */
  27. const struct cred *old_cred;
  28. const struct cred *new_cred;
  29. /* True if sibling tasks need to set the no_new_privs flag. */
  30. bool set_no_new_privs;
  31. /* An error encountered in preparation step, or 0. */
  32. atomic_t preparation_error;
  33. /*
  34. * Barrier after preparation step in restrict_one_thread.
  35. * The calling thread waits for completion.
  36. *
  37. * Re-initialized on every round of looking for newly spawned threads.
  38. */
  39. atomic_t num_preparing;
  40. struct completion all_prepared;
  41. /* Sibling threads wait for completion. */
  42. struct completion ready_to_commit;
  43. /*
  44. * Barrier after commit step (used by syscall impl to wait for
  45. * completion).
  46. */
  47. atomic_t num_unfinished;
  48. struct completion all_finished;
  49. };
  50. struct tsync_work {
  51. struct callback_head work;
  52. struct task_struct *task;
  53. struct tsync_shared_context *shared_ctx;
  54. };
  55. /*
  56. * restrict_one_thread - update a thread's Landlock domain in lockstep with the
  57. * other threads in the same process
  58. *
  59. * When this is run, the same function gets run in all other threads in the same
  60. * process (except for the calling thread which called landlock_restrict_self).
  61. * The concurrently running invocations of restrict_one_thread coordinate
  62. * through the shared ctx object to do their work in lockstep to implement
  63. * all-or-nothing semantics for enforcing the new Landlock domain.
  64. *
  65. * Afterwards, depending on the presence of an error, all threads either commit
  66. * or abort the prepared credentials. The commit operation can not fail any
  67. * more.
  68. */
  69. static void restrict_one_thread(struct tsync_shared_context *ctx)
  70. {
  71. int err;
  72. struct cred *cred = NULL;
  73. if (current_cred() == ctx->old_cred) {
  74. /*
  75. * Switch out old_cred with new_cred, if possible.
  76. *
  77. * In the common case, where all threads initially point to the same
  78. * struct cred, this optimization avoids creating separate redundant
  79. * credentials objects for each, which would all have the same contents.
  80. *
  81. * Note: We are intentionally dropping the const qualifier here, because
  82. * it is required by commit_creds() and abort_creds().
  83. */
  84. cred = (struct cred *)get_cred(ctx->new_cred);
  85. } else {
  86. /* Else, prepare new creds and populate them. */
  87. cred = prepare_creds();
  88. if (!cred) {
  89. atomic_set(&ctx->preparation_error, -ENOMEM);
  90. /*
  91. * Even on error, we need to adhere to the protocol and coordinate
  92. * with concurrently running invocations.
  93. */
  94. if (atomic_dec_return(&ctx->num_preparing) == 0)
  95. complete_all(&ctx->all_prepared);
  96. goto out;
  97. }
  98. landlock_cred_copy(landlock_cred(cred),
  99. landlock_cred(ctx->new_cred));
  100. }
  101. /*
  102. * Barrier: Wait until all threads are done preparing.
  103. * After this point, we can have no more failures.
  104. */
  105. if (atomic_dec_return(&ctx->num_preparing) == 0)
  106. complete_all(&ctx->all_prepared);
  107. /*
  108. * Wait for signal from calling thread that it's safe to read the
  109. * preparation error now and we are ready to commit (or abort).
  110. */
  111. wait_for_completion(&ctx->ready_to_commit);
  112. /* Abort the commit if any of the other threads had an error. */
  113. err = atomic_read(&ctx->preparation_error);
  114. if (err) {
  115. abort_creds(cred);
  116. goto out;
  117. }
  118. /*
  119. * Make sure that all sibling tasks fulfill the no_new_privs prerequisite.
  120. * (This is in line with Seccomp's SECCOMP_FILTER_FLAG_TSYNC logic in
  121. * kernel/seccomp.c)
  122. */
  123. if (ctx->set_no_new_privs)
  124. task_set_no_new_privs(current);
  125. commit_creds(cred);
  126. out:
  127. /* Notify the calling thread once all threads are done */
  128. if (atomic_dec_return(&ctx->num_unfinished) == 0)
  129. complete_all(&ctx->all_finished);
  130. }
  131. /*
  132. * restrict_one_thread_callback - task_work callback for restricting a thread
  133. *
  134. * Calls restrict_one_thread with the struct landlock_shared_tsync_context.
  135. */
  136. static void restrict_one_thread_callback(struct callback_head *work)
  137. {
  138. struct tsync_work *ctx = container_of(work, struct tsync_work, work);
  139. restrict_one_thread(ctx->shared_ctx);
  140. }
  141. /*
  142. * struct tsync_works - a growable array of per-task contexts
  143. *
  144. * The zero-initialized struct represents the empty array.
  145. */
  146. struct tsync_works {
  147. struct tsync_work **works;
  148. size_t size;
  149. size_t capacity;
  150. };
  151. /*
  152. * tsync_works_provide - provides a preallocated tsync_work for the given task
  153. *
  154. * This also stores a task pointer in the context and increments the reference
  155. * count of the task.
  156. *
  157. * This function may fail in the case where we did not preallocate sufficient
  158. * capacity. This can legitimately happen if new threads get started after we
  159. * grew the capacity.
  160. *
  161. * Returns:
  162. * A pointer to the preallocated context struct, with task filled in.
  163. *
  164. * NULL, if we ran out of preallocated context structs.
  165. */
  166. static struct tsync_work *tsync_works_provide(struct tsync_works *s,
  167. struct task_struct *task)
  168. {
  169. struct tsync_work *ctx;
  170. if (s->size >= s->capacity)
  171. return NULL;
  172. ctx = s->works[s->size];
  173. s->size++;
  174. ctx->task = get_task_struct(task);
  175. return ctx;
  176. }
  177. /**
  178. * tsync_works_trim - Put the last tsync_work element
  179. *
  180. * @s: TSYNC works to trim.
  181. *
  182. * Put the last task and decrement the size of @s.
  183. *
  184. * This helper does not cancel a running task, but just reset the last element
  185. * to zero.
  186. */
  187. static void tsync_works_trim(struct tsync_works *s)
  188. {
  189. struct tsync_work *ctx;
  190. if (WARN_ON_ONCE(s->size <= 0))
  191. return;
  192. ctx = s->works[s->size - 1];
  193. /*
  194. * For consistency, remove the task from ctx so that it does not look like
  195. * we handed it a task_work.
  196. */
  197. put_task_struct(ctx->task);
  198. *ctx = (typeof(*ctx)){};
  199. /*
  200. * Cancel the tsync_works_provide() change to recycle the reserved memory
  201. * for the next thread, if any. This also ensures that cancel_tsync_works()
  202. * and tsync_works_release() do not see any NULL task pointers.
  203. */
  204. s->size--;
  205. }
  206. /*
  207. * tsync_works_grow_by - preallocates space for n more contexts in s
  208. *
  209. * On a successful return, the subsequent n calls to tsync_works_provide() are
  210. * guaranteed to succeed. (size + n <= capacity)
  211. *
  212. * Returns:
  213. * -ENOMEM if the (re)allocation fails
  214. * 0 if the allocation succeeds, partially succeeds, or no reallocation
  215. * was needed
  216. */
  217. static int tsync_works_grow_by(struct tsync_works *s, size_t n, gfp_t flags)
  218. {
  219. size_t i;
  220. size_t new_capacity;
  221. struct tsync_work **works;
  222. struct tsync_work *work;
  223. if (check_add_overflow(s->size, n, &new_capacity))
  224. return -EOVERFLOW;
  225. /* No need to reallocate if s already has sufficient capacity. */
  226. if (new_capacity <= s->capacity)
  227. return 0;
  228. works = krealloc_array(s->works, new_capacity, sizeof(s->works[0]),
  229. flags);
  230. if (!works)
  231. return -ENOMEM;
  232. s->works = works;
  233. for (i = s->capacity; i < new_capacity; i++) {
  234. work = kzalloc_obj(*work, flags);
  235. if (!work) {
  236. /*
  237. * Leave the object in a consistent state,
  238. * but return an error.
  239. */
  240. s->capacity = i;
  241. return -ENOMEM;
  242. }
  243. s->works[i] = work;
  244. }
  245. s->capacity = new_capacity;
  246. return 0;
  247. }
  248. /*
  249. * tsync_works_contains - checks for presence of task in s
  250. */
  251. static bool tsync_works_contains_task(const struct tsync_works *s,
  252. const struct task_struct *task)
  253. {
  254. size_t i;
  255. for (i = 0; i < s->size; i++)
  256. if (s->works[i]->task == task)
  257. return true;
  258. return false;
  259. }
  260. /*
  261. * tsync_works_release - frees memory held by s and drops all task references
  262. *
  263. * This does not free s itself, only the data structures held by it.
  264. */
  265. static void tsync_works_release(struct tsync_works *s)
  266. {
  267. size_t i;
  268. for (i = 0; i < s->size; i++) {
  269. if (WARN_ON_ONCE(!s->works[i]->task))
  270. continue;
  271. put_task_struct(s->works[i]->task);
  272. }
  273. for (i = 0; i < s->capacity; i++)
  274. kfree(s->works[i]);
  275. kfree(s->works);
  276. s->works = NULL;
  277. s->size = 0;
  278. s->capacity = 0;
  279. }
  280. /*
  281. * count_additional_threads - counts the sibling threads that are not in works
  282. */
  283. static size_t count_additional_threads(const struct tsync_works *works)
  284. {
  285. const struct task_struct *caller, *thread;
  286. size_t n = 0;
  287. caller = current;
  288. guard(rcu)();
  289. for_each_thread(caller, thread) {
  290. /* Skip current, since it is initiating the sync. */
  291. if (thread == caller)
  292. continue;
  293. /* Skip exited threads. */
  294. if (thread->flags & PF_EXITING)
  295. continue;
  296. /* Skip threads that we have already seen. */
  297. if (tsync_works_contains_task(works, thread))
  298. continue;
  299. n++;
  300. }
  301. return n;
  302. }
  303. /*
  304. * schedule_task_work - adds task_work for all eligible sibling threads
  305. * which have not been scheduled yet
  306. *
  307. * For each added task_work, atomically increments shared_ctx->num_preparing and
  308. * shared_ctx->num_unfinished.
  309. *
  310. * Returns:
  311. * true, if at least one eligible sibling thread was found
  312. */
  313. static bool schedule_task_work(struct tsync_works *works,
  314. struct tsync_shared_context *shared_ctx)
  315. {
  316. int err;
  317. const struct task_struct *caller;
  318. struct task_struct *thread;
  319. struct tsync_work *ctx;
  320. bool found_more_threads = false;
  321. caller = current;
  322. guard(rcu)();
  323. for_each_thread(caller, thread) {
  324. /* Skip current, since it is initiating the sync. */
  325. if (thread == caller)
  326. continue;
  327. /* Skip exited threads. */
  328. if (thread->flags & PF_EXITING)
  329. continue;
  330. /* Skip threads that we already looked at. */
  331. if (tsync_works_contains_task(works, thread))
  332. continue;
  333. /*
  334. * We found a sibling thread that is not doing its task_work yet, and
  335. * which might spawn new threads before our task work runs, so we need
  336. * at least one more round in the outer loop.
  337. */
  338. found_more_threads = true;
  339. ctx = tsync_works_provide(works, thread);
  340. if (!ctx) {
  341. /*
  342. * We ran out of preallocated contexts -- we need to try again with
  343. * this thread at a later time!
  344. * found_more_threads is already true at this point.
  345. */
  346. break;
  347. }
  348. ctx->shared_ctx = shared_ctx;
  349. atomic_inc(&shared_ctx->num_preparing);
  350. atomic_inc(&shared_ctx->num_unfinished);
  351. init_task_work(&ctx->work, restrict_one_thread_callback);
  352. err = task_work_add(thread, &ctx->work, TWA_SIGNAL);
  353. if (unlikely(err)) {
  354. /*
  355. * task_work_add() only fails if the task is about to exit. We
  356. * checked that earlier, but it can happen as a race. Resume
  357. * without setting an error, as the task is probably gone in the
  358. * next loop iteration.
  359. */
  360. tsync_works_trim(works);
  361. atomic_dec(&shared_ctx->num_preparing);
  362. atomic_dec(&shared_ctx->num_unfinished);
  363. }
  364. }
  365. return found_more_threads;
  366. }
  367. /*
  368. * cancel_tsync_works - cancel all task works where it is possible
  369. *
  370. * Task works can be canceled as long as they are still queued and have not
  371. * started running. If they get canceled, we decrement
  372. * shared_ctx->num_preparing and shared_ctx->num_unfished and mark the two
  373. * completions if needed, as if the task was never scheduled.
  374. */
  375. static void cancel_tsync_works(const struct tsync_works *works,
  376. struct tsync_shared_context *shared_ctx)
  377. {
  378. size_t i;
  379. for (i = 0; i < works->size; i++) {
  380. if (WARN_ON_ONCE(!works->works[i]->task))
  381. continue;
  382. if (!task_work_cancel(works->works[i]->task,
  383. &works->works[i]->work))
  384. continue;
  385. /* After dequeueing, act as if the task work had executed. */
  386. if (atomic_dec_return(&shared_ctx->num_preparing) == 0)
  387. complete_all(&shared_ctx->all_prepared);
  388. if (atomic_dec_return(&shared_ctx->num_unfinished) == 0)
  389. complete_all(&shared_ctx->all_finished);
  390. }
  391. }
  392. /*
  393. * restrict_sibling_threads - enables a Landlock policy for all sibling threads
  394. */
  395. int landlock_restrict_sibling_threads(const struct cred *old_cred,
  396. const struct cred *new_cred)
  397. {
  398. int err;
  399. struct tsync_shared_context shared_ctx;
  400. struct tsync_works works = {};
  401. size_t newly_discovered_threads;
  402. bool found_more_threads;
  403. atomic_set(&shared_ctx.preparation_error, 0);
  404. init_completion(&shared_ctx.all_prepared);
  405. init_completion(&shared_ctx.ready_to_commit);
  406. atomic_set(&shared_ctx.num_unfinished, 1);
  407. init_completion(&shared_ctx.all_finished);
  408. shared_ctx.old_cred = old_cred;
  409. shared_ctx.new_cred = new_cred;
  410. shared_ctx.set_no_new_privs = task_no_new_privs(current);
  411. /*
  412. * Serialize concurrent TSYNC operations to prevent deadlocks when
  413. * multiple threads call landlock_restrict_self() simultaneously.
  414. * If the lock is already held, we gracefully yield by restarting the
  415. * syscall. This allows the current thread to process pending
  416. * task_works before retrying.
  417. */
  418. if (!down_write_trylock(&current->signal->exec_update_lock))
  419. return restart_syscall();
  420. /*
  421. * We schedule a pseudo-signal task_work for each of the calling task's
  422. * sibling threads. In the task work, each thread:
  423. *
  424. * 1) runs prepare_creds() and writes back the error to
  425. * shared_ctx.preparation_error, if needed.
  426. *
  427. * 2) signals that it's done with prepare_creds() to the calling task.
  428. * (completion "all_prepared").
  429. *
  430. * 3) waits for the completion "ready_to_commit". This is sent by the
  431. * calling task after ensuring that all sibling threads have done
  432. * with the "preparation" stage.
  433. *
  434. * After this barrier is reached, it's safe to read
  435. * shared_ctx.preparation_error.
  436. *
  437. * 4) reads shared_ctx.preparation_error and then either does commit_creds()
  438. * or abort_creds().
  439. *
  440. * 5) signals that it's done altogether (barrier synchronization
  441. * "all_finished")
  442. *
  443. * Unlike seccomp, which modifies sibling tasks directly, we do not need to
  444. * acquire the cred_guard_mutex and sighand->siglock:
  445. *
  446. * - As in our case, all threads are themselves exchanging their own struct
  447. * cred through the credentials API, no locks are needed for that.
  448. * - Our for_each_thread() loops are protected by RCU.
  449. * - We do not acquire a lock to keep the list of sibling threads stable
  450. * between our for_each_thread loops. If the list of available sibling
  451. * threads changes between these for_each_thread loops, we make up for
  452. * that by continuing to look for threads until they are all discovered
  453. * and have entered their task_work, where they are unable to spawn new
  454. * threads.
  455. */
  456. do {
  457. /* In RCU read-lock, count the threads we need. */
  458. newly_discovered_threads = count_additional_threads(&works);
  459. if (newly_discovered_threads == 0)
  460. break; /* done */
  461. err = tsync_works_grow_by(&works, newly_discovered_threads,
  462. GFP_KERNEL_ACCOUNT);
  463. if (err) {
  464. atomic_set(&shared_ctx.preparation_error, err);
  465. break;
  466. }
  467. /*
  468. * The "all_prepared" barrier is used locally to the loop body, this use
  469. * of for_each_thread(). We can reset it on each loop iteration because
  470. * all previous loop iterations are done with it already.
  471. *
  472. * num_preparing is initialized to 1 so that the counter can not go to 0
  473. * and mark the completion as done before all task works are registered.
  474. * We decrement it at the end of the loop body.
  475. */
  476. atomic_set(&shared_ctx.num_preparing, 1);
  477. reinit_completion(&shared_ctx.all_prepared);
  478. /*
  479. * In RCU read-lock, schedule task work on newly discovered sibling
  480. * tasks.
  481. */
  482. found_more_threads = schedule_task_work(&works, &shared_ctx);
  483. /*
  484. * Decrement num_preparing for current, to undo that we initialized it
  485. * to 1 a few lines above.
  486. */
  487. if (atomic_dec_return(&shared_ctx.num_preparing) > 0) {
  488. if (wait_for_completion_interruptible(
  489. &shared_ctx.all_prepared)) {
  490. /* In case of interruption, we need to retry the system call. */
  491. atomic_set(&shared_ctx.preparation_error,
  492. -ERESTARTNOINTR);
  493. /*
  494. * Opportunistic improvement: try to cancel task
  495. * works for tasks that did not start running
  496. * yet. We do not have a guarantee that it
  497. * cancels any of the enqueued task works
  498. * because task_work_run() might already have
  499. * dequeued them.
  500. */
  501. cancel_tsync_works(&works, &shared_ctx);
  502. /*
  503. * Break the loop with error. The cleanup code
  504. * after the loop unblocks the remaining
  505. * task_works.
  506. */
  507. break;
  508. }
  509. }
  510. } while (found_more_threads &&
  511. !atomic_read(&shared_ctx.preparation_error));
  512. /*
  513. * We now have either (a) all sibling threads blocking and in "prepared"
  514. * state in the task work, or (b) the preparation error is set. Ask all
  515. * threads to commit (or abort).
  516. */
  517. complete_all(&shared_ctx.ready_to_commit);
  518. /*
  519. * Decrement num_unfinished for current, to undo that we initialized it to 1
  520. * at the beginning.
  521. */
  522. if (atomic_dec_return(&shared_ctx.num_unfinished) > 0)
  523. wait_for_completion(&shared_ctx.all_finished);
  524. tsync_works_release(&works);
  525. up_write(&current->signal->exec_update_lock);
  526. return atomic_read(&shared_ctx.preparation_error);
  527. }