tctx.c 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388
  1. // SPDX-License-Identifier: GPL-2.0
  2. #include <linux/kernel.h>
  3. #include <linux/errno.h>
  4. #include <linux/file.h>
  5. #include <linux/mm.h>
  6. #include <linux/slab.h>
  7. #include <linux/nospec.h>
  8. #include <linux/io_uring.h>
  9. #include <uapi/linux/io_uring.h>
  10. #include "io_uring.h"
  11. #include "tctx.h"
  12. #include "bpf_filter.h"
  13. static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
  14. struct task_struct *task)
  15. {
  16. struct io_wq_hash *hash;
  17. struct io_wq_data data;
  18. unsigned int concurrency;
  19. mutex_lock(&ctx->uring_lock);
  20. hash = ctx->hash_map;
  21. if (!hash) {
  22. hash = kzalloc_obj(*hash);
  23. if (!hash) {
  24. mutex_unlock(&ctx->uring_lock);
  25. return ERR_PTR(-ENOMEM);
  26. }
  27. refcount_set(&hash->refs, 1);
  28. init_waitqueue_head(&hash->wait);
  29. ctx->hash_map = hash;
  30. }
  31. mutex_unlock(&ctx->uring_lock);
  32. data.hash = hash;
  33. data.task = task;
  34. /* Do QD, or 4 * CPUS, whatever is smallest */
  35. concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
  36. return io_wq_create(concurrency, &data);
  37. }
  38. void __io_uring_free(struct task_struct *tsk)
  39. {
  40. struct io_uring_task *tctx = tsk->io_uring;
  41. struct io_tctx_node *node;
  42. unsigned long index;
  43. /*
  44. * Fault injection forcing allocation errors in the xa_store() path
  45. * can lead to xa_empty() returning false, even though no actual
  46. * node is stored in the xarray. Until that gets sorted out, attempt
  47. * an iteration here and warn if any entries are found.
  48. */
  49. if (tctx) {
  50. xa_for_each(&tctx->xa, index, node) {
  51. WARN_ON_ONCE(1);
  52. break;
  53. }
  54. WARN_ON_ONCE(tctx->io_wq);
  55. WARN_ON_ONCE(tctx->cached_refs);
  56. percpu_counter_destroy(&tctx->inflight);
  57. kfree(tctx);
  58. tsk->io_uring = NULL;
  59. }
  60. if (tsk->io_uring_restrict) {
  61. io_put_bpf_filters(tsk->io_uring_restrict);
  62. kfree(tsk->io_uring_restrict);
  63. tsk->io_uring_restrict = NULL;
  64. }
  65. }
  66. __cold int io_uring_alloc_task_context(struct task_struct *task,
  67. struct io_ring_ctx *ctx)
  68. {
  69. struct io_uring_task *tctx;
  70. int ret;
  71. tctx = kzalloc_obj(*tctx);
  72. if (unlikely(!tctx))
  73. return -ENOMEM;
  74. ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
  75. if (unlikely(ret)) {
  76. kfree(tctx);
  77. return ret;
  78. }
  79. tctx->io_wq = io_init_wq_offload(ctx, task);
  80. if (IS_ERR(tctx->io_wq)) {
  81. ret = PTR_ERR(tctx->io_wq);
  82. percpu_counter_destroy(&tctx->inflight);
  83. kfree(tctx);
  84. return ret;
  85. }
  86. tctx->task = task;
  87. xa_init(&tctx->xa);
  88. init_waitqueue_head(&tctx->wait);
  89. atomic_set(&tctx->in_cancel, 0);
  90. atomic_set(&tctx->inflight_tracked, 0);
  91. task->io_uring = tctx;
  92. init_llist_head(&tctx->task_list);
  93. init_task_work(&tctx->task_work, tctx_task_work);
  94. return 0;
  95. }
  96. int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
  97. {
  98. struct io_uring_task *tctx = current->io_uring;
  99. struct io_tctx_node *node;
  100. int ret;
  101. if (unlikely(!tctx)) {
  102. ret = io_uring_alloc_task_context(current, ctx);
  103. if (unlikely(ret))
  104. return ret;
  105. tctx = current->io_uring;
  106. if (ctx->iowq_limits_set) {
  107. unsigned int limits[2] = { ctx->iowq_limits[0],
  108. ctx->iowq_limits[1], };
  109. ret = io_wq_max_workers(tctx->io_wq, limits);
  110. if (ret)
  111. return ret;
  112. }
  113. }
  114. /*
  115. * Re-activate io-wq keepalive on any new io_uring usage. The wq may have
  116. * been marked for idle-exit when the task temporarily had no active
  117. * io_uring instances.
  118. */
  119. if (tctx->io_wq)
  120. io_wq_set_exit_on_idle(tctx->io_wq, false);
  121. if (!xa_load(&tctx->xa, (unsigned long)ctx)) {
  122. node = kmalloc_obj(*node);
  123. if (!node)
  124. return -ENOMEM;
  125. node->ctx = ctx;
  126. node->task = current;
  127. ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
  128. node, GFP_KERNEL));
  129. if (ret) {
  130. kfree(node);
  131. return ret;
  132. }
  133. mutex_lock(&ctx->tctx_lock);
  134. list_add(&node->ctx_node, &ctx->tctx_list);
  135. mutex_unlock(&ctx->tctx_lock);
  136. }
  137. return 0;
  138. }
  139. int __io_uring_add_tctx_node_from_submit(struct io_ring_ctx *ctx)
  140. {
  141. int ret;
  142. if (ctx->flags & IORING_SETUP_SINGLE_ISSUER
  143. && ctx->submitter_task != current)
  144. return -EEXIST;
  145. ret = __io_uring_add_tctx_node(ctx);
  146. if (ret)
  147. return ret;
  148. current->io_uring->last = ctx;
  149. return 0;
  150. }
  151. /*
  152. * Remove this io_uring_file -> task mapping.
  153. */
  154. __cold void io_uring_del_tctx_node(unsigned long index)
  155. {
  156. struct io_uring_task *tctx = current->io_uring;
  157. struct io_tctx_node *node;
  158. if (!tctx)
  159. return;
  160. node = xa_erase(&tctx->xa, index);
  161. if (!node)
  162. return;
  163. WARN_ON_ONCE(current != node->task);
  164. WARN_ON_ONCE(list_empty(&node->ctx_node));
  165. mutex_lock(&node->ctx->tctx_lock);
  166. list_del(&node->ctx_node);
  167. mutex_unlock(&node->ctx->tctx_lock);
  168. if (tctx->last == node->ctx)
  169. tctx->last = NULL;
  170. kfree(node);
  171. if (xa_empty(&tctx->xa) && tctx->io_wq)
  172. io_wq_set_exit_on_idle(tctx->io_wq, true);
  173. }
  174. __cold void io_uring_clean_tctx(struct io_uring_task *tctx)
  175. {
  176. struct io_wq *wq = tctx->io_wq;
  177. struct io_tctx_node *node;
  178. unsigned long index;
  179. xa_for_each(&tctx->xa, index, node) {
  180. io_uring_del_tctx_node(index);
  181. cond_resched();
  182. }
  183. if (wq) {
  184. /*
  185. * Must be after io_uring_del_tctx_node() (removes nodes under
  186. * uring_lock) to avoid race with io_uring_try_cancel_iowq().
  187. */
  188. io_wq_put_and_exit(wq);
  189. tctx->io_wq = NULL;
  190. }
  191. }
  192. void io_uring_unreg_ringfd(void)
  193. {
  194. struct io_uring_task *tctx = current->io_uring;
  195. int i;
  196. for (i = 0; i < IO_RINGFD_REG_MAX; i++) {
  197. if (tctx->registered_rings[i]) {
  198. fput(tctx->registered_rings[i]);
  199. tctx->registered_rings[i] = NULL;
  200. }
  201. }
  202. }
  203. int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file,
  204. int start, int end)
  205. {
  206. int offset, idx;
  207. for (offset = start; offset < end; offset++) {
  208. idx = array_index_nospec(offset, IO_RINGFD_REG_MAX);
  209. if (tctx->registered_rings[idx])
  210. continue;
  211. tctx->registered_rings[idx] = file;
  212. return idx;
  213. }
  214. return -EBUSY;
  215. }
  216. static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd,
  217. int start, int end)
  218. {
  219. struct file *file;
  220. int offset;
  221. file = fget(fd);
  222. if (!file) {
  223. return -EBADF;
  224. } else if (!io_is_uring_fops(file)) {
  225. fput(file);
  226. return -EOPNOTSUPP;
  227. }
  228. offset = io_ring_add_registered_file(tctx, file, start, end);
  229. if (offset < 0)
  230. fput(file);
  231. return offset;
  232. }
  233. /*
  234. * Register a ring fd to avoid fdget/fdput for each io_uring_enter()
  235. * invocation. User passes in an array of struct io_uring_rsrc_update
  236. * with ->data set to the ring_fd, and ->offset given for the desired
  237. * index. If no index is desired, application may set ->offset == -1U
  238. * and we'll find an available index. Returns number of entries
  239. * successfully processed, or < 0 on error if none were processed.
  240. */
  241. int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg,
  242. unsigned nr_args)
  243. {
  244. struct io_uring_rsrc_update __user *arg = __arg;
  245. struct io_uring_rsrc_update reg;
  246. struct io_uring_task *tctx;
  247. int ret, i;
  248. if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
  249. return -EINVAL;
  250. mutex_unlock(&ctx->uring_lock);
  251. ret = __io_uring_add_tctx_node(ctx);
  252. mutex_lock(&ctx->uring_lock);
  253. if (ret)
  254. return ret;
  255. tctx = current->io_uring;
  256. for (i = 0; i < nr_args; i++) {
  257. int start, end;
  258. if (copy_from_user(&reg, &arg[i], sizeof(reg))) {
  259. ret = -EFAULT;
  260. break;
  261. }
  262. if (reg.resv) {
  263. ret = -EINVAL;
  264. break;
  265. }
  266. if (reg.offset == -1U) {
  267. start = 0;
  268. end = IO_RINGFD_REG_MAX;
  269. } else {
  270. if (reg.offset >= IO_RINGFD_REG_MAX) {
  271. ret = -EINVAL;
  272. break;
  273. }
  274. start = reg.offset;
  275. end = start + 1;
  276. }
  277. ret = io_ring_add_registered_fd(tctx, reg.data, start, end);
  278. if (ret < 0)
  279. break;
  280. reg.offset = ret;
  281. if (copy_to_user(&arg[i], &reg, sizeof(reg))) {
  282. fput(tctx->registered_rings[reg.offset]);
  283. tctx->registered_rings[reg.offset] = NULL;
  284. ret = -EFAULT;
  285. break;
  286. }
  287. }
  288. return i ? i : ret;
  289. }
  290. int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg,
  291. unsigned nr_args)
  292. {
  293. struct io_uring_rsrc_update __user *arg = __arg;
  294. struct io_uring_task *tctx = current->io_uring;
  295. struct io_uring_rsrc_update reg;
  296. int ret = 0, i;
  297. if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
  298. return -EINVAL;
  299. if (!tctx)
  300. return 0;
  301. for (i = 0; i < nr_args; i++) {
  302. if (copy_from_user(&reg, &arg[i], sizeof(reg))) {
  303. ret = -EFAULT;
  304. break;
  305. }
  306. if (reg.resv || reg.data || reg.offset >= IO_RINGFD_REG_MAX) {
  307. ret = -EINVAL;
  308. break;
  309. }
  310. reg.offset = array_index_nospec(reg.offset, IO_RINGFD_REG_MAX);
  311. if (tctx->registered_rings[reg.offset]) {
  312. fput(tctx->registered_rings[reg.offset]);
  313. tctx->registered_rings[reg.offset] = NULL;
  314. }
  315. }
  316. return i ? i : ret;
  317. }
  318. int __io_uring_fork(struct task_struct *tsk)
  319. {
  320. struct io_restriction *res, *src = tsk->io_uring_restrict;
  321. /* Don't leave it dangling on error */
  322. tsk->io_uring_restrict = NULL;
  323. res = kzalloc_obj(*res, GFP_KERNEL_ACCOUNT);
  324. if (!res)
  325. return -ENOMEM;
  326. tsk->io_uring_restrict = res;
  327. io_restriction_clone(res, src);
  328. return 0;
  329. }