register.c 26 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Code related to the io_uring_register() syscall
  4. *
  5. * Copyright (C) 2023 Jens Axboe
  6. */
  7. #include <linux/kernel.h>
  8. #include <linux/errno.h>
  9. #include <linux/syscalls.h>
  10. #include <linux/refcount.h>
  11. #include <linux/bits.h>
  12. #include <linux/fs.h>
  13. #include <linux/file.h>
  14. #include <linux/slab.h>
  15. #include <linux/uaccess.h>
  16. #include <linux/nospec.h>
  17. #include <linux/compat.h>
  18. #include <linux/io_uring.h>
  19. #include <linux/io_uring_types.h>
  20. #include "filetable.h"
  21. #include "io_uring.h"
  22. #include "opdef.h"
  23. #include "tctx.h"
  24. #include "rsrc.h"
  25. #include "sqpoll.h"
  26. #include "register.h"
  27. #include "cancel.h"
  28. #include "kbuf.h"
  29. #include "napi.h"
  30. #include "eventfd.h"
  31. #include "msg_ring.h"
  32. #include "memmap.h"
  33. #include "zcrx.h"
  34. #include "query.h"
  35. #include "bpf_filter.h"
  36. #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
  37. IORING_REGISTER_LAST + IORING_OP_LAST)
  38. static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
  39. unsigned nr_args)
  40. {
  41. struct io_uring_probe *p;
  42. size_t size;
  43. int i, ret;
  44. if (nr_args > IORING_OP_LAST)
  45. nr_args = IORING_OP_LAST;
  46. size = struct_size(p, ops, nr_args);
  47. p = memdup_user(arg, size);
  48. if (IS_ERR(p))
  49. return PTR_ERR(p);
  50. ret = -EINVAL;
  51. if (memchr_inv(p, 0, size))
  52. goto out;
  53. p->last_op = IORING_OP_LAST - 1;
  54. for (i = 0; i < nr_args; i++) {
  55. p->ops[i].op = i;
  56. if (io_uring_op_supported(i))
  57. p->ops[i].flags = IO_URING_OP_SUPPORTED;
  58. }
  59. p->ops_len = i;
  60. ret = 0;
  61. if (copy_to_user(arg, p, size))
  62. ret = -EFAULT;
  63. out:
  64. kfree(p);
  65. return ret;
  66. }
  67. int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
  68. {
  69. const struct cred *creds;
  70. creds = xa_erase(&ctx->personalities, id);
  71. if (creds) {
  72. put_cred(creds);
  73. return 0;
  74. }
  75. return -EINVAL;
  76. }
  77. static int io_register_personality(struct io_ring_ctx *ctx)
  78. {
  79. const struct cred *creds;
  80. u32 id;
  81. int ret;
  82. creds = get_current_cred();
  83. ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
  84. XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
  85. if (ret < 0) {
  86. put_cred(creds);
  87. return ret;
  88. }
  89. return id;
  90. }
  91. /*
  92. * Returns number of restrictions parsed and added on success, or < 0 for
  93. * an error.
  94. */
  95. static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args,
  96. struct io_restriction *restrictions)
  97. {
  98. struct io_uring_restriction *res;
  99. size_t size;
  100. int i, ret;
  101. if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
  102. return -EINVAL;
  103. size = array_size(nr_args, sizeof(*res));
  104. if (size == SIZE_MAX)
  105. return -EOVERFLOW;
  106. res = memdup_user(arg, size);
  107. if (IS_ERR(res))
  108. return PTR_ERR(res);
  109. ret = -EINVAL;
  110. for (i = 0; i < nr_args; i++) {
  111. switch (res[i].opcode) {
  112. case IORING_RESTRICTION_REGISTER_OP:
  113. if (res[i].register_op >= IORING_REGISTER_LAST)
  114. goto err;
  115. __set_bit(res[i].register_op, restrictions->register_op);
  116. restrictions->reg_registered = true;
  117. break;
  118. case IORING_RESTRICTION_SQE_OP:
  119. if (res[i].sqe_op >= IORING_OP_LAST)
  120. goto err;
  121. __set_bit(res[i].sqe_op, restrictions->sqe_op);
  122. restrictions->op_registered = true;
  123. break;
  124. case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
  125. restrictions->sqe_flags_allowed = res[i].sqe_flags;
  126. restrictions->op_registered = true;
  127. break;
  128. case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
  129. restrictions->sqe_flags_required = res[i].sqe_flags;
  130. restrictions->op_registered = true;
  131. break;
  132. default:
  133. goto err;
  134. }
  135. }
  136. ret = nr_args;
  137. if (!nr_args) {
  138. restrictions->op_registered = true;
  139. restrictions->reg_registered = true;
  140. }
  141. err:
  142. kfree(res);
  143. return ret;
  144. }
  145. static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
  146. void __user *arg, unsigned int nr_args)
  147. {
  148. int ret;
  149. /* Restrictions allowed only if rings started disabled */
  150. if (!(ctx->flags & IORING_SETUP_R_DISABLED))
  151. return -EBADFD;
  152. /* We allow only a single restrictions registration */
  153. if (ctx->restrictions.op_registered || ctx->restrictions.reg_registered)
  154. return -EBUSY;
  155. ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions);
  156. /*
  157. * Reset all restrictions if an error happened, but retain any COW'ed
  158. * settings.
  159. */
  160. if (ret < 0) {
  161. struct io_bpf_filters *bpf = ctx->restrictions.bpf_filters;
  162. bool cowed = ctx->restrictions.bpf_filters_cow;
  163. memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
  164. ctx->restrictions.bpf_filters = bpf;
  165. ctx->restrictions.bpf_filters_cow = cowed;
  166. return ret;
  167. }
  168. if (ctx->restrictions.op_registered)
  169. ctx->op_restricted = 1;
  170. if (ctx->restrictions.reg_registered)
  171. ctx->reg_restricted = 1;
  172. return 0;
  173. }
  174. static int io_register_restrictions_task(void __user *arg, unsigned int nr_args)
  175. {
  176. struct io_uring_task_restriction __user *ures = arg;
  177. struct io_uring_task_restriction tres;
  178. struct io_restriction *res;
  179. int ret;
  180. /* Disallow if task already has registered restrictions */
  181. if (current->io_uring_restrict)
  182. return -EPERM;
  183. /*
  184. * Similar to seccomp, disallow setting a filter if task_no_new_privs
  185. * is false and we're not CAP_SYS_ADMIN.
  186. */
  187. if (!task_no_new_privs(current) &&
  188. !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
  189. return -EACCES;
  190. if (nr_args != 1)
  191. return -EINVAL;
  192. if (copy_from_user(&tres, arg, sizeof(tres)))
  193. return -EFAULT;
  194. if (tres.flags)
  195. return -EINVAL;
  196. if (!mem_is_zero(tres.resv, sizeof(tres.resv)))
  197. return -EINVAL;
  198. res = kzalloc_obj(*res, GFP_KERNEL_ACCOUNT);
  199. if (!res)
  200. return -ENOMEM;
  201. ret = io_parse_restrictions(ures->restrictions, tres.nr_res, res);
  202. if (ret < 0) {
  203. kfree(res);
  204. return ret;
  205. }
  206. current->io_uring_restrict = res;
  207. return 0;
  208. }
  209. static int io_register_bpf_filter_task(void __user *arg, unsigned int nr_args)
  210. {
  211. struct io_restriction *res;
  212. int ret;
  213. /*
  214. * Similar to seccomp, disallow setting a filter if task_no_new_privs
  215. * is false and we're not CAP_SYS_ADMIN.
  216. */
  217. if (!task_no_new_privs(current) &&
  218. !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
  219. return -EACCES;
  220. if (nr_args != 1)
  221. return -EINVAL;
  222. /* If no task restrictions exist, setup a new set */
  223. res = current->io_uring_restrict;
  224. if (!res) {
  225. res = kzalloc_obj(*res, GFP_KERNEL_ACCOUNT);
  226. if (!res)
  227. return -ENOMEM;
  228. }
  229. ret = io_register_bpf_filter(res, arg);
  230. if (ret) {
  231. if (res != current->io_uring_restrict)
  232. kfree(res);
  233. return ret;
  234. }
  235. if (!current->io_uring_restrict)
  236. current->io_uring_restrict = res;
  237. return 0;
  238. }
  239. static int io_register_enable_rings(struct io_ring_ctx *ctx)
  240. {
  241. if (!(ctx->flags & IORING_SETUP_R_DISABLED))
  242. return -EBADFD;
  243. if (ctx->flags & IORING_SETUP_SINGLE_ISSUER) {
  244. ctx->submitter_task = get_task_struct(current);
  245. /*
  246. * Lazy activation attempts would fail if it was polled before
  247. * submitter_task is set.
  248. */
  249. if (wq_has_sleeper(&ctx->poll_wq))
  250. io_activate_pollwq(ctx);
  251. }
  252. /* Keep submitter_task store before clearing IORING_SETUP_R_DISABLED */
  253. smp_store_release(&ctx->flags, ctx->flags & ~IORING_SETUP_R_DISABLED);
  254. if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
  255. wake_up(&ctx->sq_data->wait);
  256. return 0;
  257. }
  258. static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
  259. cpumask_var_t new_mask)
  260. {
  261. int ret;
  262. if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
  263. ret = io_wq_cpu_affinity(current->io_uring, new_mask);
  264. } else {
  265. mutex_unlock(&ctx->uring_lock);
  266. ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
  267. mutex_lock(&ctx->uring_lock);
  268. }
  269. return ret;
  270. }
  271. static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
  272. void __user *arg, unsigned len)
  273. {
  274. cpumask_var_t new_mask;
  275. int ret;
  276. if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
  277. return -ENOMEM;
  278. cpumask_clear(new_mask);
  279. if (len > cpumask_size())
  280. len = cpumask_size();
  281. #ifdef CONFIG_COMPAT
  282. if (in_compat_syscall())
  283. ret = compat_get_bitmap(cpumask_bits(new_mask),
  284. (const compat_ulong_t __user *)arg,
  285. len * 8 /* CHAR_BIT */);
  286. else
  287. #endif
  288. ret = copy_from_user(new_mask, arg, len);
  289. if (ret) {
  290. free_cpumask_var(new_mask);
  291. return -EFAULT;
  292. }
  293. ret = __io_register_iowq_aff(ctx, new_mask);
  294. free_cpumask_var(new_mask);
  295. return ret;
  296. }
  297. static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
  298. {
  299. return __io_register_iowq_aff(ctx, NULL);
  300. }
  301. static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
  302. void __user *arg)
  303. __must_hold(&ctx->uring_lock)
  304. {
  305. struct io_tctx_node *node;
  306. struct io_uring_task *tctx = NULL;
  307. struct io_sq_data *sqd = NULL;
  308. __u32 new_count[2];
  309. int i, ret;
  310. if (copy_from_user(new_count, arg, sizeof(new_count)))
  311. return -EFAULT;
  312. for (i = 0; i < ARRAY_SIZE(new_count); i++)
  313. if (new_count[i] > INT_MAX)
  314. return -EINVAL;
  315. if (ctx->flags & IORING_SETUP_SQPOLL) {
  316. sqd = ctx->sq_data;
  317. if (sqd) {
  318. struct task_struct *tsk;
  319. /*
  320. * Observe the correct sqd->lock -> ctx->uring_lock
  321. * ordering. Fine to drop uring_lock here, we hold
  322. * a ref to the ctx.
  323. */
  324. refcount_inc(&sqd->refs);
  325. mutex_unlock(&ctx->uring_lock);
  326. mutex_lock(&sqd->lock);
  327. mutex_lock(&ctx->uring_lock);
  328. tsk = sqpoll_task_locked(sqd);
  329. if (tsk)
  330. tctx = tsk->io_uring;
  331. }
  332. } else {
  333. tctx = current->io_uring;
  334. }
  335. BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
  336. for (i = 0; i < ARRAY_SIZE(new_count); i++)
  337. if (new_count[i])
  338. ctx->iowq_limits[i] = new_count[i];
  339. ctx->iowq_limits_set = true;
  340. if (tctx && tctx->io_wq) {
  341. ret = io_wq_max_workers(tctx->io_wq, new_count);
  342. if (ret)
  343. goto err;
  344. } else {
  345. memset(new_count, 0, sizeof(new_count));
  346. }
  347. if (sqd) {
  348. mutex_unlock(&ctx->uring_lock);
  349. mutex_unlock(&sqd->lock);
  350. io_put_sq_data(sqd);
  351. mutex_lock(&ctx->uring_lock);
  352. }
  353. if (copy_to_user(arg, new_count, sizeof(new_count)))
  354. return -EFAULT;
  355. /* that's it for SQPOLL, only the SQPOLL task creates requests */
  356. if (sqd)
  357. return 0;
  358. /* now propagate the restriction to all registered users */
  359. mutex_lock(&ctx->tctx_lock);
  360. list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
  361. tctx = node->task->io_uring;
  362. if (WARN_ON_ONCE(!tctx->io_wq))
  363. continue;
  364. for (i = 0; i < ARRAY_SIZE(new_count); i++)
  365. new_count[i] = ctx->iowq_limits[i];
  366. /* ignore errors, it always returns zero anyway */
  367. (void)io_wq_max_workers(tctx->io_wq, new_count);
  368. }
  369. mutex_unlock(&ctx->tctx_lock);
  370. return 0;
  371. err:
  372. if (sqd) {
  373. mutex_unlock(&ctx->uring_lock);
  374. mutex_unlock(&sqd->lock);
  375. io_put_sq_data(sqd);
  376. mutex_lock(&ctx->uring_lock);
  377. }
  378. return ret;
  379. }
  380. static int io_register_clock(struct io_ring_ctx *ctx,
  381. struct io_uring_clock_register __user *arg)
  382. {
  383. struct io_uring_clock_register reg;
  384. if (copy_from_user(&reg, arg, sizeof(reg)))
  385. return -EFAULT;
  386. if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
  387. return -EINVAL;
  388. switch (reg.clockid) {
  389. case CLOCK_MONOTONIC:
  390. ctx->clock_offset = 0;
  391. break;
  392. case CLOCK_BOOTTIME:
  393. ctx->clock_offset = TK_OFFS_BOOT;
  394. break;
  395. default:
  396. return -EINVAL;
  397. }
  398. ctx->clockid = reg.clockid;
  399. return 0;
  400. }
  401. /*
  402. * State to maintain until we can swap. Both new and old state, used for
  403. * either mapping or freeing.
  404. */
  405. struct io_ring_ctx_rings {
  406. struct io_rings *rings;
  407. struct io_uring_sqe *sq_sqes;
  408. struct io_mapped_region sq_region;
  409. struct io_mapped_region ring_region;
  410. };
  411. static void io_register_free_rings(struct io_ring_ctx *ctx,
  412. struct io_ring_ctx_rings *r)
  413. {
  414. io_free_region(ctx->user, &r->sq_region);
  415. io_free_region(ctx->user, &r->ring_region);
  416. }
  417. #define swap_old(ctx, o, n, field) \
  418. do { \
  419. (o).field = (ctx)->field; \
  420. (ctx)->field = (n).field; \
  421. } while (0)
  422. #define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP)
  423. #define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \
  424. IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP | \
  425. IORING_SETUP_CQE_MIXED | IORING_SETUP_SQE_MIXED)
  426. static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
  427. {
  428. struct io_ctx_config config;
  429. struct io_uring_region_desc rd;
  430. struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL;
  431. unsigned i, tail, old_head;
  432. struct io_uring_params *p = &config.p;
  433. struct io_rings_layout *rl = &config.layout;
  434. int ret;
  435. memset(&config, 0, sizeof(config));
  436. /* limited to DEFER_TASKRUN for now */
  437. if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
  438. return -EINVAL;
  439. if (copy_from_user(p, arg, sizeof(*p)))
  440. return -EFAULT;
  441. if (p->flags & ~RESIZE_FLAGS)
  442. return -EINVAL;
  443. /* properties that are always inherited */
  444. p->flags |= (ctx->flags & COPY_FLAGS);
  445. ret = io_prepare_config(&config);
  446. if (unlikely(ret))
  447. return ret;
  448. memset(&rd, 0, sizeof(rd));
  449. rd.size = PAGE_ALIGN(rl->rings_size);
  450. if (p->flags & IORING_SETUP_NO_MMAP) {
  451. rd.user_addr = p->cq_off.user_addr;
  452. rd.flags |= IORING_MEM_REGION_TYPE_USER;
  453. }
  454. ret = io_create_region(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING);
  455. if (ret)
  456. return ret;
  457. n.rings = io_region_get_ptr(&n.ring_region);
  458. /*
  459. * At this point n.rings is shared with userspace, just like o.rings
  460. * is as well. While we don't expect userspace to modify it while
  461. * a resize is in progress, and it's most likely that userspace will
  462. * shoot itself in the foot if it does, we can't always assume good
  463. * intent... Use read/write once helpers from here on to indicate the
  464. * shared nature of it.
  465. */
  466. WRITE_ONCE(n.rings->sq_ring_mask, p->sq_entries - 1);
  467. WRITE_ONCE(n.rings->cq_ring_mask, p->cq_entries - 1);
  468. WRITE_ONCE(n.rings->sq_ring_entries, p->sq_entries);
  469. WRITE_ONCE(n.rings->cq_ring_entries, p->cq_entries);
  470. if (copy_to_user(arg, p, sizeof(*p))) {
  471. io_register_free_rings(ctx, &n);
  472. return -EFAULT;
  473. }
  474. memset(&rd, 0, sizeof(rd));
  475. rd.size = PAGE_ALIGN(rl->sq_size);
  476. if (p->flags & IORING_SETUP_NO_MMAP) {
  477. rd.user_addr = p->sq_off.user_addr;
  478. rd.flags |= IORING_MEM_REGION_TYPE_USER;
  479. }
  480. ret = io_create_region(ctx, &n.sq_region, &rd, IORING_OFF_SQES);
  481. if (ret) {
  482. io_register_free_rings(ctx, &n);
  483. return ret;
  484. }
  485. n.sq_sqes = io_region_get_ptr(&n.sq_region);
  486. /*
  487. * If using SQPOLL, park the thread
  488. */
  489. if (ctx->sq_data) {
  490. mutex_unlock(&ctx->uring_lock);
  491. io_sq_thread_park(ctx->sq_data);
  492. mutex_lock(&ctx->uring_lock);
  493. }
  494. /*
  495. * We'll do the swap. Grab the ctx->mmap_lock, which will exclude
  496. * any new mmap's on the ring fd. Clear out existing mappings to prevent
  497. * mmap from seeing them, as we'll unmap them. Any attempt to mmap
  498. * existing rings beyond this point will fail. Not that it could proceed
  499. * at this point anyway, as the io_uring mmap side needs go grab the
  500. * ctx->mmap_lock as well. Likewise, hold the completion lock over the
  501. * duration of the actual swap.
  502. */
  503. mutex_lock(&ctx->mmap_lock);
  504. spin_lock(&ctx->completion_lock);
  505. o.rings = ctx->rings;
  506. ctx->rings = NULL;
  507. o.sq_sqes = ctx->sq_sqes;
  508. ctx->sq_sqes = NULL;
  509. /*
  510. * Now copy SQ and CQ entries, if any. If either of the destination
  511. * rings can't hold what is already there, then fail the operation.
  512. */
  513. tail = READ_ONCE(o.rings->sq.tail);
  514. old_head = READ_ONCE(o.rings->sq.head);
  515. if (tail - old_head > p->sq_entries)
  516. goto overflow;
  517. for (i = old_head; i < tail; i++) {
  518. unsigned src_head = i & (ctx->sq_entries - 1);
  519. unsigned dst_head = i & (p->sq_entries - 1);
  520. n.sq_sqes[dst_head] = o.sq_sqes[src_head];
  521. }
  522. WRITE_ONCE(n.rings->sq.head, old_head);
  523. WRITE_ONCE(n.rings->sq.tail, tail);
  524. tail = READ_ONCE(o.rings->cq.tail);
  525. old_head = READ_ONCE(o.rings->cq.head);
  526. if (tail - old_head > p->cq_entries) {
  527. overflow:
  528. /* restore old rings, and return -EOVERFLOW via cleanup path */
  529. ctx->rings = o.rings;
  530. ctx->sq_sqes = o.sq_sqes;
  531. to_free = &n;
  532. ret = -EOVERFLOW;
  533. goto out;
  534. }
  535. for (i = old_head; i < tail; i++) {
  536. unsigned src_head = i & (ctx->cq_entries - 1);
  537. unsigned dst_head = i & (p->cq_entries - 1);
  538. n.rings->cqes[dst_head] = o.rings->cqes[src_head];
  539. }
  540. WRITE_ONCE(n.rings->cq.head, old_head);
  541. WRITE_ONCE(n.rings->cq.tail, tail);
  542. /* invalidate cached cqe refill */
  543. ctx->cqe_cached = ctx->cqe_sentinel = NULL;
  544. WRITE_ONCE(n.rings->sq_dropped, READ_ONCE(o.rings->sq_dropped));
  545. atomic_set(&n.rings->sq_flags, atomic_read(&o.rings->sq_flags));
  546. WRITE_ONCE(n.rings->cq_flags, READ_ONCE(o.rings->cq_flags));
  547. WRITE_ONCE(n.rings->cq_overflow, READ_ONCE(o.rings->cq_overflow));
  548. /* all done, store old pointers and assign new ones */
  549. if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
  550. ctx->sq_array = (u32 *)((char *)n.rings + rl->sq_array_offset);
  551. ctx->sq_entries = p->sq_entries;
  552. ctx->cq_entries = p->cq_entries;
  553. /*
  554. * Just mark any flag we may have missed and that the application
  555. * should act on unconditionally. Worst case it'll be an extra
  556. * syscall.
  557. */
  558. atomic_or(IORING_SQ_TASKRUN | IORING_SQ_NEED_WAKEUP, &n.rings->sq_flags);
  559. ctx->rings = n.rings;
  560. rcu_assign_pointer(ctx->rings_rcu, n.rings);
  561. ctx->sq_sqes = n.sq_sqes;
  562. swap_old(ctx, o, n, ring_region);
  563. swap_old(ctx, o, n, sq_region);
  564. to_free = &o;
  565. ret = 0;
  566. out:
  567. spin_unlock(&ctx->completion_lock);
  568. mutex_unlock(&ctx->mmap_lock);
  569. /* Wait for concurrent io_ctx_mark_taskrun() */
  570. if (to_free == &o)
  571. synchronize_rcu_expedited();
  572. io_register_free_rings(ctx, to_free);
  573. if (ctx->sq_data)
  574. io_sq_thread_unpark(ctx->sq_data);
  575. return ret;
  576. }
  577. static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
  578. {
  579. struct io_uring_mem_region_reg __user *reg_uptr = uarg;
  580. struct io_uring_mem_region_reg reg;
  581. struct io_uring_region_desc __user *rd_uptr;
  582. struct io_uring_region_desc rd;
  583. struct io_mapped_region region = {};
  584. int ret;
  585. if (io_region_is_set(&ctx->param_region))
  586. return -EBUSY;
  587. if (copy_from_user(&reg, reg_uptr, sizeof(reg)))
  588. return -EFAULT;
  589. rd_uptr = u64_to_user_ptr(reg.region_uptr);
  590. if (copy_from_user(&rd, rd_uptr, sizeof(rd)))
  591. return -EFAULT;
  592. if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
  593. return -EINVAL;
  594. if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG)
  595. return -EINVAL;
  596. /*
  597. * This ensures there are no waiters. Waiters are unlocked and it's
  598. * hard to synchronise with them, especially if we need to initialise
  599. * the region.
  600. */
  601. if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) &&
  602. !(ctx->flags & IORING_SETUP_R_DISABLED))
  603. return -EINVAL;
  604. ret = io_create_region(ctx, &region, &rd, IORING_MAP_OFF_PARAM_REGION);
  605. if (ret)
  606. return ret;
  607. if (copy_to_user(rd_uptr, &rd, sizeof(rd))) {
  608. io_free_region(ctx->user, &region);
  609. return -EFAULT;
  610. }
  611. if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) {
  612. ctx->cq_wait_arg = io_region_get_ptr(&region);
  613. ctx->cq_wait_size = rd.size;
  614. }
  615. io_region_publish(ctx, &region, &ctx->param_region);
  616. return 0;
  617. }
  618. static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
  619. void __user *arg, unsigned nr_args)
  620. __releases(ctx->uring_lock)
  621. __acquires(ctx->uring_lock)
  622. {
  623. int ret;
  624. /*
  625. * We don't quiesce the refs for register anymore and so it can't be
  626. * dying as we're holding a file ref here.
  627. */
  628. if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
  629. return -ENXIO;
  630. if (ctx->submitter_task && ctx->submitter_task != current)
  631. return -EEXIST;
  632. if (ctx->reg_restricted && !(ctx->flags & IORING_SETUP_R_DISABLED)) {
  633. opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
  634. if (!test_bit(opcode, ctx->restrictions.register_op))
  635. return -EACCES;
  636. }
  637. switch (opcode) {
  638. case IORING_REGISTER_BUFFERS:
  639. ret = -EFAULT;
  640. if (!arg)
  641. break;
  642. ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
  643. break;
  644. case IORING_UNREGISTER_BUFFERS:
  645. ret = -EINVAL;
  646. if (arg || nr_args)
  647. break;
  648. ret = io_sqe_buffers_unregister(ctx);
  649. break;
  650. case IORING_REGISTER_FILES:
  651. ret = -EFAULT;
  652. if (!arg)
  653. break;
  654. ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
  655. break;
  656. case IORING_UNREGISTER_FILES:
  657. ret = -EINVAL;
  658. if (arg || nr_args)
  659. break;
  660. ret = io_sqe_files_unregister(ctx);
  661. break;
  662. case IORING_REGISTER_FILES_UPDATE:
  663. ret = io_register_files_update(ctx, arg, nr_args);
  664. break;
  665. case IORING_REGISTER_EVENTFD:
  666. ret = -EINVAL;
  667. if (nr_args != 1)
  668. break;
  669. ret = io_eventfd_register(ctx, arg, 0);
  670. break;
  671. case IORING_REGISTER_EVENTFD_ASYNC:
  672. ret = -EINVAL;
  673. if (nr_args != 1)
  674. break;
  675. ret = io_eventfd_register(ctx, arg, 1);
  676. break;
  677. case IORING_UNREGISTER_EVENTFD:
  678. ret = -EINVAL;
  679. if (arg || nr_args)
  680. break;
  681. ret = io_eventfd_unregister(ctx);
  682. break;
  683. case IORING_REGISTER_PROBE:
  684. ret = -EINVAL;
  685. if (!arg || nr_args > 256)
  686. break;
  687. ret = io_probe(ctx, arg, nr_args);
  688. break;
  689. case IORING_REGISTER_PERSONALITY:
  690. ret = -EINVAL;
  691. if (arg || nr_args)
  692. break;
  693. ret = io_register_personality(ctx);
  694. break;
  695. case IORING_UNREGISTER_PERSONALITY:
  696. ret = -EINVAL;
  697. if (arg)
  698. break;
  699. ret = io_unregister_personality(ctx, nr_args);
  700. break;
  701. case IORING_REGISTER_ENABLE_RINGS:
  702. ret = -EINVAL;
  703. if (arg || nr_args)
  704. break;
  705. ret = io_register_enable_rings(ctx);
  706. break;
  707. case IORING_REGISTER_RESTRICTIONS:
  708. ret = io_register_restrictions(ctx, arg, nr_args);
  709. break;
  710. case IORING_REGISTER_FILES2:
  711. ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
  712. break;
  713. case IORING_REGISTER_FILES_UPDATE2:
  714. ret = io_register_rsrc_update(ctx, arg, nr_args,
  715. IORING_RSRC_FILE);
  716. break;
  717. case IORING_REGISTER_BUFFERS2:
  718. ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
  719. break;
  720. case IORING_REGISTER_BUFFERS_UPDATE:
  721. ret = io_register_rsrc_update(ctx, arg, nr_args,
  722. IORING_RSRC_BUFFER);
  723. break;
  724. case IORING_REGISTER_IOWQ_AFF:
  725. ret = -EINVAL;
  726. if (!arg || !nr_args)
  727. break;
  728. ret = io_register_iowq_aff(ctx, arg, nr_args);
  729. break;
  730. case IORING_UNREGISTER_IOWQ_AFF:
  731. ret = -EINVAL;
  732. if (arg || nr_args)
  733. break;
  734. ret = io_unregister_iowq_aff(ctx);
  735. break;
  736. case IORING_REGISTER_IOWQ_MAX_WORKERS:
  737. ret = -EINVAL;
  738. if (!arg || nr_args != 2)
  739. break;
  740. ret = io_register_iowq_max_workers(ctx, arg);
  741. break;
  742. case IORING_REGISTER_RING_FDS:
  743. ret = io_ringfd_register(ctx, arg, nr_args);
  744. break;
  745. case IORING_UNREGISTER_RING_FDS:
  746. ret = io_ringfd_unregister(ctx, arg, nr_args);
  747. break;
  748. case IORING_REGISTER_PBUF_RING:
  749. ret = -EINVAL;
  750. if (!arg || nr_args != 1)
  751. break;
  752. ret = io_register_pbuf_ring(ctx, arg);
  753. break;
  754. case IORING_UNREGISTER_PBUF_RING:
  755. ret = -EINVAL;
  756. if (!arg || nr_args != 1)
  757. break;
  758. ret = io_unregister_pbuf_ring(ctx, arg);
  759. break;
  760. case IORING_REGISTER_SYNC_CANCEL:
  761. ret = -EINVAL;
  762. if (!arg || nr_args != 1)
  763. break;
  764. ret = io_sync_cancel(ctx, arg);
  765. break;
  766. case IORING_REGISTER_FILE_ALLOC_RANGE:
  767. ret = -EINVAL;
  768. if (!arg || nr_args)
  769. break;
  770. ret = io_register_file_alloc_range(ctx, arg);
  771. break;
  772. case IORING_REGISTER_PBUF_STATUS:
  773. ret = -EINVAL;
  774. if (!arg || nr_args != 1)
  775. break;
  776. ret = io_register_pbuf_status(ctx, arg);
  777. break;
  778. case IORING_REGISTER_NAPI:
  779. ret = -EINVAL;
  780. if (!arg || nr_args != 1)
  781. break;
  782. ret = io_register_napi(ctx, arg);
  783. break;
  784. case IORING_UNREGISTER_NAPI:
  785. ret = -EINVAL;
  786. if (nr_args != 1)
  787. break;
  788. ret = io_unregister_napi(ctx, arg);
  789. break;
  790. case IORING_REGISTER_CLOCK:
  791. ret = -EINVAL;
  792. if (!arg || nr_args)
  793. break;
  794. ret = io_register_clock(ctx, arg);
  795. break;
  796. case IORING_REGISTER_CLONE_BUFFERS:
  797. ret = -EINVAL;
  798. if (!arg || nr_args != 1)
  799. break;
  800. ret = io_register_clone_buffers(ctx, arg);
  801. break;
  802. case IORING_REGISTER_ZCRX_IFQ:
  803. ret = -EINVAL;
  804. if (!arg || nr_args != 1)
  805. break;
  806. ret = io_register_zcrx_ifq(ctx, arg);
  807. break;
  808. case IORING_REGISTER_RESIZE_RINGS:
  809. ret = -EINVAL;
  810. if (!arg || nr_args != 1)
  811. break;
  812. ret = io_register_resize_rings(ctx, arg);
  813. break;
  814. case IORING_REGISTER_MEM_REGION:
  815. ret = -EINVAL;
  816. if (!arg || nr_args != 1)
  817. break;
  818. ret = io_register_mem_region(ctx, arg);
  819. break;
  820. case IORING_REGISTER_QUERY:
  821. ret = io_query(arg, nr_args);
  822. break;
  823. case IORING_REGISTER_ZCRX_CTRL:
  824. ret = io_zcrx_ctrl(ctx, arg, nr_args);
  825. break;
  826. case IORING_REGISTER_BPF_FILTER:
  827. ret = -EINVAL;
  828. if (nr_args != 1)
  829. break;
  830. ret = io_register_bpf_filter(&ctx->restrictions, arg);
  831. if (!ret)
  832. WRITE_ONCE(ctx->bpf_filters,
  833. ctx->restrictions.bpf_filters->filters);
  834. break;
  835. default:
  836. ret = -EINVAL;
  837. break;
  838. }
  839. return ret;
  840. }
  841. /*
  842. * Given an 'fd' value, return the ctx associated with if. If 'registered' is
  843. * true, then the registered index is used. Otherwise, the normal fd table.
  844. * Caller must call fput() on the returned file, unless it's an ERR_PTR.
  845. */
  846. struct file *io_uring_register_get_file(unsigned int fd, bool registered)
  847. {
  848. struct file *file;
  849. if (registered) {
  850. /*
  851. * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
  852. * need only dereference our task private array to find it.
  853. */
  854. struct io_uring_task *tctx = current->io_uring;
  855. if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
  856. return ERR_PTR(-EINVAL);
  857. fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
  858. file = tctx->registered_rings[fd];
  859. if (file)
  860. get_file(file);
  861. } else {
  862. file = fget(fd);
  863. }
  864. if (unlikely(!file))
  865. return ERR_PTR(-EBADF);
  866. if (io_is_uring_fops(file))
  867. return file;
  868. fput(file);
  869. return ERR_PTR(-EOPNOTSUPP);
  870. }
  871. static int io_uring_register_send_msg_ring(void __user *arg, unsigned int nr_args)
  872. {
  873. struct io_uring_sqe sqe;
  874. if (!arg || nr_args != 1)
  875. return -EINVAL;
  876. if (copy_from_user(&sqe, arg, sizeof(sqe)))
  877. return -EFAULT;
  878. /* no flags supported */
  879. if (sqe.flags)
  880. return -EINVAL;
  881. if (sqe.opcode != IORING_OP_MSG_RING)
  882. return -EINVAL;
  883. return io_uring_sync_msg_ring(&sqe);
  884. }
  885. /*
  886. * "blind" registration opcodes are ones where there's no ring given, and
  887. * hence the source fd must be -1.
  888. */
  889. static int io_uring_register_blind(unsigned int opcode, void __user *arg,
  890. unsigned int nr_args)
  891. {
  892. switch (opcode) {
  893. case IORING_REGISTER_SEND_MSG_RING:
  894. return io_uring_register_send_msg_ring(arg, nr_args);
  895. case IORING_REGISTER_QUERY:
  896. return io_query(arg, nr_args);
  897. case IORING_REGISTER_RESTRICTIONS:
  898. return io_register_restrictions_task(arg, nr_args);
  899. case IORING_REGISTER_BPF_FILTER:
  900. return io_register_bpf_filter_task(arg, nr_args);
  901. }
  902. return -EINVAL;
  903. }
  904. SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
  905. void __user *, arg, unsigned int, nr_args)
  906. {
  907. struct io_ring_ctx *ctx;
  908. long ret = -EBADF;
  909. struct file *file;
  910. bool use_registered_ring;
  911. use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
  912. opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
  913. if (opcode >= IORING_REGISTER_LAST)
  914. return -EINVAL;
  915. if (fd == -1)
  916. return io_uring_register_blind(opcode, arg, nr_args);
  917. file = io_uring_register_get_file(fd, use_registered_ring);
  918. if (IS_ERR(file))
  919. return PTR_ERR(file);
  920. ctx = file->private_data;
  921. mutex_lock(&ctx->uring_lock);
  922. ret = __io_uring_register(ctx, opcode, arg, nr_args);
  923. trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr,
  924. ctx->buf_table.nr, ret);
  925. mutex_unlock(&ctx->uring_lock);
  926. fput(file);
  927. return ret;
  928. }