rsrc.c 36 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559
  1. // SPDX-License-Identifier: GPL-2.0
  2. #include <linux/kernel.h>
  3. #include <linux/errno.h>
  4. #include <linux/fs.h>
  5. #include <linux/file.h>
  6. #include <linux/mm.h>
  7. #include <linux/slab.h>
  8. #include <linux/nospec.h>
  9. #include <linux/hugetlb.h>
  10. #include <linux/compat.h>
  11. #include <linux/io_uring.h>
  12. #include <linux/io_uring/cmd.h>
  13. #include <uapi/linux/io_uring.h>
  14. #include "filetable.h"
  15. #include "io_uring.h"
  16. #include "openclose.h"
  17. #include "rsrc.h"
  18. #include "memmap.h"
  19. #include "register.h"
  20. struct io_rsrc_update {
  21. struct file *file;
  22. u64 arg;
  23. u32 nr_args;
  24. u32 offset;
  25. };
  26. static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
  27. struct iovec *iov, struct page **last_hpage);
  28. /* only define max */
  29. #define IORING_MAX_FIXED_FILES (1U << 20)
  30. #define IORING_MAX_REG_BUFFERS (1U << 14)
  31. #define IO_CACHED_BVECS_SEGS 32
  32. int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
  33. {
  34. unsigned long page_limit, cur_pages, new_pages;
  35. if (!nr_pages)
  36. return 0;
  37. /* Don't allow more pages than we can safely lock */
  38. page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
  39. cur_pages = atomic_long_read(&user->locked_vm);
  40. do {
  41. new_pages = cur_pages + nr_pages;
  42. if (new_pages > page_limit)
  43. return -ENOMEM;
  44. } while (!atomic_long_try_cmpxchg(&user->locked_vm,
  45. &cur_pages, new_pages));
  46. return 0;
  47. }
  48. void io_unaccount_mem(struct user_struct *user, struct mm_struct *mm_account,
  49. unsigned long nr_pages)
  50. {
  51. if (user)
  52. __io_unaccount_mem(user, nr_pages);
  53. if (mm_account)
  54. atomic64_sub(nr_pages, &mm_account->pinned_vm);
  55. }
  56. int io_account_mem(struct user_struct *user, struct mm_struct *mm_account,
  57. unsigned long nr_pages)
  58. {
  59. int ret;
  60. if (user) {
  61. ret = __io_account_mem(user, nr_pages);
  62. if (ret)
  63. return ret;
  64. }
  65. if (mm_account)
  66. atomic64_add(nr_pages, &mm_account->pinned_vm);
  67. return 0;
  68. }
  69. int io_validate_user_buf_range(u64 uaddr, u64 ulen)
  70. {
  71. unsigned long tmp, base = (unsigned long)uaddr;
  72. unsigned long acct_len = (unsigned long)PAGE_ALIGN(ulen);
  73. /* arbitrary limit, but we need something */
  74. if (ulen > SZ_1G || !ulen)
  75. return -EFAULT;
  76. if (check_add_overflow(base, acct_len, &tmp))
  77. return -EOVERFLOW;
  78. return 0;
  79. }
  80. static void io_release_ubuf(void *priv)
  81. {
  82. struct io_mapped_ubuf *imu = priv;
  83. unsigned int i;
  84. for (i = 0; i < imu->nr_bvecs; i++) {
  85. struct folio *folio = page_folio(imu->bvec[i].bv_page);
  86. unpin_user_folio(folio, 1);
  87. }
  88. }
  89. static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx,
  90. int nr_bvecs)
  91. {
  92. if (nr_bvecs <= IO_CACHED_BVECS_SEGS)
  93. return io_cache_alloc(&ctx->imu_cache, GFP_KERNEL);
  94. return kvmalloc_flex(struct io_mapped_ubuf, bvec, nr_bvecs);
  95. }
  96. static void io_free_imu(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
  97. {
  98. if (imu->nr_bvecs <= IO_CACHED_BVECS_SEGS)
  99. io_cache_free(&ctx->imu_cache, imu);
  100. else
  101. kvfree(imu);
  102. }
  103. static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
  104. {
  105. if (unlikely(refcount_read(&imu->refs) > 1)) {
  106. if (!refcount_dec_and_test(&imu->refs))
  107. return;
  108. }
  109. if (imu->acct_pages)
  110. io_unaccount_mem(ctx->user, ctx->mm_account, imu->acct_pages);
  111. imu->release(imu->priv);
  112. io_free_imu(ctx, imu);
  113. }
  114. struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type)
  115. {
  116. struct io_rsrc_node *node;
  117. node = io_cache_alloc(&ctx->node_cache, GFP_KERNEL);
  118. if (node) {
  119. node->type = type;
  120. node->refs = 1;
  121. node->tag = 0;
  122. node->file_ptr = 0;
  123. }
  124. return node;
  125. }
  126. bool io_rsrc_cache_init(struct io_ring_ctx *ctx)
  127. {
  128. const int imu_cache_size = struct_size_t(struct io_mapped_ubuf, bvec,
  129. IO_CACHED_BVECS_SEGS);
  130. const int node_size = sizeof(struct io_rsrc_node);
  131. bool ret;
  132. ret = io_alloc_cache_init(&ctx->node_cache, IO_ALLOC_CACHE_MAX,
  133. node_size, 0);
  134. ret |= io_alloc_cache_init(&ctx->imu_cache, IO_ALLOC_CACHE_MAX,
  135. imu_cache_size, 0);
  136. return ret;
  137. }
  138. void io_rsrc_cache_free(struct io_ring_ctx *ctx)
  139. {
  140. io_alloc_cache_free(&ctx->node_cache, kfree);
  141. io_alloc_cache_free(&ctx->imu_cache, kfree);
  142. }
  143. static void io_clear_table_tags(struct io_rsrc_data *data)
  144. {
  145. int i;
  146. for (i = 0; i < data->nr; i++) {
  147. struct io_rsrc_node *node = data->nodes[i];
  148. if (node)
  149. node->tag = 0;
  150. }
  151. }
  152. __cold void io_rsrc_data_free(struct io_ring_ctx *ctx,
  153. struct io_rsrc_data *data)
  154. {
  155. if (!data->nr)
  156. return;
  157. while (data->nr--) {
  158. if (data->nodes[data->nr])
  159. io_put_rsrc_node(ctx, data->nodes[data->nr]);
  160. }
  161. kvfree(data->nodes);
  162. data->nodes = NULL;
  163. data->nr = 0;
  164. }
  165. __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr)
  166. {
  167. data->nodes = kvmalloc_objs(struct io_rsrc_node *, nr,
  168. GFP_KERNEL_ACCOUNT | __GFP_ZERO);
  169. if (data->nodes) {
  170. data->nr = nr;
  171. return 0;
  172. }
  173. return -ENOMEM;
  174. }
  175. static int __io_sqe_files_update(struct io_ring_ctx *ctx,
  176. struct io_uring_rsrc_update2 *up,
  177. unsigned nr_args)
  178. {
  179. u64 __user *tags = u64_to_user_ptr(up->tags);
  180. __s32 __user *fds = u64_to_user_ptr(up->data);
  181. int fd, i, err = 0;
  182. unsigned int done;
  183. if (!ctx->file_table.data.nr)
  184. return -ENXIO;
  185. if (up->offset + nr_args > ctx->file_table.data.nr)
  186. return -EINVAL;
  187. for (done = 0; done < nr_args; done++) {
  188. u64 tag = 0;
  189. if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
  190. copy_from_user(&fd, &fds[done], sizeof(fd))) {
  191. err = -EFAULT;
  192. break;
  193. }
  194. if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
  195. err = -EINVAL;
  196. break;
  197. }
  198. if (fd == IORING_REGISTER_FILES_SKIP)
  199. continue;
  200. i = up->offset + done;
  201. if (io_reset_rsrc_node(ctx, &ctx->file_table.data, i))
  202. io_file_bitmap_clear(&ctx->file_table, i);
  203. if (fd != -1) {
  204. struct file *file = fget(fd);
  205. struct io_rsrc_node *node;
  206. if (!file) {
  207. err = -EBADF;
  208. break;
  209. }
  210. /*
  211. * Don't allow io_uring instances to be registered.
  212. */
  213. if (io_is_uring_fops(file)) {
  214. fput(file);
  215. err = -EBADF;
  216. break;
  217. }
  218. node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
  219. if (!node) {
  220. err = -ENOMEM;
  221. fput(file);
  222. break;
  223. }
  224. ctx->file_table.data.nodes[i] = node;
  225. if (tag)
  226. node->tag = tag;
  227. io_fixed_file_set(node, file);
  228. io_file_bitmap_set(&ctx->file_table, i);
  229. }
  230. }
  231. return done ? done : err;
  232. }
  233. static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
  234. struct io_uring_rsrc_update2 *up,
  235. unsigned int nr_args)
  236. {
  237. u64 __user *tags = u64_to_user_ptr(up->tags);
  238. struct iovec fast_iov, *iov;
  239. struct page *last_hpage = NULL;
  240. struct iovec __user *uvec;
  241. u64 user_data = up->data;
  242. __u32 done;
  243. int i, err;
  244. if (!ctx->buf_table.nr)
  245. return -ENXIO;
  246. if (up->offset + nr_args > ctx->buf_table.nr)
  247. return -EINVAL;
  248. for (done = 0; done < nr_args; done++) {
  249. struct io_rsrc_node *node;
  250. u64 tag = 0;
  251. uvec = u64_to_user_ptr(user_data);
  252. iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat);
  253. if (IS_ERR(iov)) {
  254. err = PTR_ERR(iov);
  255. break;
  256. }
  257. if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
  258. err = -EFAULT;
  259. break;
  260. }
  261. node = io_sqe_buffer_register(ctx, iov, &last_hpage);
  262. if (IS_ERR(node)) {
  263. err = PTR_ERR(node);
  264. break;
  265. }
  266. if (tag) {
  267. if (!node) {
  268. err = -EINVAL;
  269. break;
  270. }
  271. node->tag = tag;
  272. }
  273. i = array_index_nospec(up->offset + done, ctx->buf_table.nr);
  274. io_reset_rsrc_node(ctx, &ctx->buf_table, i);
  275. ctx->buf_table.nodes[i] = node;
  276. if (ctx->compat)
  277. user_data += sizeof(struct compat_iovec);
  278. else
  279. user_data += sizeof(struct iovec);
  280. }
  281. return done ? done : err;
  282. }
  283. static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
  284. struct io_uring_rsrc_update2 *up,
  285. unsigned nr_args)
  286. {
  287. __u32 tmp;
  288. lockdep_assert_held(&ctx->uring_lock);
  289. if (check_add_overflow(up->offset, nr_args, &tmp))
  290. return -EOVERFLOW;
  291. switch (type) {
  292. case IORING_RSRC_FILE:
  293. return __io_sqe_files_update(ctx, up, nr_args);
  294. case IORING_RSRC_BUFFER:
  295. return __io_sqe_buffers_update(ctx, up, nr_args);
  296. }
  297. return -EINVAL;
  298. }
  299. int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
  300. unsigned nr_args)
  301. {
  302. struct io_uring_rsrc_update2 up;
  303. if (!nr_args)
  304. return -EINVAL;
  305. memset(&up, 0, sizeof(up));
  306. if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
  307. return -EFAULT;
  308. if (up.resv || up.resv2)
  309. return -EINVAL;
  310. return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
  311. }
  312. int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
  313. unsigned size, unsigned type)
  314. {
  315. struct io_uring_rsrc_update2 up;
  316. if (size != sizeof(up))
  317. return -EINVAL;
  318. if (copy_from_user(&up, arg, sizeof(up)))
  319. return -EFAULT;
  320. if (!up.nr || up.resv || up.resv2)
  321. return -EINVAL;
  322. return __io_register_rsrc_update(ctx, type, &up, up.nr);
  323. }
  324. __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
  325. unsigned int size, unsigned int type)
  326. {
  327. struct io_uring_rsrc_register rr;
  328. /* keep it extendible */
  329. if (size != sizeof(rr))
  330. return -EINVAL;
  331. memset(&rr, 0, sizeof(rr));
  332. if (copy_from_user(&rr, arg, size))
  333. return -EFAULT;
  334. if (!rr.nr || rr.resv2)
  335. return -EINVAL;
  336. if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE)
  337. return -EINVAL;
  338. switch (type) {
  339. case IORING_RSRC_FILE:
  340. if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
  341. break;
  342. return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
  343. rr.nr, u64_to_user_ptr(rr.tags));
  344. case IORING_RSRC_BUFFER:
  345. if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
  346. break;
  347. return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
  348. rr.nr, u64_to_user_ptr(rr.tags));
  349. }
  350. return -EINVAL;
  351. }
  352. int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
  353. {
  354. struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
  355. if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
  356. return -EINVAL;
  357. if (sqe->rw_flags || sqe->splice_fd_in)
  358. return -EINVAL;
  359. up->offset = READ_ONCE(sqe->off);
  360. up->nr_args = READ_ONCE(sqe->len);
  361. if (!up->nr_args)
  362. return -EINVAL;
  363. up->arg = READ_ONCE(sqe->addr);
  364. return 0;
  365. }
  366. static int io_files_update_with_index_alloc(struct io_kiocb *req,
  367. unsigned int issue_flags)
  368. {
  369. struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
  370. __s32 __user *fds = u64_to_user_ptr(up->arg);
  371. unsigned int done;
  372. struct file *file;
  373. int ret, fd;
  374. if (!req->ctx->file_table.data.nr)
  375. return -ENXIO;
  376. for (done = 0; done < up->nr_args; done++) {
  377. if (get_user(fd, &fds[done])) {
  378. ret = -EFAULT;
  379. break;
  380. }
  381. file = fget(fd);
  382. if (!file) {
  383. ret = -EBADF;
  384. break;
  385. }
  386. ret = io_fixed_fd_install(req, issue_flags, file,
  387. IORING_FILE_INDEX_ALLOC);
  388. if (ret < 0)
  389. break;
  390. if (put_user(ret, &fds[done])) {
  391. __io_close_fixed(req->ctx, issue_flags, ret);
  392. ret = -EFAULT;
  393. break;
  394. }
  395. }
  396. if (done)
  397. return done;
  398. return ret;
  399. }
  400. int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
  401. {
  402. struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
  403. struct io_ring_ctx *ctx = req->ctx;
  404. struct io_uring_rsrc_update2 up2;
  405. int ret;
  406. up2.offset = up->offset;
  407. up2.data = up->arg;
  408. up2.nr = 0;
  409. up2.tags = 0;
  410. up2.resv = 0;
  411. up2.resv2 = 0;
  412. if (up->offset == IORING_FILE_INDEX_ALLOC) {
  413. ret = io_files_update_with_index_alloc(req, issue_flags);
  414. } else {
  415. io_ring_submit_lock(ctx, issue_flags);
  416. ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
  417. &up2, up->nr_args);
  418. io_ring_submit_unlock(ctx, issue_flags);
  419. }
  420. if (ret < 0)
  421. req_set_fail(req);
  422. io_req_set_res(req, ret, 0);
  423. return IOU_COMPLETE;
  424. }
  425. void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
  426. {
  427. if (node->tag)
  428. io_post_aux_cqe(ctx, node->tag, 0, 0);
  429. switch (node->type) {
  430. case IORING_RSRC_FILE:
  431. fput(io_slot_file(node));
  432. break;
  433. case IORING_RSRC_BUFFER:
  434. io_buffer_unmap(ctx, node->buf);
  435. break;
  436. default:
  437. WARN_ON_ONCE(1);
  438. break;
  439. }
  440. io_cache_free(&ctx->node_cache, node);
  441. }
  442. int io_sqe_files_unregister(struct io_ring_ctx *ctx)
  443. {
  444. if (!ctx->file_table.data.nr)
  445. return -ENXIO;
  446. io_free_file_tables(ctx, &ctx->file_table);
  447. io_file_table_set_alloc_range(ctx, 0, 0);
  448. return 0;
  449. }
  450. int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
  451. unsigned nr_args, u64 __user *tags)
  452. {
  453. __s32 __user *fds = (__s32 __user *) arg;
  454. struct file *file;
  455. int fd, ret;
  456. unsigned i;
  457. if (ctx->file_table.data.nr)
  458. return -EBUSY;
  459. if (!nr_args)
  460. return -EINVAL;
  461. if (nr_args > IORING_MAX_FIXED_FILES)
  462. return -EMFILE;
  463. if (nr_args > rlimit(RLIMIT_NOFILE))
  464. return -EMFILE;
  465. if (!io_alloc_file_tables(ctx, &ctx->file_table, nr_args))
  466. return -ENOMEM;
  467. for (i = 0; i < nr_args; i++) {
  468. struct io_rsrc_node *node;
  469. u64 tag = 0;
  470. ret = -EFAULT;
  471. if (tags && copy_from_user(&tag, &tags[i], sizeof(tag)))
  472. goto fail;
  473. if (fds && copy_from_user(&fd, &fds[i], sizeof(fd)))
  474. goto fail;
  475. /* allow sparse sets */
  476. if (!fds || fd == -1) {
  477. ret = -EINVAL;
  478. if (tag)
  479. goto fail;
  480. continue;
  481. }
  482. file = fget(fd);
  483. ret = -EBADF;
  484. if (unlikely(!file))
  485. goto fail;
  486. /*
  487. * Don't allow io_uring instances to be registered.
  488. */
  489. if (io_is_uring_fops(file)) {
  490. fput(file);
  491. goto fail;
  492. }
  493. ret = -ENOMEM;
  494. node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
  495. if (!node) {
  496. fput(file);
  497. goto fail;
  498. }
  499. if (tag)
  500. node->tag = tag;
  501. ctx->file_table.data.nodes[i] = node;
  502. io_fixed_file_set(node, file);
  503. io_file_bitmap_set(&ctx->file_table, i);
  504. }
  505. /* default it to the whole table */
  506. io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr);
  507. return 0;
  508. fail:
  509. io_clear_table_tags(&ctx->file_table.data);
  510. io_sqe_files_unregister(ctx);
  511. return ret;
  512. }
  513. int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
  514. {
  515. if (!ctx->buf_table.nr)
  516. return -ENXIO;
  517. io_rsrc_data_free(ctx, &ctx->buf_table);
  518. return 0;
  519. }
  520. /*
  521. * Not super efficient, but this is just a registration time. And we do cache
  522. * the last compound head, so generally we'll only do a full search if we don't
  523. * match that one.
  524. *
  525. * We check if the given compound head page has already been accounted, to
  526. * avoid double accounting it. This allows us to account the full size of the
  527. * page, not just the constituent pages of a huge page.
  528. */
  529. static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
  530. int nr_pages, struct page *hpage)
  531. {
  532. int i, j;
  533. /* check current page array */
  534. for (i = 0; i < nr_pages; i++) {
  535. if (!PageCompound(pages[i]))
  536. continue;
  537. if (compound_head(pages[i]) == hpage)
  538. return true;
  539. }
  540. /* check previously registered pages */
  541. for (i = 0; i < ctx->buf_table.nr; i++) {
  542. struct io_rsrc_node *node = ctx->buf_table.nodes[i];
  543. struct io_mapped_ubuf *imu;
  544. if (!node)
  545. continue;
  546. imu = node->buf;
  547. for (j = 0; j < imu->nr_bvecs; j++) {
  548. if (!PageCompound(imu->bvec[j].bv_page))
  549. continue;
  550. if (compound_head(imu->bvec[j].bv_page) == hpage)
  551. return true;
  552. }
  553. }
  554. return false;
  555. }
  556. static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
  557. int nr_pages, struct io_mapped_ubuf *imu,
  558. struct page **last_hpage)
  559. {
  560. int i, ret;
  561. imu->acct_pages = 0;
  562. for (i = 0; i < nr_pages; i++) {
  563. if (!PageCompound(pages[i])) {
  564. imu->acct_pages++;
  565. } else {
  566. struct page *hpage;
  567. hpage = compound_head(pages[i]);
  568. if (hpage == *last_hpage)
  569. continue;
  570. *last_hpage = hpage;
  571. if (headpage_already_acct(ctx, pages, i, hpage))
  572. continue;
  573. imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
  574. }
  575. }
  576. if (!imu->acct_pages)
  577. return 0;
  578. ret = io_account_mem(ctx->user, ctx->mm_account, imu->acct_pages);
  579. if (ret)
  580. imu->acct_pages = 0;
  581. return ret;
  582. }
  583. static bool io_coalesce_buffer(struct page ***pages, int *nr_pages,
  584. struct io_imu_folio_data *data)
  585. {
  586. struct page **page_array = *pages, **new_array = NULL;
  587. unsigned nr_pages_left = *nr_pages;
  588. unsigned nr_folios = data->nr_folios;
  589. unsigned i, j;
  590. /* Store head pages only*/
  591. new_array = kvmalloc_objs(struct page *, nr_folios);
  592. if (!new_array)
  593. return false;
  594. for (i = 0, j = 0; i < nr_folios; i++) {
  595. struct page *p = compound_head(page_array[j]);
  596. struct folio *folio = page_folio(p);
  597. unsigned int nr;
  598. WARN_ON_ONCE(i > 0 && p != page_array[j]);
  599. nr = i ? data->nr_pages_mid : data->nr_pages_head;
  600. nr = min(nr, nr_pages_left);
  601. /* Drop all but one ref, the entire folio will remain pinned. */
  602. if (nr > 1)
  603. unpin_user_folio(folio, nr - 1);
  604. j += nr;
  605. nr_pages_left -= nr;
  606. new_array[i] = p;
  607. }
  608. WARN_ON_ONCE(j != *nr_pages);
  609. kvfree(page_array);
  610. *pages = new_array;
  611. *nr_pages = nr_folios;
  612. return true;
  613. }
  614. bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
  615. struct io_imu_folio_data *data)
  616. {
  617. struct folio *folio = page_folio(page_array[0]);
  618. unsigned int count = 1, nr_folios = 1;
  619. int i;
  620. data->nr_pages_mid = folio_nr_pages(folio);
  621. data->folio_shift = folio_shift(folio);
  622. data->first_folio_page_idx = folio_page_idx(folio, page_array[0]);
  623. /*
  624. * Check if pages are contiguous inside a folio, and all folios have
  625. * the same page count except for the head and tail.
  626. */
  627. for (i = 1; i < nr_pages; i++) {
  628. if (page_folio(page_array[i]) == folio &&
  629. page_array[i] == page_array[i-1] + 1) {
  630. count++;
  631. continue;
  632. }
  633. if (nr_folios == 1) {
  634. if (folio_page_idx(folio, page_array[i-1]) !=
  635. data->nr_pages_mid - 1)
  636. return false;
  637. data->nr_pages_head = count;
  638. } else if (count != data->nr_pages_mid) {
  639. return false;
  640. }
  641. folio = page_folio(page_array[i]);
  642. if (folio_size(folio) != (1UL << data->folio_shift) ||
  643. folio_page_idx(folio, page_array[i]) != 0)
  644. return false;
  645. count = 1;
  646. nr_folios++;
  647. }
  648. if (nr_folios == 1)
  649. data->nr_pages_head = count;
  650. data->nr_folios = nr_folios;
  651. return true;
  652. }
  653. static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
  654. struct iovec *iov,
  655. struct page **last_hpage)
  656. {
  657. struct io_mapped_ubuf *imu = NULL;
  658. struct page **pages = NULL;
  659. struct io_rsrc_node *node;
  660. unsigned long off;
  661. size_t size;
  662. int ret, nr_pages, i;
  663. struct io_imu_folio_data data;
  664. bool coalesced = false;
  665. if (!iov->iov_base) {
  666. if (iov->iov_len)
  667. return ERR_PTR(-EFAULT);
  668. /* remove the buffer without installing a new one */
  669. return NULL;
  670. }
  671. ret = io_validate_user_buf_range((unsigned long)iov->iov_base,
  672. iov->iov_len);
  673. if (ret)
  674. return ERR_PTR(ret);
  675. node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
  676. if (!node)
  677. return ERR_PTR(-ENOMEM);
  678. ret = -ENOMEM;
  679. pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len,
  680. &nr_pages);
  681. if (IS_ERR(pages)) {
  682. ret = PTR_ERR(pages);
  683. pages = NULL;
  684. goto done;
  685. }
  686. /* If it's huge page(s), try to coalesce them into fewer bvec entries */
  687. if (nr_pages > 1 && io_check_coalesce_buffer(pages, nr_pages, &data)) {
  688. if (data.nr_pages_mid != 1)
  689. coalesced = io_coalesce_buffer(&pages, &nr_pages, &data);
  690. }
  691. imu = io_alloc_imu(ctx, nr_pages);
  692. if (!imu)
  693. goto done;
  694. imu->nr_bvecs = nr_pages;
  695. ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage);
  696. if (ret)
  697. goto done;
  698. size = iov->iov_len;
  699. /* store original address for later verification */
  700. imu->ubuf = (unsigned long) iov->iov_base;
  701. imu->len = iov->iov_len;
  702. imu->folio_shift = PAGE_SHIFT;
  703. imu->release = io_release_ubuf;
  704. imu->priv = imu;
  705. imu->flags = 0;
  706. imu->dir = IO_IMU_DEST | IO_IMU_SOURCE;
  707. if (coalesced)
  708. imu->folio_shift = data.folio_shift;
  709. refcount_set(&imu->refs, 1);
  710. off = (unsigned long)iov->iov_base & ~PAGE_MASK;
  711. if (coalesced)
  712. off += data.first_folio_page_idx << PAGE_SHIFT;
  713. node->buf = imu;
  714. ret = 0;
  715. for (i = 0; i < nr_pages; i++) {
  716. size_t vec_len;
  717. vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off);
  718. bvec_set_page(&imu->bvec[i], pages[i], vec_len, off);
  719. off = 0;
  720. size -= vec_len;
  721. }
  722. done:
  723. if (ret) {
  724. if (imu)
  725. io_free_imu(ctx, imu);
  726. if (pages) {
  727. for (i = 0; i < nr_pages; i++)
  728. unpin_user_folio(page_folio(pages[i]), 1);
  729. }
  730. io_cache_free(&ctx->node_cache, node);
  731. node = ERR_PTR(ret);
  732. }
  733. kvfree(pages);
  734. return node;
  735. }
  736. int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
  737. unsigned int nr_args, u64 __user *tags)
  738. {
  739. struct page *last_hpage = NULL;
  740. struct io_rsrc_data data;
  741. struct iovec fast_iov, *iov = &fast_iov;
  742. const struct iovec __user *uvec;
  743. int i, ret;
  744. BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
  745. if (ctx->buf_table.nr)
  746. return -EBUSY;
  747. if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
  748. return -EINVAL;
  749. ret = io_rsrc_data_alloc(&data, nr_args);
  750. if (ret)
  751. return ret;
  752. if (!arg)
  753. memset(iov, 0, sizeof(*iov));
  754. for (i = 0; i < nr_args; i++) {
  755. struct io_rsrc_node *node;
  756. u64 tag = 0;
  757. if (arg) {
  758. uvec = (struct iovec __user *) arg;
  759. iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat);
  760. if (IS_ERR(iov)) {
  761. ret = PTR_ERR(iov);
  762. break;
  763. }
  764. if (ctx->compat)
  765. arg += sizeof(struct compat_iovec);
  766. else
  767. arg += sizeof(struct iovec);
  768. }
  769. if (tags) {
  770. if (copy_from_user(&tag, &tags[i], sizeof(tag))) {
  771. ret = -EFAULT;
  772. break;
  773. }
  774. }
  775. node = io_sqe_buffer_register(ctx, iov, &last_hpage);
  776. if (IS_ERR(node)) {
  777. ret = PTR_ERR(node);
  778. break;
  779. }
  780. if (tag) {
  781. if (!node) {
  782. ret = -EINVAL;
  783. break;
  784. }
  785. node->tag = tag;
  786. }
  787. data.nodes[i] = node;
  788. }
  789. ctx->buf_table = data;
  790. if (ret) {
  791. io_clear_table_tags(&ctx->buf_table);
  792. io_sqe_buffers_unregister(ctx);
  793. }
  794. return ret;
  795. }
  796. int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq,
  797. void (*release)(void *), unsigned int index,
  798. unsigned int issue_flags)
  799. {
  800. struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx;
  801. struct io_rsrc_data *data = &ctx->buf_table;
  802. struct req_iterator rq_iter;
  803. struct io_mapped_ubuf *imu;
  804. struct io_rsrc_node *node;
  805. struct bio_vec bv;
  806. unsigned int nr_bvecs = 0;
  807. int ret = 0;
  808. io_ring_submit_lock(ctx, issue_flags);
  809. if (index >= data->nr) {
  810. ret = -EINVAL;
  811. goto unlock;
  812. }
  813. index = array_index_nospec(index, data->nr);
  814. if (data->nodes[index]) {
  815. ret = -EBUSY;
  816. goto unlock;
  817. }
  818. node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
  819. if (!node) {
  820. ret = -ENOMEM;
  821. goto unlock;
  822. }
  823. /*
  824. * blk_rq_nr_phys_segments() may overestimate the number of bvecs
  825. * but avoids needing to iterate over the bvecs
  826. */
  827. imu = io_alloc_imu(ctx, blk_rq_nr_phys_segments(rq));
  828. if (!imu) {
  829. kfree(node);
  830. ret = -ENOMEM;
  831. goto unlock;
  832. }
  833. imu->ubuf = 0;
  834. imu->len = blk_rq_bytes(rq);
  835. imu->acct_pages = 0;
  836. imu->folio_shift = PAGE_SHIFT;
  837. refcount_set(&imu->refs, 1);
  838. imu->release = release;
  839. imu->priv = rq;
  840. imu->flags = IO_REGBUF_F_KBUF;
  841. imu->dir = 1 << rq_data_dir(rq);
  842. rq_for_each_bvec(bv, rq, rq_iter)
  843. imu->bvec[nr_bvecs++] = bv;
  844. imu->nr_bvecs = nr_bvecs;
  845. node->buf = imu;
  846. data->nodes[index] = node;
  847. unlock:
  848. io_ring_submit_unlock(ctx, issue_flags);
  849. return ret;
  850. }
  851. EXPORT_SYMBOL_GPL(io_buffer_register_bvec);
  852. int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index,
  853. unsigned int issue_flags)
  854. {
  855. struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx;
  856. struct io_rsrc_data *data = &ctx->buf_table;
  857. struct io_rsrc_node *node;
  858. int ret = 0;
  859. io_ring_submit_lock(ctx, issue_flags);
  860. if (index >= data->nr) {
  861. ret = -EINVAL;
  862. goto unlock;
  863. }
  864. index = array_index_nospec(index, data->nr);
  865. node = data->nodes[index];
  866. if (!node) {
  867. ret = -EINVAL;
  868. goto unlock;
  869. }
  870. if (!(node->buf->flags & IO_REGBUF_F_KBUF)) {
  871. ret = -EBUSY;
  872. goto unlock;
  873. }
  874. io_put_rsrc_node(ctx, node);
  875. data->nodes[index] = NULL;
  876. unlock:
  877. io_ring_submit_unlock(ctx, issue_flags);
  878. return ret;
  879. }
  880. EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec);
  881. static int validate_fixed_range(u64 buf_addr, size_t len,
  882. const struct io_mapped_ubuf *imu)
  883. {
  884. u64 buf_end;
  885. if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
  886. return -EFAULT;
  887. /* not inside the mapped region */
  888. if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len)))
  889. return -EFAULT;
  890. if (unlikely(len > MAX_RW_COUNT))
  891. return -EFAULT;
  892. return 0;
  893. }
  894. static int io_import_kbuf(int ddir, struct iov_iter *iter,
  895. struct io_mapped_ubuf *imu, size_t len, size_t offset)
  896. {
  897. size_t count = len + offset;
  898. iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, count);
  899. iov_iter_advance(iter, offset);
  900. return 0;
  901. }
  902. static int io_import_fixed(int ddir, struct iov_iter *iter,
  903. struct io_mapped_ubuf *imu,
  904. u64 buf_addr, size_t len)
  905. {
  906. const struct bio_vec *bvec;
  907. size_t folio_mask;
  908. unsigned nr_segs;
  909. size_t offset;
  910. int ret;
  911. ret = validate_fixed_range(buf_addr, len, imu);
  912. if (unlikely(ret))
  913. return ret;
  914. if (!(imu->dir & (1 << ddir)))
  915. return -EFAULT;
  916. if (unlikely(!len)) {
  917. iov_iter_bvec(iter, ddir, NULL, 0, 0);
  918. return 0;
  919. }
  920. offset = buf_addr - imu->ubuf;
  921. if (imu->flags & IO_REGBUF_F_KBUF)
  922. return io_import_kbuf(ddir, iter, imu, len, offset);
  923. /*
  924. * Don't use iov_iter_advance() here, as it's really slow for
  925. * using the latter parts of a big fixed buffer - it iterates
  926. * over each segment manually. We can cheat a bit here for user
  927. * registered nodes, because we know that:
  928. *
  929. * 1) it's a BVEC iter, we set it up
  930. * 2) all bvecs are the same in size, except potentially the
  931. * first and last bvec
  932. */
  933. folio_mask = (1UL << imu->folio_shift) - 1;
  934. bvec = imu->bvec;
  935. if (offset >= bvec->bv_len) {
  936. unsigned long seg_skip;
  937. /* skip first vec */
  938. offset -= bvec->bv_len;
  939. seg_skip = 1 + (offset >> imu->folio_shift);
  940. bvec += seg_skip;
  941. offset &= folio_mask;
  942. }
  943. nr_segs = (offset + len + bvec->bv_offset + folio_mask) >> imu->folio_shift;
  944. iov_iter_bvec(iter, ddir, bvec, nr_segs, len);
  945. iter->iov_offset = offset;
  946. return 0;
  947. }
  948. inline struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req,
  949. unsigned issue_flags)
  950. {
  951. struct io_ring_ctx *ctx = req->ctx;
  952. struct io_rsrc_node *node;
  953. if (req->flags & REQ_F_BUF_NODE)
  954. return req->buf_node;
  955. req->flags |= REQ_F_BUF_NODE;
  956. io_ring_submit_lock(ctx, issue_flags);
  957. node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index);
  958. if (node) {
  959. node->refs++;
  960. req->buf_node = node;
  961. io_ring_submit_unlock(ctx, issue_flags);
  962. return node;
  963. }
  964. req->flags &= ~REQ_F_BUF_NODE;
  965. io_ring_submit_unlock(ctx, issue_flags);
  966. return NULL;
  967. }
  968. int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter,
  969. u64 buf_addr, size_t len, int ddir,
  970. unsigned issue_flags)
  971. {
  972. struct io_rsrc_node *node;
  973. node = io_find_buf_node(req, issue_flags);
  974. if (!node)
  975. return -EFAULT;
  976. return io_import_fixed(ddir, iter, node->buf, buf_addr, len);
  977. }
  978. /* Lock two rings at once. The rings must be different! */
  979. static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2)
  980. {
  981. if (ctx1 > ctx2)
  982. swap(ctx1, ctx2);
  983. mutex_lock(&ctx1->uring_lock);
  984. mutex_lock_nested(&ctx2->uring_lock, SINGLE_DEPTH_NESTING);
  985. }
  986. /* Both rings are locked by the caller. */
  987. static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx,
  988. struct io_uring_clone_buffers *arg)
  989. {
  990. struct io_rsrc_data data;
  991. int i, ret, off, nr;
  992. unsigned int nbufs;
  993. lockdep_assert_held(&ctx->uring_lock);
  994. lockdep_assert_held(&src_ctx->uring_lock);
  995. /*
  996. * Accounting state is shared between the two rings; that only works if
  997. * both rings are accounted towards the same counters.
  998. */
  999. if (ctx->user != src_ctx->user || ctx->mm_account != src_ctx->mm_account)
  1000. return -EINVAL;
  1001. /* if offsets are given, must have nr specified too */
  1002. if (!arg->nr && (arg->dst_off || arg->src_off))
  1003. return -EINVAL;
  1004. /* not allowed unless REPLACE is set */
  1005. if (ctx->buf_table.nr && !(arg->flags & IORING_REGISTER_DST_REPLACE))
  1006. return -EBUSY;
  1007. nbufs = src_ctx->buf_table.nr;
  1008. if (!nbufs)
  1009. return -ENXIO;
  1010. if (!arg->nr)
  1011. arg->nr = nbufs;
  1012. else if (arg->nr > nbufs)
  1013. return -EINVAL;
  1014. else if (arg->nr > IORING_MAX_REG_BUFFERS)
  1015. return -EINVAL;
  1016. if (check_add_overflow(arg->nr, arg->src_off, &off) || off > nbufs)
  1017. return -EOVERFLOW;
  1018. if (check_add_overflow(arg->nr, arg->dst_off, &nbufs))
  1019. return -EOVERFLOW;
  1020. if (nbufs > IORING_MAX_REG_BUFFERS)
  1021. return -EINVAL;
  1022. ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.nr));
  1023. if (ret)
  1024. return ret;
  1025. /* Copy original dst nodes from before the cloned range */
  1026. for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) {
  1027. struct io_rsrc_node *node = ctx->buf_table.nodes[i];
  1028. if (node) {
  1029. data.nodes[i] = node;
  1030. node->refs++;
  1031. }
  1032. }
  1033. off = arg->dst_off;
  1034. i = arg->src_off;
  1035. nr = arg->nr;
  1036. while (nr--) {
  1037. struct io_rsrc_node *dst_node, *src_node;
  1038. src_node = io_rsrc_node_lookup(&src_ctx->buf_table, i);
  1039. if (!src_node) {
  1040. dst_node = NULL;
  1041. } else {
  1042. dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
  1043. if (!dst_node) {
  1044. io_rsrc_data_free(ctx, &data);
  1045. return -ENOMEM;
  1046. }
  1047. refcount_inc(&src_node->buf->refs);
  1048. dst_node->buf = src_node->buf;
  1049. }
  1050. data.nodes[off++] = dst_node;
  1051. i++;
  1052. }
  1053. /* Copy original dst nodes from after the cloned range */
  1054. for (i = nbufs; i < ctx->buf_table.nr; i++) {
  1055. struct io_rsrc_node *node = ctx->buf_table.nodes[i];
  1056. if (node) {
  1057. data.nodes[i] = node;
  1058. node->refs++;
  1059. }
  1060. }
  1061. /*
  1062. * If asked for replace, put the old table. data->nodes[] holds both
  1063. * old and new nodes at this point.
  1064. */
  1065. if (arg->flags & IORING_REGISTER_DST_REPLACE)
  1066. io_rsrc_data_free(ctx, &ctx->buf_table);
  1067. /*
  1068. * ctx->buf_table must be empty now - either the contents are being
  1069. * replaced and we just freed the table, or the contents are being
  1070. * copied to a ring that does not have buffers yet (checked at function
  1071. * entry).
  1072. */
  1073. WARN_ON_ONCE(ctx->buf_table.nr);
  1074. ctx->buf_table = data;
  1075. return 0;
  1076. }
  1077. /*
  1078. * Copy the registered buffers from the source ring whose file descriptor
  1079. * is given in the src_fd to the current ring. This is identical to registering
  1080. * the buffers with ctx, except faster as mappings already exist.
  1081. *
  1082. * Since the memory is already accounted once, don't account it again.
  1083. */
  1084. int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
  1085. {
  1086. struct io_uring_clone_buffers buf;
  1087. struct io_ring_ctx *src_ctx;
  1088. bool registered_src;
  1089. struct file *file;
  1090. int ret;
  1091. if (copy_from_user(&buf, arg, sizeof(buf)))
  1092. return -EFAULT;
  1093. if (buf.flags & ~(IORING_REGISTER_SRC_REGISTERED|IORING_REGISTER_DST_REPLACE))
  1094. return -EINVAL;
  1095. if (!(buf.flags & IORING_REGISTER_DST_REPLACE) && ctx->buf_table.nr)
  1096. return -EBUSY;
  1097. if (memchr_inv(buf.pad, 0, sizeof(buf.pad)))
  1098. return -EINVAL;
  1099. registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0;
  1100. file = io_uring_register_get_file(buf.src_fd, registered_src);
  1101. if (IS_ERR(file))
  1102. return PTR_ERR(file);
  1103. src_ctx = file->private_data;
  1104. if (src_ctx != ctx) {
  1105. mutex_unlock(&ctx->uring_lock);
  1106. lock_two_rings(ctx, src_ctx);
  1107. if (src_ctx->submitter_task &&
  1108. src_ctx->submitter_task != current) {
  1109. ret = -EEXIST;
  1110. goto out;
  1111. }
  1112. }
  1113. ret = io_clone_buffers(ctx, src_ctx, &buf);
  1114. out:
  1115. if (src_ctx != ctx)
  1116. mutex_unlock(&src_ctx->uring_lock);
  1117. fput(file);
  1118. return ret;
  1119. }
  1120. void io_vec_free(struct iou_vec *iv)
  1121. {
  1122. if (!iv->iovec)
  1123. return;
  1124. kfree(iv->iovec);
  1125. iv->iovec = NULL;
  1126. iv->nr = 0;
  1127. }
  1128. int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries)
  1129. {
  1130. gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_NOWARN;
  1131. struct iovec *iov;
  1132. iov = kmalloc_objs(iov[0], nr_entries, gfp);
  1133. if (!iov)
  1134. return -ENOMEM;
  1135. io_vec_free(iv);
  1136. iv->iovec = iov;
  1137. iv->nr = nr_entries;
  1138. return 0;
  1139. }
  1140. static int io_vec_fill_bvec(int ddir, struct iov_iter *iter,
  1141. struct io_mapped_ubuf *imu,
  1142. struct iovec *iovec, unsigned nr_iovs,
  1143. struct iou_vec *vec)
  1144. {
  1145. unsigned long folio_size = 1 << imu->folio_shift;
  1146. unsigned long folio_mask = folio_size - 1;
  1147. struct bio_vec *res_bvec = vec->bvec;
  1148. size_t total_len = 0;
  1149. unsigned bvec_idx = 0;
  1150. unsigned iov_idx;
  1151. for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) {
  1152. size_t iov_len = iovec[iov_idx].iov_len;
  1153. u64 buf_addr = (u64)(uintptr_t)iovec[iov_idx].iov_base;
  1154. struct bio_vec *src_bvec;
  1155. size_t offset;
  1156. int ret;
  1157. ret = validate_fixed_range(buf_addr, iov_len, imu);
  1158. if (unlikely(ret))
  1159. return ret;
  1160. if (unlikely(!iov_len))
  1161. return -EFAULT;
  1162. if (unlikely(check_add_overflow(total_len, iov_len, &total_len)))
  1163. return -EOVERFLOW;
  1164. offset = buf_addr - imu->ubuf;
  1165. /*
  1166. * Only the first bvec can have non zero bv_offset, account it
  1167. * here and work with full folios below.
  1168. */
  1169. offset += imu->bvec[0].bv_offset;
  1170. src_bvec = imu->bvec + (offset >> imu->folio_shift);
  1171. offset &= folio_mask;
  1172. for (; iov_len; offset = 0, bvec_idx++, src_bvec++) {
  1173. size_t seg_size = min_t(size_t, iov_len,
  1174. folio_size - offset);
  1175. bvec_set_page(&res_bvec[bvec_idx],
  1176. src_bvec->bv_page, seg_size, offset);
  1177. iov_len -= seg_size;
  1178. }
  1179. }
  1180. if (total_len > MAX_RW_COUNT)
  1181. return -EINVAL;
  1182. iov_iter_bvec(iter, ddir, res_bvec, bvec_idx, total_len);
  1183. return 0;
  1184. }
  1185. static int io_estimate_bvec_size(struct iovec *iov, unsigned nr_iovs,
  1186. struct io_mapped_ubuf *imu)
  1187. {
  1188. unsigned shift = imu->folio_shift;
  1189. size_t max_segs = 0;
  1190. unsigned i;
  1191. for (i = 0; i < nr_iovs; i++) {
  1192. max_segs += (iov[i].iov_len >> shift) + 2;
  1193. if (max_segs > INT_MAX)
  1194. return -EOVERFLOW;
  1195. }
  1196. return max_segs;
  1197. }
  1198. static int io_vec_fill_kern_bvec(int ddir, struct iov_iter *iter,
  1199. struct io_mapped_ubuf *imu,
  1200. struct iovec *iovec, unsigned nr_iovs,
  1201. struct iou_vec *vec)
  1202. {
  1203. const struct bio_vec *src_bvec = imu->bvec;
  1204. struct bio_vec *res_bvec = vec->bvec;
  1205. unsigned res_idx = 0;
  1206. size_t total_len = 0;
  1207. unsigned iov_idx;
  1208. for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) {
  1209. size_t offset = (size_t)(uintptr_t)iovec[iov_idx].iov_base;
  1210. size_t iov_len = iovec[iov_idx].iov_len;
  1211. struct bvec_iter bi = {
  1212. .bi_size = offset + iov_len,
  1213. };
  1214. struct bio_vec bv;
  1215. bvec_iter_advance(src_bvec, &bi, offset);
  1216. for_each_mp_bvec(bv, src_bvec, bi, bi)
  1217. res_bvec[res_idx++] = bv;
  1218. total_len += iov_len;
  1219. }
  1220. iov_iter_bvec(iter, ddir, res_bvec, res_idx, total_len);
  1221. return 0;
  1222. }
  1223. static int iov_kern_bvec_size(const struct iovec *iov,
  1224. const struct io_mapped_ubuf *imu,
  1225. unsigned int *nr_seg)
  1226. {
  1227. size_t offset = (size_t)(uintptr_t)iov->iov_base;
  1228. const struct bio_vec *bvec = imu->bvec;
  1229. int start = 0, i = 0;
  1230. size_t off = 0;
  1231. int ret;
  1232. ret = validate_fixed_range(offset, iov->iov_len, imu);
  1233. if (unlikely(ret))
  1234. return ret;
  1235. for (i = 0; off < offset + iov->iov_len && i < imu->nr_bvecs;
  1236. off += bvec[i].bv_len, i++) {
  1237. if (offset >= off && offset < off + bvec[i].bv_len)
  1238. start = i;
  1239. }
  1240. *nr_seg = i - start;
  1241. return 0;
  1242. }
  1243. static int io_kern_bvec_size(struct iovec *iov, unsigned nr_iovs,
  1244. struct io_mapped_ubuf *imu, unsigned *nr_segs)
  1245. {
  1246. unsigned max_segs = 0;
  1247. size_t total_len = 0;
  1248. unsigned i;
  1249. int ret;
  1250. *nr_segs = 0;
  1251. for (i = 0; i < nr_iovs; i++) {
  1252. if (unlikely(!iov[i].iov_len))
  1253. return -EFAULT;
  1254. if (unlikely(check_add_overflow(total_len, iov[i].iov_len,
  1255. &total_len)))
  1256. return -EOVERFLOW;
  1257. ret = iov_kern_bvec_size(&iov[i], imu, &max_segs);
  1258. if (unlikely(ret))
  1259. return ret;
  1260. *nr_segs += max_segs;
  1261. }
  1262. if (total_len > MAX_RW_COUNT)
  1263. return -EINVAL;
  1264. return 0;
  1265. }
  1266. int io_import_reg_vec(int ddir, struct iov_iter *iter,
  1267. struct io_kiocb *req, struct iou_vec *vec,
  1268. unsigned nr_iovs, unsigned issue_flags)
  1269. {
  1270. struct io_rsrc_node *node;
  1271. struct io_mapped_ubuf *imu;
  1272. unsigned iovec_off;
  1273. struct iovec *iov;
  1274. unsigned nr_segs;
  1275. node = io_find_buf_node(req, issue_flags);
  1276. if (!node)
  1277. return -EFAULT;
  1278. imu = node->buf;
  1279. if (!(imu->dir & (1 << ddir)))
  1280. return -EFAULT;
  1281. iovec_off = vec->nr - nr_iovs;
  1282. iov = vec->iovec + iovec_off;
  1283. if (imu->flags & IO_REGBUF_F_KBUF) {
  1284. int ret = io_kern_bvec_size(iov, nr_iovs, imu, &nr_segs);
  1285. if (unlikely(ret))
  1286. return ret;
  1287. } else {
  1288. int ret = io_estimate_bvec_size(iov, nr_iovs, imu);
  1289. if (ret < 0)
  1290. return ret;
  1291. nr_segs = ret;
  1292. }
  1293. if (sizeof(struct bio_vec) > sizeof(struct iovec)) {
  1294. size_t bvec_bytes;
  1295. bvec_bytes = nr_segs * sizeof(struct bio_vec);
  1296. nr_segs = (bvec_bytes + sizeof(*iov) - 1) / sizeof(*iov);
  1297. nr_segs += nr_iovs;
  1298. }
  1299. if (nr_segs > vec->nr) {
  1300. struct iou_vec tmp_vec = {};
  1301. int ret;
  1302. ret = io_vec_realloc(&tmp_vec, nr_segs);
  1303. if (ret)
  1304. return ret;
  1305. iovec_off = tmp_vec.nr - nr_iovs;
  1306. memcpy(tmp_vec.iovec + iovec_off, iov, sizeof(*iov) * nr_iovs);
  1307. io_vec_free(vec);
  1308. *vec = tmp_vec;
  1309. iov = vec->iovec + iovec_off;
  1310. req->flags |= REQ_F_NEED_CLEANUP;
  1311. }
  1312. if (imu->flags & IO_REGBUF_F_KBUF)
  1313. return io_vec_fill_kern_bvec(ddir, iter, imu, iov, nr_iovs, vec);
  1314. return io_vec_fill_bvec(ddir, iter, imu, iov, nr_iovs, vec);
  1315. }
  1316. int io_prep_reg_iovec(struct io_kiocb *req, struct iou_vec *iv,
  1317. const struct iovec __user *uvec, size_t uvec_segs)
  1318. {
  1319. struct iovec *iov;
  1320. int iovec_off, ret;
  1321. void *res;
  1322. if (uvec_segs > iv->nr) {
  1323. ret = io_vec_realloc(iv, uvec_segs);
  1324. if (ret)
  1325. return ret;
  1326. req->flags |= REQ_F_NEED_CLEANUP;
  1327. }
  1328. /* pad iovec to the right */
  1329. iovec_off = iv->nr - uvec_segs;
  1330. iov = iv->iovec + iovec_off;
  1331. res = iovec_from_user(uvec, uvec_segs, uvec_segs, iov,
  1332. io_is_compat(req->ctx));
  1333. if (IS_ERR(res))
  1334. return PTR_ERR(res);
  1335. req->flags |= REQ_F_IMPORT_BUFFER;
  1336. return 0;
  1337. }