zcrx.c 34 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522
  1. // SPDX-License-Identifier: GPL-2.0
  2. #include <linux/kernel.h>
  3. #include <linux/errno.h>
  4. #include <linux/dma-map-ops.h>
  5. #include <linux/mm.h>
  6. #include <linux/nospec.h>
  7. #include <linux/io_uring.h>
  8. #include <linux/netdevice.h>
  9. #include <linux/rtnetlink.h>
  10. #include <linux/skbuff_ref.h>
  11. #include <linux/anon_inodes.h>
  12. #include <net/page_pool/helpers.h>
  13. #include <net/page_pool/memory_provider.h>
  14. #include <net/netlink.h>
  15. #include <net/netdev_queues.h>
  16. #include <net/netdev_rx_queue.h>
  17. #include <net/tcp.h>
  18. #include <net/rps.h>
  19. #include <trace/events/page_pool.h>
  20. #include <uapi/linux/io_uring.h>
  21. #include "io_uring.h"
  22. #include "kbuf.h"
  23. #include "memmap.h"
  24. #include "zcrx.h"
  25. #include "rsrc.h"
  26. #define IO_ZCRX_AREA_SUPPORTED_FLAGS (IORING_ZCRX_AREA_DMABUF)
  27. #define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)
  28. static inline struct io_zcrx_ifq *io_pp_to_ifq(struct page_pool *pp)
  29. {
  30. return pp->mp_priv;
  31. }
  32. static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov)
  33. {
  34. struct net_iov_area *owner = net_iov_owner(niov);
  35. return container_of(owner, struct io_zcrx_area, nia);
  36. }
  37. static inline struct page *io_zcrx_iov_page(const struct net_iov *niov)
  38. {
  39. struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
  40. unsigned niov_pages_shift;
  41. lockdep_assert(!area->mem.is_dmabuf);
  42. niov_pages_shift = area->ifq->niov_shift - PAGE_SHIFT;
  43. return area->mem.pages[net_iov_idx(niov) << niov_pages_shift];
  44. }
  45. static int io_area_max_shift(struct io_zcrx_mem *mem)
  46. {
  47. struct sg_table *sgt = mem->sgt;
  48. struct scatterlist *sg;
  49. unsigned shift = -1U;
  50. unsigned i;
  51. for_each_sgtable_dma_sg(sgt, sg, i)
  52. shift = min(shift, __ffs(sg->length));
  53. return shift;
  54. }
  55. static int io_populate_area_dma(struct io_zcrx_ifq *ifq,
  56. struct io_zcrx_area *area)
  57. {
  58. unsigned niov_size = 1U << ifq->niov_shift;
  59. struct sg_table *sgt = area->mem.sgt;
  60. struct scatterlist *sg;
  61. unsigned i, niov_idx = 0;
  62. for_each_sgtable_dma_sg(sgt, sg, i) {
  63. dma_addr_t dma = sg_dma_address(sg);
  64. unsigned long sg_len = sg_dma_len(sg);
  65. if (WARN_ON_ONCE(sg_len % niov_size))
  66. return -EINVAL;
  67. while (sg_len && niov_idx < area->nia.num_niovs) {
  68. struct net_iov *niov = &area->nia.niovs[niov_idx];
  69. if (net_mp_niov_set_dma_addr(niov, dma))
  70. return -EFAULT;
  71. sg_len -= niov_size;
  72. dma += niov_size;
  73. niov_idx++;
  74. }
  75. }
  76. if (WARN_ON_ONCE(niov_idx != area->nia.num_niovs))
  77. return -EFAULT;
  78. return 0;
  79. }
  80. static void io_release_dmabuf(struct io_zcrx_mem *mem)
  81. {
  82. if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
  83. return;
  84. if (mem->sgt)
  85. dma_buf_unmap_attachment_unlocked(mem->attach, mem->sgt,
  86. DMA_FROM_DEVICE);
  87. if (mem->attach)
  88. dma_buf_detach(mem->dmabuf, mem->attach);
  89. if (mem->dmabuf)
  90. dma_buf_put(mem->dmabuf);
  91. mem->sgt = NULL;
  92. mem->attach = NULL;
  93. mem->dmabuf = NULL;
  94. }
  95. static int io_import_dmabuf(struct io_zcrx_ifq *ifq,
  96. struct io_zcrx_mem *mem,
  97. struct io_uring_zcrx_area_reg *area_reg)
  98. {
  99. unsigned long off = (unsigned long)area_reg->addr;
  100. unsigned long len = (unsigned long)area_reg->len;
  101. unsigned long total_size = 0;
  102. struct scatterlist *sg;
  103. int dmabuf_fd = area_reg->dmabuf_fd;
  104. int i, ret;
  105. if (off)
  106. return -EINVAL;
  107. if (WARN_ON_ONCE(!ifq->dev))
  108. return -EFAULT;
  109. if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
  110. return -EINVAL;
  111. mem->is_dmabuf = true;
  112. mem->dmabuf = dma_buf_get(dmabuf_fd);
  113. if (IS_ERR(mem->dmabuf)) {
  114. ret = PTR_ERR(mem->dmabuf);
  115. mem->dmabuf = NULL;
  116. goto err;
  117. }
  118. mem->attach = dma_buf_attach(mem->dmabuf, ifq->dev);
  119. if (IS_ERR(mem->attach)) {
  120. ret = PTR_ERR(mem->attach);
  121. mem->attach = NULL;
  122. goto err;
  123. }
  124. mem->sgt = dma_buf_map_attachment_unlocked(mem->attach, DMA_FROM_DEVICE);
  125. if (IS_ERR(mem->sgt)) {
  126. ret = PTR_ERR(mem->sgt);
  127. mem->sgt = NULL;
  128. goto err;
  129. }
  130. for_each_sgtable_dma_sg(mem->sgt, sg, i)
  131. total_size += sg_dma_len(sg);
  132. if (total_size != len) {
  133. ret = -EINVAL;
  134. goto err;
  135. }
  136. mem->size = len;
  137. return 0;
  138. err:
  139. io_release_dmabuf(mem);
  140. return ret;
  141. }
  142. static unsigned long io_count_account_pages(struct page **pages, unsigned nr_pages)
  143. {
  144. struct folio *last_folio = NULL;
  145. unsigned long res = 0;
  146. int i;
  147. for (i = 0; i < nr_pages; i++) {
  148. struct folio *folio = page_folio(pages[i]);
  149. if (folio == last_folio)
  150. continue;
  151. last_folio = folio;
  152. res += folio_nr_pages(folio);
  153. }
  154. return res;
  155. }
  156. static int io_import_umem(struct io_zcrx_ifq *ifq,
  157. struct io_zcrx_mem *mem,
  158. struct io_uring_zcrx_area_reg *area_reg)
  159. {
  160. struct page **pages;
  161. int nr_pages, ret;
  162. if (area_reg->dmabuf_fd)
  163. return -EINVAL;
  164. if (!area_reg->addr)
  165. return -EFAULT;
  166. pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len,
  167. &nr_pages);
  168. if (IS_ERR(pages))
  169. return PTR_ERR(pages);
  170. ret = sg_alloc_table_from_pages(&mem->page_sg_table, pages, nr_pages,
  171. 0, (unsigned long)nr_pages << PAGE_SHIFT,
  172. GFP_KERNEL_ACCOUNT);
  173. if (ret) {
  174. unpin_user_pages(pages, nr_pages);
  175. kvfree(pages);
  176. return ret;
  177. }
  178. mem->account_pages = io_count_account_pages(pages, nr_pages);
  179. ret = io_account_mem(ifq->user, ifq->mm_account, mem->account_pages);
  180. if (ret < 0)
  181. mem->account_pages = 0;
  182. mem->sgt = &mem->page_sg_table;
  183. mem->pages = pages;
  184. mem->nr_folios = nr_pages;
  185. mem->size = area_reg->len;
  186. return ret;
  187. }
  188. static void io_release_area_mem(struct io_zcrx_mem *mem)
  189. {
  190. if (mem->is_dmabuf) {
  191. io_release_dmabuf(mem);
  192. return;
  193. }
  194. if (mem->pages) {
  195. unpin_user_pages(mem->pages, mem->nr_folios);
  196. sg_free_table(mem->sgt);
  197. mem->sgt = NULL;
  198. kvfree(mem->pages);
  199. }
  200. }
  201. static int io_import_area(struct io_zcrx_ifq *ifq,
  202. struct io_zcrx_mem *mem,
  203. struct io_uring_zcrx_area_reg *area_reg)
  204. {
  205. int ret;
  206. if (area_reg->flags & ~IO_ZCRX_AREA_SUPPORTED_FLAGS)
  207. return -EINVAL;
  208. if (area_reg->rq_area_token)
  209. return -EINVAL;
  210. if (area_reg->__resv2[0] || area_reg->__resv2[1])
  211. return -EINVAL;
  212. ret = io_validate_user_buf_range(area_reg->addr, area_reg->len);
  213. if (ret)
  214. return ret;
  215. if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK)
  216. return -EINVAL;
  217. if (area_reg->flags & IORING_ZCRX_AREA_DMABUF)
  218. return io_import_dmabuf(ifq, mem, area_reg);
  219. return io_import_umem(ifq, mem, area_reg);
  220. }
  221. static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq,
  222. struct io_zcrx_area *area)
  223. {
  224. int i;
  225. guard(mutex)(&ifq->pp_lock);
  226. if (!area->is_mapped)
  227. return;
  228. area->is_mapped = false;
  229. for (i = 0; i < area->nia.num_niovs; i++)
  230. net_mp_niov_set_dma_addr(&area->nia.niovs[i], 0);
  231. if (area->mem.is_dmabuf) {
  232. io_release_dmabuf(&area->mem);
  233. } else {
  234. dma_unmap_sgtable(ifq->dev, &area->mem.page_sg_table,
  235. DMA_FROM_DEVICE, IO_DMA_ATTR);
  236. }
  237. }
  238. static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
  239. {
  240. int ret;
  241. guard(mutex)(&ifq->pp_lock);
  242. if (area->is_mapped)
  243. return 0;
  244. if (!area->mem.is_dmabuf) {
  245. ret = dma_map_sgtable(ifq->dev, &area->mem.page_sg_table,
  246. DMA_FROM_DEVICE, IO_DMA_ATTR);
  247. if (ret < 0)
  248. return ret;
  249. }
  250. ret = io_populate_area_dma(ifq, area);
  251. if (ret && !area->mem.is_dmabuf)
  252. dma_unmap_sgtable(ifq->dev, &area->mem.page_sg_table,
  253. DMA_FROM_DEVICE, IO_DMA_ATTR);
  254. if (ret == 0)
  255. area->is_mapped = true;
  256. return ret;
  257. }
  258. static void io_zcrx_sync_for_device(struct page_pool *pool,
  259. struct net_iov *niov)
  260. {
  261. #if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC)
  262. dma_addr_t dma_addr;
  263. unsigned niov_size;
  264. if (!dma_dev_need_sync(pool->p.dev))
  265. return;
  266. niov_size = 1U << io_pp_to_ifq(pool)->niov_shift;
  267. dma_addr = page_pool_get_dma_addr_netmem(net_iov_to_netmem(niov));
  268. __dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset,
  269. niov_size, pool->p.dma_dir);
  270. #endif
  271. }
  272. #define IO_RQ_MAX_ENTRIES 32768
  273. #define IO_SKBS_PER_CALL_LIMIT 20
  274. struct io_zcrx_args {
  275. struct io_kiocb *req;
  276. struct io_zcrx_ifq *ifq;
  277. struct socket *sock;
  278. unsigned nr_skbs;
  279. };
  280. static const struct memory_provider_ops io_uring_pp_zc_ops;
  281. static inline atomic_t *io_get_user_counter(struct net_iov *niov)
  282. {
  283. struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
  284. return &area->user_refs[net_iov_idx(niov)];
  285. }
  286. static bool io_zcrx_put_niov_uref(struct net_iov *niov)
  287. {
  288. atomic_t *uref = io_get_user_counter(niov);
  289. int old;
  290. old = atomic_read(uref);
  291. do {
  292. if (unlikely(old == 0))
  293. return false;
  294. } while (!atomic_try_cmpxchg(uref, &old, old - 1));
  295. return true;
  296. }
  297. static void io_zcrx_get_niov_uref(struct net_iov *niov)
  298. {
  299. atomic_inc(io_get_user_counter(niov));
  300. }
  301. static void io_fill_zcrx_offsets(struct io_uring_zcrx_offsets *offsets)
  302. {
  303. offsets->head = offsetof(struct io_uring, head);
  304. offsets->tail = offsetof(struct io_uring, tail);
  305. offsets->rqes = ALIGN(sizeof(struct io_uring), L1_CACHE_BYTES);
  306. }
  307. static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx,
  308. struct io_zcrx_ifq *ifq,
  309. struct io_uring_zcrx_ifq_reg *reg,
  310. struct io_uring_region_desc *rd,
  311. u32 id)
  312. {
  313. u64 mmap_offset;
  314. size_t off, size;
  315. void *ptr;
  316. int ret;
  317. io_fill_zcrx_offsets(&reg->offsets);
  318. off = reg->offsets.rqes;
  319. size = off + sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries;
  320. if (size > rd->size)
  321. return -EINVAL;
  322. mmap_offset = IORING_MAP_OFF_ZCRX_REGION;
  323. mmap_offset += id << IORING_OFF_PBUF_SHIFT;
  324. ret = io_create_region(ctx, &ifq->region, rd, mmap_offset);
  325. if (ret < 0)
  326. return ret;
  327. ptr = io_region_get_ptr(&ifq->region);
  328. ifq->rq_ring = (struct io_uring *)ptr;
  329. ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off);
  330. return 0;
  331. }
  332. static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq)
  333. {
  334. io_free_region(ifq->user, &ifq->region);
  335. ifq->rq_ring = NULL;
  336. ifq->rqes = NULL;
  337. }
  338. static void io_zcrx_free_area(struct io_zcrx_ifq *ifq,
  339. struct io_zcrx_area *area)
  340. {
  341. io_zcrx_unmap_area(ifq, area);
  342. io_release_area_mem(&area->mem);
  343. if (area->mem.account_pages)
  344. io_unaccount_mem(ifq->user, ifq->mm_account,
  345. area->mem.account_pages);
  346. kvfree(area->freelist);
  347. kvfree(area->nia.niovs);
  348. kvfree(area->user_refs);
  349. kfree(area);
  350. }
  351. static int io_zcrx_append_area(struct io_zcrx_ifq *ifq,
  352. struct io_zcrx_area *area)
  353. {
  354. if (ifq->area)
  355. return -EINVAL;
  356. ifq->area = area;
  357. return 0;
  358. }
  359. static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
  360. struct io_uring_zcrx_area_reg *area_reg,
  361. struct io_uring_zcrx_ifq_reg *reg)
  362. {
  363. int buf_size_shift = PAGE_SHIFT;
  364. struct io_zcrx_area *area;
  365. unsigned nr_iovs;
  366. int i, ret;
  367. if (reg->rx_buf_len) {
  368. if (!is_power_of_2(reg->rx_buf_len) ||
  369. reg->rx_buf_len < PAGE_SIZE)
  370. return -EINVAL;
  371. buf_size_shift = ilog2(reg->rx_buf_len);
  372. }
  373. ret = -ENOMEM;
  374. area = kzalloc_obj(*area);
  375. if (!area)
  376. goto err;
  377. area->ifq = ifq;
  378. ret = io_import_area(ifq, &area->mem, area_reg);
  379. if (ret)
  380. goto err;
  381. if (buf_size_shift > io_area_max_shift(&area->mem)) {
  382. ret = -ERANGE;
  383. goto err;
  384. }
  385. ifq->niov_shift = buf_size_shift;
  386. nr_iovs = area->mem.size >> ifq->niov_shift;
  387. area->nia.num_niovs = nr_iovs;
  388. ret = -ENOMEM;
  389. area->nia.niovs = kvmalloc_objs(area->nia.niovs[0], nr_iovs,
  390. GFP_KERNEL_ACCOUNT | __GFP_ZERO);
  391. if (!area->nia.niovs)
  392. goto err;
  393. area->freelist = kvmalloc_array(nr_iovs, sizeof(area->freelist[0]),
  394. GFP_KERNEL_ACCOUNT | __GFP_ZERO);
  395. if (!area->freelist)
  396. goto err;
  397. area->user_refs = kvmalloc_objs(area->user_refs[0], nr_iovs,
  398. GFP_KERNEL_ACCOUNT | __GFP_ZERO);
  399. if (!area->user_refs)
  400. goto err;
  401. for (i = 0; i < nr_iovs; i++) {
  402. struct net_iov *niov = &area->nia.niovs[i];
  403. niov->owner = &area->nia;
  404. area->freelist[i] = i;
  405. atomic_set(&area->user_refs[i], 0);
  406. niov->type = NET_IOV_IOURING;
  407. }
  408. area->free_count = nr_iovs;
  409. /* we're only supporting one area per ifq for now */
  410. area->area_id = 0;
  411. area_reg->rq_area_token = (u64)area->area_id << IORING_ZCRX_AREA_SHIFT;
  412. spin_lock_init(&area->freelist_lock);
  413. ret = io_zcrx_append_area(ifq, area);
  414. if (!ret)
  415. return 0;
  416. err:
  417. if (area)
  418. io_zcrx_free_area(ifq, area);
  419. return ret;
  420. }
  421. static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx)
  422. {
  423. struct io_zcrx_ifq *ifq;
  424. ifq = kzalloc_obj(*ifq);
  425. if (!ifq)
  426. return NULL;
  427. ifq->if_rxq = -1;
  428. spin_lock_init(&ifq->rq_lock);
  429. mutex_init(&ifq->pp_lock);
  430. refcount_set(&ifq->refs, 1);
  431. refcount_set(&ifq->user_refs, 1);
  432. return ifq;
  433. }
  434. static void io_zcrx_drop_netdev(struct io_zcrx_ifq *ifq)
  435. {
  436. guard(mutex)(&ifq->pp_lock);
  437. if (!ifq->netdev)
  438. return;
  439. netdev_put(ifq->netdev, &ifq->netdev_tracker);
  440. ifq->netdev = NULL;
  441. }
  442. static void io_close_queue(struct io_zcrx_ifq *ifq)
  443. {
  444. struct net_device *netdev;
  445. netdevice_tracker netdev_tracker;
  446. struct pp_memory_provider_params p = {
  447. .mp_ops = &io_uring_pp_zc_ops,
  448. .mp_priv = ifq,
  449. };
  450. scoped_guard(mutex, &ifq->pp_lock) {
  451. netdev = ifq->netdev;
  452. netdev_tracker = ifq->netdev_tracker;
  453. ifq->netdev = NULL;
  454. }
  455. if (netdev) {
  456. if (ifq->if_rxq != -1)
  457. net_mp_close_rxq(netdev, ifq->if_rxq, &p);
  458. netdev_put(netdev, &netdev_tracker);
  459. }
  460. ifq->if_rxq = -1;
  461. }
  462. static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)
  463. {
  464. io_close_queue(ifq);
  465. if (ifq->area)
  466. io_zcrx_free_area(ifq, ifq->area);
  467. free_uid(ifq->user);
  468. if (ifq->mm_account)
  469. mmdrop(ifq->mm_account);
  470. if (ifq->dev)
  471. put_device(ifq->dev);
  472. io_free_rbuf_ring(ifq);
  473. mutex_destroy(&ifq->pp_lock);
  474. kfree(ifq);
  475. }
  476. static void io_put_zcrx_ifq(struct io_zcrx_ifq *ifq)
  477. {
  478. if (refcount_dec_and_test(&ifq->refs))
  479. io_zcrx_ifq_free(ifq);
  480. }
  481. static void io_zcrx_return_niov_freelist(struct net_iov *niov)
  482. {
  483. struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
  484. spin_lock_bh(&area->freelist_lock);
  485. area->freelist[area->free_count++] = net_iov_idx(niov);
  486. spin_unlock_bh(&area->freelist_lock);
  487. }
  488. static void io_zcrx_return_niov(struct net_iov *niov)
  489. {
  490. netmem_ref netmem = net_iov_to_netmem(niov);
  491. if (!niov->desc.pp) {
  492. /* copy fallback allocated niovs */
  493. io_zcrx_return_niov_freelist(niov);
  494. return;
  495. }
  496. page_pool_put_unrefed_netmem(niov->desc.pp, netmem, -1, false);
  497. }
  498. static void io_zcrx_scrub(struct io_zcrx_ifq *ifq)
  499. {
  500. struct io_zcrx_area *area = ifq->area;
  501. int i;
  502. if (!area)
  503. return;
  504. /* Reclaim back all buffers given to the user space. */
  505. for (i = 0; i < area->nia.num_niovs; i++) {
  506. struct net_iov *niov = &area->nia.niovs[i];
  507. int nr;
  508. if (!atomic_read(io_get_user_counter(niov)))
  509. continue;
  510. nr = atomic_xchg(io_get_user_counter(niov), 0);
  511. if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr))
  512. io_zcrx_return_niov(niov);
  513. }
  514. }
  515. static void zcrx_unregister(struct io_zcrx_ifq *ifq)
  516. {
  517. if (refcount_dec_and_test(&ifq->user_refs)) {
  518. io_close_queue(ifq);
  519. io_zcrx_scrub(ifq);
  520. }
  521. io_put_zcrx_ifq(ifq);
  522. }
  523. struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx,
  524. unsigned int id)
  525. {
  526. struct io_zcrx_ifq *ifq = xa_load(&ctx->zcrx_ctxs, id);
  527. lockdep_assert_held(&ctx->mmap_lock);
  528. return ifq ? &ifq->region : NULL;
  529. }
  530. static int zcrx_box_release(struct inode *inode, struct file *file)
  531. {
  532. struct io_zcrx_ifq *ifq = file->private_data;
  533. if (WARN_ON_ONCE(!ifq))
  534. return -EFAULT;
  535. zcrx_unregister(ifq);
  536. return 0;
  537. }
  538. static const struct file_operations zcrx_box_fops = {
  539. .owner = THIS_MODULE,
  540. .release = zcrx_box_release,
  541. };
  542. static int zcrx_export(struct io_ring_ctx *ctx, struct io_zcrx_ifq *ifq,
  543. struct zcrx_ctrl *ctrl, void __user *arg)
  544. {
  545. struct zcrx_ctrl_export *ce = &ctrl->zc_export;
  546. struct file *file;
  547. int fd = -1;
  548. if (!mem_is_zero(ce, sizeof(*ce)))
  549. return -EINVAL;
  550. fd = get_unused_fd_flags(O_CLOEXEC);
  551. if (fd < 0)
  552. return fd;
  553. ce->zcrx_fd = fd;
  554. if (copy_to_user(arg, ctrl, sizeof(*ctrl))) {
  555. put_unused_fd(fd);
  556. return -EFAULT;
  557. }
  558. refcount_inc(&ifq->refs);
  559. refcount_inc(&ifq->user_refs);
  560. file = anon_inode_create_getfile("[zcrx]", &zcrx_box_fops,
  561. ifq, O_CLOEXEC, NULL);
  562. if (IS_ERR(file)) {
  563. put_unused_fd(fd);
  564. zcrx_unregister(ifq);
  565. return PTR_ERR(file);
  566. }
  567. fd_install(fd, file);
  568. return 0;
  569. }
  570. static int import_zcrx(struct io_ring_ctx *ctx,
  571. struct io_uring_zcrx_ifq_reg __user *arg,
  572. struct io_uring_zcrx_ifq_reg *reg)
  573. {
  574. struct io_zcrx_ifq *ifq;
  575. struct file *file;
  576. int fd, ret;
  577. u32 id;
  578. if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
  579. return -EINVAL;
  580. if (!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)))
  581. return -EINVAL;
  582. if (reg->if_rxq || reg->rq_entries || reg->area_ptr || reg->region_ptr)
  583. return -EINVAL;
  584. if (reg->flags & ~ZCRX_REG_IMPORT)
  585. return -EINVAL;
  586. fd = reg->if_idx;
  587. CLASS(fd, f)(fd);
  588. if (fd_empty(f))
  589. return -EBADF;
  590. file = fd_file(f);
  591. if (file->f_op != &zcrx_box_fops || !file->private_data)
  592. return -EBADF;
  593. ifq = file->private_data;
  594. refcount_inc(&ifq->refs);
  595. refcount_inc(&ifq->user_refs);
  596. scoped_guard(mutex, &ctx->mmap_lock) {
  597. ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL);
  598. if (ret)
  599. goto err;
  600. }
  601. reg->zcrx_id = id;
  602. io_fill_zcrx_offsets(&reg->offsets);
  603. if (copy_to_user(arg, reg, sizeof(*reg))) {
  604. ret = -EFAULT;
  605. goto err_xa_erase;
  606. }
  607. scoped_guard(mutex, &ctx->mmap_lock) {
  608. ret = -ENOMEM;
  609. if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL))
  610. goto err_xa_erase;
  611. }
  612. return 0;
  613. err_xa_erase:
  614. scoped_guard(mutex, &ctx->mmap_lock)
  615. xa_erase(&ctx->zcrx_ctxs, id);
  616. err:
  617. zcrx_unregister(ifq);
  618. return ret;
  619. }
  620. int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
  621. struct io_uring_zcrx_ifq_reg __user *arg)
  622. {
  623. struct pp_memory_provider_params mp_param = {};
  624. struct io_uring_zcrx_area_reg area;
  625. struct io_uring_zcrx_ifq_reg reg;
  626. struct io_uring_region_desc rd;
  627. struct io_zcrx_ifq *ifq;
  628. int ret;
  629. u32 id;
  630. /*
  631. * 1. Interface queue allocation.
  632. * 2. It can observe data destined for sockets of other tasks.
  633. */
  634. if (!capable(CAP_NET_ADMIN))
  635. return -EPERM;
  636. /* mandatory io_uring features for zc rx */
  637. if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
  638. return -EINVAL;
  639. if (!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)))
  640. return -EINVAL;
  641. if (copy_from_user(&reg, arg, sizeof(reg)))
  642. return -EFAULT;
  643. if (!mem_is_zero(&reg.__resv, sizeof(reg.__resv)) || reg.zcrx_id)
  644. return -EINVAL;
  645. if (reg.flags & ZCRX_REG_IMPORT)
  646. return import_zcrx(ctx, arg, &reg);
  647. if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd)))
  648. return -EFAULT;
  649. if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags)
  650. return -EINVAL;
  651. if (reg.rq_entries > IO_RQ_MAX_ENTRIES) {
  652. if (!(ctx->flags & IORING_SETUP_CLAMP))
  653. return -EINVAL;
  654. reg.rq_entries = IO_RQ_MAX_ENTRIES;
  655. }
  656. reg.rq_entries = roundup_pow_of_two(reg.rq_entries);
  657. if (copy_from_user(&area, u64_to_user_ptr(reg.area_ptr), sizeof(area)))
  658. return -EFAULT;
  659. ifq = io_zcrx_ifq_alloc(ctx);
  660. if (!ifq)
  661. return -ENOMEM;
  662. if (ctx->user) {
  663. get_uid(ctx->user);
  664. ifq->user = ctx->user;
  665. }
  666. if (ctx->mm_account) {
  667. mmgrab(ctx->mm_account);
  668. ifq->mm_account = ctx->mm_account;
  669. }
  670. ifq->rq_entries = reg.rq_entries;
  671. scoped_guard(mutex, &ctx->mmap_lock) {
  672. /* preallocate id */
  673. ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL);
  674. if (ret)
  675. goto ifq_free;
  676. }
  677. ret = io_allocate_rbuf_ring(ctx, ifq, &reg, &rd, id);
  678. if (ret)
  679. goto err;
  680. ifq->netdev = netdev_get_by_index_lock(current->nsproxy->net_ns, reg.if_idx);
  681. if (!ifq->netdev) {
  682. ret = -ENODEV;
  683. goto err;
  684. }
  685. netdev_hold(ifq->netdev, &ifq->netdev_tracker, GFP_KERNEL);
  686. ifq->dev = netdev_queue_get_dma_dev(ifq->netdev, reg.if_rxq);
  687. if (!ifq->dev) {
  688. ret = -EOPNOTSUPP;
  689. goto netdev_put_unlock;
  690. }
  691. get_device(ifq->dev);
  692. ret = io_zcrx_create_area(ifq, &area, &reg);
  693. if (ret)
  694. goto netdev_put_unlock;
  695. if (reg.rx_buf_len)
  696. mp_param.rx_page_size = 1U << ifq->niov_shift;
  697. mp_param.mp_ops = &io_uring_pp_zc_ops;
  698. mp_param.mp_priv = ifq;
  699. ret = __net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param, NULL);
  700. if (ret)
  701. goto netdev_put_unlock;
  702. netdev_unlock(ifq->netdev);
  703. ifq->if_rxq = reg.if_rxq;
  704. reg.zcrx_id = id;
  705. scoped_guard(mutex, &ctx->mmap_lock) {
  706. /* publish ifq */
  707. ret = -ENOMEM;
  708. if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL))
  709. goto err;
  710. }
  711. reg.rx_buf_len = 1U << ifq->niov_shift;
  712. if (copy_to_user(arg, &reg, sizeof(reg)) ||
  713. copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd)) ||
  714. copy_to_user(u64_to_user_ptr(reg.area_ptr), &area, sizeof(area))) {
  715. ret = -EFAULT;
  716. goto err;
  717. }
  718. return 0;
  719. netdev_put_unlock:
  720. netdev_unlock(ifq->netdev);
  721. err:
  722. scoped_guard(mutex, &ctx->mmap_lock)
  723. xa_erase(&ctx->zcrx_ctxs, id);
  724. ifq_free:
  725. zcrx_unregister(ifq);
  726. return ret;
  727. }
  728. static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area)
  729. {
  730. unsigned niov_idx;
  731. lockdep_assert_held(&area->freelist_lock);
  732. niov_idx = area->freelist[--area->free_count];
  733. return &area->nia.niovs[niov_idx];
  734. }
  735. void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
  736. {
  737. struct io_zcrx_ifq *ifq;
  738. lockdep_assert_held(&ctx->uring_lock);
  739. while (1) {
  740. scoped_guard(mutex, &ctx->mmap_lock) {
  741. unsigned long id = 0;
  742. ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT);
  743. if (ifq)
  744. xa_erase(&ctx->zcrx_ctxs, id);
  745. }
  746. if (!ifq)
  747. break;
  748. zcrx_unregister(ifq);
  749. }
  750. xa_destroy(&ctx->zcrx_ctxs);
  751. }
  752. static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq)
  753. {
  754. u32 entries;
  755. entries = smp_load_acquire(&ifq->rq_ring->tail) - ifq->cached_rq_head;
  756. return min(entries, ifq->rq_entries);
  757. }
  758. static struct io_uring_zcrx_rqe *io_zcrx_get_rqe(struct io_zcrx_ifq *ifq,
  759. unsigned mask)
  760. {
  761. unsigned int idx = ifq->cached_rq_head++ & mask;
  762. return &ifq->rqes[idx];
  763. }
  764. static inline bool io_parse_rqe(struct io_uring_zcrx_rqe *rqe,
  765. struct io_zcrx_ifq *ifq,
  766. struct net_iov **ret_niov)
  767. {
  768. __u64 off = READ_ONCE(rqe->off);
  769. unsigned niov_idx, area_idx;
  770. struct io_zcrx_area *area;
  771. area_idx = off >> IORING_ZCRX_AREA_SHIFT;
  772. niov_idx = (off & ~IORING_ZCRX_AREA_MASK) >> ifq->niov_shift;
  773. if (unlikely(rqe->__pad || area_idx))
  774. return false;
  775. area = ifq->area;
  776. if (unlikely(niov_idx >= area->nia.num_niovs))
  777. return false;
  778. niov_idx = array_index_nospec(niov_idx, area->nia.num_niovs);
  779. *ret_niov = &area->nia.niovs[niov_idx];
  780. return true;
  781. }
  782. static void io_zcrx_ring_refill(struct page_pool *pp,
  783. struct io_zcrx_ifq *ifq)
  784. {
  785. unsigned int mask = ifq->rq_entries - 1;
  786. unsigned int entries;
  787. guard(spinlock_bh)(&ifq->rq_lock);
  788. entries = io_zcrx_rqring_entries(ifq);
  789. entries = min_t(unsigned, entries, PP_ALLOC_CACHE_REFILL);
  790. if (unlikely(!entries))
  791. return;
  792. do {
  793. struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(ifq, mask);
  794. struct net_iov *niov;
  795. netmem_ref netmem;
  796. if (!io_parse_rqe(rqe, ifq, &niov))
  797. continue;
  798. if (!io_zcrx_put_niov_uref(niov))
  799. continue;
  800. netmem = net_iov_to_netmem(niov);
  801. if (!page_pool_unref_and_test(netmem))
  802. continue;
  803. if (unlikely(niov->desc.pp != pp)) {
  804. io_zcrx_return_niov(niov);
  805. continue;
  806. }
  807. io_zcrx_sync_for_device(pp, niov);
  808. net_mp_netmem_place_in_cache(pp, netmem);
  809. } while (--entries);
  810. smp_store_release(&ifq->rq_ring->head, ifq->cached_rq_head);
  811. }
  812. static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq)
  813. {
  814. struct io_zcrx_area *area = ifq->area;
  815. spin_lock_bh(&area->freelist_lock);
  816. while (area->free_count && pp->alloc.count < PP_ALLOC_CACHE_REFILL) {
  817. struct net_iov *niov = __io_zcrx_get_free_niov(area);
  818. netmem_ref netmem = net_iov_to_netmem(niov);
  819. net_mp_niov_set_page_pool(pp, niov);
  820. io_zcrx_sync_for_device(pp, niov);
  821. net_mp_netmem_place_in_cache(pp, netmem);
  822. }
  823. spin_unlock_bh(&area->freelist_lock);
  824. }
  825. static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp)
  826. {
  827. struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
  828. /* pp should already be ensuring that */
  829. if (unlikely(pp->alloc.count))
  830. goto out_return;
  831. io_zcrx_ring_refill(pp, ifq);
  832. if (likely(pp->alloc.count))
  833. goto out_return;
  834. io_zcrx_refill_slow(pp, ifq);
  835. if (!pp->alloc.count)
  836. return 0;
  837. out_return:
  838. return pp->alloc.cache[--pp->alloc.count];
  839. }
  840. static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem)
  841. {
  842. struct net_iov *niov;
  843. if (WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
  844. return false;
  845. niov = netmem_to_net_iov(netmem);
  846. net_mp_niov_clear_page_pool(niov);
  847. io_zcrx_return_niov_freelist(niov);
  848. return false;
  849. }
  850. static int io_pp_zc_init(struct page_pool *pp)
  851. {
  852. struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
  853. int ret;
  854. if (WARN_ON_ONCE(!ifq))
  855. return -EINVAL;
  856. if (WARN_ON_ONCE(ifq->dev != pp->p.dev))
  857. return -EINVAL;
  858. if (WARN_ON_ONCE(!pp->dma_map))
  859. return -EOPNOTSUPP;
  860. if (pp->p.order + PAGE_SHIFT != ifq->niov_shift)
  861. return -EINVAL;
  862. if (pp->p.dma_dir != DMA_FROM_DEVICE)
  863. return -EOPNOTSUPP;
  864. ret = io_zcrx_map_area(ifq, ifq->area);
  865. if (ret)
  866. return ret;
  867. refcount_inc(&ifq->refs);
  868. return 0;
  869. }
  870. static void io_pp_zc_destroy(struct page_pool *pp)
  871. {
  872. io_put_zcrx_ifq(io_pp_to_ifq(pp));
  873. }
  874. static int io_pp_nl_fill(void *mp_priv, struct sk_buff *rsp,
  875. struct netdev_rx_queue *rxq)
  876. {
  877. struct nlattr *nest;
  878. int type;
  879. type = rxq ? NETDEV_A_QUEUE_IO_URING : NETDEV_A_PAGE_POOL_IO_URING;
  880. nest = nla_nest_start(rsp, type);
  881. if (!nest)
  882. return -EMSGSIZE;
  883. nla_nest_end(rsp, nest);
  884. return 0;
  885. }
  886. static void io_pp_uninstall(void *mp_priv, struct netdev_rx_queue *rxq)
  887. {
  888. struct pp_memory_provider_params *p = &rxq->mp_params;
  889. struct io_zcrx_ifq *ifq = mp_priv;
  890. io_zcrx_drop_netdev(ifq);
  891. if (ifq->area)
  892. io_zcrx_unmap_area(ifq, ifq->area);
  893. p->mp_ops = NULL;
  894. p->mp_priv = NULL;
  895. }
  896. static const struct memory_provider_ops io_uring_pp_zc_ops = {
  897. .alloc_netmems = io_pp_zc_alloc_netmems,
  898. .release_netmem = io_pp_zc_release_netmem,
  899. .init = io_pp_zc_init,
  900. .destroy = io_pp_zc_destroy,
  901. .nl_fill = io_pp_nl_fill,
  902. .uninstall = io_pp_uninstall,
  903. };
  904. static unsigned zcrx_parse_rq(netmem_ref *netmem_array, unsigned nr,
  905. struct io_zcrx_ifq *zcrx)
  906. {
  907. unsigned int mask = zcrx->rq_entries - 1;
  908. unsigned int i;
  909. nr = min(nr, io_zcrx_rqring_entries(zcrx));
  910. for (i = 0; i < nr; i++) {
  911. struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(zcrx, mask);
  912. struct net_iov *niov;
  913. if (!io_parse_rqe(rqe, zcrx, &niov))
  914. break;
  915. netmem_array[i] = net_iov_to_netmem(niov);
  916. }
  917. smp_store_release(&zcrx->rq_ring->head, zcrx->cached_rq_head);
  918. return i;
  919. }
  920. #define ZCRX_FLUSH_BATCH 32
  921. static void zcrx_return_buffers(netmem_ref *netmems, unsigned nr)
  922. {
  923. unsigned i;
  924. for (i = 0; i < nr; i++) {
  925. netmem_ref netmem = netmems[i];
  926. struct net_iov *niov = netmem_to_net_iov(netmem);
  927. if (!io_zcrx_put_niov_uref(niov))
  928. continue;
  929. if (!page_pool_unref_and_test(netmem))
  930. continue;
  931. io_zcrx_return_niov(niov);
  932. }
  933. }
  934. static int zcrx_flush_rq(struct io_ring_ctx *ctx, struct io_zcrx_ifq *zcrx,
  935. struct zcrx_ctrl *ctrl)
  936. {
  937. struct zcrx_ctrl_flush_rq *frq = &ctrl->zc_flush;
  938. netmem_ref netmems[ZCRX_FLUSH_BATCH];
  939. unsigned total = 0;
  940. unsigned nr;
  941. if (!mem_is_zero(&frq->__resv, sizeof(frq->__resv)))
  942. return -EINVAL;
  943. do {
  944. scoped_guard(spinlock_bh, &zcrx->rq_lock) {
  945. nr = zcrx_parse_rq(netmems, ZCRX_FLUSH_BATCH, zcrx);
  946. zcrx_return_buffers(netmems, nr);
  947. }
  948. total += nr;
  949. if (fatal_signal_pending(current))
  950. break;
  951. cond_resched();
  952. } while (nr == ZCRX_FLUSH_BATCH && total < zcrx->rq_entries);
  953. return 0;
  954. }
  955. int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
  956. {
  957. struct zcrx_ctrl ctrl;
  958. struct io_zcrx_ifq *zcrx;
  959. if (nr_args)
  960. return -EINVAL;
  961. if (copy_from_user(&ctrl, arg, sizeof(ctrl)))
  962. return -EFAULT;
  963. if (!mem_is_zero(&ctrl.__resv, sizeof(ctrl.__resv)))
  964. return -EFAULT;
  965. zcrx = xa_load(&ctx->zcrx_ctxs, ctrl.zcrx_id);
  966. if (!zcrx)
  967. return -ENXIO;
  968. switch (ctrl.op) {
  969. case ZCRX_CTRL_FLUSH_RQ:
  970. return zcrx_flush_rq(ctx, zcrx, &ctrl);
  971. case ZCRX_CTRL_EXPORT:
  972. return zcrx_export(ctx, zcrx, &ctrl, arg);
  973. }
  974. return -EOPNOTSUPP;
  975. }
  976. static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov,
  977. struct io_zcrx_ifq *ifq, int off, int len)
  978. {
  979. struct io_ring_ctx *ctx = req->ctx;
  980. struct io_uring_zcrx_cqe *rcqe;
  981. struct io_zcrx_area *area;
  982. struct io_uring_cqe *cqe;
  983. u64 offset;
  984. if (!io_defer_get_uncommited_cqe(ctx, &cqe))
  985. return false;
  986. cqe->user_data = req->cqe.user_data;
  987. cqe->res = len;
  988. cqe->flags = IORING_CQE_F_MORE;
  989. if (ctx->flags & IORING_SETUP_CQE_MIXED)
  990. cqe->flags |= IORING_CQE_F_32;
  991. area = io_zcrx_iov_to_area(niov);
  992. offset = off + (net_iov_idx(niov) << ifq->niov_shift);
  993. rcqe = (struct io_uring_zcrx_cqe *)(cqe + 1);
  994. rcqe->off = offset + ((u64)area->area_id << IORING_ZCRX_AREA_SHIFT);
  995. rcqe->__pad = 0;
  996. return true;
  997. }
  998. static struct net_iov *io_alloc_fallback_niov(struct io_zcrx_ifq *ifq)
  999. {
  1000. struct io_zcrx_area *area = ifq->area;
  1001. struct net_iov *niov = NULL;
  1002. if (area->mem.is_dmabuf)
  1003. return NULL;
  1004. spin_lock_bh(&area->freelist_lock);
  1005. if (area->free_count)
  1006. niov = __io_zcrx_get_free_niov(area);
  1007. spin_unlock_bh(&area->freelist_lock);
  1008. if (niov)
  1009. page_pool_fragment_netmem(net_iov_to_netmem(niov), 1);
  1010. return niov;
  1011. }
  1012. struct io_copy_cache {
  1013. struct page *page;
  1014. unsigned long offset;
  1015. size_t size;
  1016. };
  1017. static ssize_t io_copy_page(struct io_copy_cache *cc, struct page *src_page,
  1018. unsigned int src_offset, size_t len)
  1019. {
  1020. size_t copied = 0;
  1021. len = min(len, cc->size);
  1022. while (len) {
  1023. void *src_addr, *dst_addr;
  1024. struct page *dst_page = cc->page;
  1025. unsigned dst_offset = cc->offset;
  1026. size_t n = len;
  1027. if (folio_test_partial_kmap(page_folio(dst_page)) ||
  1028. folio_test_partial_kmap(page_folio(src_page))) {
  1029. dst_page += dst_offset / PAGE_SIZE;
  1030. dst_offset = offset_in_page(dst_offset);
  1031. src_page += src_offset / PAGE_SIZE;
  1032. src_offset = offset_in_page(src_offset);
  1033. n = min(PAGE_SIZE - src_offset, PAGE_SIZE - dst_offset);
  1034. n = min(n, len);
  1035. }
  1036. dst_addr = kmap_local_page(dst_page) + dst_offset;
  1037. src_addr = kmap_local_page(src_page) + src_offset;
  1038. memcpy(dst_addr, src_addr, n);
  1039. kunmap_local(src_addr);
  1040. kunmap_local(dst_addr);
  1041. cc->size -= n;
  1042. cc->offset += n;
  1043. src_offset += n;
  1044. len -= n;
  1045. copied += n;
  1046. }
  1047. return copied;
  1048. }
  1049. static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
  1050. struct page *src_page, unsigned int src_offset,
  1051. size_t len)
  1052. {
  1053. size_t copied = 0;
  1054. int ret = 0;
  1055. while (len) {
  1056. struct io_copy_cache cc;
  1057. struct net_iov *niov;
  1058. size_t n;
  1059. niov = io_alloc_fallback_niov(ifq);
  1060. if (!niov) {
  1061. ret = -ENOMEM;
  1062. break;
  1063. }
  1064. cc.page = io_zcrx_iov_page(niov);
  1065. cc.offset = 0;
  1066. cc.size = PAGE_SIZE;
  1067. n = io_copy_page(&cc, src_page, src_offset, len);
  1068. if (!io_zcrx_queue_cqe(req, niov, ifq, 0, n)) {
  1069. io_zcrx_return_niov(niov);
  1070. ret = -ENOSPC;
  1071. break;
  1072. }
  1073. io_zcrx_get_niov_uref(niov);
  1074. src_offset += n;
  1075. len -= n;
  1076. copied += n;
  1077. }
  1078. return copied ? copied : ret;
  1079. }
  1080. static int io_zcrx_copy_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
  1081. const skb_frag_t *frag, int off, int len)
  1082. {
  1083. struct page *page = skb_frag_page(frag);
  1084. return io_zcrx_copy_chunk(req, ifq, page, off + skb_frag_off(frag), len);
  1085. }
  1086. static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
  1087. const skb_frag_t *frag, int off, int len)
  1088. {
  1089. struct net_iov *niov;
  1090. struct page_pool *pp;
  1091. if (unlikely(!skb_frag_is_net_iov(frag)))
  1092. return io_zcrx_copy_frag(req, ifq, frag, off, len);
  1093. niov = netmem_to_net_iov(frag->netmem);
  1094. pp = niov->desc.pp;
  1095. if (!pp || pp->mp_ops != &io_uring_pp_zc_ops || io_pp_to_ifq(pp) != ifq)
  1096. return -EFAULT;
  1097. if (!io_zcrx_queue_cqe(req, niov, ifq, off + skb_frag_off(frag), len))
  1098. return -ENOSPC;
  1099. /*
  1100. * Prevent it from being recycled while user is accessing it.
  1101. * It has to be done before grabbing a user reference.
  1102. */
  1103. page_pool_ref_netmem(net_iov_to_netmem(niov));
  1104. io_zcrx_get_niov_uref(niov);
  1105. return len;
  1106. }
  1107. static int
  1108. io_zcrx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
  1109. unsigned int offset, size_t len)
  1110. {
  1111. struct io_zcrx_args *args = desc->arg.data;
  1112. struct io_zcrx_ifq *ifq = args->ifq;
  1113. struct io_kiocb *req = args->req;
  1114. struct sk_buff *frag_iter;
  1115. unsigned start, start_off = offset;
  1116. int i, copy, end, off;
  1117. int ret = 0;
  1118. len = min_t(size_t, len, desc->count);
  1119. /*
  1120. * __tcp_read_sock() always calls io_zcrx_recv_skb one last time, even
  1121. * if desc->count is already 0. This is caused by the if (offset + 1 !=
  1122. * skb->len) check. Return early in this case to break out of
  1123. * __tcp_read_sock().
  1124. */
  1125. if (!len)
  1126. return 0;
  1127. if (unlikely(args->nr_skbs++ > IO_SKBS_PER_CALL_LIMIT))
  1128. return -EAGAIN;
  1129. if (unlikely(offset < skb_headlen(skb))) {
  1130. ssize_t copied;
  1131. size_t to_copy;
  1132. to_copy = min_t(size_t, skb_headlen(skb) - offset, len);
  1133. copied = io_zcrx_copy_chunk(req, ifq, virt_to_page(skb->data),
  1134. offset_in_page(skb->data) + offset,
  1135. to_copy);
  1136. if (copied < 0) {
  1137. ret = copied;
  1138. goto out;
  1139. }
  1140. offset += copied;
  1141. len -= copied;
  1142. if (!len)
  1143. goto out;
  1144. if (offset != skb_headlen(skb))
  1145. goto out;
  1146. }
  1147. start = skb_headlen(skb);
  1148. for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
  1149. const skb_frag_t *frag;
  1150. if (WARN_ON(start > offset + len))
  1151. return -EFAULT;
  1152. frag = &skb_shinfo(skb)->frags[i];
  1153. end = start + skb_frag_size(frag);
  1154. if (offset < end) {
  1155. copy = end - offset;
  1156. if (copy > len)
  1157. copy = len;
  1158. off = offset - start;
  1159. ret = io_zcrx_recv_frag(req, ifq, frag, off, copy);
  1160. if (ret < 0)
  1161. goto out;
  1162. offset += ret;
  1163. len -= ret;
  1164. if (len == 0 || ret != copy)
  1165. goto out;
  1166. }
  1167. start = end;
  1168. }
  1169. skb_walk_frags(skb, frag_iter) {
  1170. if (WARN_ON(start > offset + len))
  1171. return -EFAULT;
  1172. end = start + frag_iter->len;
  1173. if (offset < end) {
  1174. size_t count;
  1175. copy = end - offset;
  1176. if (copy > len)
  1177. copy = len;
  1178. off = offset - start;
  1179. count = desc->count;
  1180. ret = io_zcrx_recv_skb(desc, frag_iter, off, copy);
  1181. desc->count = count;
  1182. if (ret < 0)
  1183. goto out;
  1184. offset += ret;
  1185. len -= ret;
  1186. if (len == 0 || ret != copy)
  1187. goto out;
  1188. }
  1189. start = end;
  1190. }
  1191. out:
  1192. if (offset == start_off)
  1193. return ret;
  1194. desc->count -= (offset - start_off);
  1195. return offset - start_off;
  1196. }
  1197. static int io_zcrx_tcp_recvmsg(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
  1198. struct sock *sk, int flags,
  1199. unsigned issue_flags, unsigned int *outlen)
  1200. {
  1201. unsigned int len = *outlen;
  1202. struct io_zcrx_args args = {
  1203. .req = req,
  1204. .ifq = ifq,
  1205. .sock = sk->sk_socket,
  1206. };
  1207. read_descriptor_t rd_desc = {
  1208. .count = len ? len : UINT_MAX,
  1209. .arg.data = &args,
  1210. };
  1211. int ret;
  1212. lock_sock(sk);
  1213. ret = tcp_read_sock(sk, &rd_desc, io_zcrx_recv_skb);
  1214. if (len && ret > 0)
  1215. *outlen = len - ret;
  1216. if (ret <= 0) {
  1217. if (ret < 0 || sock_flag(sk, SOCK_DONE))
  1218. goto out;
  1219. if (sk->sk_err)
  1220. ret = sock_error(sk);
  1221. else if (sk->sk_shutdown & RCV_SHUTDOWN)
  1222. goto out;
  1223. else if (sk->sk_state == TCP_CLOSE)
  1224. ret = -ENOTCONN;
  1225. else
  1226. ret = -EAGAIN;
  1227. } else if (unlikely(args.nr_skbs > IO_SKBS_PER_CALL_LIMIT) &&
  1228. (issue_flags & IO_URING_F_MULTISHOT)) {
  1229. ret = IOU_REQUEUE;
  1230. } else if (sock_flag(sk, SOCK_DONE)) {
  1231. /* Make it to retry until it finally gets 0. */
  1232. if (issue_flags & IO_URING_F_MULTISHOT)
  1233. ret = IOU_REQUEUE;
  1234. else
  1235. ret = -EAGAIN;
  1236. }
  1237. out:
  1238. release_sock(sk);
  1239. return ret;
  1240. }
  1241. int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
  1242. struct socket *sock, unsigned int flags,
  1243. unsigned issue_flags, unsigned int *len)
  1244. {
  1245. struct sock *sk = sock->sk;
  1246. const struct proto *prot = READ_ONCE(sk->sk_prot);
  1247. if (prot->recvmsg != tcp_recvmsg)
  1248. return -EPROTONOSUPPORT;
  1249. sock_rps_record_flow(sk);
  1250. return io_zcrx_tcp_recvmsg(req, ifq, sk, flags, issue_flags, len);
  1251. }