direct-io.c 34 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105
  1. // SPDX-License-Identifier: GPL-2.0
  2. #include <linux/fsverity.h>
  3. #include <linux/iomap.h>
  4. #include "ctree.h"
  5. #include "delalloc-space.h"
  6. #include "direct-io.h"
  7. #include "extent-tree.h"
  8. #include "file.h"
  9. #include "fs.h"
  10. #include "transaction.h"
  11. #include "volumes.h"
  12. #include "bio.h"
  13. #include "ordered-data.h"
  14. struct btrfs_dio_data {
  15. ssize_t submitted;
  16. struct extent_changeset *data_reserved;
  17. struct btrfs_ordered_extent *ordered;
  18. bool data_space_reserved;
  19. bool nocow_done;
  20. };
  21. struct btrfs_dio_private {
  22. /* Range of I/O */
  23. u64 file_offset;
  24. u32 bytes;
  25. /* This must be last */
  26. struct btrfs_bio bbio;
  27. };
  28. static struct bio_set btrfs_dio_bioset;
  29. static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
  30. struct extent_state **cached_state,
  31. unsigned int iomap_flags)
  32. {
  33. const bool writing = (iomap_flags & IOMAP_WRITE);
  34. const bool nowait = (iomap_flags & IOMAP_NOWAIT);
  35. struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
  36. struct btrfs_ordered_extent *ordered;
  37. int ret = 0;
  38. /* Direct lock must be taken before the extent lock. */
  39. if (nowait) {
  40. if (!btrfs_try_lock_dio_extent(io_tree, lockstart, lockend, cached_state))
  41. return -EAGAIN;
  42. } else {
  43. btrfs_lock_dio_extent(io_tree, lockstart, lockend, cached_state);
  44. }
  45. while (1) {
  46. if (nowait) {
  47. if (!btrfs_try_lock_extent(io_tree, lockstart, lockend,
  48. cached_state)) {
  49. ret = -EAGAIN;
  50. break;
  51. }
  52. } else {
  53. btrfs_lock_extent(io_tree, lockstart, lockend, cached_state);
  54. }
  55. /*
  56. * We're concerned with the entire range that we're going to be
  57. * doing DIO to, so we need to make sure there's no ordered
  58. * extents in this range.
  59. */
  60. ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
  61. lockend - lockstart + 1);
  62. /*
  63. * We need to make sure there are no buffered pages in this
  64. * range either, we could have raced between the invalidate in
  65. * generic_file_direct_write and locking the extent. The
  66. * invalidate needs to happen so that reads after a write do not
  67. * get stale data.
  68. */
  69. if (!ordered &&
  70. (!writing || !filemap_range_has_page(inode->i_mapping,
  71. lockstart, lockend)))
  72. break;
  73. btrfs_unlock_extent(io_tree, lockstart, lockend, cached_state);
  74. if (ordered) {
  75. if (nowait) {
  76. btrfs_put_ordered_extent(ordered);
  77. ret = -EAGAIN;
  78. break;
  79. }
  80. /*
  81. * If we are doing a DIO read and the ordered extent we
  82. * found is for a buffered write, we can not wait for it
  83. * to complete and retry, because if we do so we can
  84. * deadlock with concurrent buffered writes on page
  85. * locks. This happens only if our DIO read covers more
  86. * than one extent map, if at this point has already
  87. * created an ordered extent for a previous extent map
  88. * and locked its range in the inode's io tree, and a
  89. * concurrent write against that previous extent map's
  90. * range and this range started (we unlock the ranges
  91. * in the io tree only when the bios complete and
  92. * buffered writes always lock pages before attempting
  93. * to lock range in the io tree).
  94. */
  95. if (writing ||
  96. test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
  97. btrfs_start_ordered_extent(ordered);
  98. else
  99. ret = nowait ? -EAGAIN : -ENOTBLK;
  100. btrfs_put_ordered_extent(ordered);
  101. } else {
  102. /*
  103. * We could trigger writeback for this range (and wait
  104. * for it to complete) and then invalidate the pages for
  105. * this range (through invalidate_inode_pages2_range()),
  106. * but that can lead us to a deadlock with a concurrent
  107. * call to readahead (a buffered read or a defrag call
  108. * triggered a readahead) on a page lock due to an
  109. * ordered dio extent we created before but did not have
  110. * yet a corresponding bio submitted (whence it can not
  111. * complete), which makes readahead wait for that
  112. * ordered extent to complete while holding a lock on
  113. * that page.
  114. */
  115. ret = nowait ? -EAGAIN : -ENOTBLK;
  116. }
  117. if (ret)
  118. break;
  119. cond_resched();
  120. }
  121. if (ret)
  122. btrfs_unlock_dio_extent(io_tree, lockstart, lockend, cached_state);
  123. return ret;
  124. }
  125. static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
  126. struct btrfs_dio_data *dio_data,
  127. const u64 start,
  128. const struct btrfs_file_extent *file_extent,
  129. const int type)
  130. {
  131. struct extent_map *em = NULL;
  132. struct btrfs_ordered_extent *ordered;
  133. if (type != BTRFS_ORDERED_NOCOW) {
  134. em = btrfs_create_io_em(inode, start, file_extent, type);
  135. if (IS_ERR(em))
  136. goto out;
  137. }
  138. ordered = btrfs_alloc_ordered_extent(inode, start, file_extent,
  139. (1U << type) |
  140. (1U << BTRFS_ORDERED_DIRECT));
  141. if (IS_ERR(ordered)) {
  142. if (em) {
  143. btrfs_free_extent_map(em);
  144. btrfs_drop_extent_map_range(inode, start,
  145. start + file_extent->num_bytes - 1, false);
  146. }
  147. em = ERR_CAST(ordered);
  148. } else {
  149. ASSERT(!dio_data->ordered);
  150. dio_data->ordered = ordered;
  151. }
  152. out:
  153. return em;
  154. }
  155. static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
  156. struct btrfs_dio_data *dio_data,
  157. u64 start, u64 len)
  158. {
  159. struct btrfs_root *root = inode->root;
  160. struct btrfs_fs_info *fs_info = root->fs_info;
  161. struct btrfs_file_extent file_extent;
  162. struct extent_map *em;
  163. struct btrfs_key ins;
  164. u64 alloc_hint;
  165. int ret;
  166. alloc_hint = btrfs_get_extent_allocation_hint(inode, start, len);
  167. again:
  168. ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
  169. 0, alloc_hint, &ins, true, true);
  170. if (ret == -EAGAIN) {
  171. ASSERT(btrfs_is_zoned(fs_info));
  172. wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH,
  173. TASK_UNINTERRUPTIBLE);
  174. goto again;
  175. }
  176. if (ret)
  177. return ERR_PTR(ret);
  178. file_extent.disk_bytenr = ins.objectid;
  179. file_extent.disk_num_bytes = ins.offset;
  180. file_extent.num_bytes = ins.offset;
  181. file_extent.ram_bytes = ins.offset;
  182. file_extent.offset = 0;
  183. file_extent.compression = BTRFS_COMPRESS_NONE;
  184. em = btrfs_create_dio_extent(inode, dio_data, start, &file_extent,
  185. BTRFS_ORDERED_REGULAR);
  186. btrfs_dec_block_group_reservations(fs_info, ins.objectid);
  187. if (IS_ERR(em))
  188. btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true);
  189. return em;
  190. }
  191. static int btrfs_get_blocks_direct_write(struct extent_map **map,
  192. struct inode *inode,
  193. struct btrfs_dio_data *dio_data,
  194. u64 start, u64 *lenp,
  195. unsigned int iomap_flags)
  196. {
  197. const bool nowait = (iomap_flags & IOMAP_NOWAIT);
  198. struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
  199. struct btrfs_file_extent file_extent;
  200. struct extent_map *em = *map;
  201. int type;
  202. u64 block_start;
  203. struct btrfs_block_group *bg;
  204. bool can_nocow = false;
  205. bool space_reserved = false;
  206. u64 len = *lenp;
  207. u64 prev_len;
  208. int ret = 0;
  209. /*
  210. * We don't allocate a new extent in the following cases
  211. *
  212. * 1) The inode is marked as NODATACOW. In this case we'll just use the
  213. * existing extent.
  214. * 2) The extent is marked as PREALLOC. We're good to go here and can
  215. * just use the extent.
  216. *
  217. */
  218. if ((em->flags & EXTENT_FLAG_PREALLOC) ||
  219. ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
  220. em->disk_bytenr != EXTENT_MAP_HOLE)) {
  221. if (em->flags & EXTENT_FLAG_PREALLOC)
  222. type = BTRFS_ORDERED_PREALLOC;
  223. else
  224. type = BTRFS_ORDERED_NOCOW;
  225. len = min(len, em->len - (start - em->start));
  226. block_start = btrfs_extent_map_block_start(em) + (start - em->start);
  227. if (can_nocow_extent(BTRFS_I(inode), start, &len, &file_extent,
  228. false) == 1) {
  229. bg = btrfs_inc_nocow_writers(fs_info, block_start);
  230. if (bg)
  231. can_nocow = true;
  232. }
  233. }
  234. prev_len = len;
  235. if (can_nocow) {
  236. struct extent_map *em2;
  237. /* We can NOCOW, so only need to reserve metadata space. */
  238. ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
  239. nowait);
  240. if (ret < 0) {
  241. /* Our caller expects us to free the input extent map. */
  242. btrfs_free_extent_map(em);
  243. *map = NULL;
  244. btrfs_dec_nocow_writers(bg);
  245. if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
  246. ret = -EAGAIN;
  247. goto out;
  248. }
  249. space_reserved = true;
  250. em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start,
  251. &file_extent, type);
  252. btrfs_dec_nocow_writers(bg);
  253. if (type == BTRFS_ORDERED_PREALLOC) {
  254. btrfs_free_extent_map(em);
  255. *map = em2;
  256. em = em2;
  257. }
  258. if (IS_ERR(em2)) {
  259. ret = PTR_ERR(em2);
  260. goto out;
  261. }
  262. dio_data->nocow_done = true;
  263. } else {
  264. /* Our caller expects us to free the input extent map. */
  265. btrfs_free_extent_map(em);
  266. *map = NULL;
  267. if (nowait) {
  268. ret = -EAGAIN;
  269. goto out;
  270. }
  271. /*
  272. * If we could not allocate data space before locking the file
  273. * range and we can't do a NOCOW write, then we have to fail.
  274. */
  275. if (!dio_data->data_space_reserved) {
  276. ret = -ENOSPC;
  277. goto out;
  278. }
  279. /*
  280. * We have to COW and we have already reserved data space before,
  281. * so now we reserve only metadata.
  282. */
  283. ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
  284. false);
  285. if (ret < 0)
  286. goto out;
  287. space_reserved = true;
  288. em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len);
  289. if (IS_ERR(em)) {
  290. ret = PTR_ERR(em);
  291. goto out;
  292. }
  293. *map = em;
  294. len = min(len, em->len - (start - em->start));
  295. if (len < prev_len)
  296. btrfs_delalloc_release_metadata(BTRFS_I(inode),
  297. prev_len - len, true);
  298. }
  299. /*
  300. * We have created our ordered extent, so we can now release our reservation
  301. * for an outstanding extent.
  302. */
  303. btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len);
  304. /*
  305. * Need to update the i_size under the extent lock so buffered
  306. * readers will get the updated i_size when we unlock.
  307. */
  308. if (start + len > i_size_read(inode))
  309. i_size_write(inode, start + len);
  310. out:
  311. if (ret && space_reserved) {
  312. btrfs_delalloc_release_extents(BTRFS_I(inode), len);
  313. btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
  314. }
  315. *lenp = len;
  316. return ret;
  317. }
  318. static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
  319. loff_t length, unsigned int flags, struct iomap *iomap,
  320. struct iomap *srcmap)
  321. {
  322. struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
  323. struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
  324. struct extent_map *em;
  325. struct extent_state *cached_state = NULL;
  326. struct btrfs_dio_data *dio_data = iter->private;
  327. u64 lockstart, lockend;
  328. const bool write = !!(flags & IOMAP_WRITE);
  329. int ret = 0;
  330. u64 len = length;
  331. const u64 data_alloc_len = length;
  332. u32 unlock_bits = EXTENT_LOCKED;
  333. /*
  334. * We could potentially fault if we have a buffer > PAGE_SIZE, and if
  335. * we're NOWAIT we may submit a bio for a partial range and return
  336. * EIOCBQUEUED, which would result in an errant short read.
  337. *
  338. * The best way to handle this would be to allow for partial completions
  339. * of iocb's, so we could submit the partial bio, return and fault in
  340. * the rest of the pages, and then submit the io for the rest of the
  341. * range. However we don't have that currently, so simply return
  342. * -EAGAIN at this point so that the normal path is used.
  343. */
  344. if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE)
  345. return -EAGAIN;
  346. /*
  347. * Cap the size of reads to that usually seen in buffered I/O as we need
  348. * to allocate a contiguous array for the checksums.
  349. */
  350. if (!write)
  351. len = min_t(u64, len, fs_info->sectorsize * BIO_MAX_VECS);
  352. lockstart = start;
  353. lockend = start + len - 1;
  354. /*
  355. * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't
  356. * enough if we've written compressed pages to this area, so we need to
  357. * flush the dirty pages again to make absolutely sure that any
  358. * outstanding dirty pages are on disk - the first flush only starts
  359. * compression on the data, while keeping the pages locked, so by the
  360. * time the second flush returns we know bios for the compressed pages
  361. * were submitted and finished, and the pages no longer under writeback.
  362. *
  363. * If we have a NOWAIT request and we have any pages in the range that
  364. * are locked, likely due to compression still in progress, we don't want
  365. * to block on page locks. We also don't want to block on pages marked as
  366. * dirty or under writeback (same as for the non-compression case).
  367. * iomap_dio_rw() did the same check, but after that and before we got
  368. * here, mmap'ed writes may have happened or buffered reads started
  369. * (readpage() and readahead(), which lock pages), as we haven't locked
  370. * the file range yet.
  371. */
  372. if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
  373. &BTRFS_I(inode)->runtime_flags)) {
  374. if (flags & IOMAP_NOWAIT) {
  375. if (filemap_range_needs_writeback(inode->i_mapping,
  376. lockstart, lockend))
  377. return -EAGAIN;
  378. } else {
  379. ret = filemap_fdatawrite_range(inode->i_mapping, start,
  380. start + length - 1);
  381. if (ret)
  382. return ret;
  383. }
  384. }
  385. memset(dio_data, 0, sizeof(*dio_data));
  386. /*
  387. * We always try to allocate data space and must do it before locking
  388. * the file range, to avoid deadlocks with concurrent writes to the same
  389. * range if the range has several extents and the writes don't expand the
  390. * current i_size (the inode lock is taken in shared mode). If we fail to
  391. * allocate data space here we continue and later, after locking the
  392. * file range, we fail with ENOSPC only if we figure out we can not do a
  393. * NOCOW write.
  394. */
  395. if (write && !(flags & IOMAP_NOWAIT)) {
  396. ret = btrfs_check_data_free_space(BTRFS_I(inode),
  397. &dio_data->data_reserved,
  398. start, data_alloc_len, false);
  399. if (!ret)
  400. dio_data->data_space_reserved = true;
  401. else if (!(BTRFS_I(inode)->flags &
  402. (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
  403. goto err;
  404. }
  405. /*
  406. * If this errors out it's because we couldn't invalidate pagecache for
  407. * this range and we need to fallback to buffered IO, or we are doing a
  408. * NOWAIT read/write and we need to block.
  409. */
  410. ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags);
  411. if (ret < 0)
  412. goto err;
  413. em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
  414. if (IS_ERR(em)) {
  415. ret = PTR_ERR(em);
  416. goto unlock_err;
  417. }
  418. /*
  419. * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
  420. * io. INLINE is special, and we could probably kludge it in here, but
  421. * it's still buffered so for safety lets just fall back to the generic
  422. * buffered path.
  423. *
  424. * For COMPRESSED we _have_ to read the entire extent in so we can
  425. * decompress it, so there will be buffering required no matter what we
  426. * do, so go ahead and fallback to buffered.
  427. *
  428. * We return -ENOTBLK because that's what makes DIO go ahead and go back
  429. * to buffered IO. Don't blame me, this is the price we pay for using
  430. * the generic code.
  431. */
  432. if (btrfs_extent_map_is_compressed(em) || em->disk_bytenr == EXTENT_MAP_INLINE) {
  433. btrfs_free_extent_map(em);
  434. /*
  435. * If we are in a NOWAIT context, return -EAGAIN in order to
  436. * fallback to buffered IO. This is not only because we can
  437. * block with buffered IO (no support for NOWAIT semantics at
  438. * the moment) but also to avoid returning short reads to user
  439. * space - this happens if we were able to read some data from
  440. * previous non-compressed extents and then when we fallback to
  441. * buffered IO, at btrfs_file_read_iter() by calling
  442. * filemap_read(), we fail to fault in pages for the read buffer,
  443. * in which case filemap_read() returns a short read (the number
  444. * of bytes previously read is > 0, so it does not return -EFAULT).
  445. */
  446. ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;
  447. goto unlock_err;
  448. }
  449. len = min(len, em->len - (start - em->start));
  450. /*
  451. * If we have a NOWAIT request and the range contains multiple extents
  452. * (or a mix of extents and holes), then we return -EAGAIN to make the
  453. * caller fallback to a context where it can do a blocking (without
  454. * NOWAIT) request. This way we avoid doing partial IO and returning
  455. * success to the caller, which is not optimal for writes and for reads
  456. * it can result in unexpected behaviour for an application.
  457. *
  458. * When doing a read, because we use IOMAP_DIO_PARTIAL when calling
  459. * iomap_dio_rw(), we can end up returning less data then what the caller
  460. * asked for, resulting in an unexpected, and incorrect, short read.
  461. * That is, the caller asked to read N bytes and we return less than that,
  462. * which is wrong unless we are crossing EOF. This happens if we get a
  463. * page fault error when trying to fault in pages for the buffer that is
  464. * associated to the struct iov_iter passed to iomap_dio_rw(), and we
  465. * have previously submitted bios for other extents in the range, in
  466. * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
  467. * those bios have completed by the time we get the page fault error,
  468. * which we return back to our caller - we should only return EIOCBQUEUED
  469. * after we have submitted bios for all the extents in the range.
  470. */
  471. if ((flags & IOMAP_NOWAIT) && len < length) {
  472. btrfs_free_extent_map(em);
  473. ret = -EAGAIN;
  474. goto unlock_err;
  475. }
  476. if (write) {
  477. ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
  478. start, &len, flags);
  479. if (ret < 0)
  480. goto unlock_err;
  481. /* Recalc len in case the new em is smaller than requested */
  482. len = min(len, em->len - (start - em->start));
  483. if (dio_data->data_space_reserved) {
  484. u64 release_offset;
  485. u64 release_len = 0;
  486. if (dio_data->nocow_done) {
  487. release_offset = start;
  488. release_len = data_alloc_len;
  489. } else if (len < data_alloc_len) {
  490. release_offset = start + len;
  491. release_len = data_alloc_len - len;
  492. }
  493. if (release_len > 0)
  494. btrfs_free_reserved_data_space(BTRFS_I(inode),
  495. dio_data->data_reserved,
  496. release_offset,
  497. release_len);
  498. }
  499. }
  500. /*
  501. * Translate extent map information to iomap.
  502. * We trim the extents (and move the addr) even though iomap code does
  503. * that, since we have locked only the parts we are performing I/O in.
  504. */
  505. if ((em->disk_bytenr == EXTENT_MAP_HOLE) ||
  506. ((em->flags & EXTENT_FLAG_PREALLOC) && !write)) {
  507. iomap->addr = IOMAP_NULL_ADDR;
  508. iomap->type = IOMAP_HOLE;
  509. } else {
  510. iomap->addr = btrfs_extent_map_block_start(em) + (start - em->start);
  511. iomap->type = IOMAP_MAPPED;
  512. }
  513. iomap->offset = start;
  514. iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
  515. iomap->length = len;
  516. btrfs_free_extent_map(em);
  517. /*
  518. * Reads will hold the EXTENT_DIO_LOCKED bit until the io is completed,
  519. * writes only hold it for this part. We hold the extent lock until
  520. * we're completely done with the extent map to make sure it remains
  521. * valid.
  522. */
  523. if (write)
  524. unlock_bits |= EXTENT_DIO_LOCKED;
  525. btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
  526. unlock_bits, &cached_state);
  527. /* We didn't use everything, unlock the dio extent for the remainder. */
  528. if (!write && (start + len) < lockend)
  529. btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, start + len,
  530. lockend, NULL);
  531. return 0;
  532. unlock_err:
  533. /*
  534. * Don't use EXTENT_LOCK_BITS here in case we extend it later and forget
  535. * to update this, be explicit that we expect EXTENT_LOCKED and
  536. * EXTENT_DIO_LOCKED to be set here, and so that's what we're clearing.
  537. */
  538. btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
  539. EXTENT_LOCKED | EXTENT_DIO_LOCKED, &cached_state);
  540. err:
  541. if (dio_data->data_space_reserved) {
  542. btrfs_free_reserved_data_space(BTRFS_I(inode),
  543. dio_data->data_reserved,
  544. start, data_alloc_len);
  545. extent_changeset_free(dio_data->data_reserved);
  546. }
  547. return ret;
  548. }
  549. static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
  550. ssize_t written, unsigned int flags, struct iomap *iomap)
  551. {
  552. struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
  553. struct btrfs_dio_data *dio_data = iter->private;
  554. size_t submitted = dio_data->submitted;
  555. const bool write = !!(flags & IOMAP_WRITE);
  556. int ret = 0;
  557. if (!write && (iomap->type == IOMAP_HOLE)) {
  558. /* If reading from a hole, unlock and return */
  559. btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
  560. pos + length - 1, NULL);
  561. return 0;
  562. }
  563. if (submitted < length) {
  564. pos += submitted;
  565. length -= submitted;
  566. if (write)
  567. btrfs_finish_ordered_extent(dio_data->ordered, NULL,
  568. pos, length, false);
  569. else
  570. btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
  571. pos + length - 1, NULL);
  572. ret = -ENOTBLK;
  573. }
  574. if (write) {
  575. btrfs_put_ordered_extent(dio_data->ordered);
  576. dio_data->ordered = NULL;
  577. }
  578. if (write)
  579. extent_changeset_free(dio_data->data_reserved);
  580. return ret;
  581. }
  582. static void btrfs_dio_end_io(struct btrfs_bio *bbio)
  583. {
  584. struct btrfs_dio_private *dip =
  585. container_of(bbio, struct btrfs_dio_private, bbio);
  586. struct btrfs_inode *inode = bbio->inode;
  587. struct bio *bio = &bbio->bio;
  588. if (bio->bi_status) {
  589. btrfs_warn(inode->root->fs_info,
  590. "direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d",
  591. btrfs_ino(inode), bio->bi_opf,
  592. dip->file_offset, dip->bytes, bio->bi_status);
  593. }
  594. if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
  595. btrfs_finish_ordered_extent(bbio->ordered, NULL,
  596. dip->file_offset, dip->bytes,
  597. !bio->bi_status);
  598. } else {
  599. btrfs_unlock_dio_extent(&inode->io_tree, dip->file_offset,
  600. dip->file_offset + dip->bytes - 1, NULL);
  601. }
  602. bbio->bio.bi_private = bbio->private;
  603. iomap_dio_bio_end_io(bio);
  604. }
  605. static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
  606. struct btrfs_ordered_extent *ordered)
  607. {
  608. u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
  609. u64 len = bbio->bio.bi_iter.bi_size;
  610. struct btrfs_ordered_extent *new;
  611. int ret;
  612. /* Must always be called for the beginning of an ordered extent. */
  613. if (WARN_ON_ONCE(start != ordered->disk_bytenr))
  614. return -EINVAL;
  615. /* No need to split if the ordered extent covers the entire bio. */
  616. if (ordered->disk_num_bytes == len) {
  617. refcount_inc(&ordered->refs);
  618. bbio->ordered = ordered;
  619. return 0;
  620. }
  621. /*
  622. * Don't split the extent_map for NOCOW extents, as we're writing into
  623. * a pre-existing one.
  624. */
  625. if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
  626. ret = btrfs_split_extent_map(bbio->inode, bbio->file_offset,
  627. ordered->num_bytes, len,
  628. ordered->disk_bytenr);
  629. if (ret)
  630. return ret;
  631. }
  632. new = btrfs_split_ordered_extent(ordered, len);
  633. if (IS_ERR(new))
  634. return PTR_ERR(new);
  635. bbio->ordered = new;
  636. return 0;
  637. }
  638. static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
  639. loff_t file_offset)
  640. {
  641. struct btrfs_bio *bbio = btrfs_bio(bio);
  642. struct btrfs_dio_private *dip =
  643. container_of(bbio, struct btrfs_dio_private, bbio);
  644. struct btrfs_dio_data *dio_data = iter->private;
  645. btrfs_bio_init(bbio, BTRFS_I(iter->inode), file_offset,
  646. btrfs_dio_end_io, bio->bi_private);
  647. dip->file_offset = file_offset;
  648. dip->bytes = bio->bi_iter.bi_size;
  649. dio_data->submitted += bio->bi_iter.bi_size;
  650. /*
  651. * Check if we are doing a partial write. If we are, we need to split
  652. * the ordered extent to match the submitted bio. Hang on to the
  653. * remaining unfinishable ordered_extent in dio_data so that it can be
  654. * cancelled in iomap_end to avoid a deadlock wherein faulting the
  655. * remaining pages is blocked on the outstanding ordered extent.
  656. */
  657. if (iter->flags & IOMAP_WRITE) {
  658. int ret;
  659. ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered);
  660. if (ret) {
  661. btrfs_finish_ordered_extent(dio_data->ordered, NULL,
  662. file_offset, dip->bytes,
  663. !ret);
  664. bio->bi_status = errno_to_blk_status(ret);
  665. iomap_dio_bio_end_io(bio);
  666. return;
  667. }
  668. }
  669. btrfs_submit_bbio(bbio, 0);
  670. }
  671. static const struct iomap_ops btrfs_dio_iomap_ops = {
  672. .iomap_begin = btrfs_dio_iomap_begin,
  673. .iomap_end = btrfs_dio_iomap_end,
  674. };
  675. static const struct iomap_dio_ops btrfs_dio_ops = {
  676. .submit_io = btrfs_dio_submit_io,
  677. .bio_set = &btrfs_dio_bioset,
  678. };
  679. static ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter,
  680. size_t done_before)
  681. {
  682. struct btrfs_dio_data data = { 0 };
  683. return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
  684. IOMAP_DIO_PARTIAL | IOMAP_DIO_FSBLOCK_ALIGNED, &data, done_before);
  685. }
  686. static struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
  687. size_t done_before)
  688. {
  689. struct btrfs_dio_data data = { 0 };
  690. return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
  691. IOMAP_DIO_PARTIAL | IOMAP_DIO_FSBLOCK_ALIGNED, &data, done_before);
  692. }
  693. static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
  694. const struct iov_iter *iter, loff_t offset)
  695. {
  696. const u32 blocksize_mask = fs_info->sectorsize - 1;
  697. if (offset & blocksize_mask)
  698. return -EINVAL;
  699. if (iov_iter_alignment(iter) & blocksize_mask)
  700. return -EINVAL;
  701. return 0;
  702. }
  703. ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
  704. {
  705. struct file *file = iocb->ki_filp;
  706. struct inode *inode = file_inode(file);
  707. struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
  708. loff_t pos;
  709. ssize_t written = 0;
  710. ssize_t written_buffered;
  711. size_t prev_left = 0;
  712. loff_t endbyte;
  713. ssize_t ret;
  714. unsigned int ilock_flags = 0;
  715. struct iomap_dio *dio;
  716. const u64 data_profile = btrfs_data_alloc_profile(fs_info) &
  717. BTRFS_BLOCK_GROUP_PROFILE_MASK;
  718. if (iocb->ki_flags & IOCB_NOWAIT)
  719. ilock_flags |= BTRFS_ILOCK_TRY;
  720. /*
  721. * If the write DIO is within EOF, use a shared lock and also only if
  722. * security bits will likely not be dropped by file_remove_privs() called
  723. * from btrfs_write_check(). Either will need to be rechecked after the
  724. * lock was acquired.
  725. */
  726. if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode))
  727. ilock_flags |= BTRFS_ILOCK_SHARED;
  728. /*
  729. * If our data profile has duplication (either extra mirrors or RAID56),
  730. * we can not trust the direct IO buffer, the content may change during
  731. * writeback and cause different contents written to different mirrors.
  732. *
  733. * Thus only RAID0 and SINGLE can go true zero-copy direct IO.
  734. */
  735. if (data_profile != BTRFS_BLOCK_GROUP_RAID0 && data_profile != 0)
  736. goto buffered;
  737. relock:
  738. ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
  739. if (ret < 0)
  740. return ret;
  741. /* Shared lock cannot be used with security bits set. */
  742. if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) {
  743. btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
  744. ilock_flags &= ~BTRFS_ILOCK_SHARED;
  745. goto relock;
  746. }
  747. ret = generic_write_checks(iocb, from);
  748. if (ret <= 0) {
  749. btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
  750. return ret;
  751. }
  752. ret = btrfs_write_check(iocb, ret);
  753. if (ret < 0) {
  754. btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
  755. goto out;
  756. }
  757. pos = iocb->ki_pos;
  758. /*
  759. * Re-check since file size may have changed just before taking the
  760. * lock or pos may have changed because of O_APPEND in generic_write_check()
  761. */
  762. if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
  763. pos + iov_iter_count(from) > i_size_read(inode)) {
  764. btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
  765. ilock_flags &= ~BTRFS_ILOCK_SHARED;
  766. goto relock;
  767. }
  768. if (check_direct_IO(fs_info, from, pos)) {
  769. btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
  770. goto buffered;
  771. }
  772. /*
  773. * We can't control the folios being passed in, applications can write
  774. * to them while a direct IO write is in progress. This means the
  775. * content might change after we calculated the data checksum.
  776. * Therefore we can end up storing a checksum that doesn't match the
  777. * persisted data.
  778. *
  779. * To be extra safe and avoid false data checksum mismatch, if the
  780. * inode requires data checksum, just fallback to buffered IO.
  781. * For buffered IO we have full control of page cache and can ensure
  782. * no one is modifying the content during writeback.
  783. */
  784. if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
  785. btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
  786. goto buffered;
  787. }
  788. /*
  789. * The iov_iter can be mapped to the same file range we are writing to.
  790. * If that's the case, then we will deadlock in the iomap code, because
  791. * it first calls our callback btrfs_dio_iomap_begin(), which will create
  792. * an ordered extent, and after that it will fault in the pages that the
  793. * iov_iter refers to. During the fault in we end up in the readahead
  794. * pages code (starting at btrfs_readahead()), which will lock the range,
  795. * find that ordered extent and then wait for it to complete (at
  796. * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
  797. * obviously the ordered extent can never complete as we didn't submit
  798. * yet the respective bio(s). This always happens when the buffer is
  799. * memory mapped to the same file range, since the iomap DIO code always
  800. * invalidates pages in the target file range (after starting and waiting
  801. * for any writeback).
  802. *
  803. * So here we disable page faults in the iov_iter and then retry if we
  804. * got -EFAULT, faulting in the pages before the retry.
  805. */
  806. again:
  807. from->nofault = true;
  808. dio = btrfs_dio_write(iocb, from, written);
  809. from->nofault = false;
  810. if (IS_ERR_OR_NULL(dio)) {
  811. ret = PTR_ERR_OR_ZERO(dio);
  812. } else {
  813. /*
  814. * If we have a synchronous write, we must make sure the fsync
  815. * triggered by the iomap_dio_complete() call below doesn't
  816. * deadlock on the inode lock - we are already holding it and we
  817. * can't call it after unlocking because we may need to complete
  818. * partial writes due to the input buffer (or parts of it) not
  819. * being already faulted in.
  820. */
  821. ASSERT(current->journal_info == NULL);
  822. current->journal_info = BTRFS_TRANS_DIO_WRITE_STUB;
  823. ret = iomap_dio_complete(dio);
  824. current->journal_info = NULL;
  825. }
  826. /* No increment (+=) because iomap returns a cumulative value. */
  827. if (ret > 0)
  828. written = ret;
  829. if (iov_iter_count(from) > 0 && (ret == -EFAULT || ret > 0)) {
  830. const size_t left = iov_iter_count(from);
  831. /*
  832. * We have more data left to write. Try to fault in as many as
  833. * possible of the remainder pages and retry. We do this without
  834. * releasing and locking again the inode, to prevent races with
  835. * truncate.
  836. *
  837. * Also, in case the iov refers to pages in the file range of the
  838. * file we want to write to (due to a mmap), we could enter an
  839. * infinite loop if we retry after faulting the pages in, since
  840. * iomap will invalidate any pages in the range early on, before
  841. * it tries to fault in the pages of the iov. So we keep track of
  842. * how much was left of iov in the previous EFAULT and fallback
  843. * to buffered IO in case we haven't made any progress.
  844. */
  845. if (left == prev_left) {
  846. ret = -ENOTBLK;
  847. } else {
  848. fault_in_iov_iter_readable(from, left);
  849. prev_left = left;
  850. goto again;
  851. }
  852. }
  853. btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
  854. /*
  855. * If 'ret' is -ENOTBLK or we have not written all data, then it means
  856. * we must fallback to buffered IO.
  857. */
  858. if ((ret < 0 && ret != -ENOTBLK) || !iov_iter_count(from))
  859. goto out;
  860. buffered:
  861. /*
  862. * If we are in a NOWAIT context, then return -EAGAIN to signal the caller
  863. * it must retry the operation in a context where blocking is acceptable,
  864. * because even if we end up not blocking during the buffered IO attempt
  865. * below, we will block when flushing and waiting for the IO.
  866. */
  867. if (iocb->ki_flags & IOCB_NOWAIT) {
  868. ret = -EAGAIN;
  869. goto out;
  870. }
  871. pos = iocb->ki_pos;
  872. written_buffered = btrfs_buffered_write(iocb, from);
  873. if (written_buffered < 0) {
  874. ret = written_buffered;
  875. goto out;
  876. }
  877. /*
  878. * Ensure all data is persisted. We want the next direct IO read to be
  879. * able to read what was just written.
  880. */
  881. endbyte = pos + written_buffered - 1;
  882. ret = btrfs_fdatawrite_range(BTRFS_I(inode), pos, endbyte);
  883. if (ret)
  884. goto out;
  885. ret = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
  886. if (ret)
  887. goto out;
  888. written += written_buffered;
  889. iocb->ki_pos = pos + written_buffered;
  890. invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
  891. endbyte >> PAGE_SHIFT);
  892. out:
  893. return ret < 0 ? ret : written;
  894. }
  895. static int check_direct_read(struct btrfs_fs_info *fs_info,
  896. const struct iov_iter *iter, loff_t offset)
  897. {
  898. int ret;
  899. int i, seg;
  900. ret = check_direct_IO(fs_info, iter, offset);
  901. if (ret < 0)
  902. return ret;
  903. if (!iter_is_iovec(iter))
  904. return 0;
  905. for (seg = 0; seg < iter->nr_segs; seg++) {
  906. for (i = seg + 1; i < iter->nr_segs; i++) {
  907. const struct iovec *iov1 = iter_iov(iter) + seg;
  908. const struct iovec *iov2 = iter_iov(iter) + i;
  909. if (iov1->iov_base == iov2->iov_base)
  910. return -EINVAL;
  911. }
  912. }
  913. return 0;
  914. }
  915. ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
  916. {
  917. struct inode *inode = file_inode(iocb->ki_filp);
  918. size_t prev_left = 0;
  919. ssize_t read = 0;
  920. ssize_t ret;
  921. if (fsverity_active(inode))
  922. return 0;
  923. if (check_direct_read(inode_to_fs_info(inode), to, iocb->ki_pos))
  924. return 0;
  925. btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
  926. again:
  927. /*
  928. * This is similar to what we do for direct IO writes, see the comment
  929. * at btrfs_direct_write(), but we also disable page faults in addition
  930. * to disabling them only at the iov_iter level. This is because when
  931. * reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
  932. * which can still trigger page fault ins despite having set ->nofault
  933. * to true of our 'to' iov_iter.
  934. *
  935. * The difference to direct IO writes is that we deadlock when trying
  936. * to lock the extent range in the inode's tree during he page reads
  937. * triggered by the fault in (while for writes it is due to waiting for
  938. * our own ordered extent). This is because for direct IO reads,
  939. * btrfs_dio_iomap_begin() returns with the extent range locked, which
  940. * is only unlocked in the endio callback (end_bio_extent_readpage()).
  941. */
  942. pagefault_disable();
  943. to->nofault = true;
  944. ret = btrfs_dio_read(iocb, to, read);
  945. to->nofault = false;
  946. pagefault_enable();
  947. /* No increment (+=) because iomap returns a cumulative value. */
  948. if (ret > 0)
  949. read = ret;
  950. if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
  951. const size_t left = iov_iter_count(to);
  952. if (left == prev_left) {
  953. /*
  954. * We didn't make any progress since the last attempt,
  955. * fallback to a buffered read for the remainder of the
  956. * range. This is just to avoid any possibility of looping
  957. * for too long.
  958. */
  959. ret = read;
  960. } else {
  961. /*
  962. * We made some progress since the last retry or this is
  963. * the first time we are retrying. Fault in as many pages
  964. * as possible and retry.
  965. */
  966. fault_in_iov_iter_writeable(to, left);
  967. prev_left = left;
  968. goto again;
  969. }
  970. }
  971. btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
  972. return ret < 0 ? ret : read;
  973. }
  974. int __init btrfs_init_dio(void)
  975. {
  976. if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
  977. offsetof(struct btrfs_dio_private, bbio.bio),
  978. BIOSET_NEED_BVECS))
  979. return -ENOMEM;
  980. return 0;
  981. }
  982. void __cold btrfs_destroy_dio(void)
  983. {
  984. bioset_exit(&btrfs_dio_bioset);
  985. }