reflink.c 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930
  1. // SPDX-License-Identifier: GPL-2.0
  2. #include <linux/blkdev.h>
  3. #include <linux/fscrypt.h>
  4. #include <linux/iversion.h>
  5. #include "ctree.h"
  6. #include "fs.h"
  7. #include "messages.h"
  8. #include "compression.h"
  9. #include "delalloc-space.h"
  10. #include "disk-io.h"
  11. #include "reflink.h"
  12. #include "transaction.h"
  13. #include "subpage.h"
  14. #include "accessors.h"
  15. #include "file-item.h"
  16. #include "file.h"
  17. #include "super.h"
  18. #define BTRFS_MAX_DEDUPE_LEN SZ_16M
  19. static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
  20. struct inode *inode,
  21. u64 endoff,
  22. const u64 destoff,
  23. const u64 olen,
  24. bool no_time_update)
  25. {
  26. int ret;
  27. inode_inc_iversion(inode);
  28. if (!no_time_update) {
  29. inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
  30. }
  31. /*
  32. * We round up to the block size at eof when determining which
  33. * extents to clone above, but shouldn't round up the file size.
  34. */
  35. if (endoff > destoff + olen)
  36. endoff = destoff + olen;
  37. if (endoff > inode->i_size) {
  38. i_size_write(inode, endoff);
  39. btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
  40. }
  41. ret = btrfs_update_inode(trans, BTRFS_I(inode));
  42. if (unlikely(ret)) {
  43. btrfs_abort_transaction(trans, ret);
  44. btrfs_end_transaction(trans);
  45. return ret;
  46. }
  47. return btrfs_end_transaction(trans);
  48. }
  49. static int copy_inline_to_page(struct btrfs_inode *inode,
  50. const u64 file_offset,
  51. char *inline_data,
  52. const u64 size,
  53. const u64 datal,
  54. const u8 comp_type)
  55. {
  56. struct btrfs_fs_info *fs_info = inode->root->fs_info;
  57. const u32 block_size = fs_info->sectorsize;
  58. const u64 range_end = file_offset + block_size - 1;
  59. const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0);
  60. char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0);
  61. struct extent_changeset *data_reserved = NULL;
  62. struct folio *folio = NULL;
  63. struct address_space *mapping = inode->vfs_inode.i_mapping;
  64. int ret;
  65. ASSERT(IS_ALIGNED(file_offset, block_size));
  66. /*
  67. * We have flushed and locked the ranges of the source and destination
  68. * inodes, we also have locked the inodes, so we are safe to do a
  69. * reservation here. Also we must not do the reservation while holding
  70. * a transaction open, otherwise we would deadlock.
  71. */
  72. ret = btrfs_delalloc_reserve_space(inode, &data_reserved, file_offset,
  73. block_size);
  74. if (ret)
  75. goto out;
  76. folio = __filemap_get_folio(mapping, file_offset >> PAGE_SHIFT,
  77. FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
  78. btrfs_alloc_write_mask(mapping));
  79. if (IS_ERR(folio)) {
  80. ret = PTR_ERR(folio);
  81. goto out_unlock;
  82. }
  83. ret = set_folio_extent_mapped(folio);
  84. if (ret < 0)
  85. goto out_unlock;
  86. btrfs_clear_extent_bit(&inode->io_tree, file_offset, range_end,
  87. EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, NULL);
  88. ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL);
  89. if (ret)
  90. goto out_unlock;
  91. /*
  92. * After dirtying the page our caller will need to start a transaction,
  93. * and if we are low on metadata free space, that can cause flushing of
  94. * delalloc for all inodes in order to get metadata space released.
  95. * However we are holding the range locked for the whole duration of
  96. * the clone/dedupe operation, so we may deadlock if that happens and no
  97. * other task releases enough space. So mark this inode as not being
  98. * possible to flush to avoid such deadlock. We will clear that flag
  99. * when we finish cloning all extents, since a transaction is started
  100. * after finding each extent to clone.
  101. */
  102. set_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags);
  103. if (comp_type == BTRFS_COMPRESS_NONE) {
  104. memcpy_to_folio(folio, offset_in_folio(folio, file_offset), data_start,
  105. datal);
  106. } else {
  107. ret = btrfs_decompress(comp_type, data_start, folio,
  108. offset_in_folio(folio, file_offset),
  109. inline_size, datal);
  110. if (ret)
  111. goto out_unlock;
  112. flush_dcache_folio(folio);
  113. }
  114. /*
  115. * If our inline data is smaller then the block/page size, then the
  116. * remaining of the block/page is equivalent to zeroes. We had something
  117. * like the following done:
  118. *
  119. * $ xfs_io -f -c "pwrite -S 0xab 0 500" file
  120. * $ sync # (or fsync)
  121. * $ xfs_io -c "falloc 0 4K" file
  122. * $ xfs_io -c "pwrite -S 0xcd 4K 4K"
  123. *
  124. * So what's in the range [500, 4095] corresponds to zeroes.
  125. */
  126. if (datal < block_size)
  127. folio_zero_range(folio, datal, block_size - datal);
  128. btrfs_folio_set_uptodate(fs_info, folio, file_offset, block_size);
  129. btrfs_folio_clear_checked(fs_info, folio, file_offset, block_size);
  130. btrfs_folio_set_dirty(fs_info, folio, file_offset, block_size);
  131. out_unlock:
  132. if (!IS_ERR(folio)) {
  133. folio_unlock(folio);
  134. folio_put(folio);
  135. }
  136. if (ret)
  137. btrfs_delalloc_release_space(inode, data_reserved, file_offset,
  138. block_size, true);
  139. btrfs_delalloc_release_extents(inode, block_size);
  140. out:
  141. extent_changeset_free(data_reserved);
  142. return ret;
  143. }
  144. /*
  145. * Deal with cloning of inline extents. We try to copy the inline extent from
  146. * the source inode to destination inode when possible. When not possible we
  147. * copy the inline extent's data into the respective page of the inode.
  148. */
  149. static int clone_copy_inline_extent(struct btrfs_inode *inode,
  150. struct btrfs_path *path,
  151. struct btrfs_key *new_key,
  152. const u64 drop_start,
  153. const u64 datal,
  154. const u64 size,
  155. const u8 comp_type,
  156. char *inline_data,
  157. struct btrfs_trans_handle **trans_out)
  158. {
  159. struct btrfs_root *root = inode->root;
  160. struct btrfs_fs_info *fs_info = root->fs_info;
  161. const u64 aligned_end = ALIGN(new_key->offset + datal,
  162. fs_info->sectorsize);
  163. struct btrfs_trans_handle *trans = NULL;
  164. struct btrfs_drop_extents_args drop_args = { 0 };
  165. int ret;
  166. struct btrfs_key key;
  167. if (new_key->offset > 0) {
  168. ret = copy_inline_to_page(inode, new_key->offset,
  169. inline_data, size, datal, comp_type);
  170. goto out;
  171. }
  172. key.objectid = btrfs_ino(inode);
  173. key.type = BTRFS_EXTENT_DATA_KEY;
  174. key.offset = 0;
  175. ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
  176. if (ret < 0) {
  177. return ret;
  178. } else if (ret > 0) {
  179. if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
  180. ret = btrfs_next_leaf(root, path);
  181. if (ret < 0)
  182. return ret;
  183. else if (ret > 0)
  184. goto copy_inline_extent;
  185. }
  186. btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
  187. if (key.objectid == btrfs_ino(inode) &&
  188. key.type == BTRFS_EXTENT_DATA_KEY) {
  189. /*
  190. * There's an implicit hole at file offset 0, copy the
  191. * inline extent's data to the page.
  192. */
  193. ASSERT(key.offset > 0);
  194. goto copy_to_page;
  195. }
  196. } else if (i_size_read(&inode->vfs_inode) <= datal) {
  197. struct btrfs_file_extent_item *ei;
  198. ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
  199. struct btrfs_file_extent_item);
  200. /*
  201. * If it's an inline extent replace it with the source inline
  202. * extent, otherwise copy the source inline extent data into
  203. * the respective page at the destination inode.
  204. */
  205. if (btrfs_file_extent_type(path->nodes[0], ei) ==
  206. BTRFS_FILE_EXTENT_INLINE)
  207. goto copy_inline_extent;
  208. goto copy_to_page;
  209. }
  210. copy_inline_extent:
  211. /*
  212. * We have no extent items, or we have an extent at offset 0 which may
  213. * or may not be inlined. All these cases are dealt the same way.
  214. */
  215. if (i_size_read(&inode->vfs_inode) > datal) {
  216. /*
  217. * At the destination offset 0 we have either a hole, a regular
  218. * extent or an inline extent larger then the one we want to
  219. * clone. Deal with all these cases by copying the inline extent
  220. * data into the respective page at the destination inode.
  221. */
  222. goto copy_to_page;
  223. }
  224. /*
  225. * Release path before starting a new transaction so we don't hold locks
  226. * that would confuse lockdep.
  227. */
  228. btrfs_release_path(path);
  229. /*
  230. * If we end up here it means were copy the inline extent into a leaf
  231. * of the destination inode. We know we will drop or adjust at most one
  232. * extent item in the destination root.
  233. *
  234. * 1 unit - adjusting old extent (we may have to split it)
  235. * 1 unit - add new extent
  236. * 1 unit - inode update
  237. */
  238. trans = btrfs_start_transaction(root, 3);
  239. if (IS_ERR(trans)) {
  240. ret = PTR_ERR(trans);
  241. trans = NULL;
  242. goto out;
  243. }
  244. drop_args.path = path;
  245. drop_args.start = drop_start;
  246. drop_args.end = aligned_end;
  247. drop_args.drop_cache = true;
  248. ret = btrfs_drop_extents(trans, root, inode, &drop_args);
  249. if (unlikely(ret)) {
  250. btrfs_abort_transaction(trans, ret);
  251. goto out;
  252. }
  253. ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
  254. if (unlikely(ret)) {
  255. btrfs_abort_transaction(trans, ret);
  256. goto out;
  257. }
  258. write_extent_buffer(path->nodes[0], inline_data,
  259. btrfs_item_ptr_offset(path->nodes[0],
  260. path->slots[0]),
  261. size);
  262. btrfs_update_inode_bytes(inode, datal, drop_args.bytes_found);
  263. btrfs_set_inode_full_sync(inode);
  264. ret = btrfs_inode_set_file_extent_range(inode, 0, aligned_end);
  265. if (unlikely(ret))
  266. btrfs_abort_transaction(trans, ret);
  267. out:
  268. if (!ret && !trans) {
  269. /*
  270. * No transaction here means we copied the inline extent into a
  271. * page of the destination inode.
  272. *
  273. * 1 unit to update inode item
  274. */
  275. trans = btrfs_start_transaction(root, 1);
  276. if (IS_ERR(trans)) {
  277. ret = PTR_ERR(trans);
  278. trans = NULL;
  279. }
  280. }
  281. if (ret && trans)
  282. btrfs_end_transaction(trans);
  283. if (!ret)
  284. *trans_out = trans;
  285. return ret;
  286. copy_to_page:
  287. /*
  288. * Release our path because we don't need it anymore and also because
  289. * copy_inline_to_page() needs to reserve data and metadata, which may
  290. * need to flush delalloc when we are low on available space and
  291. * therefore cause a deadlock if writeback of an inline extent needs to
  292. * write to the same leaf or an ordered extent completion needs to write
  293. * to the same leaf.
  294. */
  295. btrfs_release_path(path);
  296. ret = copy_inline_to_page(inode, new_key->offset,
  297. inline_data, size, datal, comp_type);
  298. goto out;
  299. }
  300. /*
  301. * Clone a range from inode file to another.
  302. *
  303. * @src: Inode to clone from
  304. * @inode: Inode to clone to
  305. * @off: Offset within source to start clone from
  306. * @olen: Original length, passed by user, of range to clone
  307. * @olen_aligned: Block-aligned value of olen
  308. * @destoff: Offset within @inode to start clone
  309. * @no_time_update: Whether to update mtime/ctime on the target inode
  310. */
  311. static int btrfs_clone(struct inode *src, struct inode *inode,
  312. const u64 off, const u64 olen, const u64 olen_aligned,
  313. const u64 destoff, bool no_time_update)
  314. {
  315. struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
  316. BTRFS_PATH_AUTO_FREE(path);
  317. struct extent_buffer *leaf;
  318. struct btrfs_trans_handle *trans;
  319. char AUTO_KVFREE(buf);
  320. struct btrfs_key key;
  321. u32 nritems;
  322. int slot;
  323. int ret;
  324. const u64 len = olen_aligned;
  325. u64 last_dest_end = destoff;
  326. u64 prev_extent_end = off;
  327. ret = -ENOMEM;
  328. buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
  329. if (!buf)
  330. return ret;
  331. path = btrfs_alloc_path();
  332. if (!path)
  333. return ret;
  334. path->reada = READA_FORWARD;
  335. /* Clone data */
  336. key.objectid = btrfs_ino(BTRFS_I(src));
  337. key.type = BTRFS_EXTENT_DATA_KEY;
  338. key.offset = off;
  339. while (1) {
  340. struct btrfs_file_extent_item *extent;
  341. u64 extent_gen;
  342. int type;
  343. u32 size;
  344. struct btrfs_key new_key;
  345. u64 disko = 0, diskl = 0;
  346. u64 datao = 0, datal = 0;
  347. u8 comp;
  348. u64 drop_start;
  349. /* Note the key will change type as we walk through the tree */
  350. ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path,
  351. 0, 0);
  352. if (ret < 0)
  353. goto out;
  354. /*
  355. * First search, if no extent item that starts at offset off was
  356. * found but the previous item is an extent item, it's possible
  357. * it might overlap our target range, therefore process it.
  358. */
  359. if (key.offset == off && ret > 0 && path->slots[0] > 0) {
  360. btrfs_item_key_to_cpu(path->nodes[0], &key,
  361. path->slots[0] - 1);
  362. if (key.type == BTRFS_EXTENT_DATA_KEY)
  363. path->slots[0]--;
  364. }
  365. nritems = btrfs_header_nritems(path->nodes[0]);
  366. process_slot:
  367. if (path->slots[0] >= nritems) {
  368. ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
  369. if (ret < 0)
  370. goto out;
  371. if (ret > 0)
  372. break;
  373. nritems = btrfs_header_nritems(path->nodes[0]);
  374. }
  375. leaf = path->nodes[0];
  376. slot = path->slots[0];
  377. btrfs_item_key_to_cpu(leaf, &key, slot);
  378. if (key.type > BTRFS_EXTENT_DATA_KEY ||
  379. key.objectid != btrfs_ino(BTRFS_I(src)))
  380. break;
  381. ASSERT(key.type == BTRFS_EXTENT_DATA_KEY);
  382. extent = btrfs_item_ptr(leaf, slot,
  383. struct btrfs_file_extent_item);
  384. extent_gen = btrfs_file_extent_generation(leaf, extent);
  385. comp = btrfs_file_extent_compression(leaf, extent);
  386. type = btrfs_file_extent_type(leaf, extent);
  387. if (type == BTRFS_FILE_EXTENT_REG ||
  388. type == BTRFS_FILE_EXTENT_PREALLOC) {
  389. disko = btrfs_file_extent_disk_bytenr(leaf, extent);
  390. diskl = btrfs_file_extent_disk_num_bytes(leaf, extent);
  391. datao = btrfs_file_extent_offset(leaf, extent);
  392. datal = btrfs_file_extent_num_bytes(leaf, extent);
  393. } else if (type == BTRFS_FILE_EXTENT_INLINE) {
  394. /* Take upper bound, may be compressed */
  395. datal = btrfs_file_extent_ram_bytes(leaf, extent);
  396. }
  397. /*
  398. * The first search might have left us at an extent item that
  399. * ends before our target range's start, can happen if we have
  400. * holes and NO_HOLES feature enabled.
  401. *
  402. * Subsequent searches may leave us on a file range we have
  403. * processed before - this happens due to a race with ordered
  404. * extent completion for a file range that is outside our source
  405. * range, but that range was part of a file extent item that
  406. * also covered a leading part of our source range.
  407. */
  408. if (key.offset + datal <= prev_extent_end) {
  409. path->slots[0]++;
  410. goto process_slot;
  411. } else if (key.offset >= off + len) {
  412. break;
  413. }
  414. prev_extent_end = key.offset + datal;
  415. size = btrfs_item_size(leaf, slot);
  416. read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot),
  417. size);
  418. btrfs_release_path(path);
  419. memcpy(&new_key, &key, sizeof(new_key));
  420. new_key.objectid = btrfs_ino(BTRFS_I(inode));
  421. if (off <= key.offset)
  422. new_key.offset = key.offset + destoff - off;
  423. else
  424. new_key.offset = destoff;
  425. /*
  426. * Deal with a hole that doesn't have an extent item that
  427. * represents it (NO_HOLES feature enabled).
  428. * This hole is either in the middle of the cloning range or at
  429. * the beginning (fully overlaps it or partially overlaps it).
  430. */
  431. if (new_key.offset != last_dest_end)
  432. drop_start = last_dest_end;
  433. else
  434. drop_start = new_key.offset;
  435. if (type == BTRFS_FILE_EXTENT_REG ||
  436. type == BTRFS_FILE_EXTENT_PREALLOC) {
  437. struct btrfs_replace_extent_info clone_info;
  438. /*
  439. * a | --- range to clone ---| b
  440. * | ------------- extent ------------- |
  441. */
  442. /* Subtract range b */
  443. if (key.offset + datal > off + len)
  444. datal = off + len - key.offset;
  445. /* Subtract range a */
  446. if (off > key.offset) {
  447. datao += off - key.offset;
  448. datal -= off - key.offset;
  449. }
  450. clone_info.disk_offset = disko;
  451. clone_info.disk_len = diskl;
  452. clone_info.data_offset = datao;
  453. clone_info.data_len = datal;
  454. clone_info.file_offset = new_key.offset;
  455. clone_info.extent_buf = buf;
  456. clone_info.is_new_extent = false;
  457. clone_info.update_times = !no_time_update;
  458. ret = btrfs_replace_file_extents(BTRFS_I(inode), path,
  459. drop_start, new_key.offset + datal - 1,
  460. &clone_info, &trans);
  461. if (ret)
  462. goto out;
  463. } else {
  464. ASSERT(type == BTRFS_FILE_EXTENT_INLINE);
  465. /*
  466. * Inline extents always have to start at file offset 0
  467. * and can never be bigger then the sector size. We can
  468. * never clone only parts of an inline extent, since all
  469. * reflink operations must start at a sector size aligned
  470. * offset, and the length must be aligned too or end at
  471. * the i_size (which implies the whole inlined data).
  472. */
  473. ASSERT(key.offset == 0);
  474. ASSERT(datal <= fs_info->sectorsize);
  475. if (WARN_ON(type != BTRFS_FILE_EXTENT_INLINE) ||
  476. WARN_ON(key.offset != 0) ||
  477. WARN_ON(datal > fs_info->sectorsize)) {
  478. ret = -EUCLEAN;
  479. goto out;
  480. }
  481. ret = clone_copy_inline_extent(BTRFS_I(inode), path, &new_key,
  482. drop_start, datal, size,
  483. comp, buf, &trans);
  484. if (ret)
  485. goto out;
  486. }
  487. btrfs_release_path(path);
  488. /*
  489. * Whenever we share an extent we update the last_reflink_trans
  490. * of each inode to the current transaction. This is needed to
  491. * make sure fsync does not log multiple checksum items with
  492. * overlapping ranges (because some extent items might refer
  493. * only to sections of the original extent). For the destination
  494. * inode we do this regardless of the generation of the extents
  495. * or even if they are inline extents or explicit holes, to make
  496. * sure a full fsync does not skip them. For the source inode,
  497. * we only need to update last_reflink_trans in case it's a new
  498. * extent that is not a hole or an inline extent, to deal with
  499. * the checksums problem on fsync.
  500. */
  501. if (extent_gen == trans->transid && disko > 0)
  502. BTRFS_I(src)->last_reflink_trans = trans->transid;
  503. BTRFS_I(inode)->last_reflink_trans = trans->transid;
  504. last_dest_end = ALIGN(new_key.offset + datal,
  505. fs_info->sectorsize);
  506. ret = clone_finish_inode_update(trans, inode, last_dest_end,
  507. destoff, olen, no_time_update);
  508. if (ret)
  509. goto out;
  510. if (new_key.offset + datal >= destoff + len)
  511. break;
  512. btrfs_release_path(path);
  513. key.offset = prev_extent_end;
  514. if (fatal_signal_pending(current)) {
  515. ret = -EINTR;
  516. goto out;
  517. }
  518. cond_resched();
  519. }
  520. ret = 0;
  521. if (last_dest_end < destoff + len) {
  522. /*
  523. * We have an implicit hole that fully or partially overlaps our
  524. * cloning range at its end. This means that we either have the
  525. * NO_HOLES feature enabled or the implicit hole happened due to
  526. * mixing buffered and direct IO writes against this file.
  527. */
  528. btrfs_release_path(path);
  529. /*
  530. * When using NO_HOLES and we are cloning a range that covers
  531. * only a hole (no extents) into a range beyond the current
  532. * i_size, punching a hole in the target range will not create
  533. * an extent map defining a hole, because the range starts at or
  534. * beyond current i_size. If the file previously had an i_size
  535. * greater than the new i_size set by this clone operation, we
  536. * need to make sure the next fsync is a full fsync, so that it
  537. * detects and logs a hole covering a range from the current
  538. * i_size to the new i_size. If the clone range covers extents,
  539. * besides a hole, then we know the full sync flag was already
  540. * set by previous calls to btrfs_replace_file_extents() that
  541. * replaced file extent items.
  542. */
  543. if (last_dest_end >= i_size_read(inode))
  544. btrfs_set_inode_full_sync(BTRFS_I(inode));
  545. ret = btrfs_replace_file_extents(BTRFS_I(inode), path,
  546. last_dest_end, destoff + len - 1, NULL, &trans);
  547. if (ret)
  548. goto out;
  549. ret = clone_finish_inode_update(trans, inode, destoff + len,
  550. destoff, olen, no_time_update);
  551. }
  552. out:
  553. clear_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &BTRFS_I(inode)->runtime_flags);
  554. return ret;
  555. }
  556. static void btrfs_double_mmap_lock(struct btrfs_inode *inode1, struct btrfs_inode *inode2)
  557. {
  558. if (inode1 < inode2)
  559. swap(inode1, inode2);
  560. down_write(&inode1->i_mmap_lock);
  561. down_write_nested(&inode2->i_mmap_lock, SINGLE_DEPTH_NESTING);
  562. }
  563. static void btrfs_double_mmap_unlock(struct btrfs_inode *inode1, struct btrfs_inode *inode2)
  564. {
  565. up_write(&inode1->i_mmap_lock);
  566. up_write(&inode2->i_mmap_lock);
  567. }
  568. static int btrfs_extent_same_range(struct btrfs_inode *src, u64 loff, u64 len,
  569. struct btrfs_inode *dst, u64 dst_loff)
  570. {
  571. const u64 end = dst_loff + len - 1;
  572. struct extent_state *cached_state = NULL;
  573. struct btrfs_fs_info *fs_info = src->root->fs_info;
  574. const u64 bs = fs_info->sectorsize;
  575. int ret;
  576. /*
  577. * Lock destination range to serialize with concurrent readahead(), and
  578. * we are safe from concurrency with relocation of source extents
  579. * because we have already locked the inode's i_mmap_lock in exclusive
  580. * mode.
  581. */
  582. btrfs_lock_extent(&dst->io_tree, dst_loff, end, &cached_state);
  583. ret = btrfs_clone(&src->vfs_inode, &dst->vfs_inode, loff, len,
  584. ALIGN(len, bs), dst_loff, 1);
  585. btrfs_unlock_extent(&dst->io_tree, dst_loff, end, &cached_state);
  586. btrfs_btree_balance_dirty(fs_info);
  587. return ret;
  588. }
  589. static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
  590. struct inode *dst, u64 dst_loff)
  591. {
  592. int ret = 0;
  593. u64 i, tail_len, chunk_count;
  594. struct btrfs_root *root_dst = BTRFS_I(dst)->root;
  595. spin_lock(&root_dst->root_item_lock);
  596. if (root_dst->send_in_progress) {
  597. btrfs_warn_rl(root_dst->fs_info,
  598. "cannot deduplicate to root %llu while send operations are using it (%d in progress)",
  599. btrfs_root_id(root_dst),
  600. root_dst->send_in_progress);
  601. spin_unlock(&root_dst->root_item_lock);
  602. return -EAGAIN;
  603. }
  604. root_dst->dedupe_in_progress++;
  605. spin_unlock(&root_dst->root_item_lock);
  606. tail_len = olen % BTRFS_MAX_DEDUPE_LEN;
  607. chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN);
  608. for (i = 0; i < chunk_count; i++) {
  609. ret = btrfs_extent_same_range(BTRFS_I(src), loff, BTRFS_MAX_DEDUPE_LEN,
  610. BTRFS_I(dst), dst_loff);
  611. if (ret)
  612. goto out;
  613. loff += BTRFS_MAX_DEDUPE_LEN;
  614. dst_loff += BTRFS_MAX_DEDUPE_LEN;
  615. }
  616. if (tail_len > 0)
  617. ret = btrfs_extent_same_range(BTRFS_I(src), loff, tail_len,
  618. BTRFS_I(dst), dst_loff);
  619. out:
  620. spin_lock(&root_dst->root_item_lock);
  621. root_dst->dedupe_in_progress--;
  622. spin_unlock(&root_dst->root_item_lock);
  623. return ret;
  624. }
  625. static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
  626. u64 off, u64 olen, u64 destoff)
  627. {
  628. struct extent_state *cached_state = NULL;
  629. struct inode *inode = file_inode(file);
  630. struct inode *src = file_inode(file_src);
  631. struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
  632. int ret;
  633. u64 len = olen;
  634. u64 bs = fs_info->sectorsize;
  635. u64 end;
  636. /*
  637. * VFS's generic_remap_file_range_prep() protects us from cloning the
  638. * eof block into the middle of a file, which would result in corruption
  639. * if the file size is not blocksize aligned. So we don't need to check
  640. * for that case here.
  641. */
  642. if (off + len == src->i_size)
  643. len = ALIGN(src->i_size, bs) - off;
  644. if (destoff > inode->i_size) {
  645. const u64 wb_start = ALIGN_DOWN(inode->i_size, bs);
  646. ret = btrfs_cont_expand(BTRFS_I(inode), inode->i_size, destoff);
  647. if (ret)
  648. return ret;
  649. /*
  650. * We may have truncated the last block if the inode's size is
  651. * not sector size aligned, so we need to wait for writeback to
  652. * complete before proceeding further, otherwise we can race
  653. * with cloning and attempt to increment a reference to an
  654. * extent that no longer exists (writeback completed right after
  655. * we found the previous extent covering eof and before we
  656. * attempted to increment its reference count).
  657. */
  658. ret = btrfs_wait_ordered_range(BTRFS_I(inode), wb_start,
  659. destoff - wb_start);
  660. if (ret)
  661. return ret;
  662. }
  663. /*
  664. * Lock destination range to serialize with concurrent readahead(), and
  665. * we are safe from concurrency with relocation of source extents
  666. * because we have already locked the inode's i_mmap_lock in exclusive
  667. * mode.
  668. */
  669. end = destoff + len - 1;
  670. btrfs_lock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state);
  671. ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
  672. btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state);
  673. if (ret < 0)
  674. return ret;
  675. /*
  676. * We may have copied an inline extent into a page of the destination
  677. * range. So flush delalloc and wait for ordered extent completion.
  678. * This is to ensure the invalidation below does not fail, as if for
  679. * example it finds a dirty folio, our folio release callback
  680. * (btrfs_release_folio()) returns false, which makes the invalidation
  681. * return an -EBUSY error. We can't ignore such failures since they
  682. * could come from some range other than the copied inline extent's
  683. * destination range and we have no way to know that.
  684. */
  685. ret = btrfs_wait_ordered_range(BTRFS_I(inode), destoff, len);
  686. if (ret < 0)
  687. return ret;
  688. /*
  689. * Invalidate page cache so that future reads will see the cloned data
  690. * immediately and not the previous data.
  691. */
  692. ret = filemap_invalidate_inode(inode, false, destoff, end);
  693. if (ret < 0)
  694. return ret;
  695. btrfs_btree_balance_dirty(fs_info);
  696. return 0;
  697. }
  698. static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
  699. struct file *file_out, loff_t pos_out,
  700. loff_t *len, unsigned int remap_flags)
  701. {
  702. struct btrfs_inode *inode_in = BTRFS_I(file_inode(file_in));
  703. struct btrfs_inode *inode_out = BTRFS_I(file_inode(file_out));
  704. u64 bs = inode_out->root->fs_info->sectorsize;
  705. u64 wb_len;
  706. int ret;
  707. if (!(remap_flags & REMAP_FILE_DEDUP)) {
  708. struct btrfs_root *root_out = inode_out->root;
  709. if (btrfs_root_readonly(root_out))
  710. return -EROFS;
  711. ASSERT(inode_in->vfs_inode.i_sb == inode_out->vfs_inode.i_sb);
  712. }
  713. /* Can only reflink encrypted files if both files are encrypted. */
  714. if (IS_ENCRYPTED(&inode_in->vfs_inode) != IS_ENCRYPTED(&inode_out->vfs_inode))
  715. return -EINVAL;
  716. /* Don't make the dst file partly checksummed */
  717. if ((inode_in->flags & BTRFS_INODE_NODATASUM) !=
  718. (inode_out->flags & BTRFS_INODE_NODATASUM)) {
  719. return -EINVAL;
  720. }
  721. /*
  722. * Now that the inodes are locked, we need to start writeback ourselves
  723. * and can not rely on the writeback from the VFS's generic helper
  724. * generic_remap_file_range_prep() because:
  725. *
  726. * 1) For compression we must call filemap_fdatawrite_range() range
  727. * twice (btrfs_fdatawrite_range() does it for us), and the generic
  728. * helper only calls it once;
  729. *
  730. * 2) filemap_fdatawrite_range(), called by the generic helper only
  731. * waits for the writeback to complete, i.e. for IO to be done, and
  732. * not for the ordered extents to complete. We need to wait for them
  733. * to complete so that new file extent items are in the fs tree.
  734. */
  735. if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP))
  736. wb_len = ALIGN(inode_in->vfs_inode.i_size, bs) - ALIGN_DOWN(pos_in, bs);
  737. else
  738. wb_len = ALIGN(*len, bs);
  739. /*
  740. * Workaround to make sure NOCOW buffered write reach disk as NOCOW.
  741. *
  742. * Btrfs' back references do not have a block level granularity, they
  743. * work at the whole extent level.
  744. * NOCOW buffered write without data space reserved may not be able
  745. * to fall back to CoW due to lack of data space, thus could cause
  746. * data loss.
  747. *
  748. * Here we take a shortcut by flushing the whole inode, so that all
  749. * nocow write should reach disk as nocow before we increase the
  750. * reference of the extent. We could do better by only flushing NOCOW
  751. * data, but that needs extra accounting.
  752. *
  753. * Also we don't need to check ASYNC_EXTENT, as async extent will be
  754. * CoWed anyway, not affecting nocow part.
  755. */
  756. ret = filemap_flush(inode_in->vfs_inode.i_mapping);
  757. if (ret < 0)
  758. return ret;
  759. ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), wb_len);
  760. if (ret < 0)
  761. return ret;
  762. ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs), wb_len);
  763. if (ret < 0)
  764. return ret;
  765. return generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
  766. len, remap_flags);
  767. }
  768. static bool file_sync_write(const struct file *file)
  769. {
  770. if (file->f_flags & (__O_SYNC | O_DSYNC))
  771. return true;
  772. if (IS_SYNC(file_inode(file)))
  773. return true;
  774. return false;
  775. }
  776. loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
  777. struct file *dst_file, loff_t destoff, loff_t len,
  778. unsigned int remap_flags)
  779. {
  780. struct btrfs_inode *src_inode = BTRFS_I(file_inode(src_file));
  781. struct btrfs_inode *dst_inode = BTRFS_I(file_inode(dst_file));
  782. bool same_inode = dst_inode == src_inode;
  783. int ret;
  784. if (btrfs_is_shutdown(inode_to_fs_info(file_inode(src_file))))
  785. return -EIO;
  786. if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
  787. return -EINVAL;
  788. if (same_inode) {
  789. btrfs_inode_lock(src_inode, BTRFS_ILOCK_MMAP);
  790. } else {
  791. lock_two_nondirectories(&src_inode->vfs_inode, &dst_inode->vfs_inode);
  792. btrfs_double_mmap_lock(src_inode, dst_inode);
  793. }
  794. ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff,
  795. &len, remap_flags);
  796. if (ret < 0 || len == 0)
  797. goto out_unlock;
  798. if (remap_flags & REMAP_FILE_DEDUP)
  799. ret = btrfs_extent_same(&src_inode->vfs_inode, off, len,
  800. &dst_inode->vfs_inode, destoff);
  801. else
  802. ret = btrfs_clone_files(dst_file, src_file, off, len, destoff);
  803. out_unlock:
  804. if (same_inode) {
  805. btrfs_inode_unlock(src_inode, BTRFS_ILOCK_MMAP);
  806. } else {
  807. btrfs_double_mmap_unlock(src_inode, dst_inode);
  808. unlock_two_nondirectories(&src_inode->vfs_inode,
  809. &dst_inode->vfs_inode);
  810. }
  811. /*
  812. * If either the source or the destination file was opened with O_SYNC,
  813. * O_DSYNC or has the S_SYNC attribute, fsync both the destination and
  814. * source files/ranges, so that after a successful return (0) followed
  815. * by a power failure results in the reflinked data to be readable from
  816. * both files/ranges.
  817. */
  818. if (ret == 0 && len > 0 &&
  819. (file_sync_write(src_file) || file_sync_write(dst_file))) {
  820. ret = btrfs_sync_file(src_file, off, off + len - 1, 0);
  821. if (ret == 0)
  822. ret = btrfs_sync_file(dst_file, destoff,
  823. destoff + len - 1, 0);
  824. }
  825. return ret < 0 ? ret : len;
  826. }