fiemap.c 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928
  1. // SPDX-License-Identifier: GPL-2.0
  2. #include "backref.h"
  3. #include "btrfs_inode.h"
  4. #include "fiemap.h"
  5. #include "file.h"
  6. #include "file-item.h"
  7. struct btrfs_fiemap_entry {
  8. u64 offset;
  9. u64 phys;
  10. u64 len;
  11. u32 flags;
  12. };
  13. /*
  14. * Indicate the caller of emit_fiemap_extent() that it needs to unlock the file
  15. * range from the inode's io tree, unlock the subvolume tree search path, flush
  16. * the fiemap cache and relock the file range and research the subvolume tree.
  17. * The value here is something negative that can't be confused with a valid
  18. * errno value and different from 1 because that's also a return value from
  19. * fiemap_fill_next_extent() and also it's often used to mean some btree search
  20. * did not find a key, so make it some distinct negative value.
  21. */
  22. #define BTRFS_FIEMAP_FLUSH_CACHE (-(MAX_ERRNO + 1))
  23. /*
  24. * Used to:
  25. *
  26. * - Cache the next entry to be emitted to the fiemap buffer, so that we can
  27. * merge extents that are contiguous and can be grouped as a single one;
  28. *
  29. * - Store extents ready to be written to the fiemap buffer in an intermediary
  30. * buffer. This intermediary buffer is to ensure that in case the fiemap
  31. * buffer is memory mapped to the fiemap target file, we don't deadlock
  32. * during btrfs_page_mkwrite(). This is because during fiemap we are locking
  33. * an extent range in order to prevent races with delalloc flushing and
  34. * ordered extent completion, which is needed in order to reliably detect
  35. * delalloc in holes and prealloc extents. And this can lead to a deadlock
  36. * if the fiemap buffer is memory mapped to the file we are running fiemap
  37. * against (a silly, useless in practice scenario, but possible) because
  38. * btrfs_page_mkwrite() will try to lock the same extent range.
  39. */
  40. struct fiemap_cache {
  41. /* An array of ready fiemap entries. */
  42. struct btrfs_fiemap_entry *entries;
  43. /* Number of entries in the entries array. */
  44. int entries_size;
  45. /* Index of the next entry in the entries array to write to. */
  46. int entries_pos;
  47. /*
  48. * Once the entries array is full, this indicates what's the offset for
  49. * the next file extent item we must search for in the inode's subvolume
  50. * tree after unlocking the extent range in the inode's io tree and
  51. * releasing the search path.
  52. */
  53. u64 next_search_offset;
  54. /*
  55. * This matches struct fiemap_extent_info::fi_mapped_extents, we use it
  56. * to count ourselves emitted extents and stop instead of relying on
  57. * fiemap_fill_next_extent() because we buffer ready fiemap entries at
  58. * the @entries array, and we want to stop as soon as we hit the max
  59. * amount of extents to map, not just to save time but also to make the
  60. * logic at extent_fiemap() simpler.
  61. */
  62. unsigned int extents_mapped;
  63. /* Fields for the cached extent (unsubmitted, not ready, extent). */
  64. u64 offset;
  65. u64 phys;
  66. u64 len;
  67. u32 flags;
  68. bool cached;
  69. };
  70. static int flush_fiemap_cache(struct fiemap_extent_info *fieinfo,
  71. struct fiemap_cache *cache)
  72. {
  73. for (int i = 0; i < cache->entries_pos; i++) {
  74. struct btrfs_fiemap_entry *entry = &cache->entries[i];
  75. int ret;
  76. ret = fiemap_fill_next_extent(fieinfo, entry->offset,
  77. entry->phys, entry->len,
  78. entry->flags);
  79. /*
  80. * Ignore 1 (reached max entries) because we keep track of that
  81. * ourselves in emit_fiemap_extent().
  82. */
  83. if (ret < 0)
  84. return ret;
  85. }
  86. cache->entries_pos = 0;
  87. return 0;
  88. }
  89. /*
  90. * Helper to submit fiemap extent.
  91. *
  92. * Will try to merge current fiemap extent specified by @offset, @phys,
  93. * @len and @flags with cached one.
  94. * And only when we fails to merge, cached one will be submitted as
  95. * fiemap extent.
  96. *
  97. * Return value is the same as fiemap_fill_next_extent().
  98. */
  99. static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
  100. struct fiemap_cache *cache,
  101. u64 offset, u64 phys, u64 len, u32 flags)
  102. {
  103. struct btrfs_fiemap_entry *entry;
  104. u64 cache_end;
  105. /* Set at the end of extent_fiemap(). */
  106. ASSERT((flags & FIEMAP_EXTENT_LAST) == 0);
  107. if (!cache->cached)
  108. goto assign;
  109. /*
  110. * When iterating the extents of the inode, at extent_fiemap(), we may
  111. * find an extent that starts at an offset behind the end offset of the
  112. * previous extent we processed. This happens if fiemap is called
  113. * without FIEMAP_FLAG_SYNC and there are ordered extents completing
  114. * after we had to unlock the file range, release the search path, emit
  115. * the fiemap extents stored in the buffer (cache->entries array) and
  116. * the lock the remainder of the range and re-search the btree.
  117. *
  118. * For example we are in leaf X processing its last item, which is the
  119. * file extent item for file range [512K, 1M[, and after
  120. * btrfs_next_leaf() releases the path, there's an ordered extent that
  121. * completes for the file range [768K, 2M[, and that results in trimming
  122. * the file extent item so that it now corresponds to the file range
  123. * [512K, 768K[ and a new file extent item is inserted for the file
  124. * range [768K, 2M[, which may end up as the last item of leaf X or as
  125. * the first item of the next leaf - in either case btrfs_next_leaf()
  126. * will leave us with a path pointing to the new extent item, for the
  127. * file range [768K, 2M[, since that's the first key that follows the
  128. * last one we processed. So in order not to report overlapping extents
  129. * to user space, we trim the length of the previously cached extent and
  130. * emit it.
  131. *
  132. * Upon calling btrfs_next_leaf() we may also find an extent with an
  133. * offset smaller than or equals to cache->offset, and this happens
  134. * when we had a hole or prealloc extent with several delalloc ranges in
  135. * it, but after btrfs_next_leaf() released the path, delalloc was
  136. * flushed and the resulting ordered extents were completed, so we can
  137. * now have found a file extent item for an offset that is smaller than
  138. * or equals to what we have in cache->offset. We deal with this as
  139. * described below.
  140. */
  141. cache_end = cache->offset + cache->len;
  142. if (cache_end > offset) {
  143. if (offset == cache->offset) {
  144. /*
  145. * We cached a delalloc range (found in the io tree) for
  146. * a hole or prealloc extent and we have now found a
  147. * file extent item for the same offset. What we have
  148. * now is more recent and up to date, so discard what
  149. * we had in the cache and use what we have just found.
  150. */
  151. goto assign;
  152. } else if (offset > cache->offset) {
  153. /*
  154. * The extent range we previously found ends after the
  155. * offset of the file extent item we found and that
  156. * offset falls somewhere in the middle of that previous
  157. * extent range. So adjust the range we previously found
  158. * to end at the offset of the file extent item we have
  159. * just found, since this extent is more up to date.
  160. * Emit that adjusted range and cache the file extent
  161. * item we have just found. This corresponds to the case
  162. * where a previously found file extent item was split
  163. * due to an ordered extent completing.
  164. */
  165. cache->len = offset - cache->offset;
  166. goto emit;
  167. } else {
  168. const u64 range_end = offset + len;
  169. /*
  170. * The offset of the file extent item we have just found
  171. * is behind the cached offset. This means we were
  172. * processing a hole or prealloc extent for which we
  173. * have found delalloc ranges (in the io tree), so what
  174. * we have in the cache is the last delalloc range we
  175. * found while the file extent item we found can be
  176. * either for a whole delalloc range we previously
  177. * emitted or only a part of that range.
  178. *
  179. * We have two cases here:
  180. *
  181. * 1) The file extent item's range ends at or behind the
  182. * cached extent's end. In this case just ignore the
  183. * current file extent item because we don't want to
  184. * overlap with previous ranges that may have been
  185. * emitted already;
  186. *
  187. * 2) The file extent item starts behind the currently
  188. * cached extent but its end offset goes beyond the
  189. * end offset of the cached extent. We don't want to
  190. * overlap with a previous range that may have been
  191. * emitted already, so we emit the currently cached
  192. * extent and then partially store the current file
  193. * extent item's range in the cache, for the subrange
  194. * going the cached extent's end to the end of the
  195. * file extent item.
  196. */
  197. if (range_end <= cache_end)
  198. return 0;
  199. if (!(flags & (FIEMAP_EXTENT_ENCODED | FIEMAP_EXTENT_DELALLOC)))
  200. phys += cache_end - offset;
  201. offset = cache_end;
  202. len = range_end - cache_end;
  203. goto emit;
  204. }
  205. }
  206. /*
  207. * Only merges fiemap extents if
  208. * 1) Their logical addresses are continuous
  209. *
  210. * 2) Their physical addresses are continuous
  211. * So truly compressed (physical size smaller than logical size)
  212. * extents won't get merged with each other
  213. *
  214. * 3) Share same flags
  215. */
  216. if (cache->offset + cache->len == offset &&
  217. cache->phys + cache->len == phys &&
  218. cache->flags == flags) {
  219. cache->len += len;
  220. return 0;
  221. }
  222. emit:
  223. /* Not mergeable, need to submit cached one */
  224. if (cache->entries_pos == cache->entries_size) {
  225. /*
  226. * We will need to research for the end offset of the last
  227. * stored extent and not from the current offset, because after
  228. * unlocking the range and releasing the path, if there's a hole
  229. * between that end offset and this current offset, a new extent
  230. * may have been inserted due to a new write, so we don't want
  231. * to miss it.
  232. */
  233. entry = &cache->entries[cache->entries_size - 1];
  234. cache->next_search_offset = entry->offset + entry->len;
  235. cache->cached = false;
  236. return BTRFS_FIEMAP_FLUSH_CACHE;
  237. }
  238. entry = &cache->entries[cache->entries_pos];
  239. entry->offset = cache->offset;
  240. entry->phys = cache->phys;
  241. entry->len = cache->len;
  242. entry->flags = cache->flags;
  243. cache->entries_pos++;
  244. cache->extents_mapped++;
  245. if (cache->extents_mapped == fieinfo->fi_extents_max) {
  246. cache->cached = false;
  247. return 1;
  248. }
  249. assign:
  250. cache->cached = true;
  251. cache->offset = offset;
  252. cache->phys = phys;
  253. cache->len = len;
  254. cache->flags = flags;
  255. return 0;
  256. }
  257. /*
  258. * Emit last fiemap cache
  259. *
  260. * The last fiemap cache may still be cached in the following case:
  261. * 0 4k 8k
  262. * |<- Fiemap range ->|
  263. * |<------------ First extent ----------->|
  264. *
  265. * In this case, the first extent range will be cached but not emitted.
  266. * So we must emit it before ending extent_fiemap().
  267. */
  268. static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
  269. struct fiemap_cache *cache)
  270. {
  271. int ret;
  272. if (!cache->cached)
  273. return 0;
  274. ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
  275. cache->len, cache->flags);
  276. cache->cached = false;
  277. if (ret > 0)
  278. ret = 0;
  279. return ret;
  280. }
  281. static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *path)
  282. {
  283. struct extent_buffer *clone = path->nodes[0];
  284. struct btrfs_key key;
  285. int slot;
  286. int ret;
  287. path->slots[0]++;
  288. if (path->slots[0] < btrfs_header_nritems(path->nodes[0]))
  289. return 0;
  290. /*
  291. * Add a temporary extra ref to an already cloned extent buffer to
  292. * prevent btrfs_next_leaf() freeing it, we want to reuse it to avoid
  293. * the cost of allocating a new one.
  294. */
  295. ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, &clone->bflags));
  296. refcount_inc(&clone->refs);
  297. ret = btrfs_next_leaf(inode->root, path);
  298. if (ret != 0)
  299. goto out;
  300. /*
  301. * Don't bother with cloning if there are no more file extent items for
  302. * our inode.
  303. */
  304. btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
  305. if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) {
  306. ret = 1;
  307. goto out;
  308. }
  309. /*
  310. * Important to preserve the start field, for the optimizations when
  311. * checking if extents are shared (see extent_fiemap()).
  312. *
  313. * We must set ->start before calling copy_extent_buffer_full(). If we
  314. * are on sub-pagesize blocksize, we use ->start to determine the offset
  315. * into the folio where our eb exists, and if we update ->start after
  316. * the fact then any subsequent reads of the eb may read from a
  317. * different offset in the folio than where we originally copied into.
  318. */
  319. clone->start = path->nodes[0]->start;
  320. /* See the comment at fiemap_search_slot() about why we clone. */
  321. copy_extent_buffer_full(clone, path->nodes[0]);
  322. slot = path->slots[0];
  323. btrfs_release_path(path);
  324. path->nodes[0] = clone;
  325. path->slots[0] = slot;
  326. out:
  327. if (ret)
  328. free_extent_buffer(clone);
  329. return ret;
  330. }
  331. /*
  332. * Search for the first file extent item that starts at a given file offset or
  333. * the one that starts immediately before that offset.
  334. * Returns: 0 on success, < 0 on error, 1 if not found.
  335. */
  336. static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path,
  337. u64 file_offset)
  338. {
  339. const u64 ino = btrfs_ino(inode);
  340. struct btrfs_root *root = inode->root;
  341. struct extent_buffer *clone;
  342. struct btrfs_key key;
  343. int slot;
  344. int ret;
  345. key.objectid = ino;
  346. key.type = BTRFS_EXTENT_DATA_KEY;
  347. key.offset = file_offset;
  348. ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
  349. if (ret < 0)
  350. return ret;
  351. if (ret > 0 && path->slots[0] > 0) {
  352. btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
  353. if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
  354. path->slots[0]--;
  355. }
  356. if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
  357. ret = btrfs_next_leaf(root, path);
  358. if (ret != 0)
  359. return ret;
  360. btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
  361. if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
  362. return 1;
  363. }
  364. /*
  365. * We clone the leaf and use it during fiemap. This is because while
  366. * using the leaf we do expensive things like checking if an extent is
  367. * shared, which can take a long time. In order to prevent blocking
  368. * other tasks for too long, we use a clone of the leaf. We have locked
  369. * the file range in the inode's io tree, so we know none of our file
  370. * extent items can change. This way we avoid blocking other tasks that
  371. * want to insert items for other inodes in the same leaf or b+tree
  372. * rebalance operations (triggered for example when someone is trying
  373. * to push items into this leaf when trying to insert an item in a
  374. * neighbour leaf).
  375. * We also need the private clone because holding a read lock on an
  376. * extent buffer of the subvolume's b+tree will make lockdep unhappy
  377. * when we check if extents are shared, as backref walking may need to
  378. * lock the same leaf we are processing.
  379. */
  380. clone = btrfs_clone_extent_buffer(path->nodes[0]);
  381. if (!clone)
  382. return -ENOMEM;
  383. slot = path->slots[0];
  384. btrfs_release_path(path);
  385. path->nodes[0] = clone;
  386. path->slots[0] = slot;
  387. return 0;
  388. }
  389. /*
  390. * Process a range which is a hole or a prealloc extent in the inode's subvolume
  391. * btree. If @disk_bytenr is 0, we are dealing with a hole, otherwise a prealloc
  392. * extent. The end offset (@end) is inclusive.
  393. */
  394. static int fiemap_process_hole(struct btrfs_inode *inode,
  395. struct fiemap_extent_info *fieinfo,
  396. struct fiemap_cache *cache,
  397. struct extent_state **delalloc_cached_state,
  398. struct btrfs_backref_share_check_ctx *backref_ctx,
  399. u64 disk_bytenr, u64 extent_offset,
  400. u64 extent_gen,
  401. u64 start, u64 end)
  402. {
  403. const u64 i_size = i_size_read(&inode->vfs_inode);
  404. u64 cur_offset = start;
  405. u64 last_delalloc_end = 0;
  406. u32 prealloc_flags = FIEMAP_EXTENT_UNWRITTEN;
  407. bool checked_extent_shared = false;
  408. int ret;
  409. /*
  410. * There can be no delalloc past i_size, so don't waste time looking for
  411. * it beyond i_size.
  412. */
  413. while (cur_offset < end && cur_offset < i_size) {
  414. u64 delalloc_start;
  415. u64 delalloc_end;
  416. u64 prealloc_start;
  417. u64 prealloc_len = 0;
  418. bool delalloc;
  419. delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end,
  420. delalloc_cached_state,
  421. &delalloc_start,
  422. &delalloc_end);
  423. if (!delalloc)
  424. break;
  425. /*
  426. * If this is a prealloc extent we have to report every section
  427. * of it that has no delalloc.
  428. */
  429. if (disk_bytenr != 0) {
  430. if (last_delalloc_end == 0) {
  431. prealloc_start = start;
  432. prealloc_len = delalloc_start - start;
  433. } else {
  434. prealloc_start = last_delalloc_end + 1;
  435. prealloc_len = delalloc_start - prealloc_start;
  436. }
  437. }
  438. if (prealloc_len > 0) {
  439. if (!checked_extent_shared && fieinfo->fi_extents_max) {
  440. ret = btrfs_is_data_extent_shared(inode,
  441. disk_bytenr,
  442. extent_gen,
  443. backref_ctx);
  444. if (ret < 0)
  445. return ret;
  446. else if (ret > 0)
  447. prealloc_flags |= FIEMAP_EXTENT_SHARED;
  448. checked_extent_shared = true;
  449. }
  450. ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
  451. disk_bytenr + extent_offset,
  452. prealloc_len, prealloc_flags);
  453. if (ret)
  454. return ret;
  455. extent_offset += prealloc_len;
  456. }
  457. ret = emit_fiemap_extent(fieinfo, cache, delalloc_start, 0,
  458. delalloc_end + 1 - delalloc_start,
  459. FIEMAP_EXTENT_DELALLOC |
  460. FIEMAP_EXTENT_UNKNOWN);
  461. if (ret)
  462. return ret;
  463. last_delalloc_end = delalloc_end;
  464. cur_offset = delalloc_end + 1;
  465. extent_offset += cur_offset - delalloc_start;
  466. cond_resched();
  467. }
  468. /*
  469. * Either we found no delalloc for the whole prealloc extent or we have
  470. * a prealloc extent that spans i_size or starts at or after i_size.
  471. */
  472. if (disk_bytenr != 0 && last_delalloc_end < end) {
  473. u64 prealloc_start;
  474. u64 prealloc_len;
  475. if (last_delalloc_end == 0) {
  476. prealloc_start = start;
  477. prealloc_len = end + 1 - start;
  478. } else {
  479. prealloc_start = last_delalloc_end + 1;
  480. prealloc_len = end + 1 - prealloc_start;
  481. }
  482. if (!checked_extent_shared && fieinfo->fi_extents_max) {
  483. ret = btrfs_is_data_extent_shared(inode,
  484. disk_bytenr,
  485. extent_gen,
  486. backref_ctx);
  487. if (ret < 0)
  488. return ret;
  489. else if (ret > 0)
  490. prealloc_flags |= FIEMAP_EXTENT_SHARED;
  491. }
  492. ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
  493. disk_bytenr + extent_offset,
  494. prealloc_len, prealloc_flags);
  495. if (ret)
  496. return ret;
  497. }
  498. return 0;
  499. }
  500. static int fiemap_find_last_extent_offset(struct btrfs_inode *inode,
  501. struct btrfs_path *path,
  502. u64 *last_extent_end_ret)
  503. {
  504. const u64 ino = btrfs_ino(inode);
  505. struct btrfs_root *root = inode->root;
  506. struct extent_buffer *leaf;
  507. struct btrfs_file_extent_item *ei;
  508. struct btrfs_key key;
  509. u64 disk_bytenr;
  510. int ret;
  511. /*
  512. * Lookup the last file extent. We're not using i_size here because
  513. * there might be preallocation past i_size.
  514. */
  515. ret = btrfs_lookup_file_extent(NULL, root, path, ino, (u64)-1, 0);
  516. /* There can't be a file extent item at offset (u64)-1 */
  517. ASSERT(ret != 0);
  518. if (ret < 0)
  519. return ret;
  520. /*
  521. * For a non-existing key, btrfs_search_slot() always leaves us at a
  522. * slot > 0, except if the btree is empty, which is impossible because
  523. * at least it has the inode item for this inode and all the items for
  524. * the root inode 256.
  525. */
  526. ASSERT(path->slots[0] > 0);
  527. path->slots[0]--;
  528. leaf = path->nodes[0];
  529. btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
  530. if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
  531. /* No file extent items in the subvolume tree. */
  532. *last_extent_end_ret = 0;
  533. return 0;
  534. }
  535. /*
  536. * For an inline extent, the disk_bytenr is where inline data starts at,
  537. * so first check if we have an inline extent item before checking if we
  538. * have an implicit hole (disk_bytenr == 0).
  539. */
  540. ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
  541. if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) {
  542. *last_extent_end_ret = btrfs_file_extent_end(path);
  543. return 0;
  544. }
  545. /*
  546. * Find the last file extent item that is not a hole (when NO_HOLES is
  547. * not enabled). This should take at most 2 iterations in the worst
  548. * case: we have one hole file extent item at slot 0 of a leaf and
  549. * another hole file extent item as the last item in the previous leaf.
  550. * This is because we merge file extent items that represent holes.
  551. */
  552. disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
  553. while (disk_bytenr == 0) {
  554. ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
  555. if (ret < 0) {
  556. return ret;
  557. } else if (ret > 0) {
  558. /* No file extent items that are not holes. */
  559. *last_extent_end_ret = 0;
  560. return 0;
  561. }
  562. leaf = path->nodes[0];
  563. ei = btrfs_item_ptr(leaf, path->slots[0],
  564. struct btrfs_file_extent_item);
  565. disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
  566. }
  567. *last_extent_end_ret = btrfs_file_extent_end(path);
  568. return 0;
  569. }
  570. static int extent_fiemap(struct btrfs_inode *inode,
  571. struct fiemap_extent_info *fieinfo,
  572. u64 start, u64 len)
  573. {
  574. const u64 ino = btrfs_ino(inode);
  575. struct extent_state *cached_state = NULL;
  576. struct extent_state *delalloc_cached_state = NULL;
  577. BTRFS_PATH_AUTO_FREE(path);
  578. struct fiemap_cache cache = { 0 };
  579. struct btrfs_backref_share_check_ctx *backref_ctx;
  580. u64 last_extent_end = 0;
  581. u64 prev_extent_end;
  582. u64 range_start;
  583. u64 range_end;
  584. const u64 sectorsize = inode->root->fs_info->sectorsize;
  585. bool stopped = false;
  586. int ret;
  587. cache.entries_size = PAGE_SIZE / sizeof(struct btrfs_fiemap_entry);
  588. cache.entries = kmalloc_objs(struct btrfs_fiemap_entry,
  589. cache.entries_size);
  590. backref_ctx = btrfs_alloc_backref_share_check_ctx();
  591. path = btrfs_alloc_path();
  592. if (!cache.entries || !backref_ctx || !path) {
  593. ret = -ENOMEM;
  594. goto out;
  595. }
  596. restart:
  597. range_start = round_down(start, sectorsize);
  598. range_end = round_up(start + len, sectorsize);
  599. prev_extent_end = range_start;
  600. btrfs_lock_extent(&inode->io_tree, range_start, range_end, &cached_state);
  601. ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end);
  602. if (ret < 0)
  603. goto out_unlock;
  604. btrfs_release_path(path);
  605. path->reada = READA_FORWARD;
  606. ret = fiemap_search_slot(inode, path, range_start);
  607. if (ret < 0) {
  608. goto out_unlock;
  609. } else if (ret > 0) {
  610. /*
  611. * No file extent item found, but we may have delalloc between
  612. * the current offset and i_size. So check for that.
  613. */
  614. ret = 0;
  615. goto check_eof_delalloc;
  616. }
  617. while (prev_extent_end < range_end) {
  618. struct extent_buffer *leaf = path->nodes[0];
  619. struct btrfs_file_extent_item *ei;
  620. struct btrfs_key key;
  621. u64 extent_end;
  622. u64 extent_len;
  623. u64 extent_offset = 0;
  624. u64 extent_gen;
  625. u64 disk_bytenr = 0;
  626. u64 flags = 0;
  627. int extent_type;
  628. u8 compression;
  629. btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
  630. if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
  631. break;
  632. extent_end = btrfs_file_extent_end(path);
  633. /*
  634. * The first iteration can leave us at an extent item that ends
  635. * before our range's start. Move to the next item.
  636. */
  637. if (extent_end <= range_start)
  638. goto next_item;
  639. backref_ctx->curr_leaf_bytenr = leaf->start;
  640. /* We have in implicit hole (NO_HOLES feature enabled). */
  641. if (prev_extent_end < key.offset) {
  642. const u64 hole_end = min(key.offset, range_end) - 1;
  643. ret = fiemap_process_hole(inode, fieinfo, &cache,
  644. &delalloc_cached_state,
  645. backref_ctx, 0, 0, 0,
  646. prev_extent_end, hole_end);
  647. if (ret < 0) {
  648. goto out_unlock;
  649. } else if (ret > 0) {
  650. /* fiemap_fill_next_extent() told us to stop. */
  651. stopped = true;
  652. break;
  653. }
  654. /* We've reached the end of the fiemap range, stop. */
  655. if (key.offset >= range_end) {
  656. stopped = true;
  657. break;
  658. }
  659. }
  660. extent_len = extent_end - key.offset;
  661. ei = btrfs_item_ptr(leaf, path->slots[0],
  662. struct btrfs_file_extent_item);
  663. compression = btrfs_file_extent_compression(leaf, ei);
  664. extent_type = btrfs_file_extent_type(leaf, ei);
  665. extent_gen = btrfs_file_extent_generation(leaf, ei);
  666. if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
  667. disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
  668. if (compression == BTRFS_COMPRESS_NONE)
  669. extent_offset = btrfs_file_extent_offset(leaf, ei);
  670. }
  671. if (compression != BTRFS_COMPRESS_NONE)
  672. flags |= FIEMAP_EXTENT_ENCODED;
  673. if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
  674. flags |= FIEMAP_EXTENT_DATA_INLINE;
  675. flags |= FIEMAP_EXTENT_NOT_ALIGNED;
  676. ret = emit_fiemap_extent(fieinfo, &cache, key.offset, 0,
  677. extent_len, flags);
  678. } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
  679. ret = fiemap_process_hole(inode, fieinfo, &cache,
  680. &delalloc_cached_state,
  681. backref_ctx,
  682. disk_bytenr, extent_offset,
  683. extent_gen, key.offset,
  684. extent_end - 1);
  685. } else if (disk_bytenr == 0) {
  686. /* We have an explicit hole. */
  687. ret = fiemap_process_hole(inode, fieinfo, &cache,
  688. &delalloc_cached_state,
  689. backref_ctx, 0, 0, 0,
  690. key.offset, extent_end - 1);
  691. } else {
  692. /* We have a regular extent. */
  693. if (fieinfo->fi_extents_max) {
  694. ret = btrfs_is_data_extent_shared(inode,
  695. disk_bytenr,
  696. extent_gen,
  697. backref_ctx);
  698. if (ret < 0)
  699. goto out_unlock;
  700. else if (ret > 0)
  701. flags |= FIEMAP_EXTENT_SHARED;
  702. }
  703. ret = emit_fiemap_extent(fieinfo, &cache, key.offset,
  704. disk_bytenr + extent_offset,
  705. extent_len, flags);
  706. }
  707. if (ret < 0) {
  708. goto out_unlock;
  709. } else if (ret > 0) {
  710. /* emit_fiemap_extent() told us to stop. */
  711. stopped = true;
  712. break;
  713. }
  714. prev_extent_end = extent_end;
  715. next_item:
  716. if (fatal_signal_pending(current)) {
  717. ret = -EINTR;
  718. goto out_unlock;
  719. }
  720. ret = fiemap_next_leaf_item(inode, path);
  721. if (ret < 0) {
  722. goto out_unlock;
  723. } else if (ret > 0) {
  724. /* No more file extent items for this inode. */
  725. break;
  726. }
  727. cond_resched();
  728. }
  729. check_eof_delalloc:
  730. if (!stopped && prev_extent_end < range_end) {
  731. ret = fiemap_process_hole(inode, fieinfo, &cache,
  732. &delalloc_cached_state, backref_ctx,
  733. 0, 0, 0, prev_extent_end, range_end - 1);
  734. if (ret < 0)
  735. goto out_unlock;
  736. prev_extent_end = range_end;
  737. }
  738. if (cache.cached && cache.offset + cache.len >= last_extent_end) {
  739. const u64 i_size = i_size_read(&inode->vfs_inode);
  740. if (prev_extent_end < i_size) {
  741. u64 delalloc_start;
  742. u64 delalloc_end;
  743. bool delalloc;
  744. delalloc = btrfs_find_delalloc_in_range(inode,
  745. prev_extent_end,
  746. i_size - 1,
  747. &delalloc_cached_state,
  748. &delalloc_start,
  749. &delalloc_end);
  750. if (!delalloc)
  751. cache.flags |= FIEMAP_EXTENT_LAST;
  752. } else {
  753. cache.flags |= FIEMAP_EXTENT_LAST;
  754. }
  755. }
  756. out_unlock:
  757. btrfs_unlock_extent(&inode->io_tree, range_start, range_end, &cached_state);
  758. if (ret == BTRFS_FIEMAP_FLUSH_CACHE) {
  759. btrfs_release_path(path);
  760. ret = flush_fiemap_cache(fieinfo, &cache);
  761. if (ret)
  762. goto out;
  763. len -= cache.next_search_offset - start;
  764. start = cache.next_search_offset;
  765. goto restart;
  766. } else if (ret < 0) {
  767. goto out;
  768. }
  769. /*
  770. * Must free the path before emitting to the fiemap buffer because we
  771. * may have a non-cloned leaf and if the fiemap buffer is memory mapped
  772. * to a file, a write into it (through btrfs_page_mkwrite()) may trigger
  773. * waiting for an ordered extent that in order to complete needs to
  774. * modify that leaf, therefore leading to a deadlock.
  775. */
  776. btrfs_free_path(path);
  777. path = NULL;
  778. ret = flush_fiemap_cache(fieinfo, &cache);
  779. if (ret)
  780. goto out;
  781. ret = emit_last_fiemap_cache(fieinfo, &cache);
  782. out:
  783. btrfs_free_extent_state(delalloc_cached_state);
  784. kfree(cache.entries);
  785. btrfs_free_backref_share_ctx(backref_ctx);
  786. return ret;
  787. }
  788. int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
  789. u64 start, u64 len)
  790. {
  791. struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
  792. int ret;
  793. ret = fiemap_prep(inode, fieinfo, start, &len, 0);
  794. if (ret)
  795. return ret;
  796. /*
  797. * fiemap_prep() called filemap_write_and_wait() for the whole possible
  798. * file range (0 to LLONG_MAX), but that is not enough if we have
  799. * compression enabled. The first filemap_fdatawrite_range() only kicks
  800. * in the compression of data (in an async thread) and will return
  801. * before the compression is done and writeback is started. A second
  802. * filemap_fdatawrite_range() is needed to wait for the compression to
  803. * complete and writeback to start. We also need to wait for ordered
  804. * extents to complete, because our fiemap implementation uses mainly
  805. * file extent items to list the extents, searching for extent maps
  806. * only for file ranges with holes or prealloc extents to figure out
  807. * if we have delalloc in those ranges.
  808. */
  809. if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
  810. ret = btrfs_wait_ordered_range(btrfs_inode, 0, LLONG_MAX);
  811. if (ret)
  812. return ret;
  813. }
  814. btrfs_inode_lock(btrfs_inode, BTRFS_ILOCK_SHARED);
  815. /*
  816. * We did an initial flush to avoid holding the inode's lock while
  817. * triggering writeback and waiting for the completion of IO and ordered
  818. * extents. Now after we locked the inode we do it again, because it's
  819. * possible a new write may have happened in between those two steps.
  820. */
  821. if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
  822. ret = btrfs_wait_ordered_range(btrfs_inode, 0, LLONG_MAX);
  823. if (ret) {
  824. btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
  825. return ret;
  826. }
  827. }
  828. ret = extent_fiemap(btrfs_inode, fieinfo, start, len);
  829. btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
  830. return ret;
  831. }