ordered-data.c 40 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright (C) 2007 Oracle. All rights reserved.
  4. */
  5. #include <linux/slab.h>
  6. #include <linux/blkdev.h>
  7. #include <linux/writeback.h>
  8. #include <linux/sched/mm.h>
  9. #include "messages.h"
  10. #include "misc.h"
  11. #include "ctree.h"
  12. #include "transaction.h"
  13. #include "btrfs_inode.h"
  14. #include "extent_io.h"
  15. #include "disk-io.h"
  16. #include "compression.h"
  17. #include "delalloc-space.h"
  18. #include "qgroup.h"
  19. #include "subpage.h"
  20. #include "file.h"
  21. #include "block-group.h"
  22. static struct kmem_cache *btrfs_ordered_extent_cache;
  23. static u64 entry_end(struct btrfs_ordered_extent *entry)
  24. {
  25. if (entry->file_offset + entry->num_bytes < entry->file_offset)
  26. return (u64)-1;
  27. return entry->file_offset + entry->num_bytes;
  28. }
  29. /* returns NULL if the insertion worked, or it returns the node it did find
  30. * in the tree
  31. */
  32. static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
  33. struct rb_node *node)
  34. {
  35. struct rb_node **p = &root->rb_node;
  36. struct rb_node *parent = NULL;
  37. struct btrfs_ordered_extent *entry;
  38. while (*p) {
  39. parent = *p;
  40. entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
  41. if (file_offset < entry->file_offset)
  42. p = &(*p)->rb_left;
  43. else if (file_offset >= entry_end(entry))
  44. p = &(*p)->rb_right;
  45. else
  46. return parent;
  47. }
  48. rb_link_node(node, parent, p);
  49. rb_insert_color(node, root);
  50. return NULL;
  51. }
  52. /*
  53. * look for a given offset in the tree, and if it can't be found return the
  54. * first lesser offset
  55. */
  56. static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
  57. struct rb_node **prev_ret)
  58. {
  59. struct rb_node *n = root->rb_node;
  60. struct rb_node *prev = NULL;
  61. struct rb_node *test;
  62. struct btrfs_ordered_extent *entry;
  63. struct btrfs_ordered_extent *prev_entry = NULL;
  64. while (n) {
  65. entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
  66. prev = n;
  67. prev_entry = entry;
  68. if (file_offset < entry->file_offset)
  69. n = n->rb_left;
  70. else if (file_offset >= entry_end(entry))
  71. n = n->rb_right;
  72. else
  73. return n;
  74. }
  75. if (!prev_ret)
  76. return NULL;
  77. while (prev && file_offset >= entry_end(prev_entry)) {
  78. test = rb_next(prev);
  79. if (!test)
  80. break;
  81. prev_entry = rb_entry(test, struct btrfs_ordered_extent,
  82. rb_node);
  83. if (file_offset < entry_end(prev_entry))
  84. break;
  85. prev = test;
  86. }
  87. if (prev)
  88. prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
  89. rb_node);
  90. while (prev && file_offset < entry_end(prev_entry)) {
  91. test = rb_prev(prev);
  92. if (!test)
  93. break;
  94. prev_entry = rb_entry(test, struct btrfs_ordered_extent,
  95. rb_node);
  96. prev = test;
  97. }
  98. *prev_ret = prev;
  99. return NULL;
  100. }
  101. static int btrfs_range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
  102. u64 len)
  103. {
  104. if (file_offset + len <= entry->file_offset ||
  105. entry->file_offset + entry->num_bytes <= file_offset)
  106. return 0;
  107. return 1;
  108. }
  109. /*
  110. * look find the first ordered struct that has this offset, otherwise
  111. * the first one less than this offset
  112. */
  113. static inline struct rb_node *ordered_tree_search(struct btrfs_inode *inode,
  114. u64 file_offset)
  115. {
  116. struct rb_node *prev = NULL;
  117. struct rb_node *ret;
  118. struct btrfs_ordered_extent *entry;
  119. if (inode->ordered_tree_last) {
  120. entry = rb_entry(inode->ordered_tree_last, struct btrfs_ordered_extent,
  121. rb_node);
  122. if (in_range(file_offset, entry->file_offset, entry->num_bytes))
  123. return inode->ordered_tree_last;
  124. }
  125. ret = __tree_search(&inode->ordered_tree, file_offset, &prev);
  126. if (!ret)
  127. ret = prev;
  128. if (ret)
  129. inode->ordered_tree_last = ret;
  130. return ret;
  131. }
  132. static struct btrfs_ordered_extent *alloc_ordered_extent(
  133. struct btrfs_inode *inode, u64 file_offset, u64 num_bytes,
  134. u64 ram_bytes, u64 disk_bytenr, u64 disk_num_bytes,
  135. u64 offset, unsigned long flags, int compress_type)
  136. {
  137. struct btrfs_ordered_extent *entry;
  138. int ret;
  139. u64 qgroup_rsv = 0;
  140. const bool is_nocow = (flags &
  141. ((1U << BTRFS_ORDERED_NOCOW) | (1U << BTRFS_ORDERED_PREALLOC)));
  142. /*
  143. * For a NOCOW write we can free the qgroup reserve right now. For a COW
  144. * one we transfer the reserved space from the inode's iotree into the
  145. * ordered extent by calling btrfs_qgroup_release_data() and tracking
  146. * the qgroup reserved amount in the ordered extent, so that later after
  147. * completing the ordered extent, when running the data delayed ref it
  148. * creates, we free the reserved data with btrfs_qgroup_free_refroot().
  149. */
  150. if (is_nocow)
  151. ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes, &qgroup_rsv);
  152. else
  153. ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes, &qgroup_rsv);
  154. if (ret < 0)
  155. return ERR_PTR(ret);
  156. entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
  157. if (!entry) {
  158. entry = ERR_PTR(-ENOMEM);
  159. goto out;
  160. }
  161. entry->file_offset = file_offset;
  162. entry->num_bytes = num_bytes;
  163. entry->ram_bytes = ram_bytes;
  164. entry->disk_bytenr = disk_bytenr;
  165. entry->disk_num_bytes = disk_num_bytes;
  166. entry->offset = offset;
  167. entry->bytes_left = num_bytes;
  168. if (WARN_ON_ONCE(!igrab(&inode->vfs_inode))) {
  169. kmem_cache_free(btrfs_ordered_extent_cache, entry);
  170. entry = ERR_PTR(-ESTALE);
  171. goto out;
  172. }
  173. entry->inode = inode;
  174. entry->compress_type = compress_type;
  175. entry->truncated_len = (u64)-1;
  176. entry->qgroup_rsv = qgroup_rsv;
  177. entry->flags = flags;
  178. refcount_set(&entry->refs, 1);
  179. init_waitqueue_head(&entry->wait);
  180. INIT_LIST_HEAD(&entry->list);
  181. INIT_LIST_HEAD(&entry->log_list);
  182. INIT_LIST_HEAD(&entry->root_extent_list);
  183. INIT_LIST_HEAD(&entry->work_list);
  184. INIT_LIST_HEAD(&entry->bioc_list);
  185. init_completion(&entry->completion);
  186. /*
  187. * We don't need the count_max_extents here, we can assume that all of
  188. * that work has been done at higher layers, so this is truly the
  189. * smallest the extent is going to get.
  190. */
  191. spin_lock(&inode->lock);
  192. btrfs_mod_outstanding_extents(inode, 1);
  193. spin_unlock(&inode->lock);
  194. out:
  195. if (IS_ERR(entry) && !is_nocow)
  196. btrfs_qgroup_free_refroot(inode->root->fs_info,
  197. btrfs_root_id(inode->root),
  198. qgroup_rsv, BTRFS_QGROUP_RSV_DATA);
  199. return entry;
  200. }
  201. static void insert_ordered_extent(struct btrfs_ordered_extent *entry)
  202. {
  203. struct btrfs_inode *inode = entry->inode;
  204. struct btrfs_root *root = inode->root;
  205. struct btrfs_fs_info *fs_info = root->fs_info;
  206. struct rb_node *node;
  207. trace_btrfs_ordered_extent_add(inode, entry);
  208. percpu_counter_add_batch(&fs_info->ordered_bytes, entry->num_bytes,
  209. fs_info->delalloc_batch);
  210. /* One ref for the tree. */
  211. refcount_inc(&entry->refs);
  212. spin_lock(&inode->ordered_tree_lock);
  213. node = tree_insert(&inode->ordered_tree, entry->file_offset,
  214. &entry->rb_node);
  215. if (unlikely(node))
  216. btrfs_panic(fs_info, -EEXIST,
  217. "inconsistency in ordered tree at offset %llu",
  218. entry->file_offset);
  219. spin_unlock(&inode->ordered_tree_lock);
  220. spin_lock(&root->ordered_extent_lock);
  221. list_add_tail(&entry->root_extent_list,
  222. &root->ordered_extents);
  223. root->nr_ordered_extents++;
  224. if (root->nr_ordered_extents == 1) {
  225. spin_lock(&fs_info->ordered_root_lock);
  226. BUG_ON(!list_empty(&root->ordered_root));
  227. list_add_tail(&root->ordered_root, &fs_info->ordered_roots);
  228. spin_unlock(&fs_info->ordered_root_lock);
  229. }
  230. spin_unlock(&root->ordered_extent_lock);
  231. }
  232. /*
  233. * Add an ordered extent to the per-inode tree.
  234. *
  235. * @inode: Inode that this extent is for.
  236. * @file_offset: Logical offset in file where the extent starts.
  237. * @num_bytes: Logical length of extent in file.
  238. * @ram_bytes: Full length of unencoded data.
  239. * @disk_bytenr: Offset of extent on disk.
  240. * @disk_num_bytes: Size of extent on disk.
  241. * @offset: Offset into unencoded data where file data starts.
  242. * @flags: Flags specifying type of extent (1U << BTRFS_ORDERED_*).
  243. * @compress_type: Compression algorithm used for data.
  244. *
  245. * Most of these parameters correspond to &struct btrfs_file_extent_item. The
  246. * tree is given a single reference on the ordered extent that was inserted, and
  247. * the returned pointer is given a second reference.
  248. *
  249. * Return: the new ordered extent or error pointer.
  250. */
  251. struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
  252. struct btrfs_inode *inode, u64 file_offset,
  253. const struct btrfs_file_extent *file_extent, unsigned long flags)
  254. {
  255. struct btrfs_ordered_extent *entry;
  256. ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0);
  257. /*
  258. * For regular writes, we just use the members in @file_extent.
  259. *
  260. * For NOCOW, we don't really care about the numbers except @start and
  261. * file_extent->num_bytes, as we won't insert a file extent item at all.
  262. *
  263. * For PREALLOC, we do not use ordered extent members, but
  264. * btrfs_mark_extent_written() handles everything.
  265. *
  266. * So here we always pass 0 as offset for NOCOW/PREALLOC ordered extents,
  267. * or btrfs_split_ordered_extent() cannot handle it correctly.
  268. */
  269. if (flags & ((1U << BTRFS_ORDERED_NOCOW) | (1U << BTRFS_ORDERED_PREALLOC)))
  270. entry = alloc_ordered_extent(inode, file_offset,
  271. file_extent->num_bytes,
  272. file_extent->num_bytes,
  273. file_extent->disk_bytenr + file_extent->offset,
  274. file_extent->num_bytes, 0, flags,
  275. file_extent->compression);
  276. else
  277. entry = alloc_ordered_extent(inode, file_offset,
  278. file_extent->num_bytes,
  279. file_extent->ram_bytes,
  280. file_extent->disk_bytenr,
  281. file_extent->disk_num_bytes,
  282. file_extent->offset, flags,
  283. file_extent->compression);
  284. if (!IS_ERR(entry))
  285. insert_ordered_extent(entry);
  286. return entry;
  287. }
  288. /*
  289. * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
  290. * when an ordered extent is finished. If the list covers more than one
  291. * ordered extent, it is split across multiples.
  292. */
  293. void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
  294. struct btrfs_ordered_sum *sum)
  295. {
  296. struct btrfs_inode *inode = entry->inode;
  297. spin_lock(&inode->ordered_tree_lock);
  298. list_add_tail(&sum->list, &entry->list);
  299. spin_unlock(&inode->ordered_tree_lock);
  300. }
  301. void btrfs_mark_ordered_extent_error(struct btrfs_ordered_extent *ordered)
  302. {
  303. if (!test_and_set_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
  304. mapping_set_error(ordered->inode->vfs_inode.i_mapping, -EIO);
  305. }
  306. static void finish_ordered_fn(struct btrfs_work *work)
  307. {
  308. struct btrfs_ordered_extent *ordered_extent;
  309. ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
  310. btrfs_finish_ordered_io(ordered_extent);
  311. }
  312. static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
  313. struct folio *folio, u64 file_offset,
  314. u64 len, bool uptodate)
  315. {
  316. struct btrfs_inode *inode = ordered->inode;
  317. struct btrfs_fs_info *fs_info = inode->root->fs_info;
  318. lockdep_assert_held(&inode->ordered_tree_lock);
  319. if (folio) {
  320. ASSERT(folio->mapping);
  321. ASSERT(folio_pos(folio) <= file_offset);
  322. ASSERT(file_offset + len <= folio_next_pos(folio));
  323. /*
  324. * Ordered flag indicates whether we still have
  325. * pending io unfinished for the ordered extent.
  326. *
  327. * If it's not set, we need to skip to next range.
  328. */
  329. if (!btrfs_folio_test_ordered(fs_info, folio, file_offset, len))
  330. return false;
  331. btrfs_folio_clear_ordered(fs_info, folio, file_offset, len);
  332. }
  333. /* Now we're fine to update the accounting. */
  334. if (WARN_ON_ONCE(len > ordered->bytes_left)) {
  335. btrfs_crit(fs_info,
  336. "bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%llu left=%llu",
  337. btrfs_root_id(inode->root), btrfs_ino(inode),
  338. ordered->file_offset, ordered->num_bytes,
  339. len, ordered->bytes_left);
  340. ordered->bytes_left = 0;
  341. } else {
  342. ordered->bytes_left -= len;
  343. }
  344. if (!uptodate)
  345. set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
  346. if (ordered->bytes_left)
  347. return false;
  348. /*
  349. * All the IO of the ordered extent is finished, we need to queue
  350. * the finish_func to be executed.
  351. */
  352. set_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags);
  353. cond_wake_up(&ordered->wait);
  354. refcount_inc(&ordered->refs);
  355. trace_btrfs_ordered_extent_mark_finished(inode, ordered);
  356. return true;
  357. }
  358. static void btrfs_queue_ordered_fn(struct btrfs_ordered_extent *ordered)
  359. {
  360. struct btrfs_inode *inode = ordered->inode;
  361. struct btrfs_fs_info *fs_info = inode->root->fs_info;
  362. struct btrfs_workqueue *wq = btrfs_is_free_space_inode(inode) ?
  363. fs_info->endio_freespace_worker : fs_info->endio_write_workers;
  364. btrfs_init_work(&ordered->work, finish_ordered_fn, NULL);
  365. btrfs_queue_work(wq, &ordered->work);
  366. }
  367. void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
  368. struct folio *folio, u64 file_offset, u64 len,
  369. bool uptodate)
  370. {
  371. struct btrfs_inode *inode = ordered->inode;
  372. bool ret;
  373. trace_btrfs_finish_ordered_extent(inode, file_offset, len, uptodate);
  374. spin_lock(&inode->ordered_tree_lock);
  375. ret = can_finish_ordered_extent(ordered, folio, file_offset, len,
  376. uptodate);
  377. spin_unlock(&inode->ordered_tree_lock);
  378. /*
  379. * If this is a COW write it means we created new extent maps for the
  380. * range and they point to unwritten locations if we got an error either
  381. * before submitting a bio or during IO.
  382. *
  383. * We have marked the ordered extent with BTRFS_ORDERED_IOERR, and we
  384. * are queuing its completion below. During completion, at
  385. * btrfs_finish_one_ordered(), we will drop the extent maps for the
  386. * unwritten extents.
  387. *
  388. * However because completion runs in a work queue we can end up having
  389. * a fast fsync running before that. In the case of direct IO, once we
  390. * unlock the inode the fsync might start, and we queue the completion
  391. * before unlocking the inode. In the case of buffered IO when writeback
  392. * finishes (end_bbio_data_write()) we queue the completion, so if the
  393. * writeback was triggered by a fast fsync, the fsync might start
  394. * logging before ordered extent completion runs in the work queue.
  395. *
  396. * The fast fsync will log file extent items based on the extent maps it
  397. * finds, so if by the time it collects extent maps the ordered extent
  398. * completion didn't happen yet, it will log file extent items that
  399. * point to unwritten extents, resulting in a corruption if a crash
  400. * happens and the log tree is replayed. Note that a fast fsync does not
  401. * wait for completion of ordered extents in order to reduce latency.
  402. *
  403. * Set a flag in the inode so that the next fast fsync will wait for
  404. * ordered extents to complete before starting to log.
  405. */
  406. if (!uptodate && !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
  407. set_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags);
  408. if (ret)
  409. btrfs_queue_ordered_fn(ordered);
  410. }
  411. /*
  412. * Mark all ordered extents io inside the specified range finished.
  413. *
  414. * @folio: The involved folio for the operation.
  415. * For uncompressed buffered IO, the folio status also needs to be
  416. * updated to indicate whether the pending ordered io is finished.
  417. * Can be NULL for direct IO and compressed write.
  418. * For these cases, callers are ensured they won't execute the
  419. * endio function twice.
  420. *
  421. * This function is called for endio, thus the range must have ordered
  422. * extent(s) covering it.
  423. */
  424. void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
  425. struct folio *folio, u64 file_offset,
  426. u64 num_bytes, bool uptodate)
  427. {
  428. struct rb_node *node;
  429. struct btrfs_ordered_extent *entry = NULL;
  430. u64 cur = file_offset;
  431. const u64 end = file_offset + num_bytes;
  432. trace_btrfs_writepage_end_io_hook(inode, file_offset, end - 1, uptodate);
  433. spin_lock(&inode->ordered_tree_lock);
  434. while (cur < end) {
  435. u64 entry_end;
  436. u64 this_end;
  437. u64 len;
  438. node = ordered_tree_search(inode, cur);
  439. /* No ordered extents at all */
  440. if (!node)
  441. break;
  442. entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
  443. entry_end = entry->file_offset + entry->num_bytes;
  444. /*
  445. * |<-- OE --->| |
  446. * cur
  447. * Go to next OE.
  448. */
  449. if (cur >= entry_end) {
  450. node = rb_next(node);
  451. /* No more ordered extents, exit */
  452. if (!node)
  453. break;
  454. entry = rb_entry(node, struct btrfs_ordered_extent,
  455. rb_node);
  456. /* Go to next ordered extent and continue */
  457. cur = entry->file_offset;
  458. continue;
  459. }
  460. /*
  461. * | |<--- OE --->|
  462. * cur
  463. * Go to the start of OE.
  464. */
  465. if (cur < entry->file_offset) {
  466. cur = entry->file_offset;
  467. continue;
  468. }
  469. /*
  470. * Now we are definitely inside one ordered extent.
  471. *
  472. * |<--- OE --->|
  473. * |
  474. * cur
  475. */
  476. this_end = min(entry_end, end);
  477. len = this_end - cur;
  478. ASSERT(len < U32_MAX);
  479. if (can_finish_ordered_extent(entry, folio, cur, len, uptodate)) {
  480. spin_unlock(&inode->ordered_tree_lock);
  481. btrfs_queue_ordered_fn(entry);
  482. spin_lock(&inode->ordered_tree_lock);
  483. }
  484. cur += len;
  485. }
  486. spin_unlock(&inode->ordered_tree_lock);
  487. }
  488. /*
  489. * Finish IO for one ordered extent across a given range. The range can only
  490. * contain one ordered extent.
  491. *
  492. * @cached: The cached ordered extent. If not NULL, we can skip the tree
  493. * search and use the ordered extent directly.
  494. * Will be also used to store the finished ordered extent.
  495. * @file_offset: File offset for the finished IO
  496. * @io_size: Length of the finish IO range
  497. *
  498. * Return true if the ordered extent is finished in the range, and update
  499. * @cached.
  500. * Return false otherwise.
  501. *
  502. * NOTE: The range can NOT cross multiple ordered extents.
  503. * Thus caller should ensure the range doesn't cross ordered extents.
  504. */
  505. bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
  506. struct btrfs_ordered_extent **cached,
  507. u64 file_offset, u64 io_size)
  508. {
  509. struct rb_node *node;
  510. struct btrfs_ordered_extent *entry = NULL;
  511. bool finished = false;
  512. spin_lock(&inode->ordered_tree_lock);
  513. if (cached && *cached) {
  514. entry = *cached;
  515. goto have_entry;
  516. }
  517. node = ordered_tree_search(inode, file_offset);
  518. if (!node)
  519. goto out;
  520. entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
  521. have_entry:
  522. if (!in_range(file_offset, entry->file_offset, entry->num_bytes))
  523. goto out;
  524. if (io_size > entry->bytes_left)
  525. btrfs_crit(inode->root->fs_info,
  526. "bad ordered accounting left %llu size %llu",
  527. entry->bytes_left, io_size);
  528. entry->bytes_left -= io_size;
  529. if (entry->bytes_left == 0) {
  530. /*
  531. * Ensure only one caller can set the flag and finished_ret
  532. * accordingly
  533. */
  534. finished = !test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
  535. /* test_and_set_bit implies a barrier */
  536. cond_wake_up_nomb(&entry->wait);
  537. }
  538. out:
  539. if (finished && cached && entry) {
  540. *cached = entry;
  541. refcount_inc(&entry->refs);
  542. trace_btrfs_ordered_extent_dec_test_pending(inode, entry);
  543. }
  544. spin_unlock(&inode->ordered_tree_lock);
  545. return finished;
  546. }
  547. /*
  548. * used to drop a reference on an ordered extent. This will free
  549. * the extent if the last reference is dropped
  550. */
  551. void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
  552. {
  553. trace_btrfs_ordered_extent_put(entry->inode, entry);
  554. if (refcount_dec_and_test(&entry->refs)) {
  555. struct btrfs_ordered_sum *sum;
  556. struct btrfs_ordered_sum *tmp;
  557. ASSERT(list_empty(&entry->root_extent_list));
  558. ASSERT(list_empty(&entry->log_list));
  559. ASSERT(RB_EMPTY_NODE(&entry->rb_node));
  560. btrfs_add_delayed_iput(entry->inode);
  561. list_for_each_entry_safe(sum, tmp, &entry->list, list)
  562. kvfree(sum);
  563. kmem_cache_free(btrfs_ordered_extent_cache, entry);
  564. }
  565. }
  566. /*
  567. * remove an ordered extent from the tree. No references are dropped
  568. * and waiters are woken up.
  569. */
  570. void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
  571. struct btrfs_ordered_extent *entry)
  572. {
  573. struct btrfs_root *root = btrfs_inode->root;
  574. struct btrfs_fs_info *fs_info = root->fs_info;
  575. struct rb_node *node;
  576. bool pending;
  577. bool freespace_inode;
  578. /*
  579. * If this is a free space inode the thread has not acquired the ordered
  580. * extents lockdep map.
  581. */
  582. freespace_inode = btrfs_is_free_space_inode(btrfs_inode);
  583. btrfs_lockdep_acquire(fs_info, btrfs_trans_pending_ordered);
  584. /* This is paired with alloc_ordered_extent(). */
  585. spin_lock(&btrfs_inode->lock);
  586. btrfs_mod_outstanding_extents(btrfs_inode, -1);
  587. spin_unlock(&btrfs_inode->lock);
  588. if (root != fs_info->tree_root) {
  589. u64 release;
  590. if (test_bit(BTRFS_ORDERED_ENCODED, &entry->flags))
  591. release = entry->disk_num_bytes;
  592. else
  593. release = entry->num_bytes;
  594. btrfs_delalloc_release_metadata(btrfs_inode, release,
  595. test_bit(BTRFS_ORDERED_IOERR,
  596. &entry->flags));
  597. }
  598. percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes,
  599. fs_info->delalloc_batch);
  600. spin_lock(&btrfs_inode->ordered_tree_lock);
  601. node = &entry->rb_node;
  602. rb_erase(node, &btrfs_inode->ordered_tree);
  603. RB_CLEAR_NODE(node);
  604. if (btrfs_inode->ordered_tree_last == node)
  605. btrfs_inode->ordered_tree_last = NULL;
  606. set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
  607. pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags);
  608. spin_unlock(&btrfs_inode->ordered_tree_lock);
  609. /*
  610. * The current running transaction is waiting on us, we need to let it
  611. * know that we're complete and wake it up.
  612. */
  613. if (pending) {
  614. struct btrfs_transaction *trans;
  615. /*
  616. * The checks for trans are just a formality, it should be set,
  617. * but if it isn't we don't want to deref/assert under the spin
  618. * lock, so be nice and check if trans is set, but ASSERT() so
  619. * if it isn't set a developer will notice.
  620. */
  621. spin_lock(&fs_info->trans_lock);
  622. trans = fs_info->running_transaction;
  623. if (trans)
  624. refcount_inc(&trans->use_count);
  625. spin_unlock(&fs_info->trans_lock);
  626. ASSERT(trans || BTRFS_FS_ERROR(fs_info));
  627. if (trans) {
  628. if (atomic_dec_and_test(&trans->pending_ordered))
  629. wake_up(&trans->pending_wait);
  630. btrfs_put_transaction(trans);
  631. }
  632. }
  633. btrfs_lockdep_release(fs_info, btrfs_trans_pending_ordered);
  634. spin_lock(&root->ordered_extent_lock);
  635. list_del_init(&entry->root_extent_list);
  636. root->nr_ordered_extents--;
  637. trace_btrfs_ordered_extent_remove(btrfs_inode, entry);
  638. if (!root->nr_ordered_extents) {
  639. spin_lock(&fs_info->ordered_root_lock);
  640. BUG_ON(list_empty(&root->ordered_root));
  641. list_del_init(&root->ordered_root);
  642. spin_unlock(&fs_info->ordered_root_lock);
  643. }
  644. spin_unlock(&root->ordered_extent_lock);
  645. wake_up(&entry->wait);
  646. if (!freespace_inode)
  647. btrfs_lockdep_release(fs_info, btrfs_ordered_extent);
  648. }
  649. static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
  650. {
  651. struct btrfs_ordered_extent *ordered;
  652. ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
  653. btrfs_start_ordered_extent(ordered);
  654. complete(&ordered->completion);
  655. }
  656. /*
  657. * Wait for all the ordered extents in a root. Use @bg as range or do whole
  658. * range if it's NULL.
  659. */
  660. u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
  661. const struct btrfs_block_group *bg)
  662. {
  663. struct btrfs_fs_info *fs_info = root->fs_info;
  664. LIST_HEAD(splice);
  665. LIST_HEAD(skipped);
  666. LIST_HEAD(works);
  667. struct btrfs_ordered_extent *ordered, *next;
  668. u64 count = 0;
  669. u64 range_start, range_len;
  670. u64 range_end;
  671. if (bg) {
  672. range_start = bg->start;
  673. range_len = bg->length;
  674. } else {
  675. range_start = 0;
  676. range_len = U64_MAX;
  677. }
  678. range_end = range_start + range_len;
  679. mutex_lock(&root->ordered_extent_mutex);
  680. spin_lock(&root->ordered_extent_lock);
  681. list_splice_init(&root->ordered_extents, &splice);
  682. while (!list_empty(&splice) && nr) {
  683. ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
  684. root_extent_list);
  685. if (range_end <= ordered->disk_bytenr ||
  686. ordered->disk_bytenr + ordered->disk_num_bytes <= range_start) {
  687. list_move_tail(&ordered->root_extent_list, &skipped);
  688. cond_resched_lock(&root->ordered_extent_lock);
  689. continue;
  690. }
  691. list_move_tail(&ordered->root_extent_list,
  692. &root->ordered_extents);
  693. refcount_inc(&ordered->refs);
  694. spin_unlock(&root->ordered_extent_lock);
  695. btrfs_init_work(&ordered->flush_work,
  696. btrfs_run_ordered_extent_work, NULL);
  697. list_add_tail(&ordered->work_list, &works);
  698. btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work);
  699. cond_resched();
  700. if (nr != U64_MAX)
  701. nr--;
  702. count++;
  703. spin_lock(&root->ordered_extent_lock);
  704. }
  705. list_splice_tail(&skipped, &root->ordered_extents);
  706. list_splice_tail(&splice, &root->ordered_extents);
  707. spin_unlock(&root->ordered_extent_lock);
  708. list_for_each_entry_safe(ordered, next, &works, work_list) {
  709. list_del_init(&ordered->work_list);
  710. wait_for_completion(&ordered->completion);
  711. btrfs_put_ordered_extent(ordered);
  712. cond_resched();
  713. }
  714. mutex_unlock(&root->ordered_extent_mutex);
  715. return count;
  716. }
  717. /*
  718. * Wait for @nr ordered extents that intersect the @bg, or the whole range of
  719. * the filesystem if @bg is NULL.
  720. */
  721. void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
  722. const struct btrfs_block_group *bg)
  723. {
  724. struct btrfs_root *root;
  725. LIST_HEAD(splice);
  726. u64 done;
  727. mutex_lock(&fs_info->ordered_operations_mutex);
  728. spin_lock(&fs_info->ordered_root_lock);
  729. list_splice_init(&fs_info->ordered_roots, &splice);
  730. while (!list_empty(&splice) && nr) {
  731. root = list_first_entry(&splice, struct btrfs_root,
  732. ordered_root);
  733. root = btrfs_grab_root(root);
  734. BUG_ON(!root);
  735. list_move_tail(&root->ordered_root,
  736. &fs_info->ordered_roots);
  737. spin_unlock(&fs_info->ordered_root_lock);
  738. done = btrfs_wait_ordered_extents(root, nr, bg);
  739. btrfs_put_root(root);
  740. if (nr != U64_MAX)
  741. nr -= done;
  742. spin_lock(&fs_info->ordered_root_lock);
  743. }
  744. list_splice_tail(&splice, &fs_info->ordered_roots);
  745. spin_unlock(&fs_info->ordered_root_lock);
  746. mutex_unlock(&fs_info->ordered_operations_mutex);
  747. }
  748. /*
  749. * Start IO and wait for a given ordered extent to finish.
  750. *
  751. * Wait on page writeback for all the pages in the extent but not in
  752. * [@nowriteback_start, @nowriteback_start + @nowriteback_len) and the
  753. * IO completion code to insert metadata into the btree corresponding to the extent.
  754. */
  755. void btrfs_start_ordered_extent_nowriteback(struct btrfs_ordered_extent *entry,
  756. u64 nowriteback_start, u32 nowriteback_len)
  757. {
  758. u64 start = entry->file_offset;
  759. u64 end = start + entry->num_bytes - 1;
  760. struct btrfs_inode *inode = entry->inode;
  761. bool freespace_inode;
  762. trace_btrfs_ordered_extent_start(inode, entry);
  763. /*
  764. * If this is a free space inode do not take the ordered extents lockdep
  765. * map.
  766. */
  767. freespace_inode = btrfs_is_free_space_inode(inode);
  768. /*
  769. * pages in the range can be dirty, clean or writeback. We
  770. * start IO on any dirty ones so the wait doesn't stall waiting
  771. * for the flusher thread to find them
  772. */
  773. if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) {
  774. if (!nowriteback_len) {
  775. filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end);
  776. } else {
  777. if (start < nowriteback_start)
  778. filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start,
  779. nowriteback_start - 1);
  780. if (nowriteback_start + nowriteback_len < end)
  781. filemap_fdatawrite_range(inode->vfs_inode.i_mapping,
  782. nowriteback_start + nowriteback_len,
  783. end);
  784. }
  785. }
  786. if (!freespace_inode)
  787. btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent);
  788. wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags));
  789. }
  790. /*
  791. * Used to wait on ordered extents across a large range of bytes.
  792. */
  793. int btrfs_wait_ordered_range(struct btrfs_inode *inode, u64 start, u64 len)
  794. {
  795. int ret = 0;
  796. int ret_wb = 0;
  797. u64 end;
  798. u64 orig_end;
  799. struct btrfs_ordered_extent *ordered;
  800. if (start + len < start) {
  801. orig_end = OFFSET_MAX;
  802. } else {
  803. orig_end = start + len - 1;
  804. if (orig_end > OFFSET_MAX)
  805. orig_end = OFFSET_MAX;
  806. }
  807. /* start IO across the range first to instantiate any delalloc
  808. * extents
  809. */
  810. ret = btrfs_fdatawrite_range(inode, start, orig_end);
  811. if (ret)
  812. return ret;
  813. /*
  814. * If we have a writeback error don't return immediately. Wait first
  815. * for any ordered extents that haven't completed yet. This is to make
  816. * sure no one can dirty the same page ranges and call writepages()
  817. * before the ordered extents complete - to avoid failures (-EEXIST)
  818. * when adding the new ordered extents to the ordered tree.
  819. */
  820. ret_wb = filemap_fdatawait_range(inode->vfs_inode.i_mapping, start, orig_end);
  821. end = orig_end;
  822. while (1) {
  823. ordered = btrfs_lookup_first_ordered_extent(inode, end);
  824. if (!ordered)
  825. break;
  826. if (ordered->file_offset > orig_end) {
  827. btrfs_put_ordered_extent(ordered);
  828. break;
  829. }
  830. if (ordered->file_offset + ordered->num_bytes <= start) {
  831. btrfs_put_ordered_extent(ordered);
  832. break;
  833. }
  834. btrfs_start_ordered_extent(ordered);
  835. end = ordered->file_offset;
  836. /*
  837. * If the ordered extent had an error save the error but don't
  838. * exit without waiting first for all other ordered extents in
  839. * the range to complete.
  840. */
  841. if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
  842. ret = -EIO;
  843. btrfs_put_ordered_extent(ordered);
  844. if (end == 0 || end == start)
  845. break;
  846. end--;
  847. }
  848. return ret_wb ? ret_wb : ret;
  849. }
  850. /*
  851. * find an ordered extent corresponding to file_offset. return NULL if
  852. * nothing is found, otherwise take a reference on the extent and return it
  853. */
  854. struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
  855. u64 file_offset)
  856. {
  857. struct rb_node *node;
  858. struct btrfs_ordered_extent *entry = NULL;
  859. spin_lock(&inode->ordered_tree_lock);
  860. node = ordered_tree_search(inode, file_offset);
  861. if (!node)
  862. goto out;
  863. entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
  864. if (!in_range(file_offset, entry->file_offset, entry->num_bytes))
  865. entry = NULL;
  866. if (entry) {
  867. refcount_inc(&entry->refs);
  868. trace_btrfs_ordered_extent_lookup(inode, entry);
  869. }
  870. out:
  871. spin_unlock(&inode->ordered_tree_lock);
  872. return entry;
  873. }
  874. /* Since the DIO code tries to lock a wide area we need to look for any ordered
  875. * extents that exist in the range, rather than just the start of the range.
  876. */
  877. struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
  878. struct btrfs_inode *inode, u64 file_offset, u64 len)
  879. {
  880. struct rb_node *node;
  881. struct btrfs_ordered_extent *entry = NULL;
  882. spin_lock(&inode->ordered_tree_lock);
  883. node = ordered_tree_search(inode, file_offset);
  884. if (!node) {
  885. node = ordered_tree_search(inode, file_offset + len);
  886. if (!node)
  887. goto out;
  888. }
  889. while (1) {
  890. entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
  891. if (btrfs_range_overlaps(entry, file_offset, len))
  892. break;
  893. if (entry->file_offset >= file_offset + len) {
  894. entry = NULL;
  895. break;
  896. }
  897. entry = NULL;
  898. node = rb_next(node);
  899. if (!node)
  900. break;
  901. }
  902. out:
  903. if (entry) {
  904. refcount_inc(&entry->refs);
  905. trace_btrfs_ordered_extent_lookup_range(inode, entry);
  906. }
  907. spin_unlock(&inode->ordered_tree_lock);
  908. return entry;
  909. }
  910. /*
  911. * Adds all ordered extents to the given list. The list ends up sorted by the
  912. * file_offset of the ordered extents.
  913. */
  914. void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
  915. struct list_head *list)
  916. {
  917. struct rb_node *n;
  918. btrfs_assert_inode_locked(inode);
  919. spin_lock(&inode->ordered_tree_lock);
  920. for (n = rb_first(&inode->ordered_tree); n; n = rb_next(n)) {
  921. struct btrfs_ordered_extent *ordered;
  922. ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
  923. if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
  924. continue;
  925. ASSERT(list_empty(&ordered->log_list));
  926. list_add_tail(&ordered->log_list, list);
  927. refcount_inc(&ordered->refs);
  928. trace_btrfs_ordered_extent_lookup_for_logging(inode, ordered);
  929. }
  930. spin_unlock(&inode->ordered_tree_lock);
  931. }
  932. /*
  933. * lookup and return any extent before 'file_offset'. NULL is returned
  934. * if none is found
  935. */
  936. struct btrfs_ordered_extent *
  937. btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset)
  938. {
  939. struct rb_node *node;
  940. struct btrfs_ordered_extent *entry = NULL;
  941. spin_lock(&inode->ordered_tree_lock);
  942. node = ordered_tree_search(inode, file_offset);
  943. if (!node)
  944. goto out;
  945. entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
  946. refcount_inc(&entry->refs);
  947. trace_btrfs_ordered_extent_lookup_first(inode, entry);
  948. out:
  949. spin_unlock(&inode->ordered_tree_lock);
  950. return entry;
  951. }
  952. /*
  953. * Lookup the first ordered extent that overlaps the range
  954. * [@file_offset, @file_offset + @len).
  955. *
  956. * The difference between this and btrfs_lookup_first_ordered_extent() is
  957. * that this one won't return any ordered extent that does not overlap the range.
  958. * And the difference against btrfs_lookup_ordered_extent() is, this function
  959. * ensures the first ordered extent gets returned.
  960. */
  961. struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range(
  962. struct btrfs_inode *inode, u64 file_offset, u64 len)
  963. {
  964. struct rb_node *node;
  965. struct rb_node *cur;
  966. struct rb_node *prev;
  967. struct rb_node *next;
  968. struct btrfs_ordered_extent *entry = NULL;
  969. spin_lock(&inode->ordered_tree_lock);
  970. node = inode->ordered_tree.rb_node;
  971. /*
  972. * Here we don't want to use tree_search() which will use tree->last
  973. * and screw up the search order.
  974. * And __tree_search() can't return the adjacent ordered extents
  975. * either, thus here we do our own search.
  976. */
  977. while (node) {
  978. entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
  979. if (file_offset < entry->file_offset) {
  980. node = node->rb_left;
  981. } else if (file_offset >= entry_end(entry)) {
  982. node = node->rb_right;
  983. } else {
  984. /*
  985. * Direct hit, got an ordered extent that starts at
  986. * @file_offset
  987. */
  988. goto out;
  989. }
  990. }
  991. if (!entry) {
  992. /* Empty tree */
  993. goto out;
  994. }
  995. cur = &entry->rb_node;
  996. /* We got an entry around @file_offset, check adjacent entries */
  997. if (entry->file_offset < file_offset) {
  998. prev = cur;
  999. next = rb_next(cur);
  1000. } else {
  1001. prev = rb_prev(cur);
  1002. next = cur;
  1003. }
  1004. if (prev) {
  1005. entry = rb_entry(prev, struct btrfs_ordered_extent, rb_node);
  1006. if (btrfs_range_overlaps(entry, file_offset, len))
  1007. goto out;
  1008. }
  1009. if (next) {
  1010. entry = rb_entry(next, struct btrfs_ordered_extent, rb_node);
  1011. if (btrfs_range_overlaps(entry, file_offset, len))
  1012. goto out;
  1013. }
  1014. /* No ordered extent in the range */
  1015. entry = NULL;
  1016. out:
  1017. if (entry) {
  1018. refcount_inc(&entry->refs);
  1019. trace_btrfs_ordered_extent_lookup_first_range(inode, entry);
  1020. }
  1021. spin_unlock(&inode->ordered_tree_lock);
  1022. return entry;
  1023. }
  1024. /*
  1025. * Lock the passed range and ensures all pending ordered extents in it are run
  1026. * to completion.
  1027. *
  1028. * @inode: Inode whose ordered tree is to be searched
  1029. * @start: Beginning of range to flush
  1030. * @end: Last byte of range to lock
  1031. * @cached_state: If passed, will return the extent state responsible for the
  1032. * locked range. It's the caller's responsibility to free the
  1033. * cached state.
  1034. *
  1035. * Always return with the given range locked, ensuring after it's called no
  1036. * order extent can be pending.
  1037. */
  1038. void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
  1039. u64 end,
  1040. struct extent_state **cached_state)
  1041. {
  1042. struct btrfs_ordered_extent *ordered;
  1043. struct extent_state *cache = NULL;
  1044. struct extent_state **cachedp = &cache;
  1045. if (cached_state)
  1046. cachedp = cached_state;
  1047. while (1) {
  1048. btrfs_lock_extent(&inode->io_tree, start, end, cachedp);
  1049. ordered = btrfs_lookup_ordered_range(inode, start,
  1050. end - start + 1);
  1051. if (!ordered) {
  1052. /*
  1053. * If no external cached_state has been passed then
  1054. * decrement the extra ref taken for cachedp since we
  1055. * aren't exposing it outside of this function
  1056. */
  1057. if (!cached_state)
  1058. refcount_dec(&cache->refs);
  1059. break;
  1060. }
  1061. btrfs_unlock_extent(&inode->io_tree, start, end, cachedp);
  1062. btrfs_start_ordered_extent(ordered);
  1063. btrfs_put_ordered_extent(ordered);
  1064. }
  1065. }
  1066. /*
  1067. * Lock the passed range and ensure all pending ordered extents in it are run
  1068. * to completion in nowait mode.
  1069. *
  1070. * Return true if btrfs_lock_ordered_range does not return any extents,
  1071. * otherwise false.
  1072. */
  1073. bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end,
  1074. struct extent_state **cached_state)
  1075. {
  1076. struct btrfs_ordered_extent *ordered;
  1077. if (!btrfs_try_lock_extent(&inode->io_tree, start, end, cached_state))
  1078. return false;
  1079. ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1);
  1080. if (!ordered)
  1081. return true;
  1082. btrfs_put_ordered_extent(ordered);
  1083. btrfs_unlock_extent(&inode->io_tree, start, end, cached_state);
  1084. return false;
  1085. }
  1086. /* Split out a new ordered extent for this first @len bytes of @ordered. */
  1087. struct btrfs_ordered_extent *btrfs_split_ordered_extent(
  1088. struct btrfs_ordered_extent *ordered, u64 len)
  1089. {
  1090. struct btrfs_inode *inode = ordered->inode;
  1091. struct btrfs_root *root = inode->root;
  1092. struct btrfs_fs_info *fs_info = root->fs_info;
  1093. u64 file_offset = ordered->file_offset;
  1094. u64 disk_bytenr = ordered->disk_bytenr;
  1095. unsigned long flags = ordered->flags;
  1096. struct btrfs_ordered_sum *sum, *tmpsum;
  1097. struct btrfs_ordered_extent *new;
  1098. struct rb_node *node;
  1099. u64 offset = 0;
  1100. trace_btrfs_ordered_extent_split(inode, ordered);
  1101. ASSERT(!(flags & (1U << BTRFS_ORDERED_COMPRESSED)));
  1102. /*
  1103. * The entire bio must be covered by the ordered extent, but we can't
  1104. * reduce the original extent to a zero length either.
  1105. */
  1106. if (WARN_ON_ONCE(len >= ordered->num_bytes))
  1107. return ERR_PTR(-EINVAL);
  1108. /*
  1109. * If our ordered extent had an error there's no point in continuing.
  1110. * The error may have come from a transaction abort done either by this
  1111. * task or some other concurrent task, and the transaction abort path
  1112. * iterates over all existing ordered extents and sets the flag
  1113. * BTRFS_ORDERED_IOERR on them.
  1114. */
  1115. if (unlikely(flags & (1U << BTRFS_ORDERED_IOERR))) {
  1116. const int fs_error = BTRFS_FS_ERROR(fs_info);
  1117. return fs_error ? ERR_PTR(fs_error) : ERR_PTR(-EIO);
  1118. }
  1119. /* We cannot split partially completed ordered extents. */
  1120. if (ordered->bytes_left) {
  1121. ASSERT(!(flags & ~BTRFS_ORDERED_TYPE_FLAGS));
  1122. if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes))
  1123. return ERR_PTR(-EINVAL);
  1124. }
  1125. /* We cannot split a compressed ordered extent. */
  1126. if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes))
  1127. return ERR_PTR(-EINVAL);
  1128. new = alloc_ordered_extent(inode, file_offset, len, len, disk_bytenr,
  1129. len, 0, flags, ordered->compress_type);
  1130. if (IS_ERR(new))
  1131. return new;
  1132. /* One ref for the tree. */
  1133. refcount_inc(&new->refs);
  1134. /*
  1135. * Take the root's ordered_extent_lock to avoid a race with
  1136. * btrfs_wait_ordered_extents() when updating the disk_bytenr and
  1137. * disk_num_bytes fields of the ordered extent below.
  1138. *
  1139. * There's no concern about a previous caller of
  1140. * btrfs_wait_ordered_extents() getting the trimmed ordered extent
  1141. * before we insert the new one, because even if it gets the ordered
  1142. * extent before it's trimmed and the new one inserted, right before it
  1143. * uses it or during its use, the ordered extent might have been
  1144. * trimmed in the meanwhile, and it missed the new ordered extent.
  1145. * There's no way around this and it's harmless for current use cases,
  1146. * so we take the root's ordered_extent_lock to fix that race during
  1147. * trimming and silence tools like KCSAN.
  1148. */
  1149. spin_lock_irq(&root->ordered_extent_lock);
  1150. spin_lock(&inode->ordered_tree_lock);
  1151. /*
  1152. * We don't have overlapping ordered extents (that would imply double
  1153. * allocation of extents) and we checked above that the split length
  1154. * does not cross the ordered extent's num_bytes field, so there's
  1155. * no need to remove it and re-insert it in the tree.
  1156. */
  1157. ordered->file_offset += len;
  1158. ordered->disk_bytenr += len;
  1159. ordered->num_bytes -= len;
  1160. ordered->disk_num_bytes -= len;
  1161. ordered->ram_bytes -= len;
  1162. if (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)) {
  1163. ASSERT(ordered->bytes_left == 0);
  1164. new->bytes_left = 0;
  1165. } else {
  1166. ordered->bytes_left -= len;
  1167. }
  1168. if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags)) {
  1169. if (ordered->truncated_len > len) {
  1170. ordered->truncated_len -= len;
  1171. } else {
  1172. new->truncated_len = ordered->truncated_len;
  1173. ordered->truncated_len = 0;
  1174. }
  1175. }
  1176. list_for_each_entry_safe(sum, tmpsum, &ordered->list, list) {
  1177. if (offset == len)
  1178. break;
  1179. list_move_tail(&sum->list, &new->list);
  1180. offset += sum->len;
  1181. }
  1182. node = tree_insert(&inode->ordered_tree, new->file_offset, &new->rb_node);
  1183. if (unlikely(node))
  1184. btrfs_panic(fs_info, -EEXIST,
  1185. "inconsistency in ordered tree at offset %llu after split",
  1186. new->file_offset);
  1187. spin_unlock(&inode->ordered_tree_lock);
  1188. list_add_tail(&new->root_extent_list, &root->ordered_extents);
  1189. root->nr_ordered_extents++;
  1190. spin_unlock_irq(&root->ordered_extent_lock);
  1191. return new;
  1192. }
  1193. int __init ordered_data_init(void)
  1194. {
  1195. btrfs_ordered_extent_cache = KMEM_CACHE(btrfs_ordered_extent, 0);
  1196. if (!btrfs_ordered_extent_cache)
  1197. return -ENOMEM;
  1198. return 0;
  1199. }
  1200. void __cold ordered_data_exit(void)
  1201. {
  1202. kmem_cache_destroy(btrfs_ordered_extent_cache);
  1203. }