extent-io-tree.c 52 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956
  1. // SPDX-License-Identifier: GPL-2.0
  2. #include <linux/slab.h>
  3. #include <trace/events/btrfs.h>
  4. #include "messages.h"
  5. #include "ctree.h"
  6. #include "extent_io.h"
  7. #include "extent-io-tree.h"
  8. #include "btrfs_inode.h"
  9. static struct kmem_cache *extent_state_cache;
  10. static inline bool extent_state_in_tree(const struct extent_state *state)
  11. {
  12. return !RB_EMPTY_NODE(&state->rb_node);
  13. }
  14. #ifdef CONFIG_BTRFS_DEBUG
  15. static LIST_HEAD(states);
  16. static DEFINE_SPINLOCK(leak_lock);
  17. static inline void btrfs_leak_debug_add_state(struct extent_state *state)
  18. {
  19. unsigned long flags;
  20. spin_lock_irqsave(&leak_lock, flags);
  21. list_add(&state->leak_list, &states);
  22. spin_unlock_irqrestore(&leak_lock, flags);
  23. }
  24. static inline void btrfs_leak_debug_del_state(struct extent_state *state)
  25. {
  26. unsigned long flags;
  27. spin_lock_irqsave(&leak_lock, flags);
  28. list_del(&state->leak_list);
  29. spin_unlock_irqrestore(&leak_lock, flags);
  30. }
  31. static inline void btrfs_extent_state_leak_debug_check(void)
  32. {
  33. struct extent_state *state;
  34. while (!list_empty(&states)) {
  35. state = list_first_entry(&states, struct extent_state, leak_list);
  36. btrfs_err(NULL,
  37. "state leak: start %llu end %llu state %u in tree %d refs %d",
  38. state->start, state->end, state->state,
  39. extent_state_in_tree(state),
  40. refcount_read(&state->refs));
  41. list_del(&state->leak_list);
  42. WARN_ON_ONCE(1);
  43. kmem_cache_free(extent_state_cache, state);
  44. }
  45. }
  46. #define btrfs_debug_check_extent_io_range(tree, start, end) \
  47. __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
  48. static inline void __btrfs_debug_check_extent_io_range(const char *caller,
  49. struct extent_io_tree *tree,
  50. u64 start, u64 end)
  51. {
  52. const struct btrfs_inode *inode = tree->inode;
  53. u64 isize;
  54. if (tree->owner != IO_TREE_INODE_IO)
  55. return;
  56. isize = i_size_read(&inode->vfs_inode);
  57. if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
  58. btrfs_debug_rl(inode->root->fs_info,
  59. "%s: ino %llu isize %llu odd range [%llu,%llu]",
  60. caller, btrfs_ino(inode), isize, start, end);
  61. }
  62. }
  63. #else
  64. #define btrfs_leak_debug_add_state(state) do {} while (0)
  65. #define btrfs_leak_debug_del_state(state) do {} while (0)
  66. #define btrfs_extent_state_leak_debug_check() do {} while (0)
  67. #define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
  68. #endif
  69. /* Read-only access to the inode. */
  70. const struct btrfs_inode *btrfs_extent_io_tree_to_inode(const struct extent_io_tree *tree)
  71. {
  72. if (tree->owner == IO_TREE_INODE_IO)
  73. return tree->inode;
  74. return NULL;
  75. }
  76. /* For read-only access to fs_info. */
  77. const struct btrfs_fs_info *btrfs_extent_io_tree_to_fs_info(const struct extent_io_tree *tree)
  78. {
  79. if (tree->owner == IO_TREE_INODE_IO)
  80. return tree->inode->root->fs_info;
  81. return tree->fs_info;
  82. }
  83. void btrfs_extent_io_tree_init(struct btrfs_fs_info *fs_info,
  84. struct extent_io_tree *tree, unsigned int owner)
  85. {
  86. tree->state = RB_ROOT;
  87. spin_lock_init(&tree->lock);
  88. tree->fs_info = fs_info;
  89. tree->owner = owner;
  90. }
  91. /*
  92. * Empty an io tree, removing and freeing every extent state record from the
  93. * tree. This should be called once we are sure no other task can access the
  94. * tree anymore, so no tree updates happen after we empty the tree and there
  95. * aren't any waiters on any extent state record (EXTENT_LOCK_BITS are never
  96. * set on any extent state when calling this function).
  97. */
  98. void btrfs_extent_io_tree_release(struct extent_io_tree *tree)
  99. {
  100. struct rb_root root;
  101. struct extent_state *state;
  102. struct extent_state *tmp;
  103. spin_lock(&tree->lock);
  104. root = tree->state;
  105. tree->state = RB_ROOT;
  106. rbtree_postorder_for_each_entry_safe(state, tmp, &root, rb_node) {
  107. /* Clear node to keep free_extent_state() happy. */
  108. RB_CLEAR_NODE(&state->rb_node);
  109. ASSERT(!(state->state & EXTENT_LOCK_BITS));
  110. /*
  111. * No need for a memory barrier here, as we are holding the tree
  112. * lock and we only change the waitqueue while holding that lock
  113. * (see wait_extent_bit()).
  114. */
  115. ASSERT(!waitqueue_active(&state->wq));
  116. btrfs_free_extent_state(state);
  117. cond_resched_lock(&tree->lock);
  118. }
  119. /*
  120. * Should still be empty even after a reschedule, no other task should
  121. * be accessing the tree anymore.
  122. */
  123. ASSERT(RB_EMPTY_ROOT(&tree->state));
  124. spin_unlock(&tree->lock);
  125. }
  126. static struct extent_state *alloc_extent_state(gfp_t mask)
  127. {
  128. struct extent_state *state;
  129. /*
  130. * The given mask might be not appropriate for the slab allocator,
  131. * drop the unsupported bits
  132. */
  133. mask &= ~(__GFP_DMA32|__GFP_HIGHMEM);
  134. state = kmem_cache_alloc(extent_state_cache, mask);
  135. if (!state)
  136. return state;
  137. state->state = 0;
  138. RB_CLEAR_NODE(&state->rb_node);
  139. btrfs_leak_debug_add_state(state);
  140. refcount_set(&state->refs, 1);
  141. init_waitqueue_head(&state->wq);
  142. trace_btrfs_alloc_extent_state(state, mask, _RET_IP_);
  143. return state;
  144. }
  145. static struct extent_state *alloc_extent_state_atomic(struct extent_state *prealloc)
  146. {
  147. if (!prealloc)
  148. prealloc = alloc_extent_state(GFP_ATOMIC);
  149. return prealloc;
  150. }
  151. void btrfs_free_extent_state(struct extent_state *state)
  152. {
  153. if (!state)
  154. return;
  155. if (refcount_dec_and_test(&state->refs)) {
  156. WARN_ON(extent_state_in_tree(state));
  157. btrfs_leak_debug_del_state(state);
  158. trace_btrfs_free_extent_state(state, _RET_IP_);
  159. kmem_cache_free(extent_state_cache, state);
  160. }
  161. }
  162. static int add_extent_changeset(struct extent_state *state, u32 bits,
  163. struct extent_changeset *changeset,
  164. int set)
  165. {
  166. if (!changeset)
  167. return 0;
  168. if (set && (state->state & bits) == bits)
  169. return 0;
  170. if (!set && (state->state & bits) == 0)
  171. return 0;
  172. changeset->bytes_changed += state->end - state->start + 1;
  173. return ulist_add(&changeset->range_changed, state->start, state->end, GFP_ATOMIC);
  174. }
  175. static inline struct extent_state *next_state(struct extent_state *state)
  176. {
  177. struct rb_node *next = rb_next(&state->rb_node);
  178. return rb_entry_safe(next, struct extent_state, rb_node);
  179. }
  180. static inline struct extent_state *prev_state(struct extent_state *state)
  181. {
  182. struct rb_node *next = rb_prev(&state->rb_node);
  183. return rb_entry_safe(next, struct extent_state, rb_node);
  184. }
  185. /*
  186. * Search @tree for an entry that contains @offset or if none exists for the
  187. * first entry that starts and ends after that offset.
  188. *
  189. * @tree: the tree to search
  190. * @offset: search offset
  191. * @node_ret: pointer where new node should be anchored (used when inserting an
  192. * entry in the tree)
  193. * @parent_ret: points to entry which would have been the parent of the entry,
  194. * containing @offset
  195. *
  196. * Return a pointer to the entry that contains @offset byte address.
  197. *
  198. * If no such entry exists, return the first entry that starts and ends after
  199. * @offset if one exists, otherwise NULL.
  200. *
  201. * If the returned entry starts at @offset, then @node_ret and @parent_ret
  202. * aren't changed.
  203. */
  204. static inline struct extent_state *tree_search_for_insert(struct extent_io_tree *tree,
  205. u64 offset,
  206. struct rb_node ***node_ret,
  207. struct rb_node **parent_ret)
  208. {
  209. struct rb_root *root = &tree->state;
  210. struct rb_node **node = &root->rb_node;
  211. struct rb_node *prev = NULL;
  212. struct extent_state *entry = NULL;
  213. while (*node) {
  214. prev = *node;
  215. entry = rb_entry(prev, struct extent_state, rb_node);
  216. if (offset < entry->start)
  217. node = &(*node)->rb_left;
  218. else if (offset > entry->end)
  219. node = &(*node)->rb_right;
  220. else
  221. return entry;
  222. }
  223. if (node_ret)
  224. *node_ret = node;
  225. if (parent_ret)
  226. *parent_ret = prev;
  227. /*
  228. * Return either the current entry if it contains offset (it ends after
  229. * or at offset) or the first entry that starts and ends after offset if
  230. * one exists, or NULL.
  231. */
  232. while (entry && offset > entry->end)
  233. entry = next_state(entry);
  234. return entry;
  235. }
  236. /*
  237. * Search offset in the tree or fill neighbor rbtree node pointers.
  238. *
  239. * @tree: the tree to search
  240. * @offset: offset that should fall within an entry in @tree
  241. * @next_ret: pointer to the first entry whose range ends after @offset
  242. * @prev_ret: pointer to the first entry whose range begins before @offset
  243. *
  244. * Return a pointer to the entry that contains @offset byte address. If no
  245. * such entry exists, then return NULL and fill @prev_ret and @next_ret.
  246. * Otherwise return the found entry and other pointers are left untouched.
  247. */
  248. static struct extent_state *tree_search_prev_next(struct extent_io_tree *tree,
  249. u64 offset,
  250. struct extent_state **prev_ret,
  251. struct extent_state **next_ret)
  252. {
  253. struct rb_root *root = &tree->state;
  254. struct rb_node **node = &root->rb_node;
  255. struct extent_state *orig_prev;
  256. struct extent_state *entry = NULL;
  257. ASSERT(prev_ret);
  258. ASSERT(next_ret);
  259. while (*node) {
  260. entry = rb_entry(*node, struct extent_state, rb_node);
  261. if (offset < entry->start)
  262. node = &(*node)->rb_left;
  263. else if (offset > entry->end)
  264. node = &(*node)->rb_right;
  265. else
  266. return entry;
  267. }
  268. orig_prev = entry;
  269. while (entry && offset > entry->end)
  270. entry = next_state(entry);
  271. *next_ret = entry;
  272. entry = orig_prev;
  273. while (entry && offset < entry->start)
  274. entry = prev_state(entry);
  275. *prev_ret = entry;
  276. return NULL;
  277. }
  278. /*
  279. * Inexact rb-tree search, return the next entry if @offset is not found
  280. */
  281. static inline struct extent_state *tree_search(struct extent_io_tree *tree, u64 offset)
  282. {
  283. return tree_search_for_insert(tree, offset, NULL, NULL);
  284. }
  285. static void __cold extent_io_tree_panic(const struct extent_io_tree *tree,
  286. const struct extent_state *state,
  287. const char *opname,
  288. int err)
  289. {
  290. btrfs_panic(btrfs_extent_io_tree_to_fs_info(tree), err,
  291. "extent io tree error on %s state start %llu end %llu",
  292. opname, state->start, state->end);
  293. }
  294. static void merge_prev_state(struct extent_io_tree *tree, struct extent_state *state)
  295. {
  296. struct extent_state *prev;
  297. prev = prev_state(state);
  298. if (prev && prev->end == state->start - 1 && prev->state == state->state) {
  299. if (tree->owner == IO_TREE_INODE_IO)
  300. btrfs_merge_delalloc_extent(tree->inode, state, prev);
  301. state->start = prev->start;
  302. rb_erase(&prev->rb_node, &tree->state);
  303. RB_CLEAR_NODE(&prev->rb_node);
  304. btrfs_free_extent_state(prev);
  305. }
  306. }
  307. static void merge_next_state(struct extent_io_tree *tree, struct extent_state *state)
  308. {
  309. struct extent_state *next;
  310. next = next_state(state);
  311. if (next && next->start == state->end + 1 && next->state == state->state) {
  312. if (tree->owner == IO_TREE_INODE_IO)
  313. btrfs_merge_delalloc_extent(tree->inode, state, next);
  314. state->end = next->end;
  315. rb_erase(&next->rb_node, &tree->state);
  316. RB_CLEAR_NODE(&next->rb_node);
  317. btrfs_free_extent_state(next);
  318. }
  319. }
  320. /*
  321. * Utility function to look for merge candidates inside a given range. Any
  322. * extents with matching state are merged together into a single extent in the
  323. * tree. Extents with EXTENT_IO in their state field are not merged because
  324. * the end_io handlers need to be able to do operations on them without
  325. * sleeping (or doing allocations/splits).
  326. *
  327. * This should be called with the tree lock held.
  328. */
  329. static void merge_state(struct extent_io_tree *tree, struct extent_state *state)
  330. {
  331. if (state->state & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY))
  332. return;
  333. merge_prev_state(tree, state);
  334. merge_next_state(tree, state);
  335. }
  336. static void set_state_bits(struct extent_io_tree *tree,
  337. struct extent_state *state,
  338. u32 bits, struct extent_changeset *changeset)
  339. {
  340. u32 bits_to_set = bits & ~EXTENT_CTLBITS;
  341. int ret;
  342. if (tree->owner == IO_TREE_INODE_IO)
  343. btrfs_set_delalloc_extent(tree->inode, state, bits);
  344. ret = add_extent_changeset(state, bits_to_set, changeset, 1);
  345. BUG_ON(ret < 0);
  346. state->state |= bits_to_set;
  347. }
  348. /*
  349. * Insert an extent_state struct into the tree. 'bits' are set on the
  350. * struct before it is inserted.
  351. *
  352. * Returns a pointer to the struct extent_state record containing the range
  353. * requested for insertion, which may be the same as the given struct or it
  354. * may be an existing record in the tree that was expanded to accommodate the
  355. * requested range. In case of an extent_state different from the one that was
  356. * given, the later can be freed or reused by the caller.
  357. *
  358. * On error it returns an error pointer.
  359. *
  360. * The tree lock is not taken internally. This is a utility function and
  361. * probably isn't what you want to call (see set/clear_extent_bit).
  362. */
  363. static struct extent_state *insert_state(struct extent_io_tree *tree,
  364. struct extent_state *state,
  365. u32 bits,
  366. struct extent_changeset *changeset)
  367. {
  368. struct rb_node **node;
  369. struct rb_node *parent = NULL;
  370. const u64 start = state->start - 1;
  371. const u64 end = state->end + 1;
  372. const bool try_merge = !(bits & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY));
  373. set_state_bits(tree, state, bits, changeset);
  374. node = &tree->state.rb_node;
  375. while (*node) {
  376. struct extent_state *entry;
  377. parent = *node;
  378. entry = rb_entry(parent, struct extent_state, rb_node);
  379. if (state->end < entry->start) {
  380. if (try_merge && end == entry->start &&
  381. state->state == entry->state) {
  382. if (tree->owner == IO_TREE_INODE_IO)
  383. btrfs_merge_delalloc_extent(tree->inode,
  384. state, entry);
  385. entry->start = state->start;
  386. merge_prev_state(tree, entry);
  387. state->state = 0;
  388. return entry;
  389. }
  390. node = &(*node)->rb_left;
  391. } else if (state->end > entry->end) {
  392. if (try_merge && entry->end == start &&
  393. state->state == entry->state) {
  394. if (tree->owner == IO_TREE_INODE_IO)
  395. btrfs_merge_delalloc_extent(tree->inode,
  396. state, entry);
  397. entry->end = state->end;
  398. merge_next_state(tree, entry);
  399. state->state = 0;
  400. return entry;
  401. }
  402. node = &(*node)->rb_right;
  403. } else {
  404. return ERR_PTR(-EEXIST);
  405. }
  406. }
  407. rb_link_node(&state->rb_node, parent, node);
  408. rb_insert_color(&state->rb_node, &tree->state);
  409. return state;
  410. }
  411. /*
  412. * Insert state to @tree to the location given by @node and @parent.
  413. */
  414. static void insert_state_fast(struct extent_io_tree *tree,
  415. struct extent_state *state, struct rb_node **node,
  416. struct rb_node *parent, unsigned bits,
  417. struct extent_changeset *changeset)
  418. {
  419. set_state_bits(tree, state, bits, changeset);
  420. rb_link_node(&state->rb_node, parent, node);
  421. rb_insert_color(&state->rb_node, &tree->state);
  422. merge_state(tree, state);
  423. }
  424. /*
  425. * Split a given extent state struct in two, inserting the preallocated
  426. * struct 'prealloc' as the newly created second half. 'split' indicates an
  427. * offset inside 'orig' where it should be split.
  428. *
  429. * Before calling,
  430. * the tree has 'orig' at [orig->start, orig->end]. After calling, there
  431. * are two extent state structs in the tree:
  432. * prealloc: [orig->start, split - 1]
  433. * orig: [ split, orig->end ]
  434. *
  435. * The tree locks are not taken by this function. They need to be held
  436. * by the caller.
  437. */
  438. static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
  439. struct extent_state *prealloc, u64 split)
  440. {
  441. struct rb_node *parent = NULL;
  442. struct rb_node **node;
  443. if (tree->owner == IO_TREE_INODE_IO)
  444. btrfs_split_delalloc_extent(tree->inode, orig, split);
  445. prealloc->start = orig->start;
  446. prealloc->end = split - 1;
  447. prealloc->state = orig->state;
  448. orig->start = split;
  449. parent = &orig->rb_node;
  450. node = &parent;
  451. while (*node) {
  452. struct extent_state *entry;
  453. parent = *node;
  454. entry = rb_entry(parent, struct extent_state, rb_node);
  455. if (prealloc->end < entry->start) {
  456. node = &(*node)->rb_left;
  457. } else if (prealloc->end > entry->end) {
  458. node = &(*node)->rb_right;
  459. } else {
  460. btrfs_free_extent_state(prealloc);
  461. return -EEXIST;
  462. }
  463. }
  464. rb_link_node(&prealloc->rb_node, parent, node);
  465. rb_insert_color(&prealloc->rb_node, &tree->state);
  466. return 0;
  467. }
  468. /*
  469. * Use this during tree iteration to avoid doing next node searches when it's
  470. * not needed (the current record ends at or after the target range's end).
  471. */
  472. static inline struct extent_state *next_search_state(struct extent_state *state, u64 end)
  473. {
  474. if (state->end < end)
  475. return next_state(state);
  476. return NULL;
  477. }
  478. /*
  479. * Utility function to clear some bits in an extent state struct. It will
  480. * optionally wake up anyone waiting on this state (wake == 1).
  481. *
  482. * If no bits are set on the state struct after clearing things, the
  483. * struct is freed and removed from the tree
  484. */
  485. static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
  486. struct extent_state *state,
  487. u32 bits, int wake, u64 end,
  488. struct extent_changeset *changeset)
  489. {
  490. struct extent_state *next;
  491. u32 bits_to_clear = bits & ~EXTENT_CTLBITS;
  492. int ret;
  493. if (tree->owner == IO_TREE_INODE_IO)
  494. btrfs_clear_delalloc_extent(tree->inode, state, bits);
  495. ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
  496. BUG_ON(ret < 0);
  497. state->state &= ~bits_to_clear;
  498. if (wake)
  499. wake_up(&state->wq);
  500. if (state->state == 0) {
  501. next = next_search_state(state, end);
  502. if (extent_state_in_tree(state)) {
  503. rb_erase(&state->rb_node, &tree->state);
  504. RB_CLEAR_NODE(&state->rb_node);
  505. btrfs_free_extent_state(state);
  506. } else {
  507. WARN_ON(1);
  508. }
  509. } else {
  510. merge_state(tree, state);
  511. next = next_search_state(state, end);
  512. }
  513. return next;
  514. }
  515. /*
  516. * Detect if extent bits request NOWAIT semantics and set the gfp mask accordingly,
  517. * unset the EXTENT_NOWAIT bit.
  518. */
  519. static void set_gfp_mask_from_bits(u32 *bits, gfp_t *mask)
  520. {
  521. *mask = (*bits & EXTENT_NOWAIT ? GFP_NOWAIT : GFP_NOFS);
  522. *bits &= EXTENT_NOWAIT - 1;
  523. }
  524. /*
  525. * Clear some bits on a range in the tree. This may require splitting or
  526. * inserting elements in the tree, so the gfp mask is used to indicate which
  527. * allocations or sleeping are allowed.
  528. *
  529. * The range [start, end] is inclusive.
  530. *
  531. * This takes the tree lock, and returns 0 on success and < 0 on error.
  532. */
  533. int btrfs_clear_extent_bit_changeset(struct extent_io_tree *tree, u64 start, u64 end,
  534. u32 bits, struct extent_state **cached_state,
  535. struct extent_changeset *changeset)
  536. {
  537. struct extent_state *state;
  538. struct extent_state *cached;
  539. struct extent_state *prealloc = NULL;
  540. u64 last_end;
  541. int ret = 0;
  542. bool clear;
  543. bool wake;
  544. const bool delete = (bits & EXTENT_CLEAR_ALL_BITS);
  545. gfp_t mask;
  546. set_gfp_mask_from_bits(&bits, &mask);
  547. btrfs_debug_check_extent_io_range(tree, start, end);
  548. trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits);
  549. if (delete)
  550. bits |= ~EXTENT_CTLBITS;
  551. if (bits & EXTENT_DELALLOC)
  552. bits |= EXTENT_NORESERVE;
  553. wake = (bits & EXTENT_LOCK_BITS);
  554. clear = (bits & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY));
  555. again:
  556. if (!prealloc) {
  557. /*
  558. * Don't care for allocation failure here because we might end
  559. * up not needing the pre-allocated extent state at all, which
  560. * is the case if we only have in the tree extent states that
  561. * cover our input range and don't cover too any other range.
  562. * If we end up needing a new extent state we allocate it later.
  563. */
  564. prealloc = alloc_extent_state(mask);
  565. }
  566. spin_lock(&tree->lock);
  567. if (cached_state) {
  568. cached = *cached_state;
  569. if (clear) {
  570. *cached_state = NULL;
  571. cached_state = NULL;
  572. }
  573. if (cached && extent_state_in_tree(cached) &&
  574. cached->start <= start && cached->end > start) {
  575. if (clear)
  576. refcount_dec(&cached->refs);
  577. state = cached;
  578. goto hit_next;
  579. }
  580. if (clear)
  581. btrfs_free_extent_state(cached);
  582. }
  583. /* This search will find the extents that end after our range starts. */
  584. state = tree_search(tree, start);
  585. if (!state)
  586. goto out;
  587. hit_next:
  588. if (state->start > end)
  589. goto out;
  590. WARN_ON(state->end < start);
  591. last_end = state->end;
  592. /* The state doesn't have the wanted bits, go ahead. */
  593. if (!(state->state & bits)) {
  594. state = next_search_state(state, end);
  595. goto next;
  596. }
  597. /*
  598. * | ---- desired range ---- |
  599. * | state | or
  600. * | ------------- state -------------- |
  601. *
  602. * We need to split the extent we found, and may flip bits on second
  603. * half.
  604. *
  605. * If the extent we found extends past our range, we just split and
  606. * search again. It'll get split again the next time though.
  607. *
  608. * If the extent we found is inside our range, we clear the desired bit
  609. * on it.
  610. */
  611. if (state->start < start) {
  612. prealloc = alloc_extent_state_atomic(prealloc);
  613. if (!prealloc)
  614. goto search_again;
  615. ret = split_state(tree, state, prealloc, start);
  616. prealloc = NULL;
  617. if (ret) {
  618. extent_io_tree_panic(tree, state, "split", ret);
  619. goto out;
  620. }
  621. if (state->end <= end) {
  622. state = clear_state_bit(tree, state, bits, wake, end,
  623. changeset);
  624. goto next;
  625. }
  626. if (need_resched())
  627. goto search_again;
  628. /*
  629. * Fallthrough and try atomic extent state allocation if needed.
  630. * If it fails we'll jump to 'search_again' retry the allocation
  631. * in non-atomic mode and start the search again.
  632. */
  633. }
  634. /*
  635. * | ---- desired range ---- |
  636. * | state |
  637. * We need to split the extent, and clear the bit on the first half.
  638. */
  639. if (state->start <= end && state->end > end) {
  640. prealloc = alloc_extent_state_atomic(prealloc);
  641. if (!prealloc)
  642. goto search_again;
  643. ret = split_state(tree, state, prealloc, end + 1);
  644. if (ret) {
  645. extent_io_tree_panic(tree, state, "split", ret);
  646. prealloc = NULL;
  647. goto out;
  648. }
  649. if (wake)
  650. wake_up(&state->wq);
  651. clear_state_bit(tree, prealloc, bits, wake, end, changeset);
  652. prealloc = NULL;
  653. goto out;
  654. }
  655. state = clear_state_bit(tree, state, bits, wake, end, changeset);
  656. next:
  657. if (last_end >= end)
  658. goto out;
  659. start = last_end + 1;
  660. if (state && !need_resched())
  661. goto hit_next;
  662. search_again:
  663. spin_unlock(&tree->lock);
  664. if (gfpflags_allow_blocking(mask))
  665. cond_resched();
  666. goto again;
  667. out:
  668. spin_unlock(&tree->lock);
  669. btrfs_free_extent_state(prealloc);
  670. return ret;
  671. }
  672. /*
  673. * Wait for one or more bits to clear on a range in the state tree.
  674. * The range [start, end] is inclusive.
  675. * The tree lock is taken by this function
  676. */
  677. static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
  678. u32 bits, struct extent_state **cached_state)
  679. {
  680. struct extent_state *state;
  681. btrfs_debug_check_extent_io_range(tree, start, end);
  682. spin_lock(&tree->lock);
  683. again:
  684. /*
  685. * Maintain cached_state, as we may not remove it from the tree if there
  686. * are more bits than the bits we're waiting on set on this state.
  687. */
  688. if (cached_state && *cached_state) {
  689. state = *cached_state;
  690. if (extent_state_in_tree(state) &&
  691. state->start <= start && start < state->end)
  692. goto process_node;
  693. }
  694. while (1) {
  695. /*
  696. * This search will find all the extents that end after our
  697. * range starts.
  698. */
  699. state = tree_search(tree, start);
  700. process_node:
  701. if (!state)
  702. break;
  703. if (state->start > end)
  704. goto out;
  705. if (state->state & bits) {
  706. DEFINE_WAIT(wait);
  707. start = state->start;
  708. refcount_inc(&state->refs);
  709. prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
  710. spin_unlock(&tree->lock);
  711. schedule();
  712. spin_lock(&tree->lock);
  713. finish_wait(&state->wq, &wait);
  714. btrfs_free_extent_state(state);
  715. goto again;
  716. }
  717. start = state->end + 1;
  718. if (start > end)
  719. break;
  720. if (!cond_resched_lock(&tree->lock)) {
  721. state = next_state(state);
  722. goto process_node;
  723. }
  724. }
  725. out:
  726. /* This state is no longer useful, clear it and free it up. */
  727. if (cached_state && *cached_state) {
  728. state = *cached_state;
  729. *cached_state = NULL;
  730. btrfs_free_extent_state(state);
  731. }
  732. spin_unlock(&tree->lock);
  733. }
  734. static void cache_state_if_flags(struct extent_state *state,
  735. struct extent_state **cached_ptr,
  736. unsigned flags)
  737. {
  738. if (cached_ptr && !(*cached_ptr)) {
  739. if (!flags || (state->state & flags)) {
  740. *cached_ptr = state;
  741. refcount_inc(&state->refs);
  742. }
  743. }
  744. }
  745. static void cache_state(struct extent_state *state,
  746. struct extent_state **cached_ptr)
  747. {
  748. return cache_state_if_flags(state, cached_ptr, EXTENT_LOCK_BITS | EXTENT_BOUNDARY);
  749. }
  750. /*
  751. * Find the first state struct with 'bits' set after 'start', and return it.
  752. * tree->lock must be held. NULL will returned if nothing was found after
  753. * 'start'.
  754. */
  755. static struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
  756. u64 start, u32 bits)
  757. {
  758. struct extent_state *state;
  759. /*
  760. * This search will find all the extents that end after our range
  761. * starts.
  762. */
  763. state = tree_search(tree, start);
  764. while (state) {
  765. if (state->state & bits)
  766. return state;
  767. state = next_state(state);
  768. }
  769. return NULL;
  770. }
  771. /*
  772. * Find the first offset in the io tree with one or more @bits set.
  773. *
  774. * Note: If there are multiple bits set in @bits, any of them will match.
  775. *
  776. * Return true if we find something, and update @start_ret and @end_ret.
  777. * Return false if we found nothing.
  778. */
  779. bool btrfs_find_first_extent_bit(struct extent_io_tree *tree, u64 start,
  780. u64 *start_ret, u64 *end_ret, u32 bits,
  781. struct extent_state **cached_state)
  782. {
  783. struct extent_state *state;
  784. bool ret = false;
  785. spin_lock(&tree->lock);
  786. if (cached_state && *cached_state) {
  787. state = *cached_state;
  788. if (state->end == start - 1 && extent_state_in_tree(state)) {
  789. while ((state = next_state(state)) != NULL) {
  790. if (state->state & bits)
  791. break;
  792. }
  793. /*
  794. * If we found the next extent state, clear cached_state
  795. * so that we can cache the next extent state below and
  796. * avoid future calls going over the same extent state
  797. * again. If we haven't found any, clear as well since
  798. * it's now useless.
  799. */
  800. btrfs_free_extent_state(*cached_state);
  801. *cached_state = NULL;
  802. if (state)
  803. goto got_it;
  804. goto out;
  805. }
  806. btrfs_free_extent_state(*cached_state);
  807. *cached_state = NULL;
  808. }
  809. state = find_first_extent_bit_state(tree, start, bits);
  810. got_it:
  811. if (state) {
  812. cache_state_if_flags(state, cached_state, 0);
  813. *start_ret = state->start;
  814. *end_ret = state->end;
  815. ret = true;
  816. }
  817. out:
  818. spin_unlock(&tree->lock);
  819. return ret;
  820. }
  821. /*
  822. * Find a contiguous area of bits
  823. *
  824. * @tree: io tree to check
  825. * @start: offset to start the search from
  826. * @start_ret: the first offset we found with the bits set
  827. * @end_ret: the final contiguous range of the bits that were set
  828. * @bits: bits to look for
  829. *
  830. * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
  831. * to set bits appropriately, and then merge them again. During this time it
  832. * will drop the tree->lock, so use this helper if you want to find the actual
  833. * contiguous area for given bits. We will search to the first bit we find, and
  834. * then walk down the tree until we find a non-contiguous area. The area
  835. * returned will be the full contiguous area with the bits set.
  836. *
  837. * Returns true if we found a range with the given bits set, in which case
  838. * @start_ret and @end_ret are updated, or false if no range was found.
  839. */
  840. bool btrfs_find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
  841. u64 *start_ret, u64 *end_ret, u32 bits)
  842. {
  843. struct extent_state *state;
  844. bool ret = false;
  845. ASSERT(!btrfs_fs_incompat(btrfs_extent_io_tree_to_fs_info(tree), NO_HOLES));
  846. spin_lock(&tree->lock);
  847. state = find_first_extent_bit_state(tree, start, bits);
  848. if (state) {
  849. *start_ret = state->start;
  850. *end_ret = state->end;
  851. while ((state = next_state(state)) != NULL) {
  852. if (state->start > (*end_ret + 1))
  853. break;
  854. *end_ret = state->end;
  855. }
  856. ret = true;
  857. }
  858. spin_unlock(&tree->lock);
  859. return ret;
  860. }
  861. /*
  862. * Find a contiguous range of bytes in the file marked as delalloc, not more
  863. * than 'max_bytes'. start and end are used to return the range,
  864. *
  865. * True is returned if we find something, false if nothing was in the tree.
  866. */
  867. bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
  868. u64 *end, u64 max_bytes,
  869. struct extent_state **cached_state)
  870. {
  871. struct extent_state *state;
  872. u64 cur_start = *start;
  873. bool found = false;
  874. u64 total_bytes = 0;
  875. spin_lock(&tree->lock);
  876. /*
  877. * This search will find all the extents that end after our range
  878. * starts.
  879. */
  880. state = tree_search(tree, cur_start);
  881. if (!state) {
  882. *end = (u64)-1;
  883. goto out;
  884. }
  885. while (state) {
  886. if (found && (state->start != cur_start ||
  887. (state->state & EXTENT_BOUNDARY))) {
  888. goto out;
  889. }
  890. if (!(state->state & EXTENT_DELALLOC)) {
  891. if (!found)
  892. *end = state->end;
  893. goto out;
  894. }
  895. if (!found) {
  896. *start = state->start;
  897. *cached_state = state;
  898. refcount_inc(&state->refs);
  899. }
  900. found = true;
  901. *end = state->end;
  902. cur_start = state->end + 1;
  903. total_bytes += state->end - state->start + 1;
  904. if (total_bytes >= max_bytes)
  905. break;
  906. state = next_state(state);
  907. }
  908. out:
  909. spin_unlock(&tree->lock);
  910. return found;
  911. }
  912. /*
  913. * Set some bits on a range in the tree. This may require allocations or
  914. * sleeping. By default all allocations use GFP_NOFS, use EXTENT_NOWAIT for
  915. * GFP_NOWAIT.
  916. *
  917. * If any of the exclusive bits are set, this will fail with -EEXIST if some
  918. * part of the range already has the desired bits set. The extent_state of the
  919. * existing range is returned in failed_state in this case, and the start of the
  920. * existing range is returned in failed_start. failed_state is used as an
  921. * optimization for wait_extent_bit, failed_start must be used as the source of
  922. * truth as failed_state may have changed since we returned.
  923. *
  924. * [start, end] is inclusive This takes the tree lock.
  925. */
  926. static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
  927. u32 bits, u64 *failed_start,
  928. struct extent_state **failed_state,
  929. struct extent_state **cached_state,
  930. struct extent_changeset *changeset)
  931. {
  932. struct extent_state *state;
  933. struct extent_state *prealloc = NULL;
  934. struct rb_node **p = NULL;
  935. struct rb_node *parent = NULL;
  936. int ret = 0;
  937. u64 last_start;
  938. u64 last_end;
  939. u32 exclusive_bits = (bits & EXTENT_LOCK_BITS);
  940. gfp_t mask;
  941. set_gfp_mask_from_bits(&bits, &mask);
  942. btrfs_debug_check_extent_io_range(tree, start, end);
  943. trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
  944. if (exclusive_bits)
  945. ASSERT(failed_start);
  946. else
  947. ASSERT(failed_start == NULL && failed_state == NULL);
  948. again:
  949. if (!prealloc) {
  950. /*
  951. * Don't care for allocation failure here because we might end
  952. * up not needing the pre-allocated extent state at all, which
  953. * is the case if we only have in the tree extent states that
  954. * cover our input range and don't cover too any other range.
  955. * If we end up needing a new extent state we allocate it later.
  956. */
  957. prealloc = alloc_extent_state(mask);
  958. }
  959. /* Optimistically preallocate the extent changeset ulist node. */
  960. if (changeset)
  961. extent_changeset_prealloc(changeset, mask);
  962. spin_lock(&tree->lock);
  963. if (cached_state && *cached_state) {
  964. state = *cached_state;
  965. if (state->start <= start && state->end > start &&
  966. extent_state_in_tree(state))
  967. goto hit_next;
  968. }
  969. /*
  970. * This search will find all the extents that end after our range
  971. * starts.
  972. */
  973. state = tree_search_for_insert(tree, start, &p, &parent);
  974. if (!state) {
  975. prealloc = alloc_extent_state_atomic(prealloc);
  976. if (!prealloc)
  977. goto search_again;
  978. prealloc->start = start;
  979. prealloc->end = end;
  980. insert_state_fast(tree, prealloc, p, parent, bits, changeset);
  981. cache_state(prealloc, cached_state);
  982. prealloc = NULL;
  983. goto out;
  984. }
  985. hit_next:
  986. last_start = state->start;
  987. last_end = state->end;
  988. /*
  989. * | ---- desired range ---- |
  990. * | state |
  991. *
  992. * Just lock what we found and keep going
  993. */
  994. if (state->start == start && state->end <= end) {
  995. if (state->state & exclusive_bits) {
  996. *failed_start = state->start;
  997. cache_state(state, failed_state);
  998. ret = -EEXIST;
  999. goto out;
  1000. }
  1001. set_state_bits(tree, state, bits, changeset);
  1002. cache_state(state, cached_state);
  1003. merge_state(tree, state);
  1004. if (last_end >= end)
  1005. goto out;
  1006. start = last_end + 1;
  1007. state = next_state(state);
  1008. if (state && state->start == start && !need_resched())
  1009. goto hit_next;
  1010. goto search_again;
  1011. }
  1012. /*
  1013. * | ---- desired range ---- |
  1014. * | state |
  1015. * or
  1016. * | ------------- state -------------- |
  1017. *
  1018. * We need to split the extent we found, and may flip bits on second
  1019. * half.
  1020. *
  1021. * If the extent we found extends past our range, we just split and
  1022. * search again. It'll get split again the next time though.
  1023. *
  1024. * If the extent we found is inside our range, we set the desired bit
  1025. * on it.
  1026. */
  1027. if (state->start < start) {
  1028. if (state->state & exclusive_bits) {
  1029. *failed_start = start;
  1030. cache_state(state, failed_state);
  1031. ret = -EEXIST;
  1032. goto out;
  1033. }
  1034. /*
  1035. * If this extent already has all the bits we want set, then
  1036. * skip it, not necessary to split it or do anything with it.
  1037. */
  1038. if ((state->state & bits) == bits) {
  1039. start = state->end + 1;
  1040. cache_state(state, cached_state);
  1041. goto search_again;
  1042. }
  1043. prealloc = alloc_extent_state_atomic(prealloc);
  1044. if (!prealloc)
  1045. goto search_again;
  1046. ret = split_state(tree, state, prealloc, start);
  1047. if (ret)
  1048. extent_io_tree_panic(tree, state, "split", ret);
  1049. prealloc = NULL;
  1050. if (ret)
  1051. goto out;
  1052. if (state->end <= end) {
  1053. set_state_bits(tree, state, bits, changeset);
  1054. cache_state(state, cached_state);
  1055. merge_state(tree, state);
  1056. if (last_end >= end)
  1057. goto out;
  1058. start = last_end + 1;
  1059. state = next_state(state);
  1060. if (state && state->start == start && !need_resched())
  1061. goto hit_next;
  1062. }
  1063. goto search_again;
  1064. }
  1065. /*
  1066. * | ---- desired range ---- |
  1067. * | state | or | state |
  1068. *
  1069. * There's a hole, we need to insert something in it and ignore the
  1070. * extent we found.
  1071. */
  1072. if (state->start > start) {
  1073. struct extent_state *inserted_state;
  1074. prealloc = alloc_extent_state_atomic(prealloc);
  1075. if (!prealloc)
  1076. goto search_again;
  1077. /*
  1078. * Avoid to free 'prealloc' if it can be merged with the later
  1079. * extent.
  1080. */
  1081. prealloc->start = start;
  1082. if (end < last_start)
  1083. prealloc->end = end;
  1084. else
  1085. prealloc->end = last_start - 1;
  1086. inserted_state = insert_state(tree, prealloc, bits, changeset);
  1087. if (IS_ERR(inserted_state)) {
  1088. ret = PTR_ERR(inserted_state);
  1089. extent_io_tree_panic(tree, prealloc, "insert", ret);
  1090. goto out;
  1091. }
  1092. cache_state(inserted_state, cached_state);
  1093. if (inserted_state == prealloc)
  1094. prealloc = NULL;
  1095. start = inserted_state->end + 1;
  1096. /* Beyond target range, stop. */
  1097. if (start > end)
  1098. goto out;
  1099. if (need_resched())
  1100. goto search_again;
  1101. state = next_search_state(inserted_state, end);
  1102. /*
  1103. * If there's a next state, whether contiguous or not, we don't
  1104. * need to unlock and start search again. If it's not contiguous
  1105. * we will end up here and try to allocate a prealloc state and insert.
  1106. */
  1107. if (state)
  1108. goto hit_next;
  1109. goto search_again;
  1110. }
  1111. /*
  1112. * | ---- desired range ---- |
  1113. * | state |
  1114. *
  1115. * We need to split the extent, and set the bit on the first half
  1116. */
  1117. if (state->start <= end && state->end > end) {
  1118. if (state->state & exclusive_bits) {
  1119. *failed_start = start;
  1120. cache_state(state, failed_state);
  1121. ret = -EEXIST;
  1122. goto out;
  1123. }
  1124. prealloc = alloc_extent_state_atomic(prealloc);
  1125. if (!prealloc)
  1126. goto search_again;
  1127. ret = split_state(tree, state, prealloc, end + 1);
  1128. if (ret) {
  1129. extent_io_tree_panic(tree, state, "split", ret);
  1130. prealloc = NULL;
  1131. goto out;
  1132. }
  1133. set_state_bits(tree, prealloc, bits, changeset);
  1134. cache_state(prealloc, cached_state);
  1135. merge_state(tree, prealloc);
  1136. prealloc = NULL;
  1137. goto out;
  1138. }
  1139. search_again:
  1140. if (start > end)
  1141. goto out;
  1142. spin_unlock(&tree->lock);
  1143. if (gfpflags_allow_blocking(mask))
  1144. cond_resched();
  1145. goto again;
  1146. out:
  1147. spin_unlock(&tree->lock);
  1148. btrfs_free_extent_state(prealloc);
  1149. return ret;
  1150. }
  1151. int btrfs_set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
  1152. u32 bits, struct extent_state **cached_state)
  1153. {
  1154. return set_extent_bit(tree, start, end, bits, NULL, NULL, cached_state, NULL);
  1155. }
  1156. /*
  1157. * Convert all bits in a given range from one bit to another
  1158. *
  1159. * @tree: the io tree to search
  1160. * @start: the start offset in bytes
  1161. * @end: the end offset in bytes (inclusive)
  1162. * @bits: the bits to set in this range
  1163. * @clear_bits: the bits to clear in this range
  1164. * @cached_state: state that we're going to cache
  1165. *
  1166. * This will go through and set bits for the given range. If any states exist
  1167. * already in this range they are set with the given bit and cleared of the
  1168. * clear_bits. This is only meant to be used by things that are mergeable, ie.
  1169. * converting from say DELALLOC to DIRTY. This is not meant to be used with
  1170. * boundary bits like LOCK.
  1171. *
  1172. * All allocations are done with GFP_NOFS.
  1173. */
  1174. int btrfs_convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
  1175. u32 bits, u32 clear_bits,
  1176. struct extent_state **cached_state)
  1177. {
  1178. struct extent_state *state;
  1179. struct extent_state *prealloc = NULL;
  1180. struct rb_node **p = NULL;
  1181. struct rb_node *parent = NULL;
  1182. int ret = 0;
  1183. u64 last_start;
  1184. u64 last_end;
  1185. bool first_iteration = true;
  1186. btrfs_debug_check_extent_io_range(tree, start, end);
  1187. trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits,
  1188. clear_bits);
  1189. again:
  1190. if (!prealloc) {
  1191. /*
  1192. * Best effort, don't worry if extent state allocation fails
  1193. * here for the first iteration. We might have a cached state
  1194. * that matches exactly the target range, in which case no
  1195. * extent state allocations are needed. We'll only know this
  1196. * after locking the tree.
  1197. */
  1198. prealloc = alloc_extent_state(GFP_NOFS);
  1199. if (!prealloc && !first_iteration)
  1200. return -ENOMEM;
  1201. }
  1202. spin_lock(&tree->lock);
  1203. if (cached_state && *cached_state) {
  1204. state = *cached_state;
  1205. if (state->start <= start && state->end > start &&
  1206. extent_state_in_tree(state))
  1207. goto hit_next;
  1208. }
  1209. /*
  1210. * This search will find all the extents that end after our range
  1211. * starts.
  1212. */
  1213. state = tree_search_for_insert(tree, start, &p, &parent);
  1214. if (!state) {
  1215. prealloc = alloc_extent_state_atomic(prealloc);
  1216. if (!prealloc) {
  1217. ret = -ENOMEM;
  1218. goto out;
  1219. }
  1220. prealloc->start = start;
  1221. prealloc->end = end;
  1222. insert_state_fast(tree, prealloc, p, parent, bits, NULL);
  1223. cache_state(prealloc, cached_state);
  1224. prealloc = NULL;
  1225. goto out;
  1226. }
  1227. hit_next:
  1228. last_start = state->start;
  1229. last_end = state->end;
  1230. /*
  1231. * | ---- desired range ---- |
  1232. * | state |
  1233. *
  1234. * Just lock what we found and keep going.
  1235. */
  1236. if (state->start == start && state->end <= end) {
  1237. set_state_bits(tree, state, bits, NULL);
  1238. cache_state(state, cached_state);
  1239. state = clear_state_bit(tree, state, clear_bits, 0, end, NULL);
  1240. if (last_end >= end)
  1241. goto out;
  1242. start = last_end + 1;
  1243. if (state && state->start == start && !need_resched())
  1244. goto hit_next;
  1245. goto search_again;
  1246. }
  1247. /*
  1248. * | ---- desired range ---- |
  1249. * | state |
  1250. * or
  1251. * | ------------- state -------------- |
  1252. *
  1253. * We need to split the extent we found, and may flip bits on second
  1254. * half.
  1255. *
  1256. * If the extent we found extends past our range, we just split and
  1257. * search again. It'll get split again the next time though.
  1258. *
  1259. * If the extent we found is inside our range, we set the desired bit
  1260. * on it.
  1261. */
  1262. if (state->start < start) {
  1263. prealloc = alloc_extent_state_atomic(prealloc);
  1264. if (!prealloc) {
  1265. ret = -ENOMEM;
  1266. goto out;
  1267. }
  1268. ret = split_state(tree, state, prealloc, start);
  1269. prealloc = NULL;
  1270. if (ret) {
  1271. extent_io_tree_panic(tree, state, "split", ret);
  1272. goto out;
  1273. }
  1274. if (state->end <= end) {
  1275. set_state_bits(tree, state, bits, NULL);
  1276. cache_state(state, cached_state);
  1277. state = clear_state_bit(tree, state, clear_bits, 0, end, NULL);
  1278. if (last_end >= end)
  1279. goto out;
  1280. start = last_end + 1;
  1281. if (state && state->start == start && !need_resched())
  1282. goto hit_next;
  1283. }
  1284. goto search_again;
  1285. }
  1286. /*
  1287. * | ---- desired range ---- |
  1288. * | state | or | state |
  1289. *
  1290. * There's a hole, we need to insert something in it and ignore the
  1291. * extent we found.
  1292. */
  1293. if (state->start > start) {
  1294. struct extent_state *inserted_state;
  1295. prealloc = alloc_extent_state_atomic(prealloc);
  1296. if (!prealloc) {
  1297. ret = -ENOMEM;
  1298. goto out;
  1299. }
  1300. /*
  1301. * Avoid to free 'prealloc' if it can be merged with the later
  1302. * extent.
  1303. */
  1304. prealloc->start = start;
  1305. if (end < last_start)
  1306. prealloc->end = end;
  1307. else
  1308. prealloc->end = last_start - 1;
  1309. inserted_state = insert_state(tree, prealloc, bits, NULL);
  1310. if (IS_ERR(inserted_state)) {
  1311. ret = PTR_ERR(inserted_state);
  1312. extent_io_tree_panic(tree, prealloc, "insert", ret);
  1313. goto out;
  1314. }
  1315. cache_state(inserted_state, cached_state);
  1316. if (inserted_state == prealloc)
  1317. prealloc = NULL;
  1318. start = inserted_state->end + 1;
  1319. /* Beyond target range, stop. */
  1320. if (start > end)
  1321. goto out;
  1322. if (need_resched())
  1323. goto search_again;
  1324. state = next_search_state(inserted_state, end);
  1325. /*
  1326. * If there's a next state, whether contiguous or not, we don't
  1327. * need to unlock and start search again. If it's not contiguous
  1328. * we will end up here and try to allocate a prealloc state and insert.
  1329. */
  1330. if (state)
  1331. goto hit_next;
  1332. goto search_again;
  1333. }
  1334. /*
  1335. * | ---- desired range ---- |
  1336. * | state |
  1337. *
  1338. * We need to split the extent, and set the bit on the first half.
  1339. */
  1340. if (state->start <= end && state->end > end) {
  1341. prealloc = alloc_extent_state_atomic(prealloc);
  1342. if (!prealloc) {
  1343. ret = -ENOMEM;
  1344. goto out;
  1345. }
  1346. ret = split_state(tree, state, prealloc, end + 1);
  1347. if (ret) {
  1348. extent_io_tree_panic(tree, state, "split", ret);
  1349. prealloc = NULL;
  1350. goto out;
  1351. }
  1352. set_state_bits(tree, prealloc, bits, NULL);
  1353. cache_state(prealloc, cached_state);
  1354. clear_state_bit(tree, prealloc, clear_bits, 0, end, NULL);
  1355. prealloc = NULL;
  1356. goto out;
  1357. }
  1358. search_again:
  1359. if (start > end)
  1360. goto out;
  1361. spin_unlock(&tree->lock);
  1362. cond_resched();
  1363. first_iteration = false;
  1364. goto again;
  1365. out:
  1366. spin_unlock(&tree->lock);
  1367. btrfs_free_extent_state(prealloc);
  1368. return ret;
  1369. }
  1370. /*
  1371. * Find the first range that has @bits not set. This range could start before
  1372. * @start.
  1373. *
  1374. * @tree: the tree to search
  1375. * @start: offset at/after which the found extent should start
  1376. * @start_ret: records the beginning of the range
  1377. * @end_ret: records the end of the range (inclusive)
  1378. * @bits: the set of bits which must be unset
  1379. *
  1380. * Since unallocated range is also considered one which doesn't have the bits
  1381. * set it's possible that @end_ret contains -1, this happens in case the range
  1382. * spans (last_range_end, end of device]. In this case it's up to the caller to
  1383. * trim @end_ret to the appropriate size.
  1384. */
  1385. void btrfs_find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
  1386. u64 *start_ret, u64 *end_ret, u32 bits)
  1387. {
  1388. struct extent_state *state;
  1389. struct extent_state *prev = NULL, *next = NULL;
  1390. spin_lock(&tree->lock);
  1391. /* Find first extent with bits cleared */
  1392. while (1) {
  1393. state = tree_search_prev_next(tree, start, &prev, &next);
  1394. if (!state && !next && !prev) {
  1395. /*
  1396. * Tree is completely empty, send full range and let
  1397. * caller deal with it
  1398. */
  1399. *start_ret = 0;
  1400. *end_ret = -1;
  1401. goto out;
  1402. } else if (!state && !next) {
  1403. /*
  1404. * We are past the last allocated chunk, set start at
  1405. * the end of the last extent.
  1406. */
  1407. *start_ret = prev->end + 1;
  1408. *end_ret = -1;
  1409. goto out;
  1410. } else if (!state) {
  1411. state = next;
  1412. }
  1413. /*
  1414. * At this point 'state' either contains 'start' or start is
  1415. * before 'state'
  1416. */
  1417. if (in_range(start, state->start, state->end - state->start + 1)) {
  1418. if (state->state & bits) {
  1419. /*
  1420. * |--range with bits sets--|
  1421. * |
  1422. * start
  1423. */
  1424. start = state->end + 1;
  1425. } else {
  1426. /*
  1427. * 'start' falls within a range that doesn't
  1428. * have the bits set, so take its start as the
  1429. * beginning of the desired range
  1430. *
  1431. * |--range with bits cleared----|
  1432. * |
  1433. * start
  1434. */
  1435. *start_ret = state->start;
  1436. break;
  1437. }
  1438. } else {
  1439. /*
  1440. * |---prev range---|---hole/unset---|---node range---|
  1441. * |
  1442. * start
  1443. *
  1444. * or
  1445. *
  1446. * |---hole/unset--||--first node--|
  1447. * 0 |
  1448. * start
  1449. */
  1450. if (prev)
  1451. *start_ret = prev->end + 1;
  1452. else
  1453. *start_ret = 0;
  1454. break;
  1455. }
  1456. }
  1457. /*
  1458. * Find the longest stretch from start until an entry which has the
  1459. * bits set
  1460. */
  1461. while (state) {
  1462. if (state->end >= start && !(state->state & bits)) {
  1463. *end_ret = state->end;
  1464. } else {
  1465. *end_ret = state->start - 1;
  1466. break;
  1467. }
  1468. state = next_state(state);
  1469. }
  1470. out:
  1471. spin_unlock(&tree->lock);
  1472. }
  1473. /*
  1474. * Count the number of bytes in the tree that have a given bit(s) set for a
  1475. * given range.
  1476. *
  1477. * @tree: The io tree to search.
  1478. * @start: The start offset of the range. This value is updated to the
  1479. * offset of the first byte found with the given bit(s), so it
  1480. * can end up being bigger than the initial value.
  1481. * @search_end: The end offset (inclusive value) of the search range.
  1482. * @max_bytes: The maximum byte count we are interested. The search stops
  1483. * once it reaches this count.
  1484. * @bits: The bits the range must have in order to be accounted for.
  1485. * If multiple bits are set, then only subranges that have all
  1486. * the bits set are accounted for.
  1487. * @contig: Indicate if we should ignore holes in the range or not. If
  1488. * this is true, then stop once we find a hole.
  1489. * @cached_state: A cached state to be used across multiple calls to this
  1490. * function in order to speedup searches. Use NULL if this is
  1491. * called only once or if each call does not start where the
  1492. * previous one ended.
  1493. *
  1494. * Returns the total number of bytes found within the given range that have
  1495. * all given bits set. If the returned number of bytes is greater than zero
  1496. * then @start is updated with the offset of the first byte with the bits set.
  1497. */
  1498. u64 btrfs_count_range_bits(struct extent_io_tree *tree,
  1499. u64 *start, u64 search_end, u64 max_bytes,
  1500. u32 bits, bool contig,
  1501. struct extent_state **cached_state)
  1502. {
  1503. struct extent_state *state = NULL;
  1504. struct extent_state *cached;
  1505. u64 cur_start = *start;
  1506. u64 total_bytes = 0;
  1507. u64 last = 0;
  1508. int found = 0;
  1509. if (WARN_ON(search_end < cur_start))
  1510. return 0;
  1511. spin_lock(&tree->lock);
  1512. if (!cached_state || !*cached_state)
  1513. goto search;
  1514. cached = *cached_state;
  1515. if (!extent_state_in_tree(cached))
  1516. goto search;
  1517. if (cached->start <= cur_start && cur_start <= cached->end) {
  1518. state = cached;
  1519. } else if (cached->start > cur_start) {
  1520. struct extent_state *prev;
  1521. /*
  1522. * The cached state starts after our search range's start. Check
  1523. * if the previous state record starts at or before the range we
  1524. * are looking for, and if so, use it - this is a common case
  1525. * when there are holes between records in the tree. If there is
  1526. * no previous state record, we can start from our cached state.
  1527. */
  1528. prev = prev_state(cached);
  1529. if (!prev)
  1530. state = cached;
  1531. else if (prev->start <= cur_start && cur_start <= prev->end)
  1532. state = prev;
  1533. }
  1534. /*
  1535. * This search will find all the extents that end after our range
  1536. * starts.
  1537. */
  1538. search:
  1539. if (!state)
  1540. state = tree_search(tree, cur_start);
  1541. while (state) {
  1542. if (state->start > search_end)
  1543. break;
  1544. if (contig && found && state->start > last + 1)
  1545. break;
  1546. if (state->end >= cur_start && (state->state & bits) == bits) {
  1547. total_bytes += min(search_end, state->end) + 1 -
  1548. max(cur_start, state->start);
  1549. if (total_bytes >= max_bytes)
  1550. break;
  1551. if (!found) {
  1552. *start = max(cur_start, state->start);
  1553. found = 1;
  1554. }
  1555. last = state->end;
  1556. } else if (contig && found) {
  1557. break;
  1558. }
  1559. state = next_state(state);
  1560. }
  1561. if (cached_state) {
  1562. btrfs_free_extent_state(*cached_state);
  1563. *cached_state = state;
  1564. if (state)
  1565. refcount_inc(&state->refs);
  1566. }
  1567. spin_unlock(&tree->lock);
  1568. return total_bytes;
  1569. }
  1570. /*
  1571. * Check if the single @bit exists in the given range.
  1572. */
  1573. bool btrfs_test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit)
  1574. {
  1575. struct extent_state *state;
  1576. bool bitset = false;
  1577. ASSERT(is_power_of_2(bit));
  1578. spin_lock(&tree->lock);
  1579. state = tree_search(tree, start);
  1580. while (state) {
  1581. if (state->start > end)
  1582. break;
  1583. if (state->state & bit) {
  1584. bitset = true;
  1585. break;
  1586. }
  1587. if (state->end >= end)
  1588. break;
  1589. state = next_state(state);
  1590. }
  1591. spin_unlock(&tree->lock);
  1592. return bitset;
  1593. }
  1594. void btrfs_get_range_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 *bits,
  1595. struct extent_state **cached_state)
  1596. {
  1597. struct extent_state *state;
  1598. /*
  1599. * The cached state is currently mandatory and not used to start the
  1600. * search, only to cache the first state record found in the range.
  1601. */
  1602. ASSERT(cached_state != NULL);
  1603. ASSERT(*cached_state == NULL);
  1604. *bits = 0;
  1605. spin_lock(&tree->lock);
  1606. state = tree_search(tree, start);
  1607. if (state && state->start < end) {
  1608. *cached_state = state;
  1609. refcount_inc(&state->refs);
  1610. }
  1611. while (state) {
  1612. if (state->start > end)
  1613. break;
  1614. *bits |= state->state;
  1615. if (state->end >= end)
  1616. break;
  1617. state = next_state(state);
  1618. }
  1619. spin_unlock(&tree->lock);
  1620. }
  1621. /*
  1622. * Check if the whole range [@start,@end) contains the single @bit set.
  1623. */
  1624. bool btrfs_test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit,
  1625. struct extent_state *cached)
  1626. {
  1627. struct extent_state *state;
  1628. bool bitset = true;
  1629. ASSERT(is_power_of_2(bit));
  1630. ASSERT(start < end);
  1631. spin_lock(&tree->lock);
  1632. if (cached && extent_state_in_tree(cached) && cached->start <= start &&
  1633. cached->end > start)
  1634. state = cached;
  1635. else
  1636. state = tree_search(tree, start);
  1637. while (state) {
  1638. if (state->start > start) {
  1639. bitset = false;
  1640. break;
  1641. }
  1642. if ((state->state & bit) == 0) {
  1643. bitset = false;
  1644. break;
  1645. }
  1646. if (state->end >= end)
  1647. break;
  1648. /* Next state must start where this one ends. */
  1649. start = state->end + 1;
  1650. state = next_state(state);
  1651. }
  1652. /* We ran out of states and were still inside of our range. */
  1653. if (!state)
  1654. bitset = false;
  1655. spin_unlock(&tree->lock);
  1656. return bitset;
  1657. }
  1658. /* Wrappers around set/clear extent bit */
  1659. int btrfs_set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
  1660. u32 bits, struct extent_changeset *changeset)
  1661. {
  1662. /*
  1663. * We don't support EXTENT_LOCK_BITS yet, as current changeset will
  1664. * record any bits changed, so for EXTENT_LOCK_BITS case, it will either
  1665. * fail with -EEXIST or changeset will record the whole range.
  1666. */
  1667. ASSERT(!(bits & EXTENT_LOCK_BITS));
  1668. return set_extent_bit(tree, start, end, bits, NULL, NULL, NULL, changeset);
  1669. }
  1670. int btrfs_clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
  1671. u32 bits, struct extent_changeset *changeset)
  1672. {
  1673. /*
  1674. * Don't support EXTENT_LOCK_BITS case, same reason as
  1675. * set_record_extent_bits().
  1676. */
  1677. ASSERT(!(bits & EXTENT_LOCK_BITS));
  1678. return btrfs_clear_extent_bit_changeset(tree, start, end, bits, NULL, changeset);
  1679. }
  1680. bool btrfs_try_lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
  1681. u32 bits, struct extent_state **cached)
  1682. {
  1683. int ret;
  1684. u64 failed_start;
  1685. ret = set_extent_bit(tree, start, end, bits, &failed_start, NULL, cached, NULL);
  1686. if (ret == -EEXIST) {
  1687. if (failed_start > start)
  1688. btrfs_clear_extent_bit(tree, start, failed_start - 1,
  1689. bits, cached);
  1690. return 0;
  1691. }
  1692. return 1;
  1693. }
  1694. /*
  1695. * Either insert or lock state struct between start and end use mask to tell
  1696. * us if waiting is desired.
  1697. */
  1698. int btrfs_lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
  1699. struct extent_state **cached_state)
  1700. {
  1701. struct extent_state *failed_state = NULL;
  1702. int ret;
  1703. u64 failed_start;
  1704. ret = set_extent_bit(tree, start, end, bits, &failed_start,
  1705. &failed_state, cached_state, NULL);
  1706. while (ret == -EEXIST) {
  1707. if (failed_start != start)
  1708. btrfs_clear_extent_bit(tree, start, failed_start - 1,
  1709. bits, cached_state);
  1710. wait_extent_bit(tree, failed_start, end, bits, &failed_state);
  1711. ret = set_extent_bit(tree, start, end, bits, &failed_start,
  1712. &failed_state, cached_state, NULL);
  1713. }
  1714. return ret;
  1715. }
  1716. /*
  1717. * Get the extent state that follows the given extent state.
  1718. * This is meant to be used in a context where we know no other tasks can
  1719. * concurrently modify the tree.
  1720. */
  1721. struct extent_state *btrfs_next_extent_state(struct extent_io_tree *tree,
  1722. struct extent_state *state)
  1723. {
  1724. struct extent_state *next;
  1725. spin_lock(&tree->lock);
  1726. ASSERT(extent_state_in_tree(state));
  1727. next = next_state(state);
  1728. if (next)
  1729. refcount_inc(&next->refs);
  1730. spin_unlock(&tree->lock);
  1731. return next;
  1732. }
  1733. void __cold btrfs_extent_state_free_cachep(void)
  1734. {
  1735. btrfs_extent_state_leak_debug_check();
  1736. kmem_cache_destroy(extent_state_cache);
  1737. }
  1738. int __init btrfs_extent_state_init_cachep(void)
  1739. {
  1740. extent_state_cache = kmem_cache_create("btrfs_extent_state",
  1741. sizeof(struct extent_state), 0, 0,
  1742. NULL);
  1743. if (!extent_state_cache)
  1744. return -ENOMEM;
  1745. return 0;
  1746. }