discard.c 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828
  1. // SPDX-License-Identifier: GPL-2.0
  2. #include <linux/jiffies.h>
  3. #include <linux/kernel.h>
  4. #include <linux/ktime.h>
  5. #include <linux/list.h>
  6. #include <linux/math64.h>
  7. #include <linux/sizes.h>
  8. #include <linux/workqueue.h>
  9. #include "ctree.h"
  10. #include "block-group.h"
  11. #include "discard.h"
  12. #include "free-space-cache.h"
  13. #include "fs.h"
  14. /*
  15. * This contains the logic to handle async discard.
  16. *
  17. * Async discard manages trimming of free space outside of transaction commit.
  18. * Discarding is done by managing the block_groups on a LRU list based on free
  19. * space recency. Two passes are used to first prioritize discarding extents
  20. * and then allow for trimming in the bitmap the best opportunity to coalesce.
  21. * The block_groups are maintained on multiple lists to allow for multiple
  22. * passes with different discard filter requirements. A delayed work item is
  23. * used to manage discarding with timeout determined by a max of the delay
  24. * incurred by the iops rate limit, the byte rate limit, and the max delay of
  25. * BTRFS_DISCARD_MAX_DELAY.
  26. *
  27. * Note, this only keeps track of block_groups that are explicitly for data.
  28. * Mixed block_groups are not supported.
  29. *
  30. * The first list is special to manage discarding of fully free block groups.
  31. * This is necessary because we issue a final trim for a full free block group
  32. * after forgetting it. When a block group becomes unused, instead of directly
  33. * being added to the unused_bgs list, we add it to this first list. Then
  34. * from there, if it becomes fully discarded, we place it onto the unused_bgs
  35. * list.
  36. *
  37. * The in-memory free space cache serves as the backing state for discard.
  38. * Consequently this means there is no persistence. We opt to load all the
  39. * block groups in as not discarded, so the mount case degenerates to the
  40. * crashing case.
  41. *
  42. * As the free space cache uses bitmaps, there exists a tradeoff between
  43. * ease/efficiency for find_free_extent() and the accuracy of discard state.
  44. * Here we opt to let untrimmed regions merge with everything while only letting
  45. * trimmed regions merge with other trimmed regions. This can cause
  46. * overtrimming, but the coalescing benefit seems to be worth it. Additionally,
  47. * bitmap state is tracked as a whole. If we're able to fully trim a bitmap,
  48. * the trimmed flag is set on the bitmap. Otherwise, if an allocation comes in,
  49. * this resets the state and we will retry trimming the whole bitmap. This is a
  50. * tradeoff between discard state accuracy and the cost of accounting.
  51. */
  52. /* This is an initial delay to give some chance for block reuse */
  53. #define BTRFS_DISCARD_DELAY (120ULL * NSEC_PER_SEC)
  54. #define BTRFS_DISCARD_UNUSED_DELAY (10ULL * NSEC_PER_SEC)
  55. #define BTRFS_DISCARD_MIN_DELAY_MSEC (1UL)
  56. #define BTRFS_DISCARD_MAX_DELAY_MSEC (1000UL)
  57. #define BTRFS_DISCARD_MAX_IOPS (1000U)
  58. /* Monotonically decreasing minimum length filters after index 0 */
  59. static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = {
  60. 0,
  61. BTRFS_ASYNC_DISCARD_MAX_FILTER,
  62. BTRFS_ASYNC_DISCARD_MIN_FILTER
  63. };
  64. static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl,
  65. const struct btrfs_block_group *block_group)
  66. {
  67. return &discard_ctl->discard_list[block_group->discard_index];
  68. }
  69. /*
  70. * Determine if async discard should be running.
  71. *
  72. * @discard_ctl: discard control
  73. *
  74. * Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set.
  75. */
  76. static bool btrfs_run_discard_work(const struct btrfs_discard_ctl *discard_ctl)
  77. {
  78. struct btrfs_fs_info *fs_info = container_of(discard_ctl,
  79. struct btrfs_fs_info,
  80. discard_ctl);
  81. return (!(fs_info->sb->s_flags & SB_RDONLY) &&
  82. test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags));
  83. }
  84. static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
  85. struct btrfs_block_group *block_group)
  86. {
  87. lockdep_assert_held(&discard_ctl->lock);
  88. if (list_empty(&block_group->discard_list) ||
  89. block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) {
  90. if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED)
  91. block_group->discard_index = BTRFS_DISCARD_INDEX_START;
  92. block_group->discard_eligible_time = (ktime_get_ns() +
  93. BTRFS_DISCARD_DELAY);
  94. block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
  95. }
  96. if (list_empty(&block_group->discard_list))
  97. btrfs_get_block_group(block_group);
  98. list_move_tail(&block_group->discard_list,
  99. get_discard_list(discard_ctl, block_group));
  100. }
  101. static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
  102. struct btrfs_block_group *block_group)
  103. {
  104. if (!btrfs_is_block_group_data_only(block_group))
  105. return;
  106. if (!btrfs_run_discard_work(discard_ctl))
  107. return;
  108. spin_lock(&discard_ctl->lock);
  109. __add_to_discard_list(discard_ctl, block_group);
  110. spin_unlock(&discard_ctl->lock);
  111. }
  112. static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl,
  113. struct btrfs_block_group *block_group)
  114. {
  115. bool queued;
  116. spin_lock(&discard_ctl->lock);
  117. queued = !list_empty(&block_group->discard_list);
  118. if (!btrfs_run_discard_work(discard_ctl)) {
  119. spin_unlock(&discard_ctl->lock);
  120. return;
  121. }
  122. list_del_init(&block_group->discard_list);
  123. block_group->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
  124. block_group->discard_eligible_time = (ktime_get_ns() +
  125. BTRFS_DISCARD_UNUSED_DELAY);
  126. block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
  127. if (!queued)
  128. btrfs_get_block_group(block_group);
  129. list_add_tail(&block_group->discard_list,
  130. &discard_ctl->discard_list[BTRFS_DISCARD_INDEX_UNUSED]);
  131. spin_unlock(&discard_ctl->lock);
  132. }
  133. static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl,
  134. struct btrfs_block_group *block_group)
  135. {
  136. bool running = false;
  137. bool queued = false;
  138. spin_lock(&discard_ctl->lock);
  139. if (block_group == discard_ctl->block_group) {
  140. running = true;
  141. discard_ctl->block_group = NULL;
  142. }
  143. block_group->discard_eligible_time = 0;
  144. queued = !list_empty(&block_group->discard_list);
  145. list_del_init(&block_group->discard_list);
  146. if (queued)
  147. btrfs_put_block_group(block_group);
  148. spin_unlock(&discard_ctl->lock);
  149. return running;
  150. }
  151. /*
  152. * Find block_group that's up next for discarding.
  153. *
  154. * @discard_ctl: discard control
  155. * @now: current time
  156. *
  157. * Iterate over the discard lists to find the next block_group up for
  158. * discarding checking the discard_eligible_time of block_group.
  159. */
  160. static struct btrfs_block_group *find_next_block_group(
  161. struct btrfs_discard_ctl *discard_ctl,
  162. u64 now)
  163. {
  164. struct btrfs_block_group *ret_block_group = NULL, *block_group;
  165. int i;
  166. for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) {
  167. struct list_head *discard_list = &discard_ctl->discard_list[i];
  168. if (!list_empty(discard_list)) {
  169. block_group = list_first_entry(discard_list,
  170. struct btrfs_block_group,
  171. discard_list);
  172. if (!ret_block_group)
  173. ret_block_group = block_group;
  174. if (ret_block_group->discard_eligible_time < now)
  175. break;
  176. if (ret_block_group->discard_eligible_time >
  177. block_group->discard_eligible_time)
  178. ret_block_group = block_group;
  179. }
  180. }
  181. return ret_block_group;
  182. }
  183. /*
  184. * Check whether a block group is empty.
  185. *
  186. * "Empty" here means that there are no extents physically located within the
  187. * device extents corresponding to this block group.
  188. *
  189. * For a remapped block group, this means that all of its identity remaps have
  190. * been removed. For a non-remapped block group, this means that no extents
  191. * have an address within its range, and that nothing has been remapped to be
  192. * within it.
  193. */
  194. static bool block_group_is_empty(const struct btrfs_block_group *bg)
  195. {
  196. if (bg->flags & BTRFS_BLOCK_GROUP_REMAPPED)
  197. return bg->identity_remap_count == 0;
  198. return bg->used == 0 && bg->remap_bytes == 0;
  199. }
  200. /*
  201. * Look up next block group and set it for use.
  202. *
  203. * @discard_ctl: discard control
  204. * @discard_state: the discard_state of the block_group after state management
  205. * @discard_index: the discard_index of the block_group after state management
  206. * @now: time when discard was invoked, in ns
  207. *
  208. * Wrap find_next_block_group() and set the block_group to be in use.
  209. * @discard_state's control flow is managed here. Variables related to
  210. * @discard_state are reset here as needed (eg. @discard_cursor). @discard_state
  211. * and @discard_index are remembered as it may change while we're discarding,
  212. * but we want the discard to execute in the context determined here.
  213. */
  214. static struct btrfs_block_group *peek_discard_list(
  215. struct btrfs_discard_ctl *discard_ctl,
  216. enum btrfs_discard_state *discard_state,
  217. int *discard_index, u64 now)
  218. {
  219. struct btrfs_block_group *block_group;
  220. spin_lock(&discard_ctl->lock);
  221. again:
  222. block_group = find_next_block_group(discard_ctl, now);
  223. if (block_group && now >= block_group->discard_eligible_time) {
  224. const bool empty = block_group_is_empty(block_group);
  225. if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED &&
  226. !empty) {
  227. if (btrfs_is_block_group_data_only(block_group)) {
  228. __add_to_discard_list(discard_ctl, block_group);
  229. /*
  230. * The block group must have been moved to other
  231. * discard list even if discard was disabled in
  232. * the meantime or a transaction abort happened,
  233. * otherwise we can end up in an infinite loop,
  234. * always jumping into the 'again' label and
  235. * keep getting this block group over and over
  236. * in case there are no other block groups in
  237. * the discard lists.
  238. */
  239. ASSERT(block_group->discard_index !=
  240. BTRFS_DISCARD_INDEX_UNUSED,
  241. "discard_index=%d",
  242. block_group->discard_index);
  243. } else {
  244. list_del_init(&block_group->discard_list);
  245. btrfs_put_block_group(block_group);
  246. }
  247. goto again;
  248. }
  249. if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) {
  250. block_group->discard_cursor = block_group->start;
  251. if (block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED && empty) {
  252. block_group->discard_state = BTRFS_DISCARD_FULLY_REMAPPED;
  253. } else {
  254. block_group->discard_state = BTRFS_DISCARD_EXTENTS;
  255. }
  256. }
  257. }
  258. if (block_group) {
  259. btrfs_get_block_group(block_group);
  260. discard_ctl->block_group = block_group;
  261. *discard_state = block_group->discard_state;
  262. *discard_index = block_group->discard_index;
  263. }
  264. spin_unlock(&discard_ctl->lock);
  265. return block_group;
  266. }
  267. /*
  268. * Update a block group's filters.
  269. *
  270. * @block_group: block group of interest
  271. * @bytes: recently freed region size after coalescing
  272. *
  273. * Async discard maintains multiple lists with progressively smaller filters
  274. * to prioritize discarding based on size. Should a free space that matches
  275. * a larger filter be returned to the free_space_cache, prioritize that discard
  276. * by moving @block_group to the proper filter.
  277. */
  278. void btrfs_discard_check_filter(struct btrfs_block_group *block_group,
  279. u64 bytes)
  280. {
  281. struct btrfs_discard_ctl *discard_ctl;
  282. if (!block_group ||
  283. !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
  284. return;
  285. discard_ctl = &block_group->fs_info->discard_ctl;
  286. if (block_group->discard_index > BTRFS_DISCARD_INDEX_START &&
  287. bytes >= discard_minlen[block_group->discard_index - 1]) {
  288. int i;
  289. remove_from_discard_list(discard_ctl, block_group);
  290. for (i = BTRFS_DISCARD_INDEX_START; i < BTRFS_NR_DISCARD_LISTS;
  291. i++) {
  292. if (bytes >= discard_minlen[i]) {
  293. block_group->discard_index = i;
  294. add_to_discard_list(discard_ctl, block_group);
  295. break;
  296. }
  297. }
  298. }
  299. }
  300. /*
  301. * Move a block group along the discard lists.
  302. *
  303. * @discard_ctl: discard control
  304. * @block_group: block_group of interest
  305. *
  306. * Increment @block_group's discard_index. If it falls of the list, let it be.
  307. * Otherwise add it back to the appropriate list.
  308. */
  309. static void btrfs_update_discard_index(struct btrfs_discard_ctl *discard_ctl,
  310. struct btrfs_block_group *block_group)
  311. {
  312. block_group->discard_index++;
  313. if (block_group->discard_index == BTRFS_NR_DISCARD_LISTS) {
  314. block_group->discard_index = 1;
  315. return;
  316. }
  317. add_to_discard_list(discard_ctl, block_group);
  318. }
  319. /*
  320. * Remove a block_group from the discard lists.
  321. *
  322. * @discard_ctl: discard control
  323. * @block_group: block_group of interest
  324. *
  325. * Remove @block_group from the discard lists. If necessary, wait on the
  326. * current work and then reschedule the delayed work.
  327. */
  328. void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl,
  329. struct btrfs_block_group *block_group)
  330. {
  331. if (remove_from_discard_list(discard_ctl, block_group)) {
  332. cancel_delayed_work_sync(&discard_ctl->work);
  333. btrfs_discard_schedule_work(discard_ctl, true);
  334. }
  335. }
  336. /*
  337. * Handles queuing the block_groups.
  338. *
  339. * @discard_ctl: discard control
  340. * @block_group: block_group of interest
  341. *
  342. * Maintain the LRU order of the discard lists.
  343. */
  344. void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl,
  345. struct btrfs_block_group *block_group)
  346. {
  347. if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
  348. return;
  349. if (block_group_is_empty(block_group))
  350. add_to_discard_unused_list(discard_ctl, block_group);
  351. else
  352. add_to_discard_list(discard_ctl, block_group);
  353. if (!delayed_work_pending(&discard_ctl->work))
  354. btrfs_discard_schedule_work(discard_ctl, false);
  355. }
  356. static void __btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
  357. u64 now, bool override)
  358. {
  359. struct btrfs_block_group *block_group;
  360. if (!btrfs_run_discard_work(discard_ctl))
  361. return;
  362. if (!override && delayed_work_pending(&discard_ctl->work))
  363. return;
  364. block_group = find_next_block_group(discard_ctl, now);
  365. if (block_group) {
  366. u64 delay = discard_ctl->delay_ms * NSEC_PER_MSEC;
  367. u32 kbps_limit = READ_ONCE(discard_ctl->kbps_limit);
  368. /*
  369. * A single delayed workqueue item is responsible for
  370. * discarding, so we can manage the bytes rate limit by keeping
  371. * track of the previous discard.
  372. */
  373. if (kbps_limit && discard_ctl->prev_discard) {
  374. u64 bps_limit = ((u64)kbps_limit) * SZ_1K;
  375. u64 bps_delay = div64_u64(discard_ctl->prev_discard *
  376. NSEC_PER_SEC, bps_limit);
  377. delay = max(delay, bps_delay);
  378. }
  379. /*
  380. * This timeout is to hopefully prevent immediate discarding
  381. * in a recently allocated block group.
  382. */
  383. if (now < block_group->discard_eligible_time) {
  384. u64 bg_timeout = block_group->discard_eligible_time - now;
  385. delay = max(delay, bg_timeout);
  386. }
  387. if (override && discard_ctl->prev_discard) {
  388. u64 elapsed = now - discard_ctl->prev_discard_time;
  389. if (delay > elapsed)
  390. delay -= elapsed;
  391. else
  392. delay = 0;
  393. }
  394. mod_delayed_work(discard_ctl->discard_workers,
  395. &discard_ctl->work, nsecs_to_jiffies(delay));
  396. }
  397. }
  398. /*
  399. * Responsible for scheduling the discard work.
  400. *
  401. * @discard_ctl: discard control
  402. * @override: override the current timer
  403. *
  404. * Discards are issued by a delayed workqueue item. @override is used to
  405. * update the current delay as the baseline delay interval is reevaluated on
  406. * transaction commit. This is also maxed with any other rate limit.
  407. */
  408. void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
  409. bool override)
  410. {
  411. const u64 now = ktime_get_ns();
  412. spin_lock(&discard_ctl->lock);
  413. __btrfs_discard_schedule_work(discard_ctl, now, override);
  414. spin_unlock(&discard_ctl->lock);
  415. }
  416. /*
  417. * Determine next step of a block_group.
  418. *
  419. * @discard_ctl: discard control
  420. * @block_group: block_group of interest
  421. *
  422. * Determine the next step for a block group after it's finished going through
  423. * a pass on a discard list. If it is unused and fully trimmed, we can mark it
  424. * unused and send it to the unused_bgs path. Otherwise, pass it onto the
  425. * appropriate filter list or let it fall off.
  426. */
  427. static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl,
  428. struct btrfs_block_group *block_group)
  429. {
  430. remove_from_discard_list(discard_ctl, block_group);
  431. if (block_group_is_empty(block_group)) {
  432. if (btrfs_is_free_space_trimmed(block_group))
  433. btrfs_mark_bg_unused(block_group);
  434. else
  435. add_to_discard_unused_list(discard_ctl, block_group);
  436. } else {
  437. btrfs_update_discard_index(discard_ctl, block_group);
  438. }
  439. }
  440. /*
  441. * Discard work queue callback
  442. *
  443. * @work: work
  444. *
  445. * Find the next block_group to start discarding and then discard a single
  446. * region. It does this in a two-pass fashion: first extents and second
  447. * bitmaps. Completely discarded block groups are sent to the unused_bgs path.
  448. */
  449. static void btrfs_discard_workfn(struct work_struct *work)
  450. {
  451. struct btrfs_discard_ctl *discard_ctl;
  452. struct btrfs_block_group *block_group;
  453. enum btrfs_discard_state discard_state;
  454. int discard_index = 0;
  455. u64 trimmed = 0;
  456. u64 minlen = 0;
  457. u64 now = ktime_get_ns();
  458. discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work);
  459. block_group = peek_discard_list(discard_ctl, &discard_state,
  460. &discard_index, now);
  461. if (!block_group)
  462. return;
  463. if (!btrfs_run_discard_work(discard_ctl)) {
  464. spin_lock(&discard_ctl->lock);
  465. btrfs_put_block_group(block_group);
  466. discard_ctl->block_group = NULL;
  467. spin_unlock(&discard_ctl->lock);
  468. return;
  469. }
  470. if (now < block_group->discard_eligible_time) {
  471. spin_lock(&discard_ctl->lock);
  472. btrfs_put_block_group(block_group);
  473. discard_ctl->block_group = NULL;
  474. spin_unlock(&discard_ctl->lock);
  475. btrfs_discard_schedule_work(discard_ctl, false);
  476. return;
  477. }
  478. /* Perform discarding */
  479. minlen = discard_minlen[discard_index];
  480. switch (discard_state) {
  481. case BTRFS_DISCARD_BITMAPS: {
  482. u64 maxlen = 0;
  483. /*
  484. * Use the previous levels minimum discard length as the max
  485. * length filter. In the case something is added to make a
  486. * region go beyond the max filter, the entire bitmap is set
  487. * back to BTRFS_TRIM_STATE_UNTRIMMED.
  488. */
  489. if (discard_index != BTRFS_DISCARD_INDEX_UNUSED)
  490. maxlen = discard_minlen[discard_index - 1];
  491. btrfs_trim_block_group_bitmaps(block_group, &trimmed,
  492. block_group->discard_cursor,
  493. btrfs_block_group_end(block_group),
  494. minlen, maxlen, true);
  495. discard_ctl->discard_bitmap_bytes += trimmed;
  496. break;
  497. }
  498. case BTRFS_DISCARD_FULLY_REMAPPED:
  499. btrfs_trim_fully_remapped_block_group(block_group);
  500. break;
  501. default:
  502. btrfs_trim_block_group_extents(block_group, &trimmed,
  503. block_group->discard_cursor,
  504. btrfs_block_group_end(block_group),
  505. minlen, true);
  506. discard_ctl->discard_extent_bytes += trimmed;
  507. break;
  508. }
  509. /* Determine next steps for a block_group */
  510. if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) {
  511. if (discard_state == BTRFS_DISCARD_BITMAPS ||
  512. discard_state == BTRFS_DISCARD_FULLY_REMAPPED) {
  513. btrfs_finish_discard_pass(discard_ctl, block_group);
  514. } else {
  515. block_group->discard_cursor = block_group->start;
  516. spin_lock(&discard_ctl->lock);
  517. if (block_group->discard_state !=
  518. BTRFS_DISCARD_RESET_CURSOR)
  519. block_group->discard_state =
  520. BTRFS_DISCARD_BITMAPS;
  521. spin_unlock(&discard_ctl->lock);
  522. }
  523. }
  524. now = ktime_get_ns();
  525. spin_lock(&discard_ctl->lock);
  526. discard_ctl->prev_discard = trimmed;
  527. discard_ctl->prev_discard_time = now;
  528. btrfs_put_block_group(block_group);
  529. discard_ctl->block_group = NULL;
  530. __btrfs_discard_schedule_work(discard_ctl, now, false);
  531. spin_unlock(&discard_ctl->lock);
  532. }
  533. /*
  534. * Recalculate the base delay.
  535. *
  536. * @discard_ctl: discard control
  537. *
  538. * Recalculate the base delay which is based off the total number of
  539. * discardable_extents. Clamp this between the lower_limit (iops_limit or 1ms)
  540. * and the upper_limit (BTRFS_DISCARD_MAX_DELAY_MSEC).
  541. */
  542. void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl)
  543. {
  544. s32 discardable_extents;
  545. s64 discardable_bytes;
  546. u32 iops_limit;
  547. unsigned long min_delay = BTRFS_DISCARD_MIN_DELAY_MSEC;
  548. unsigned long delay;
  549. discardable_extents = atomic_read(&discard_ctl->discardable_extents);
  550. if (!discardable_extents)
  551. return;
  552. spin_lock(&discard_ctl->lock);
  553. /*
  554. * The following is to fix a potential -1 discrepancy that we're not
  555. * sure how to reproduce. But given that this is the only place that
  556. * utilizes these numbers and this is only called by from
  557. * btrfs_finish_extent_commit() which is synchronized, we can correct
  558. * here.
  559. */
  560. if (discardable_extents < 0)
  561. atomic_add(-discardable_extents,
  562. &discard_ctl->discardable_extents);
  563. discardable_bytes = atomic64_read(&discard_ctl->discardable_bytes);
  564. if (discardable_bytes < 0)
  565. atomic64_add(-discardable_bytes,
  566. &discard_ctl->discardable_bytes);
  567. if (discardable_extents <= 0) {
  568. spin_unlock(&discard_ctl->lock);
  569. return;
  570. }
  571. iops_limit = READ_ONCE(discard_ctl->iops_limit);
  572. if (iops_limit) {
  573. delay = MSEC_PER_SEC / iops_limit;
  574. } else {
  575. /*
  576. * Unset iops_limit means go as fast as possible, so allow a
  577. * delay of 0.
  578. */
  579. delay = 0;
  580. min_delay = 0;
  581. }
  582. delay = clamp(delay, min_delay, BTRFS_DISCARD_MAX_DELAY_MSEC);
  583. discard_ctl->delay_ms = delay;
  584. spin_unlock(&discard_ctl->lock);
  585. }
  586. /*
  587. * Propagate discard counters.
  588. *
  589. * @block_group: block_group of interest
  590. *
  591. * Propagate deltas of counters up to the discard_ctl. It maintains a current
  592. * counter and a previous counter passing the delta up to the global stat.
  593. * Then the current counter value becomes the previous counter value.
  594. */
  595. void btrfs_discard_update_discardable(struct btrfs_block_group *block_group)
  596. {
  597. struct btrfs_free_space_ctl *ctl;
  598. struct btrfs_discard_ctl *discard_ctl;
  599. s32 extents_delta;
  600. s64 bytes_delta;
  601. if (!block_group ||
  602. !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC) ||
  603. !btrfs_is_block_group_data_only(block_group))
  604. return;
  605. ctl = block_group->free_space_ctl;
  606. discard_ctl = &block_group->fs_info->discard_ctl;
  607. lockdep_assert_held(&ctl->tree_lock);
  608. extents_delta = ctl->discardable_extents[BTRFS_STAT_CURR] -
  609. ctl->discardable_extents[BTRFS_STAT_PREV];
  610. if (extents_delta) {
  611. atomic_add(extents_delta, &discard_ctl->discardable_extents);
  612. ctl->discardable_extents[BTRFS_STAT_PREV] =
  613. ctl->discardable_extents[BTRFS_STAT_CURR];
  614. }
  615. bytes_delta = ctl->discardable_bytes[BTRFS_STAT_CURR] -
  616. ctl->discardable_bytes[BTRFS_STAT_PREV];
  617. if (bytes_delta) {
  618. atomic64_add(bytes_delta, &discard_ctl->discardable_bytes);
  619. ctl->discardable_bytes[BTRFS_STAT_PREV] =
  620. ctl->discardable_bytes[BTRFS_STAT_CURR];
  621. }
  622. }
  623. /*
  624. * Punt unused_bgs list to discard lists.
  625. *
  626. * @fs_info: fs_info of interest
  627. *
  628. * The unused_bgs list needs to be punted to the discard lists because the
  629. * order of operations is changed. In the normal synchronous discard path, the
  630. * block groups are trimmed via a single large trim in transaction commit. This
  631. * is ultimately what we are trying to avoid with asynchronous discard. Thus,
  632. * it must be done before going down the unused_bgs path.
  633. */
  634. void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info)
  635. {
  636. struct btrfs_block_group *block_group, *next;
  637. spin_lock(&fs_info->unused_bgs_lock);
  638. /* We enabled async discard, so punt all to the queue */
  639. list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs,
  640. bg_list) {
  641. list_del_init(&block_group->bg_list);
  642. btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
  643. /*
  644. * This put is for the get done by btrfs_mark_bg_unused.
  645. * Queueing discard incremented it for discard's reference.
  646. */
  647. btrfs_put_block_group(block_group);
  648. }
  649. spin_unlock(&fs_info->unused_bgs_lock);
  650. }
  651. /*
  652. * Purge discard lists.
  653. *
  654. * @discard_ctl: discard control
  655. *
  656. * If we are disabling async discard, we may have intercepted block groups that
  657. * are completely free and ready for the unused_bgs path. As discarding will
  658. * now happen in transaction commit or not at all, we can safely mark the
  659. * corresponding block groups as unused and they will be sent on their merry
  660. * way to the unused_bgs list.
  661. */
  662. static void btrfs_discard_purge_list(struct btrfs_discard_ctl *discard_ctl)
  663. {
  664. struct btrfs_block_group *block_group, *next;
  665. int i;
  666. spin_lock(&discard_ctl->lock);
  667. for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) {
  668. list_for_each_entry_safe(block_group, next,
  669. &discard_ctl->discard_list[i],
  670. discard_list) {
  671. list_del_init(&block_group->discard_list);
  672. spin_unlock(&discard_ctl->lock);
  673. if (block_group->used == 0)
  674. btrfs_mark_bg_unused(block_group);
  675. spin_lock(&discard_ctl->lock);
  676. btrfs_put_block_group(block_group);
  677. }
  678. }
  679. spin_unlock(&discard_ctl->lock);
  680. }
  681. void btrfs_discard_resume(struct btrfs_fs_info *fs_info)
  682. {
  683. if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
  684. btrfs_discard_cleanup(fs_info);
  685. return;
  686. }
  687. btrfs_discard_punt_unused_bgs_list(fs_info);
  688. set_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags);
  689. }
  690. void btrfs_discard_stop(struct btrfs_fs_info *fs_info)
  691. {
  692. clear_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags);
  693. }
  694. void btrfs_discard_init(struct btrfs_fs_info *fs_info)
  695. {
  696. struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl;
  697. int i;
  698. spin_lock_init(&discard_ctl->lock);
  699. INIT_DELAYED_WORK(&discard_ctl->work, btrfs_discard_workfn);
  700. for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++)
  701. INIT_LIST_HEAD(&discard_ctl->discard_list[i]);
  702. discard_ctl->prev_discard = 0;
  703. discard_ctl->prev_discard_time = 0;
  704. atomic_set(&discard_ctl->discardable_extents, 0);
  705. atomic64_set(&discard_ctl->discardable_bytes, 0);
  706. discard_ctl->max_discard_size = BTRFS_ASYNC_DISCARD_DEFAULT_MAX_SIZE;
  707. discard_ctl->delay_ms = BTRFS_DISCARD_MAX_DELAY_MSEC;
  708. discard_ctl->iops_limit = BTRFS_DISCARD_MAX_IOPS;
  709. discard_ctl->kbps_limit = 0;
  710. discard_ctl->discard_extent_bytes = 0;
  711. discard_ctl->discard_bitmap_bytes = 0;
  712. atomic64_set(&discard_ctl->discard_bytes_saved, 0);
  713. }
  714. void btrfs_discard_cleanup(struct btrfs_fs_info *fs_info)
  715. {
  716. btrfs_discard_stop(fs_info);
  717. cancel_delayed_work_sync(&fs_info->discard_ctl.work);
  718. btrfs_discard_purge_list(&fs_info->discard_ctl);
  719. }