space-info.c 71 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233
  1. // SPDX-License-Identifier: GPL-2.0
  2. #include <linux/spinlock.h>
  3. #include <linux/minmax.h>
  4. #include "misc.h"
  5. #include "ctree.h"
  6. #include "space-info.h"
  7. #include "sysfs.h"
  8. #include "volumes.h"
  9. #include "free-space-cache.h"
  10. #include "ordered-data.h"
  11. #include "transaction.h"
  12. #include "block-group.h"
  13. #include "fs.h"
  14. #include "accessors.h"
  15. #include "extent-tree.h"
  16. #include "zoned.h"
  17. #include "delayed-inode.h"
  18. /*
  19. * HOW DOES SPACE RESERVATION WORK
  20. *
  21. * If you want to know about delalloc specifically, there is a separate comment
  22. * for that with the delalloc code. This comment is about how the whole system
  23. * works generally.
  24. *
  25. * BASIC CONCEPTS
  26. *
  27. * 1) space_info. This is the ultimate arbiter of how much space we can use.
  28. * There's a description of the bytes_ fields with the struct declaration,
  29. * refer to that for specifics on each field. Suffice it to say that for
  30. * reservations we care about total_bytes - SUM(space_info->bytes_) when
  31. * determining if there is space to make an allocation. There is a space_info
  32. * for METADATA, SYSTEM, and DATA areas.
  33. *
  34. * 2) block_rsv's. These are basically buckets for every different type of
  35. * metadata reservation we have. You can see the comment in the block_rsv
  36. * code on the rules for each type, but generally block_rsv->reserved is how
  37. * much space is accounted for in space_info->bytes_may_use.
  38. *
  39. * 3) btrfs_calc*_size. These are the worst case calculations we used based
  40. * on the number of items we will want to modify. We have one for changing
  41. * items, and one for inserting new items. Generally we use these helpers to
  42. * determine the size of the block reserves, and then use the actual bytes
  43. * values to adjust the space_info counters.
  44. *
  45. * MAKING RESERVATIONS, THE NORMAL CASE
  46. *
  47. * We call into either btrfs_reserve_data_bytes() or
  48. * btrfs_reserve_metadata_bytes(), depending on which we're looking for, with
  49. * num_bytes we want to reserve.
  50. *
  51. * ->reserve
  52. * space_info->bytes_may_use += num_bytes
  53. *
  54. * ->extent allocation
  55. * Call btrfs_add_reserved_bytes() which does
  56. * space_info->bytes_may_use -= num_bytes
  57. * space_info->bytes_reserved += extent_bytes
  58. *
  59. * ->insert reference
  60. * Call btrfs_update_block_group() which does
  61. * space_info->bytes_reserved -= extent_bytes
  62. * space_info->bytes_used += extent_bytes
  63. *
  64. * MAKING RESERVATIONS, FLUSHING NORMALLY (non-priority)
  65. *
  66. * Assume we are unable to simply make the reservation because we do not have
  67. * enough space
  68. *
  69. * -> reserve_bytes
  70. * create a reserve_ticket with ->bytes set to our reservation, add it to
  71. * the tail of space_info->tickets, kick async flush thread
  72. *
  73. * ->handle_reserve_ticket
  74. * wait on ticket->wait for ->bytes to be reduced to 0, or ->error to be set
  75. * on the ticket.
  76. *
  77. * -> btrfs_async_reclaim_metadata_space/btrfs_async_reclaim_data_space
  78. * Flushes various things attempting to free up space.
  79. *
  80. * -> btrfs_try_granting_tickets()
  81. * This is called by anything that either subtracts space from
  82. * space_info->bytes_may_use, ->bytes_pinned, etc, or adds to the
  83. * space_info->total_bytes. This loops through the ->priority_tickets and
  84. * then the ->tickets list checking to see if the reservation can be
  85. * completed. If it can the space is added to space_info->bytes_may_use and
  86. * the ticket is woken up.
  87. *
  88. * -> ticket wakeup
  89. * Check if ->bytes == 0, if it does we got our reservation and we can carry
  90. * on, if not return the appropriate error (ENOSPC, but can be EINTR if we
  91. * were interrupted.)
  92. *
  93. * MAKING RESERVATIONS, FLUSHING HIGH PRIORITY
  94. *
  95. * Same as the above, except we add ourselves to the
  96. * space_info->priority_tickets, and we do not use ticket->wait, we simply
  97. * call flush_space() ourselves for the states that are safe for us to call
  98. * without deadlocking and hope for the best.
  99. *
  100. * THE FLUSHING STATES
  101. *
  102. * Generally speaking we will have two cases for each state, a "nice" state
  103. * and a "ALL THE THINGS" state. In btrfs we delay a lot of work in order to
  104. * reduce the locking over head on the various trees, and even to keep from
  105. * doing any work at all in the case of delayed refs. Each of these delayed
  106. * things however hold reservations, and so letting them run allows us to
  107. * reclaim space so we can make new reservations.
  108. *
  109. * FLUSH_DELAYED_ITEMS
  110. * Every inode has a delayed item to update the inode. Take a simple write
  111. * for example, we would update the inode item at write time to update the
  112. * mtime, and then again at finish_ordered_io() time in order to update the
  113. * isize or bytes. We keep these delayed items to coalesce these operations
  114. * into a single operation done on demand. These are an easy way to reclaim
  115. * metadata space.
  116. *
  117. * FLUSH_DELALLOC
  118. * Look at the delalloc comment to get an idea of how much space is reserved
  119. * for delayed allocation. We can reclaim some of this space simply by
  120. * running delalloc, but usually we need to wait for ordered extents to
  121. * reclaim the bulk of this space.
  122. *
  123. * FLUSH_DELAYED_REFS
  124. * We have a block reserve for the outstanding delayed refs space, and every
  125. * delayed ref operation holds a reservation. Running these is a quick way
  126. * to reclaim space, but we want to hold this until the end because COW can
  127. * churn a lot and we can avoid making some extent tree modifications if we
  128. * are able to delay for as long as possible.
  129. *
  130. * RESET_ZONES
  131. * This state works only for the zoned mode. On the zoned mode, we cannot
  132. * reuse once allocated then freed region until we reset the zone, due to
  133. * the sequential write zone requirement. The RESET_ZONES state resets the
  134. * zones of an unused block group and let us reuse the space. The reusing
  135. * is faster than removing the block group and allocating another block
  136. * group on the zones.
  137. *
  138. * ALLOC_CHUNK
  139. * We will skip this the first time through space reservation, because of
  140. * overcommit and we don't want to have a lot of useless metadata space when
  141. * our worst case reservations will likely never come true.
  142. *
  143. * RUN_DELAYED_IPUTS
  144. * If we're freeing inodes we're likely freeing checksums, file extent
  145. * items, and extent tree items. Loads of space could be freed up by these
  146. * operations, however they won't be usable until the transaction commits.
  147. *
  148. * COMMIT_TRANS
  149. * This will commit the transaction. Historically we had a lot of logic
  150. * surrounding whether or not we'd commit the transaction, but this waits born
  151. * out of a pre-tickets era where we could end up committing the transaction
  152. * thousands of times in a row without making progress. Now thanks to our
  153. * ticketing system we know if we're not making progress and can error
  154. * everybody out after a few commits rather than burning the disk hoping for
  155. * a different answer.
  156. *
  157. * OVERCOMMIT
  158. *
  159. * Because we hold so many reservations for metadata we will allow you to
  160. * reserve more space than is currently free in the currently allocate
  161. * metadata space. This only happens with metadata, data does not allow
  162. * overcommitting.
  163. *
  164. * You can see the current logic for when we allow overcommit in
  165. * btrfs_can_overcommit(), but it only applies to unallocated space. If there
  166. * is no unallocated space to be had, all reservations are kept within the
  167. * free space in the allocated metadata chunks.
  168. *
  169. * Because of overcommitting, you generally want to use the
  170. * btrfs_can_overcommit() logic for metadata allocations, as it does the right
  171. * thing with or without extra unallocated space.
  172. */
  173. struct reserve_ticket {
  174. u64 bytes;
  175. int error;
  176. bool steal;
  177. struct list_head list;
  178. wait_queue_head_t wait;
  179. spinlock_t lock;
  180. };
  181. /*
  182. * after adding space to the filesystem, we need to clear the full flags
  183. * on all the space infos.
  184. */
  185. void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
  186. {
  187. struct list_head *head = &info->space_info;
  188. struct btrfs_space_info *found;
  189. list_for_each_entry(found, head, list)
  190. found->full = false;
  191. }
  192. /*
  193. * Block groups with more than this value (percents) of unusable space will be
  194. * scheduled for background reclaim.
  195. */
  196. #define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH (75)
  197. #define BTRFS_UNALLOC_BLOCK_GROUP_TARGET (10ULL)
  198. /*
  199. * Calculate chunk size depending on volume type (regular or zoned).
  200. */
  201. static u64 calc_chunk_size(const struct btrfs_fs_info *fs_info, u64 flags)
  202. {
  203. if (btrfs_is_zoned(fs_info))
  204. return fs_info->zone_size;
  205. ASSERT(flags & BTRFS_BLOCK_GROUP_TYPE_MASK, "flags=%llu", flags);
  206. if (flags & BTRFS_BLOCK_GROUP_DATA)
  207. return BTRFS_MAX_DATA_CHUNK_SIZE;
  208. else if (flags & (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA_REMAP))
  209. return SZ_32M;
  210. /* Handle BTRFS_BLOCK_GROUP_METADATA */
  211. if (fs_info->fs_devices->total_rw_bytes > 50ULL * SZ_1G)
  212. return SZ_1G;
  213. return SZ_256M;
  214. }
  215. /*
  216. * Update default chunk size.
  217. */
  218. void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info,
  219. u64 chunk_size)
  220. {
  221. WRITE_ONCE(space_info->chunk_size, chunk_size);
  222. }
  223. static void init_space_info(struct btrfs_fs_info *info,
  224. struct btrfs_space_info *space_info, u64 flags)
  225. {
  226. space_info->fs_info = info;
  227. for (int i = 0; i < BTRFS_NR_RAID_TYPES; i++)
  228. INIT_LIST_HEAD(&space_info->block_groups[i]);
  229. init_rwsem(&space_info->groups_sem);
  230. spin_lock_init(&space_info->lock);
  231. space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
  232. space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
  233. INIT_LIST_HEAD(&space_info->ro_bgs);
  234. INIT_LIST_HEAD(&space_info->tickets);
  235. INIT_LIST_HEAD(&space_info->priority_tickets);
  236. space_info->clamp = 1;
  237. btrfs_update_space_info_chunk_size(space_info, calc_chunk_size(info, flags));
  238. space_info->subgroup_id = BTRFS_SUB_GROUP_PRIMARY;
  239. if (btrfs_is_zoned(info))
  240. space_info->bg_reclaim_threshold = BTRFS_DEFAULT_ZONED_RECLAIM_THRESH;
  241. }
  242. static int create_space_info_sub_group(struct btrfs_space_info *parent, u64 flags,
  243. enum btrfs_space_info_sub_group id, int index)
  244. {
  245. struct btrfs_fs_info *fs_info = parent->fs_info;
  246. struct btrfs_space_info *sub_group;
  247. int ret;
  248. ASSERT(parent->subgroup_id == BTRFS_SUB_GROUP_PRIMARY,
  249. "parent->subgroup_id=%d", parent->subgroup_id);
  250. ASSERT(id != BTRFS_SUB_GROUP_PRIMARY, "id=%d", id);
  251. sub_group = kzalloc_obj(*sub_group, GFP_NOFS);
  252. if (!sub_group)
  253. return -ENOMEM;
  254. init_space_info(fs_info, sub_group, flags);
  255. parent->sub_group[index] = sub_group;
  256. sub_group->parent = parent;
  257. sub_group->subgroup_id = id;
  258. ret = btrfs_sysfs_add_space_info_type(sub_group);
  259. if (ret) {
  260. kfree(sub_group);
  261. parent->sub_group[index] = NULL;
  262. }
  263. return ret;
  264. }
  265. static int create_space_info(struct btrfs_fs_info *info, u64 flags)
  266. {
  267. struct btrfs_space_info *space_info;
  268. int ret = 0;
  269. space_info = kzalloc_obj(*space_info, GFP_NOFS);
  270. if (!space_info)
  271. return -ENOMEM;
  272. init_space_info(info, space_info, flags);
  273. if (btrfs_is_zoned(info)) {
  274. if (flags & BTRFS_BLOCK_GROUP_DATA)
  275. ret = create_space_info_sub_group(space_info, flags,
  276. BTRFS_SUB_GROUP_DATA_RELOC,
  277. 0);
  278. else if (flags & BTRFS_BLOCK_GROUP_METADATA)
  279. ret = create_space_info_sub_group(space_info, flags,
  280. BTRFS_SUB_GROUP_TREELOG,
  281. 0);
  282. if (ret)
  283. goto out_free;
  284. }
  285. ret = btrfs_sysfs_add_space_info_type(space_info);
  286. if (ret)
  287. goto out_free;
  288. list_add(&space_info->list, &info->space_info);
  289. if (flags & BTRFS_BLOCK_GROUP_DATA)
  290. info->data_sinfo = space_info;
  291. return ret;
  292. out_free:
  293. kfree(space_info);
  294. return ret;
  295. }
  296. int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
  297. {
  298. struct btrfs_super_block *disk_super;
  299. u64 features;
  300. u64 flags;
  301. bool mixed = false;
  302. int ret;
  303. disk_super = fs_info->super_copy;
  304. if (!btrfs_super_root(disk_super))
  305. return -EINVAL;
  306. features = btrfs_super_incompat_flags(disk_super);
  307. if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
  308. mixed = true;
  309. flags = BTRFS_BLOCK_GROUP_SYSTEM;
  310. ret = create_space_info(fs_info, flags);
  311. if (ret)
  312. return ret;
  313. if (mixed) {
  314. flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
  315. ret = create_space_info(fs_info, flags);
  316. if (ret)
  317. return ret;
  318. } else {
  319. flags = BTRFS_BLOCK_GROUP_METADATA;
  320. ret = create_space_info(fs_info, flags);
  321. if (ret)
  322. return ret;
  323. flags = BTRFS_BLOCK_GROUP_DATA;
  324. ret = create_space_info(fs_info, flags);
  325. if (ret)
  326. return ret;
  327. }
  328. if (features & BTRFS_FEATURE_INCOMPAT_REMAP_TREE) {
  329. flags = BTRFS_BLOCK_GROUP_METADATA_REMAP;
  330. ret = create_space_info(fs_info, flags);
  331. }
  332. return ret;
  333. }
  334. void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info,
  335. struct btrfs_block_group *block_group)
  336. {
  337. struct btrfs_space_info *space_info = block_group->space_info;
  338. int factor, index;
  339. factor = btrfs_bg_type_to_factor(block_group->flags);
  340. spin_lock(&space_info->lock);
  341. if (!(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED) ||
  342. block_group->identity_remap_count != 0) {
  343. space_info->total_bytes += block_group->length;
  344. space_info->disk_total += block_group->length * factor;
  345. }
  346. space_info->bytes_used += block_group->used;
  347. space_info->disk_used += block_group->used * factor;
  348. space_info->bytes_readonly += block_group->bytes_super;
  349. btrfs_space_info_update_bytes_zone_unusable(space_info, block_group->zone_unusable);
  350. if (block_group->length > 0)
  351. space_info->full = false;
  352. btrfs_try_granting_tickets(space_info);
  353. spin_unlock(&space_info->lock);
  354. block_group->space_info = space_info;
  355. index = btrfs_bg_flags_to_raid_index(block_group->flags);
  356. down_write(&space_info->groups_sem);
  357. list_add_tail(&block_group->list, &space_info->block_groups[index]);
  358. up_write(&space_info->groups_sem);
  359. }
  360. struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
  361. u64 flags)
  362. {
  363. struct list_head *head = &info->space_info;
  364. struct btrfs_space_info *found;
  365. flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
  366. list_for_each_entry(found, head, list) {
  367. if (found->flags & flags)
  368. return found;
  369. }
  370. return NULL;
  371. }
  372. static u64 calc_effective_data_chunk_size(struct btrfs_fs_info *fs_info)
  373. {
  374. struct btrfs_space_info *data_sinfo;
  375. u64 data_chunk_size;
  376. /*
  377. * Calculate the data_chunk_size, space_info->chunk_size is the
  378. * "optimal" chunk size based on the fs size. However when we actually
  379. * allocate the chunk we will strip this down further, making it no
  380. * more than 10% of the disk or 1G, whichever is smaller.
  381. *
  382. * On the zoned mode, we need to use zone_size (= data_sinfo->chunk_size)
  383. * as it is.
  384. */
  385. data_sinfo = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
  386. if (btrfs_is_zoned(fs_info))
  387. return data_sinfo->chunk_size;
  388. data_chunk_size = min(data_sinfo->chunk_size,
  389. mult_perc(fs_info->fs_devices->total_rw_bytes, 10));
  390. return min_t(u64, data_chunk_size, SZ_1G);
  391. }
  392. static u64 calc_available_free_space(const struct btrfs_space_info *space_info,
  393. enum btrfs_reserve_flush_enum flush)
  394. {
  395. struct btrfs_fs_info *fs_info = space_info->fs_info;
  396. u64 profile;
  397. u64 avail;
  398. u64 data_chunk_size;
  399. int factor;
  400. if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM)
  401. profile = btrfs_system_alloc_profile(fs_info);
  402. else
  403. profile = btrfs_metadata_alloc_profile(fs_info);
  404. avail = atomic64_read(&fs_info->free_chunk_space);
  405. /*
  406. * If we have dup, raid1 or raid10 then only half of the free
  407. * space is actually usable. For raid56, the space info used
  408. * doesn't include the parity drive, so we don't have to
  409. * change the math
  410. */
  411. factor = btrfs_bg_type_to_factor(profile);
  412. avail = div_u64(avail, factor);
  413. if (avail == 0)
  414. return 0;
  415. data_chunk_size = calc_effective_data_chunk_size(fs_info);
  416. /*
  417. * Since data allocations immediately use block groups as part of the
  418. * reservation, because we assume that data reservations will == actual
  419. * usage, we could potentially overcommit and then immediately have that
  420. * available space used by a data allocation, which could put us in a
  421. * bind when we get close to filling the file system.
  422. *
  423. * To handle this simply remove the data_chunk_size from the available
  424. * space. If we are relatively empty this won't affect our ability to
  425. * overcommit much, and if we're very close to full it'll keep us from
  426. * getting into a position where we've given ourselves very little
  427. * metadata wiggle room.
  428. */
  429. if (avail <= data_chunk_size)
  430. return 0;
  431. avail -= data_chunk_size;
  432. /*
  433. * If we aren't flushing all things, let us overcommit up to
  434. * 1/2th of the space. If we can flush, don't let us overcommit
  435. * too much, let it overcommit up to 1/8 of the space.
  436. */
  437. if (flush == BTRFS_RESERVE_FLUSH_ALL)
  438. avail >>= 3;
  439. else
  440. avail >>= 1;
  441. /*
  442. * On the zoned mode, we always allocate one zone as one chunk.
  443. * Returning non-zone size aligned bytes here will result in
  444. * less pressure for the async metadata reclaim process, and it
  445. * will over-commit too much leading to ENOSPC. Align down to the
  446. * zone size to avoid that.
  447. */
  448. if (btrfs_is_zoned(fs_info))
  449. avail = ALIGN_DOWN(avail, fs_info->zone_size);
  450. return avail;
  451. }
  452. static inline bool check_can_overcommit(const struct btrfs_space_info *space_info,
  453. u64 space_info_used_bytes, u64 bytes,
  454. enum btrfs_reserve_flush_enum flush)
  455. {
  456. const u64 avail = calc_available_free_space(space_info, flush);
  457. return (space_info_used_bytes + bytes < space_info->total_bytes + avail);
  458. }
  459. static inline bool can_overcommit(const struct btrfs_space_info *space_info,
  460. u64 space_info_used_bytes, u64 bytes,
  461. enum btrfs_reserve_flush_enum flush)
  462. {
  463. /* Don't overcommit when in mixed mode. */
  464. if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
  465. return false;
  466. return check_can_overcommit(space_info, space_info_used_bytes, bytes, flush);
  467. }
  468. bool btrfs_can_overcommit(const struct btrfs_space_info *space_info, u64 bytes,
  469. enum btrfs_reserve_flush_enum flush)
  470. {
  471. u64 used;
  472. /* Don't overcommit when in mixed mode */
  473. if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
  474. return false;
  475. used = btrfs_space_info_used(space_info, true);
  476. return check_can_overcommit(space_info, used, bytes, flush);
  477. }
  478. static void remove_ticket(struct btrfs_space_info *space_info,
  479. struct reserve_ticket *ticket, int error)
  480. {
  481. lockdep_assert_held(&space_info->lock);
  482. if (!list_empty(&ticket->list)) {
  483. list_del_init(&ticket->list);
  484. ASSERT(space_info->reclaim_size >= ticket->bytes,
  485. "space_info->reclaim_size=%llu ticket->bytes=%llu",
  486. space_info->reclaim_size, ticket->bytes);
  487. space_info->reclaim_size -= ticket->bytes;
  488. }
  489. spin_lock(&ticket->lock);
  490. /*
  491. * If we are called from a task waiting on the ticket, it may happen
  492. * that before it sets an error on the ticket, a reclaim task was able
  493. * to satisfy the ticket. In that case ignore the error.
  494. */
  495. if (error && ticket->bytes > 0)
  496. ticket->error = error;
  497. else
  498. ticket->bytes = 0;
  499. wake_up(&ticket->wait);
  500. spin_unlock(&ticket->lock);
  501. }
  502. /*
  503. * This is for space we already have accounted in space_info->bytes_may_use, so
  504. * basically when we're returning space from block_rsv's.
  505. */
  506. void btrfs_try_granting_tickets(struct btrfs_space_info *space_info)
  507. {
  508. struct list_head *head;
  509. enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
  510. u64 used = btrfs_space_info_used(space_info, true);
  511. lockdep_assert_held(&space_info->lock);
  512. head = &space_info->priority_tickets;
  513. again:
  514. while (!list_empty(head)) {
  515. struct reserve_ticket *ticket;
  516. u64 used_after;
  517. ticket = list_first_entry(head, struct reserve_ticket, list);
  518. used_after = used + ticket->bytes;
  519. /* Check and see if our ticket can be satisfied now. */
  520. if (used_after <= space_info->total_bytes ||
  521. can_overcommit(space_info, used, ticket->bytes, flush)) {
  522. btrfs_space_info_update_bytes_may_use(space_info, ticket->bytes);
  523. remove_ticket(space_info, ticket, 0);
  524. space_info->tickets_id++;
  525. used = used_after;
  526. } else {
  527. break;
  528. }
  529. }
  530. if (head == &space_info->priority_tickets) {
  531. head = &space_info->tickets;
  532. flush = BTRFS_RESERVE_FLUSH_ALL;
  533. goto again;
  534. }
  535. }
  536. #define DUMP_BLOCK_RSV(fs_info, rsv_name) \
  537. do { \
  538. struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \
  539. spin_lock(&__rsv->lock); \
  540. btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \
  541. __rsv->size, __rsv->reserved); \
  542. spin_unlock(&__rsv->lock); \
  543. } while (0)
  544. static void dump_global_block_rsv(struct btrfs_fs_info *fs_info)
  545. {
  546. DUMP_BLOCK_RSV(fs_info, global_block_rsv);
  547. DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
  548. DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
  549. DUMP_BLOCK_RSV(fs_info, remap_block_rsv);
  550. DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
  551. DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
  552. }
  553. static void __btrfs_dump_space_info(const struct btrfs_space_info *info)
  554. {
  555. const struct btrfs_fs_info *fs_info = info->fs_info;
  556. const char *flag_str = btrfs_space_info_type_str(info);
  557. lockdep_assert_held(&info->lock);
  558. /* The free space could be negative in case of overcommit */
  559. btrfs_info(fs_info,
  560. "space_info %s (sub-group id %d) has %lld free, is %sfull",
  561. flag_str, info->subgroup_id,
  562. (s64)(info->total_bytes - btrfs_space_info_used(info, true)),
  563. info->full ? "" : "not ");
  564. btrfs_info(fs_info,
  565. "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu zone_unusable=%llu",
  566. info->total_bytes, info->bytes_used, info->bytes_pinned,
  567. info->bytes_reserved, info->bytes_may_use,
  568. info->bytes_readonly, info->bytes_zone_unusable);
  569. }
  570. void btrfs_dump_space_info(struct btrfs_space_info *info, u64 bytes,
  571. bool dump_block_groups)
  572. {
  573. struct btrfs_fs_info *fs_info = info->fs_info;
  574. struct btrfs_block_group *cache;
  575. u64 total_avail = 0;
  576. int index = 0;
  577. spin_lock(&info->lock);
  578. __btrfs_dump_space_info(info);
  579. dump_global_block_rsv(fs_info);
  580. spin_unlock(&info->lock);
  581. if (!dump_block_groups)
  582. return;
  583. down_read(&info->groups_sem);
  584. again:
  585. list_for_each_entry(cache, &info->block_groups[index], list) {
  586. u64 avail;
  587. spin_lock(&cache->lock);
  588. avail = btrfs_block_group_available_space(cache);
  589. btrfs_info(fs_info,
  590. "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu delalloc %llu super %llu zone_unusable (%llu bytes available) %s",
  591. cache->start, cache->length, cache->used, cache->pinned,
  592. cache->reserved, cache->delalloc_bytes,
  593. cache->bytes_super, cache->zone_unusable,
  594. avail, cache->ro ? "[readonly]" : "");
  595. spin_unlock(&cache->lock);
  596. btrfs_dump_free_space(cache, bytes);
  597. total_avail += avail;
  598. }
  599. if (++index < BTRFS_NR_RAID_TYPES)
  600. goto again;
  601. up_read(&info->groups_sem);
  602. btrfs_info(fs_info, "%llu bytes available across all block groups", total_avail);
  603. }
  604. static inline u64 calc_reclaim_items_nr(const struct btrfs_fs_info *fs_info,
  605. u64 to_reclaim)
  606. {
  607. u64 bytes;
  608. u64 nr;
  609. bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
  610. nr = div64_u64(to_reclaim, bytes);
  611. if (!nr)
  612. nr = 1;
  613. return nr;
  614. }
  615. /*
  616. * shrink metadata reservation for delalloc
  617. */
  618. static void shrink_delalloc(struct btrfs_space_info *space_info,
  619. u64 to_reclaim, bool wait_ordered,
  620. bool for_preempt)
  621. {
  622. struct btrfs_fs_info *fs_info = space_info->fs_info;
  623. struct btrfs_trans_handle *trans;
  624. u64 delalloc_bytes;
  625. u64 ordered_bytes;
  626. u64 items;
  627. long time_left;
  628. int loops;
  629. delalloc_bytes = percpu_counter_sum_positive(&fs_info->delalloc_bytes);
  630. ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes);
  631. if (delalloc_bytes == 0 && ordered_bytes == 0)
  632. return;
  633. /* Calc the number of the pages we need flush for space reservation */
  634. if (to_reclaim == U64_MAX) {
  635. items = U64_MAX;
  636. } else {
  637. /*
  638. * to_reclaim is set to however much metadata we need to
  639. * reclaim, but reclaiming that much data doesn't really track
  640. * exactly. What we really want to do is reclaim full inode's
  641. * worth of reservations, however that's not available to us
  642. * here. We will take a fraction of the delalloc bytes for our
  643. * flushing loops and hope for the best. Delalloc will expand
  644. * the amount we write to cover an entire dirty extent, which
  645. * will reclaim the metadata reservation for that range. If
  646. * it's not enough subsequent flush stages will be more
  647. * aggressive.
  648. */
  649. to_reclaim = max(to_reclaim, delalloc_bytes >> 3);
  650. items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2;
  651. }
  652. trans = current->journal_info;
  653. /*
  654. * If we are doing more ordered than delalloc we need to just wait on
  655. * ordered extents, otherwise we'll waste time trying to flush delalloc
  656. * that likely won't give us the space back we need.
  657. */
  658. if (ordered_bytes > delalloc_bytes && !for_preempt)
  659. wait_ordered = true;
  660. loops = 0;
  661. while ((delalloc_bytes || ordered_bytes) && loops < 3) {
  662. u64 temp = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
  663. long nr_pages = min_t(u64, temp, LONG_MAX);
  664. int async_pages;
  665. btrfs_start_delalloc_roots(fs_info, nr_pages, true);
  666. /*
  667. * We need to make sure any outstanding async pages are now
  668. * processed before we continue. This is because things like
  669. * sync_inode() try to be smart and skip writing if the inode is
  670. * marked clean. We don't use filemap_fwrite for flushing
  671. * because we want to control how many pages we write out at a
  672. * time, thus this is the only safe way to make sure we've
  673. * waited for outstanding compressed workers to have started
  674. * their jobs and thus have ordered extents set up properly.
  675. *
  676. * This exists because we do not want to wait for each
  677. * individual inode to finish its async work, we simply want to
  678. * start the IO on everybody, and then come back here and wait
  679. * for all of the async work to catch up. Once we're done with
  680. * that we know we'll have ordered extents for everything and we
  681. * can decide if we wait for that or not.
  682. *
  683. * If we choose to replace this in the future, make absolutely
  684. * sure that the proper waiting is being done in the async case,
  685. * as there have been bugs in that area before.
  686. */
  687. async_pages = atomic_read(&fs_info->async_delalloc_pages);
  688. if (!async_pages)
  689. goto skip_async;
  690. /*
  691. * We don't want to wait forever, if we wrote less pages in this
  692. * loop than we have outstanding, only wait for that number of
  693. * pages, otherwise we can wait for all async pages to finish
  694. * before continuing.
  695. */
  696. if (async_pages > nr_pages)
  697. async_pages -= nr_pages;
  698. else
  699. async_pages = 0;
  700. wait_event(fs_info->async_submit_wait,
  701. atomic_read(&fs_info->async_delalloc_pages) <=
  702. async_pages);
  703. skip_async:
  704. loops++;
  705. if (wait_ordered && !trans) {
  706. btrfs_wait_ordered_roots(fs_info, items, NULL);
  707. } else {
  708. time_left = schedule_timeout_killable(1);
  709. if (time_left)
  710. break;
  711. }
  712. /*
  713. * If we are for preemption we just want a one-shot of delalloc
  714. * flushing so we can stop flushing if we decide we don't need
  715. * to anymore.
  716. */
  717. if (for_preempt)
  718. break;
  719. spin_lock(&space_info->lock);
  720. if (list_empty(&space_info->tickets) &&
  721. list_empty(&space_info->priority_tickets)) {
  722. spin_unlock(&space_info->lock);
  723. break;
  724. }
  725. spin_unlock(&space_info->lock);
  726. delalloc_bytes = percpu_counter_sum_positive(
  727. &fs_info->delalloc_bytes);
  728. ordered_bytes = percpu_counter_sum_positive(
  729. &fs_info->ordered_bytes);
  730. }
  731. }
  732. /*
  733. * Try to flush some data based on policy set by @state. This is only advisory
  734. * and may fail for various reasons. The caller is supposed to examine the
  735. * state of @space_info to detect the outcome.
  736. */
  737. static void flush_space(struct btrfs_space_info *space_info, u64 num_bytes,
  738. enum btrfs_flush_state state, bool for_preempt)
  739. {
  740. struct btrfs_fs_info *fs_info = space_info->fs_info;
  741. struct btrfs_root *root = fs_info->tree_root;
  742. struct btrfs_trans_handle *trans;
  743. int nr;
  744. int ret = 0;
  745. switch (state) {
  746. case FLUSH_DELAYED_ITEMS_NR:
  747. case FLUSH_DELAYED_ITEMS:
  748. if (state == FLUSH_DELAYED_ITEMS_NR)
  749. nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
  750. else
  751. nr = -1;
  752. trans = btrfs_join_transaction_nostart(root);
  753. if (IS_ERR(trans)) {
  754. ret = PTR_ERR(trans);
  755. if (ret == -ENOENT)
  756. ret = 0;
  757. break;
  758. }
  759. ret = btrfs_run_delayed_items_nr(trans, nr);
  760. btrfs_end_transaction(trans);
  761. break;
  762. case FLUSH_DELALLOC:
  763. case FLUSH_DELALLOC_WAIT:
  764. case FLUSH_DELALLOC_FULL:
  765. if (state == FLUSH_DELALLOC_FULL)
  766. num_bytes = U64_MAX;
  767. shrink_delalloc(space_info, num_bytes,
  768. state != FLUSH_DELALLOC, for_preempt);
  769. break;
  770. case FLUSH_DELAYED_REFS_NR:
  771. case FLUSH_DELAYED_REFS:
  772. trans = btrfs_join_transaction_nostart(root);
  773. if (IS_ERR(trans)) {
  774. ret = PTR_ERR(trans);
  775. if (ret == -ENOENT)
  776. ret = 0;
  777. break;
  778. }
  779. if (state == FLUSH_DELAYED_REFS_NR)
  780. btrfs_run_delayed_refs(trans, num_bytes);
  781. else
  782. btrfs_run_delayed_refs(trans, 0);
  783. btrfs_end_transaction(trans);
  784. break;
  785. case ALLOC_CHUNK:
  786. case ALLOC_CHUNK_FORCE:
  787. trans = btrfs_join_transaction(root);
  788. if (IS_ERR(trans)) {
  789. ret = PTR_ERR(trans);
  790. break;
  791. }
  792. ret = btrfs_chunk_alloc(trans, space_info,
  793. btrfs_get_alloc_profile(fs_info, space_info->flags),
  794. (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
  795. CHUNK_ALLOC_FORCE);
  796. btrfs_end_transaction(trans);
  797. if (ret > 0 || ret == -ENOSPC)
  798. ret = 0;
  799. break;
  800. case RUN_DELAYED_IPUTS:
  801. /*
  802. * If we have pending delayed iputs then we could free up a
  803. * bunch of pinned space, so make sure we run the iputs before
  804. * we do our pinned bytes check below.
  805. */
  806. btrfs_run_delayed_iputs(fs_info);
  807. btrfs_wait_on_delayed_iputs(fs_info);
  808. break;
  809. case COMMIT_TRANS:
  810. ASSERT(current->journal_info == NULL);
  811. /*
  812. * We don't want to start a new transaction, just attach to the
  813. * current one or wait it fully commits in case its commit is
  814. * happening at the moment. Note: we don't use a nostart join
  815. * because that does not wait for a transaction to fully commit
  816. * (only for it to be unblocked, state TRANS_STATE_UNBLOCKED).
  817. */
  818. ret = btrfs_commit_current_transaction(root);
  819. break;
  820. case RESET_ZONES:
  821. ret = btrfs_reset_unused_block_groups(space_info, num_bytes);
  822. break;
  823. default:
  824. ret = -ENOSPC;
  825. break;
  826. }
  827. trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
  828. ret, for_preempt);
  829. return;
  830. }
  831. static u64 btrfs_calc_reclaim_metadata_size(const struct btrfs_space_info *space_info)
  832. {
  833. u64 used;
  834. u64 avail;
  835. u64 to_reclaim = space_info->reclaim_size;
  836. lockdep_assert_held(&space_info->lock);
  837. avail = calc_available_free_space(space_info, BTRFS_RESERVE_FLUSH_ALL);
  838. used = btrfs_space_info_used(space_info, true);
  839. /*
  840. * We may be flushing because suddenly we have less space than we had
  841. * before, and now we're well over-committed based on our current free
  842. * space. If that's the case add in our overage so we make sure to put
  843. * appropriate pressure on the flushing state machine.
  844. */
  845. if (space_info->total_bytes + avail < used)
  846. to_reclaim += used - (space_info->total_bytes + avail);
  847. return to_reclaim;
  848. }
  849. static bool need_preemptive_reclaim(const struct btrfs_space_info *space_info)
  850. {
  851. struct btrfs_fs_info *fs_info = space_info->fs_info;
  852. const u64 global_rsv_size = btrfs_block_rsv_reserved(&fs_info->global_block_rsv);
  853. u64 ordered, delalloc;
  854. u64 thresh;
  855. u64 used;
  856. lockdep_assert_held(&space_info->lock);
  857. /*
  858. * We have tickets queued, bail so we don't compete with the async
  859. * flushers.
  860. */
  861. if (space_info->reclaim_size)
  862. return false;
  863. thresh = mult_perc(space_info->total_bytes, 90);
  864. /* If we're just plain full then async reclaim just slows us down. */
  865. if ((space_info->bytes_used + space_info->bytes_reserved +
  866. global_rsv_size) >= thresh)
  867. return false;
  868. used = space_info->bytes_may_use + space_info->bytes_pinned;
  869. /* The total flushable belongs to the global rsv, don't flush. */
  870. if (global_rsv_size >= used)
  871. return false;
  872. /*
  873. * 128MiB is 1/4 of the maximum global rsv size. If we have less than
  874. * that devoted to other reservations then there's no sense in flushing,
  875. * we don't have a lot of things that need flushing.
  876. */
  877. if (used - global_rsv_size <= SZ_128M)
  878. return false;
  879. /*
  880. * If we have over half of the free space occupied by reservations or
  881. * pinned then we want to start flushing.
  882. *
  883. * We do not do the traditional thing here, which is to say
  884. *
  885. * if (used >= ((total_bytes + avail) / 2))
  886. * return 1;
  887. *
  888. * because this doesn't quite work how we want. If we had more than 50%
  889. * of the space_info used by bytes_used and we had 0 available we'd just
  890. * constantly run the background flusher. Instead we want it to kick in
  891. * if our reclaimable space exceeds our clamped free space.
  892. *
  893. * Our clamping range is 2^1 -> 2^8. Practically speaking that means
  894. * the following:
  895. *
  896. * Amount of RAM Minimum threshold Maximum threshold
  897. *
  898. * 256GiB 1GiB 128GiB
  899. * 128GiB 512MiB 64GiB
  900. * 64GiB 256MiB 32GiB
  901. * 32GiB 128MiB 16GiB
  902. * 16GiB 64MiB 8GiB
  903. *
  904. * These are the range our thresholds will fall in, corresponding to how
  905. * much delalloc we need for the background flusher to kick in.
  906. */
  907. thresh = calc_available_free_space(space_info, BTRFS_RESERVE_FLUSH_ALL);
  908. used = space_info->bytes_used + space_info->bytes_reserved +
  909. space_info->bytes_readonly + global_rsv_size;
  910. if (used < space_info->total_bytes)
  911. thresh += space_info->total_bytes - used;
  912. thresh >>= space_info->clamp;
  913. used = space_info->bytes_pinned;
  914. /*
  915. * If we have more ordered bytes than delalloc bytes then we're either
  916. * doing a lot of DIO, or we simply don't have a lot of delalloc waiting
  917. * around. Preemptive flushing is only useful in that it can free up
  918. * space before tickets need to wait for things to finish. In the case
  919. * of ordered extents, preemptively waiting on ordered extents gets us
  920. * nothing, if our reservations are tied up in ordered extents we'll
  921. * simply have to slow down writers by forcing them to wait on ordered
  922. * extents.
  923. *
  924. * In the case that ordered is larger than delalloc, only include the
  925. * block reserves that we would actually be able to directly reclaim
  926. * from. In this case if we're heavy on metadata operations this will
  927. * clearly be heavy enough to warrant preemptive flushing. In the case
  928. * of heavy DIO or ordered reservations, preemptive flushing will just
  929. * waste time and cause us to slow down.
  930. *
  931. * We want to make sure we truly are maxed out on ordered however, so
  932. * cut ordered in half, and if it's still higher than delalloc then we
  933. * can keep flushing. This is to avoid the case where we start
  934. * flushing, and now delalloc == ordered and we stop preemptively
  935. * flushing when we could still have several gigs of delalloc to flush.
  936. */
  937. ordered = percpu_counter_read_positive(&fs_info->ordered_bytes) >> 1;
  938. delalloc = percpu_counter_read_positive(&fs_info->delalloc_bytes);
  939. if (ordered >= delalloc)
  940. used += btrfs_block_rsv_reserved(&fs_info->delayed_refs_rsv) +
  941. btrfs_block_rsv_reserved(&fs_info->delayed_block_rsv);
  942. else
  943. used += space_info->bytes_may_use - global_rsv_size;
  944. return (used >= thresh && !btrfs_fs_closing(fs_info) &&
  945. !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
  946. }
  947. static bool steal_from_global_rsv(struct btrfs_space_info *space_info,
  948. struct reserve_ticket *ticket)
  949. {
  950. struct btrfs_fs_info *fs_info = space_info->fs_info;
  951. struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
  952. u64 min_bytes;
  953. lockdep_assert_held(&space_info->lock);
  954. if (!ticket->steal)
  955. return false;
  956. if (global_rsv->space_info != space_info)
  957. return false;
  958. spin_lock(&global_rsv->lock);
  959. min_bytes = mult_perc(global_rsv->size, 10);
  960. if (global_rsv->reserved < min_bytes + ticket->bytes) {
  961. spin_unlock(&global_rsv->lock);
  962. return false;
  963. }
  964. global_rsv->reserved -= ticket->bytes;
  965. if (global_rsv->reserved < global_rsv->size)
  966. global_rsv->full = false;
  967. spin_unlock(&global_rsv->lock);
  968. remove_ticket(space_info, ticket, 0);
  969. space_info->tickets_id++;
  970. return true;
  971. }
  972. /*
  973. * We've exhausted our flushing, start failing tickets.
  974. *
  975. * @space_info - the space info we were flushing
  976. *
  977. * We call this when we've exhausted our flushing ability and haven't made
  978. * progress in satisfying tickets. The reservation code handles tickets in
  979. * order, so if there is a large ticket first and then smaller ones we could
  980. * very well satisfy the smaller tickets. This will attempt to wake up any
  981. * tickets in the list to catch this case.
  982. *
  983. * This function returns true if it was able to make progress by clearing out
  984. * other tickets, or if it stumbles across a ticket that was smaller than the
  985. * first ticket.
  986. */
  987. static bool maybe_fail_all_tickets(struct btrfs_space_info *space_info)
  988. {
  989. struct btrfs_fs_info *fs_info = space_info->fs_info;
  990. struct reserve_ticket *ticket;
  991. u64 tickets_id = space_info->tickets_id;
  992. const int abort_error = BTRFS_FS_ERROR(fs_info);
  993. trace_btrfs_fail_all_tickets(fs_info, space_info);
  994. if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
  995. btrfs_info(fs_info, "cannot satisfy tickets, dumping space info");
  996. __btrfs_dump_space_info(space_info);
  997. }
  998. while (!list_empty(&space_info->tickets) &&
  999. tickets_id == space_info->tickets_id) {
  1000. ticket = list_first_entry(&space_info->tickets,
  1001. struct reserve_ticket, list);
  1002. if (unlikely(abort_error)) {
  1003. remove_ticket(space_info, ticket, abort_error);
  1004. } else {
  1005. if (steal_from_global_rsv(space_info, ticket))
  1006. return true;
  1007. if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
  1008. btrfs_info(fs_info, "failing ticket with %llu bytes",
  1009. ticket->bytes);
  1010. remove_ticket(space_info, ticket, -ENOSPC);
  1011. /*
  1012. * We're just throwing tickets away, so more flushing may
  1013. * not trip over btrfs_try_granting_tickets, so we need
  1014. * to call it here to see if we can make progress with
  1015. * the next ticket in the list.
  1016. */
  1017. btrfs_try_granting_tickets(space_info);
  1018. }
  1019. }
  1020. return (tickets_id != space_info->tickets_id);
  1021. }
  1022. static void do_async_reclaim_metadata_space(struct btrfs_space_info *space_info)
  1023. {
  1024. struct btrfs_fs_info *fs_info = space_info->fs_info;
  1025. u64 to_reclaim;
  1026. enum btrfs_flush_state flush_state;
  1027. int commit_cycles = 0;
  1028. u64 last_tickets_id;
  1029. enum btrfs_flush_state final_state;
  1030. if (btrfs_is_zoned(fs_info))
  1031. final_state = RESET_ZONES;
  1032. else
  1033. final_state = COMMIT_TRANS;
  1034. spin_lock(&space_info->lock);
  1035. to_reclaim = btrfs_calc_reclaim_metadata_size(space_info);
  1036. if (!to_reclaim) {
  1037. space_info->flush = false;
  1038. spin_unlock(&space_info->lock);
  1039. return;
  1040. }
  1041. last_tickets_id = space_info->tickets_id;
  1042. spin_unlock(&space_info->lock);
  1043. flush_state = FLUSH_DELAYED_ITEMS_NR;
  1044. do {
  1045. flush_space(space_info, to_reclaim, flush_state, false);
  1046. spin_lock(&space_info->lock);
  1047. if (list_empty(&space_info->tickets)) {
  1048. space_info->flush = false;
  1049. spin_unlock(&space_info->lock);
  1050. return;
  1051. }
  1052. to_reclaim = btrfs_calc_reclaim_metadata_size(space_info);
  1053. if (last_tickets_id == space_info->tickets_id) {
  1054. flush_state++;
  1055. } else {
  1056. last_tickets_id = space_info->tickets_id;
  1057. flush_state = FLUSH_DELAYED_ITEMS_NR;
  1058. if (commit_cycles)
  1059. commit_cycles--;
  1060. }
  1061. /*
  1062. * We do not want to empty the system of delalloc unless we're
  1063. * under heavy pressure, so allow one trip through the flushing
  1064. * logic before we start doing a FLUSH_DELALLOC_FULL.
  1065. */
  1066. if (flush_state == FLUSH_DELALLOC_FULL && !commit_cycles)
  1067. flush_state++;
  1068. /*
  1069. * We don't want to force a chunk allocation until we've tried
  1070. * pretty hard to reclaim space. Think of the case where we
  1071. * freed up a bunch of space and so have a lot of pinned space
  1072. * to reclaim. We would rather use that than possibly create a
  1073. * underutilized metadata chunk. So if this is our first run
  1074. * through the flushing state machine skip ALLOC_CHUNK_FORCE and
  1075. * commit the transaction. If nothing has changed the next go
  1076. * around then we can force a chunk allocation.
  1077. */
  1078. if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
  1079. flush_state++;
  1080. if (flush_state > final_state) {
  1081. commit_cycles++;
  1082. if (commit_cycles > 2) {
  1083. if (maybe_fail_all_tickets(space_info)) {
  1084. flush_state = FLUSH_DELAYED_ITEMS_NR;
  1085. commit_cycles--;
  1086. } else {
  1087. space_info->flush = false;
  1088. }
  1089. } else {
  1090. flush_state = FLUSH_DELAYED_ITEMS_NR;
  1091. }
  1092. }
  1093. spin_unlock(&space_info->lock);
  1094. } while (flush_state <= final_state);
  1095. }
  1096. /*
  1097. * This is for normal flushers, it can wait as much time as needed. We will
  1098. * loop and continuously try to flush as long as we are making progress. We
  1099. * count progress as clearing off tickets each time we have to loop.
  1100. */
  1101. static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
  1102. {
  1103. struct btrfs_fs_info *fs_info;
  1104. struct btrfs_space_info *space_info;
  1105. fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
  1106. space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
  1107. do_async_reclaim_metadata_space(space_info);
  1108. for (int i = 0; i < BTRFS_SPACE_INFO_SUB_GROUP_MAX; i++) {
  1109. if (space_info->sub_group[i])
  1110. do_async_reclaim_metadata_space(space_info->sub_group[i]);
  1111. }
  1112. }
  1113. /*
  1114. * This handles pre-flushing of metadata space before we get to the point that
  1115. * we need to start blocking threads on tickets. The logic here is different
  1116. * from the other flush paths because it doesn't rely on tickets to tell us how
  1117. * much we need to flush, instead it attempts to keep us below the 80% full
  1118. * watermark of space by flushing whichever reservation pool is currently the
  1119. * largest.
  1120. */
  1121. static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
  1122. {
  1123. struct btrfs_fs_info *fs_info;
  1124. struct btrfs_space_info *space_info;
  1125. struct btrfs_block_rsv *delayed_block_rsv;
  1126. struct btrfs_block_rsv *delayed_refs_rsv;
  1127. struct btrfs_block_rsv *global_rsv;
  1128. struct btrfs_block_rsv *trans_rsv;
  1129. int loops = 0;
  1130. fs_info = container_of(work, struct btrfs_fs_info,
  1131. preempt_reclaim_work);
  1132. space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
  1133. delayed_block_rsv = &fs_info->delayed_block_rsv;
  1134. delayed_refs_rsv = &fs_info->delayed_refs_rsv;
  1135. global_rsv = &fs_info->global_block_rsv;
  1136. trans_rsv = &fs_info->trans_block_rsv;
  1137. spin_lock(&space_info->lock);
  1138. while (need_preemptive_reclaim(space_info)) {
  1139. enum btrfs_flush_state flush;
  1140. u64 delalloc_size = 0;
  1141. u64 to_reclaim, block_rsv_size;
  1142. const u64 global_rsv_size = btrfs_block_rsv_reserved(global_rsv);
  1143. const u64 bytes_may_use = space_info->bytes_may_use;
  1144. const u64 bytes_pinned = space_info->bytes_pinned;
  1145. spin_unlock(&space_info->lock);
  1146. /*
  1147. * We don't have a precise counter for the metadata being
  1148. * reserved for delalloc, so we'll approximate it by subtracting
  1149. * out the block rsv's space from the bytes_may_use. If that
  1150. * amount is higher than the individual reserves, then we can
  1151. * assume it's tied up in delalloc reservations.
  1152. */
  1153. block_rsv_size = global_rsv_size +
  1154. btrfs_block_rsv_reserved(delayed_block_rsv) +
  1155. btrfs_block_rsv_reserved(delayed_refs_rsv) +
  1156. btrfs_block_rsv_reserved(trans_rsv);
  1157. if (block_rsv_size < bytes_may_use)
  1158. delalloc_size = bytes_may_use - block_rsv_size;
  1159. /*
  1160. * We don't want to include the global_rsv in our calculation,
  1161. * because that's space we can't touch. Subtract it from the
  1162. * block_rsv_size for the next checks.
  1163. */
  1164. block_rsv_size -= global_rsv_size;
  1165. /*
  1166. * We really want to avoid flushing delalloc too much, as it
  1167. * could result in poor allocation patterns, so only flush it if
  1168. * it's larger than the rest of the pools combined.
  1169. */
  1170. if (delalloc_size > block_rsv_size) {
  1171. to_reclaim = delalloc_size;
  1172. flush = FLUSH_DELALLOC;
  1173. } else if (bytes_pinned >
  1174. (btrfs_block_rsv_reserved(delayed_block_rsv) +
  1175. btrfs_block_rsv_reserved(delayed_refs_rsv))) {
  1176. to_reclaim = bytes_pinned;
  1177. flush = COMMIT_TRANS;
  1178. } else if (btrfs_block_rsv_reserved(delayed_block_rsv) >
  1179. btrfs_block_rsv_reserved(delayed_refs_rsv)) {
  1180. to_reclaim = btrfs_block_rsv_reserved(delayed_block_rsv);
  1181. flush = FLUSH_DELAYED_ITEMS_NR;
  1182. } else {
  1183. to_reclaim = btrfs_block_rsv_reserved(delayed_refs_rsv);
  1184. flush = FLUSH_DELAYED_REFS_NR;
  1185. }
  1186. loops++;
  1187. /*
  1188. * We don't want to reclaim everything, just a portion, so scale
  1189. * down the to_reclaim by 1/4. If it takes us down to 0,
  1190. * reclaim 1 items worth.
  1191. */
  1192. to_reclaim >>= 2;
  1193. if (!to_reclaim)
  1194. to_reclaim = btrfs_calc_insert_metadata_size(fs_info, 1);
  1195. flush_space(space_info, to_reclaim, flush, true);
  1196. cond_resched();
  1197. spin_lock(&space_info->lock);
  1198. }
  1199. /* We only went through once, back off our clamping. */
  1200. if (loops == 1 && !space_info->reclaim_size)
  1201. space_info->clamp = max(1, space_info->clamp - 1);
  1202. trace_btrfs_done_preemptive_reclaim(fs_info, space_info);
  1203. spin_unlock(&space_info->lock);
  1204. }
  1205. /*
  1206. * FLUSH_DELALLOC_WAIT:
  1207. * Space is freed from flushing delalloc in one of two ways.
  1208. *
  1209. * 1) compression is on and we allocate less space than we reserved
  1210. * 2) we are overwriting existing space
  1211. *
  1212. * For #1 that extra space is reclaimed as soon as the delalloc pages are
  1213. * COWed, by way of btrfs_add_reserved_bytes() which adds the actual extent
  1214. * length to ->bytes_reserved, and subtracts the reserved space from
  1215. * ->bytes_may_use.
  1216. *
  1217. * For #2 this is trickier. Once the ordered extent runs we will drop the
  1218. * extent in the range we are overwriting, which creates a delayed ref for
  1219. * that freed extent. This however is not reclaimed until the transaction
  1220. * commits, thus the next stages.
  1221. *
  1222. * RUN_DELAYED_IPUTS
  1223. * If we are freeing inodes, we want to make sure all delayed iputs have
  1224. * completed, because they could have been on an inode with i_nlink == 0, and
  1225. * thus have been truncated and freed up space. But again this space is not
  1226. * immediately reusable, it comes in the form of a delayed ref, which must be
  1227. * run and then the transaction must be committed.
  1228. *
  1229. * COMMIT_TRANS
  1230. * This is where we reclaim all of the pinned space generated by running the
  1231. * iputs
  1232. *
  1233. * RESET_ZONES
  1234. * This state works only for the zoned mode. We scan the unused block group
  1235. * list and reset the zones and reuse the block group.
  1236. *
  1237. * ALLOC_CHUNK_FORCE
  1238. * For data we start with alloc chunk force, however we could have been full
  1239. * before, and then the transaction commit could have freed new block groups,
  1240. * so if we now have space to allocate do the force chunk allocation.
  1241. */
  1242. static const enum btrfs_flush_state data_flush_states[] = {
  1243. FLUSH_DELALLOC_FULL,
  1244. RUN_DELAYED_IPUTS,
  1245. COMMIT_TRANS,
  1246. RESET_ZONES,
  1247. ALLOC_CHUNK_FORCE,
  1248. };
  1249. static void do_async_reclaim_data_space(struct btrfs_space_info *space_info)
  1250. {
  1251. struct btrfs_fs_info *fs_info = space_info->fs_info;
  1252. u64 last_tickets_id;
  1253. enum btrfs_flush_state flush_state = 0;
  1254. spin_lock(&space_info->lock);
  1255. if (list_empty(&space_info->tickets)) {
  1256. space_info->flush = false;
  1257. spin_unlock(&space_info->lock);
  1258. return;
  1259. }
  1260. last_tickets_id = space_info->tickets_id;
  1261. spin_unlock(&space_info->lock);
  1262. while (!space_info->full) {
  1263. flush_space(space_info, U64_MAX, ALLOC_CHUNK_FORCE, false);
  1264. spin_lock(&space_info->lock);
  1265. if (list_empty(&space_info->tickets)) {
  1266. space_info->flush = false;
  1267. spin_unlock(&space_info->lock);
  1268. return;
  1269. }
  1270. /* Something happened, fail everything and bail. */
  1271. if (unlikely(BTRFS_FS_ERROR(fs_info)))
  1272. goto aborted_fs;
  1273. last_tickets_id = space_info->tickets_id;
  1274. spin_unlock(&space_info->lock);
  1275. }
  1276. while (flush_state < ARRAY_SIZE(data_flush_states)) {
  1277. flush_space(space_info, U64_MAX,
  1278. data_flush_states[flush_state], false);
  1279. spin_lock(&space_info->lock);
  1280. if (list_empty(&space_info->tickets)) {
  1281. space_info->flush = false;
  1282. spin_unlock(&space_info->lock);
  1283. return;
  1284. }
  1285. if (last_tickets_id == space_info->tickets_id) {
  1286. flush_state++;
  1287. } else {
  1288. last_tickets_id = space_info->tickets_id;
  1289. flush_state = 0;
  1290. }
  1291. if (flush_state >= ARRAY_SIZE(data_flush_states)) {
  1292. if (space_info->full) {
  1293. if (maybe_fail_all_tickets(space_info))
  1294. flush_state = 0;
  1295. else
  1296. space_info->flush = false;
  1297. } else {
  1298. flush_state = 0;
  1299. }
  1300. /* Something happened, fail everything and bail. */
  1301. if (unlikely(BTRFS_FS_ERROR(fs_info)))
  1302. goto aborted_fs;
  1303. }
  1304. spin_unlock(&space_info->lock);
  1305. }
  1306. return;
  1307. aborted_fs:
  1308. maybe_fail_all_tickets(space_info);
  1309. space_info->flush = false;
  1310. spin_unlock(&space_info->lock);
  1311. }
  1312. static void btrfs_async_reclaim_data_space(struct work_struct *work)
  1313. {
  1314. struct btrfs_fs_info *fs_info;
  1315. struct btrfs_space_info *space_info;
  1316. fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work);
  1317. space_info = fs_info->data_sinfo;
  1318. do_async_reclaim_data_space(space_info);
  1319. for (int i = 0; i < BTRFS_SPACE_INFO_SUB_GROUP_MAX; i++)
  1320. if (space_info->sub_group[i])
  1321. do_async_reclaim_data_space(space_info->sub_group[i]);
  1322. }
  1323. void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info)
  1324. {
  1325. INIT_WORK(&fs_info->async_reclaim_work, btrfs_async_reclaim_metadata_space);
  1326. INIT_WORK(&fs_info->async_data_reclaim_work, btrfs_async_reclaim_data_space);
  1327. INIT_WORK(&fs_info->preempt_reclaim_work,
  1328. btrfs_preempt_reclaim_metadata_space);
  1329. }
  1330. static const enum btrfs_flush_state priority_flush_states[] = {
  1331. FLUSH_DELAYED_ITEMS_NR,
  1332. FLUSH_DELAYED_ITEMS,
  1333. RESET_ZONES,
  1334. ALLOC_CHUNK,
  1335. };
  1336. static const enum btrfs_flush_state evict_flush_states[] = {
  1337. FLUSH_DELAYED_ITEMS_NR,
  1338. FLUSH_DELAYED_ITEMS,
  1339. FLUSH_DELAYED_REFS_NR,
  1340. FLUSH_DELAYED_REFS,
  1341. FLUSH_DELALLOC,
  1342. FLUSH_DELALLOC_WAIT,
  1343. FLUSH_DELALLOC_FULL,
  1344. ALLOC_CHUNK,
  1345. COMMIT_TRANS,
  1346. RESET_ZONES,
  1347. };
  1348. static bool is_ticket_served(struct reserve_ticket *ticket)
  1349. {
  1350. bool ret;
  1351. spin_lock(&ticket->lock);
  1352. ret = (ticket->bytes == 0);
  1353. spin_unlock(&ticket->lock);
  1354. return ret;
  1355. }
  1356. static void priority_reclaim_metadata_space(struct btrfs_space_info *space_info,
  1357. struct reserve_ticket *ticket,
  1358. const enum btrfs_flush_state *states,
  1359. int states_nr)
  1360. {
  1361. struct btrfs_fs_info *fs_info = space_info->fs_info;
  1362. u64 to_reclaim;
  1363. int flush_state = 0;
  1364. /*
  1365. * This is the priority reclaim path, so to_reclaim could be >0 still
  1366. * because we may have only satisfied the priority tickets and still
  1367. * left non priority tickets on the list. We would then have
  1368. * to_reclaim but ->bytes == 0.
  1369. */
  1370. if (is_ticket_served(ticket))
  1371. return;
  1372. spin_lock(&space_info->lock);
  1373. to_reclaim = btrfs_calc_reclaim_metadata_size(space_info);
  1374. spin_unlock(&space_info->lock);
  1375. while (flush_state < states_nr) {
  1376. flush_space(space_info, to_reclaim, states[flush_state], false);
  1377. if (is_ticket_served(ticket))
  1378. return;
  1379. flush_state++;
  1380. }
  1381. spin_lock(&space_info->lock);
  1382. /*
  1383. * Attempt to steal from the global rsv if we can, except if the fs was
  1384. * turned into error mode due to a transaction abort when flushing space
  1385. * above, in that case fail with the abort error instead of returning
  1386. * success to the caller if we can steal from the global rsv - this is
  1387. * just to have caller fail immediately instead of later when trying to
  1388. * modify the fs, making it easier to debug -ENOSPC problems.
  1389. */
  1390. if (unlikely(BTRFS_FS_ERROR(fs_info)))
  1391. remove_ticket(space_info, ticket, BTRFS_FS_ERROR(fs_info));
  1392. else if (!steal_from_global_rsv(space_info, ticket))
  1393. remove_ticket(space_info, ticket, -ENOSPC);
  1394. /*
  1395. * We must run try_granting_tickets here because we could be a large
  1396. * ticket in front of a smaller ticket that can now be satisfied with
  1397. * the available space.
  1398. */
  1399. btrfs_try_granting_tickets(space_info);
  1400. spin_unlock(&space_info->lock);
  1401. }
  1402. static void priority_reclaim_data_space(struct btrfs_space_info *space_info,
  1403. struct reserve_ticket *ticket)
  1404. {
  1405. /* We could have been granted before we got here. */
  1406. if (is_ticket_served(ticket))
  1407. return;
  1408. spin_lock(&space_info->lock);
  1409. while (!space_info->full) {
  1410. spin_unlock(&space_info->lock);
  1411. flush_space(space_info, U64_MAX, ALLOC_CHUNK_FORCE, false);
  1412. if (is_ticket_served(ticket))
  1413. return;
  1414. spin_lock(&space_info->lock);
  1415. }
  1416. remove_ticket(space_info, ticket, -ENOSPC);
  1417. btrfs_try_granting_tickets(space_info);
  1418. spin_unlock(&space_info->lock);
  1419. }
  1420. static void wait_reserve_ticket(struct btrfs_space_info *space_info,
  1421. struct reserve_ticket *ticket)
  1422. {
  1423. DEFINE_WAIT(wait);
  1424. spin_lock(&ticket->lock);
  1425. while (ticket->bytes > 0 && ticket->error == 0) {
  1426. int ret;
  1427. ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
  1428. spin_unlock(&ticket->lock);
  1429. if (ret) {
  1430. /*
  1431. * Delete us from the list. After we unlock the space
  1432. * info, we don't want the async reclaim job to reserve
  1433. * space for this ticket. If that would happen, then the
  1434. * ticket's task would not known that space was reserved
  1435. * despite getting an error, resulting in a space leak
  1436. * (bytes_may_use counter of our space_info).
  1437. */
  1438. spin_lock(&space_info->lock);
  1439. remove_ticket(space_info, ticket, -EINTR);
  1440. spin_unlock(&space_info->lock);
  1441. return;
  1442. }
  1443. schedule();
  1444. finish_wait(&ticket->wait, &wait);
  1445. spin_lock(&ticket->lock);
  1446. }
  1447. spin_unlock(&ticket->lock);
  1448. }
  1449. /*
  1450. * Do the appropriate flushing and waiting for a ticket.
  1451. *
  1452. * @space_info: space info for the reservation
  1453. * @ticket: ticket for the reservation
  1454. * @start_ns: timestamp when the reservation started
  1455. * @orig_bytes: amount of bytes originally reserved
  1456. * @flush: how much we can flush
  1457. *
  1458. * This does the work of figuring out how to flush for the ticket, waiting for
  1459. * the reservation, and returning the appropriate error if there is one.
  1460. */
  1461. static int handle_reserve_ticket(struct btrfs_space_info *space_info,
  1462. struct reserve_ticket *ticket,
  1463. u64 start_ns, u64 orig_bytes,
  1464. enum btrfs_reserve_flush_enum flush)
  1465. {
  1466. int ret;
  1467. switch (flush) {
  1468. case BTRFS_RESERVE_FLUSH_DATA:
  1469. case BTRFS_RESERVE_FLUSH_ALL:
  1470. case BTRFS_RESERVE_FLUSH_ALL_STEAL:
  1471. wait_reserve_ticket(space_info, ticket);
  1472. break;
  1473. case BTRFS_RESERVE_FLUSH_LIMIT:
  1474. priority_reclaim_metadata_space(space_info, ticket,
  1475. priority_flush_states,
  1476. ARRAY_SIZE(priority_flush_states));
  1477. break;
  1478. case BTRFS_RESERVE_FLUSH_EVICT:
  1479. priority_reclaim_metadata_space(space_info, ticket,
  1480. evict_flush_states,
  1481. ARRAY_SIZE(evict_flush_states));
  1482. break;
  1483. case BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE:
  1484. priority_reclaim_data_space(space_info, ticket);
  1485. break;
  1486. default:
  1487. ASSERT(0, "flush=%d", flush);
  1488. break;
  1489. }
  1490. ret = ticket->error;
  1491. ASSERT(list_empty(&ticket->list));
  1492. /*
  1493. * Check that we can't have an error set if the reservation succeeded,
  1494. * as that would confuse tasks and lead them to error out without
  1495. * releasing reserved space (if an error happens the expectation is that
  1496. * space wasn't reserved at all).
  1497. */
  1498. ASSERT(!(ticket->bytes == 0 && ticket->error),
  1499. "ticket->bytes=%llu ticket->error=%d", ticket->bytes, ticket->error);
  1500. trace_btrfs_reserve_ticket(space_info->fs_info, space_info->flags,
  1501. orig_bytes, start_ns, flush, ticket->error);
  1502. return ret;
  1503. }
  1504. /*
  1505. * This returns true if this flush state will go through the ordinary flushing
  1506. * code.
  1507. */
  1508. static inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush)
  1509. {
  1510. return (flush == BTRFS_RESERVE_FLUSH_ALL) ||
  1511. (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL);
  1512. }
  1513. static inline void maybe_clamp_preempt(struct btrfs_space_info *space_info)
  1514. {
  1515. struct btrfs_fs_info *fs_info = space_info->fs_info;
  1516. u64 ordered = percpu_counter_sum_positive(&fs_info->ordered_bytes);
  1517. u64 delalloc = percpu_counter_sum_positive(&fs_info->delalloc_bytes);
  1518. /*
  1519. * If we're heavy on ordered operations then clamping won't help us. We
  1520. * need to clamp specifically to keep up with dirty'ing buffered
  1521. * writers, because there's not a 1:1 correlation of writing delalloc
  1522. * and freeing space, like there is with flushing delayed refs or
  1523. * delayed nodes. If we're already more ordered than delalloc then
  1524. * we're keeping up, otherwise we aren't and should probably clamp.
  1525. */
  1526. if (ordered < delalloc)
  1527. space_info->clamp = min(space_info->clamp + 1, 8);
  1528. }
  1529. static inline bool can_steal(enum btrfs_reserve_flush_enum flush)
  1530. {
  1531. return (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL ||
  1532. flush == BTRFS_RESERVE_FLUSH_EVICT);
  1533. }
  1534. /*
  1535. * NO_FLUSH and FLUSH_EMERGENCY don't want to create a ticket, they just want to
  1536. * fail as quickly as possible.
  1537. */
  1538. static inline bool can_ticket(enum btrfs_reserve_flush_enum flush)
  1539. {
  1540. return (flush != BTRFS_RESERVE_NO_FLUSH &&
  1541. flush != BTRFS_RESERVE_FLUSH_EMERGENCY);
  1542. }
  1543. /*
  1544. * Try to reserve bytes from the block_rsv's space.
  1545. *
  1546. * @space_info: space info we want to allocate from
  1547. * @orig_bytes: number of bytes we want
  1548. * @flush: whether or not we can flush to make our reservation
  1549. *
  1550. * This will reserve orig_bytes number of bytes from the space info associated
  1551. * with the block_rsv. If there is not enough space it will make an attempt to
  1552. * flush out space to make room. It will do this by flushing delalloc if
  1553. * possible or committing the transaction. If flush is 0 then no attempts to
  1554. * regain reservations will be made and this will fail if there is not enough
  1555. * space already.
  1556. */
  1557. static int reserve_bytes(struct btrfs_space_info *space_info, u64 orig_bytes,
  1558. enum btrfs_reserve_flush_enum flush)
  1559. {
  1560. struct btrfs_fs_info *fs_info = space_info->fs_info;
  1561. struct work_struct *async_work;
  1562. struct reserve_ticket ticket;
  1563. u64 start_ns = 0;
  1564. u64 used;
  1565. int ret = -ENOSPC;
  1566. bool pending_tickets;
  1567. ASSERT(orig_bytes, "orig_bytes=%llu", orig_bytes);
  1568. /*
  1569. * If have a transaction handle (current->journal_info != NULL), then
  1570. * the flush method can not be neither BTRFS_RESERVE_FLUSH_ALL* nor
  1571. * BTRFS_RESERVE_FLUSH_EVICT, as we could deadlock because those
  1572. * flushing methods can trigger transaction commits.
  1573. */
  1574. if (current->journal_info) {
  1575. /* One assert per line for easier debugging. */
  1576. ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL, "flush=%d", flush);
  1577. ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL_STEAL, "flush=%d", flush);
  1578. ASSERT(flush != BTRFS_RESERVE_FLUSH_EVICT, "flush=%d", flush);
  1579. }
  1580. if (flush == BTRFS_RESERVE_FLUSH_DATA)
  1581. async_work = &fs_info->async_data_reclaim_work;
  1582. else
  1583. async_work = &fs_info->async_reclaim_work;
  1584. spin_lock(&space_info->lock);
  1585. used = btrfs_space_info_used(space_info, true);
  1586. /*
  1587. * We don't want NO_FLUSH allocations to jump everybody, they can
  1588. * generally handle ENOSPC in a different way, so treat them the same as
  1589. * normal flushers when it comes to skipping pending tickets.
  1590. */
  1591. if (is_normal_flushing(flush) || (flush == BTRFS_RESERVE_NO_FLUSH))
  1592. pending_tickets = !list_empty(&space_info->tickets) ||
  1593. !list_empty(&space_info->priority_tickets);
  1594. else
  1595. pending_tickets = !list_empty(&space_info->priority_tickets);
  1596. /*
  1597. * Carry on if we have enough space (short-circuit) OR call
  1598. * can_overcommit() to ensure we can overcommit to continue.
  1599. */
  1600. if (!pending_tickets &&
  1601. ((used + orig_bytes <= space_info->total_bytes) ||
  1602. can_overcommit(space_info, used, orig_bytes, flush))) {
  1603. btrfs_space_info_update_bytes_may_use(space_info, orig_bytes);
  1604. ret = 0;
  1605. }
  1606. /*
  1607. * Things are dire, we need to make a reservation so we don't abort. We
  1608. * will let this reservation go through as long as we have actual space
  1609. * left to allocate for the block.
  1610. */
  1611. if (ret && unlikely(flush == BTRFS_RESERVE_FLUSH_EMERGENCY)) {
  1612. used -= space_info->bytes_may_use;
  1613. if (used + orig_bytes <= space_info->total_bytes) {
  1614. btrfs_space_info_update_bytes_may_use(space_info, orig_bytes);
  1615. ret = 0;
  1616. }
  1617. }
  1618. /*
  1619. * If we couldn't make a reservation then setup our reservation ticket
  1620. * and kick the async worker if it's not already running.
  1621. *
  1622. * If we are a priority flusher then we just need to add our ticket to
  1623. * the list and we will do our own flushing further down.
  1624. */
  1625. if (ret && can_ticket(flush)) {
  1626. ticket.bytes = orig_bytes;
  1627. ticket.error = 0;
  1628. space_info->reclaim_size += ticket.bytes;
  1629. init_waitqueue_head(&ticket.wait);
  1630. spin_lock_init(&ticket.lock);
  1631. ticket.steal = can_steal(flush);
  1632. if (trace_btrfs_reserve_ticket_enabled())
  1633. start_ns = ktime_get_ns();
  1634. if (flush == BTRFS_RESERVE_FLUSH_ALL ||
  1635. flush == BTRFS_RESERVE_FLUSH_ALL_STEAL ||
  1636. flush == BTRFS_RESERVE_FLUSH_DATA) {
  1637. list_add_tail(&ticket.list, &space_info->tickets);
  1638. if (!space_info->flush) {
  1639. /*
  1640. * We were forced to add a reserve ticket, so
  1641. * our preemptive flushing is unable to keep
  1642. * up. Clamp down on the threshold for the
  1643. * preemptive flushing in order to keep up with
  1644. * the workload.
  1645. */
  1646. maybe_clamp_preempt(space_info);
  1647. space_info->flush = true;
  1648. trace_btrfs_trigger_flush(fs_info,
  1649. space_info->flags,
  1650. orig_bytes, flush,
  1651. "enospc");
  1652. queue_work(system_dfl_wq, async_work);
  1653. }
  1654. } else {
  1655. list_add_tail(&ticket.list,
  1656. &space_info->priority_tickets);
  1657. }
  1658. } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
  1659. /*
  1660. * We will do the space reservation dance during log replay,
  1661. * which means we won't have fs_info->fs_root set, so don't do
  1662. * the async reclaim as we will panic.
  1663. */
  1664. if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
  1665. !work_busy(&fs_info->preempt_reclaim_work) &&
  1666. need_preemptive_reclaim(space_info)) {
  1667. trace_btrfs_trigger_flush(fs_info, space_info->flags,
  1668. orig_bytes, flush, "preempt");
  1669. queue_work(system_dfl_wq,
  1670. &fs_info->preempt_reclaim_work);
  1671. }
  1672. }
  1673. spin_unlock(&space_info->lock);
  1674. if (!ret || !can_ticket(flush))
  1675. return ret;
  1676. return handle_reserve_ticket(space_info, &ticket, start_ns, orig_bytes, flush);
  1677. }
  1678. /*
  1679. * Try to reserve metadata bytes from the block_rsv's space.
  1680. *
  1681. * @space_info: the space_info we're allocating for
  1682. * @orig_bytes: number of bytes we want
  1683. * @flush: whether or not we can flush to make our reservation
  1684. *
  1685. * This will reserve orig_bytes number of bytes from the space info associated
  1686. * with the block_rsv. If there is not enough space it will make an attempt to
  1687. * flush out space to make room. It will do this by flushing delalloc if
  1688. * possible or committing the transaction. If flush is 0 then no attempts to
  1689. * regain reservations will be made and this will fail if there is not enough
  1690. * space already.
  1691. */
  1692. int btrfs_reserve_metadata_bytes(struct btrfs_space_info *space_info,
  1693. u64 orig_bytes,
  1694. enum btrfs_reserve_flush_enum flush)
  1695. {
  1696. int ret;
  1697. ret = reserve_bytes(space_info, orig_bytes, flush);
  1698. if (ret == -ENOSPC) {
  1699. struct btrfs_fs_info *fs_info = space_info->fs_info;
  1700. trace_btrfs_space_reservation(fs_info, "space_info:enospc",
  1701. space_info->flags, orig_bytes, 1);
  1702. if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
  1703. btrfs_dump_space_info(space_info, orig_bytes, false);
  1704. }
  1705. return ret;
  1706. }
  1707. /*
  1708. * Try to reserve data bytes for an allocation.
  1709. *
  1710. * @space_info: the space_info we're allocating for
  1711. * @bytes: number of bytes we need
  1712. * @flush: how we are allowed to flush
  1713. *
  1714. * This will reserve bytes from the data space info. If there is not enough
  1715. * space then we will attempt to flush space as specified by flush.
  1716. */
  1717. int btrfs_reserve_data_bytes(struct btrfs_space_info *space_info, u64 bytes,
  1718. enum btrfs_reserve_flush_enum flush)
  1719. {
  1720. struct btrfs_fs_info *fs_info = space_info->fs_info;
  1721. int ret;
  1722. ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA ||
  1723. flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE ||
  1724. flush == BTRFS_RESERVE_NO_FLUSH, "flush=%d", flush);
  1725. ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA,
  1726. "current->journal_info=0x%lx flush=%d",
  1727. (unsigned long)current->journal_info, flush);
  1728. ret = reserve_bytes(space_info, bytes, flush);
  1729. if (ret == -ENOSPC) {
  1730. trace_btrfs_space_reservation(fs_info, "space_info:enospc",
  1731. space_info->flags, bytes, 1);
  1732. if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
  1733. btrfs_dump_space_info(space_info, bytes, false);
  1734. }
  1735. return ret;
  1736. }
  1737. /* Dump all the space infos when we abort a transaction due to ENOSPC. */
  1738. __cold void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info)
  1739. {
  1740. struct btrfs_space_info *space_info;
  1741. btrfs_info(fs_info, "dumping space info:");
  1742. list_for_each_entry(space_info, &fs_info->space_info, list) {
  1743. spin_lock(&space_info->lock);
  1744. __btrfs_dump_space_info(space_info);
  1745. spin_unlock(&space_info->lock);
  1746. }
  1747. dump_global_block_rsv(fs_info);
  1748. }
  1749. /*
  1750. * Account the unused space of all the readonly block group in the space_info.
  1751. * takes mirrors into account.
  1752. */
  1753. u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
  1754. {
  1755. struct btrfs_block_group *block_group;
  1756. u64 free_bytes = 0;
  1757. int factor;
  1758. /* It's df, we don't care if it's racy */
  1759. if (data_race(list_empty(&sinfo->ro_bgs)))
  1760. return 0;
  1761. spin_lock(&sinfo->lock);
  1762. list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
  1763. spin_lock(&block_group->lock);
  1764. if (!block_group->ro) {
  1765. spin_unlock(&block_group->lock);
  1766. continue;
  1767. }
  1768. factor = btrfs_bg_type_to_factor(block_group->flags);
  1769. free_bytes += (block_group->length -
  1770. block_group->used) * factor;
  1771. spin_unlock(&block_group->lock);
  1772. }
  1773. spin_unlock(&sinfo->lock);
  1774. return free_bytes;
  1775. }
  1776. static u64 calc_pct_ratio(u64 x, u64 y)
  1777. {
  1778. int ret;
  1779. if (!y)
  1780. return 0;
  1781. again:
  1782. ret = check_mul_overflow(100, x, &x);
  1783. if (ret)
  1784. goto lose_precision;
  1785. return div64_u64(x, y);
  1786. lose_precision:
  1787. x >>= 10;
  1788. y >>= 10;
  1789. if (!y)
  1790. y = 1;
  1791. goto again;
  1792. }
  1793. /*
  1794. * A reasonable buffer for unallocated space is 10 data block_groups.
  1795. * If we claw this back repeatedly, we can still achieve efficient
  1796. * utilization when near full, and not do too much reclaim while
  1797. * always maintaining a solid buffer for workloads that quickly
  1798. * allocate and pressure the unallocated space.
  1799. */
  1800. static u64 calc_unalloc_target(struct btrfs_fs_info *fs_info)
  1801. {
  1802. u64 chunk_sz = calc_effective_data_chunk_size(fs_info);
  1803. return BTRFS_UNALLOC_BLOCK_GROUP_TARGET * chunk_sz;
  1804. }
  1805. /*
  1806. * The fundamental goal of automatic reclaim is to protect the filesystem's
  1807. * unallocated space and thus minimize the probability of the filesystem going
  1808. * read only when a metadata allocation failure causes a transaction abort.
  1809. *
  1810. * However, relocations happen into the space_info's unused space, therefore
  1811. * automatic reclaim must also back off as that space runs low. There is no
  1812. * value in doing trivial "relocations" of re-writing the same block group
  1813. * into a fresh one.
  1814. *
  1815. * Furthermore, we want to avoid doing too much reclaim even if there are good
  1816. * candidates. This is because the allocator is pretty good at filling up the
  1817. * holes with writes. So we want to do just enough reclaim to try and stay
  1818. * safe from running out of unallocated space but not be wasteful about it.
  1819. *
  1820. * Therefore, the dynamic reclaim threshold is calculated as follows:
  1821. * - calculate a target unallocated amount of 5 block group sized chunks
  1822. * - ratchet up the intensity of reclaim depending on how far we are from
  1823. * that target by using a formula of unalloc / target to set the threshold.
  1824. *
  1825. * Typically with 10 block groups as the target, the discrete values this comes
  1826. * out to are 0, 10, 20, ... , 80, 90, and 99.
  1827. */
  1828. static int calc_dynamic_reclaim_threshold(const struct btrfs_space_info *space_info)
  1829. {
  1830. struct btrfs_fs_info *fs_info = space_info->fs_info;
  1831. u64 unalloc = atomic64_read(&fs_info->free_chunk_space);
  1832. u64 target = calc_unalloc_target(fs_info);
  1833. u64 alloc = space_info->total_bytes;
  1834. u64 used = btrfs_space_info_used(space_info, false);
  1835. u64 unused = alloc - used;
  1836. u64 want = target > unalloc ? target - unalloc : 0;
  1837. u64 data_chunk_size = calc_effective_data_chunk_size(fs_info);
  1838. /* If we have no unused space, don't bother, it won't work anyway. */
  1839. if (unused < data_chunk_size)
  1840. return 0;
  1841. /* Cast to int is OK because want <= target. */
  1842. return calc_pct_ratio(want, target);
  1843. }
  1844. int btrfs_calc_reclaim_threshold(const struct btrfs_space_info *space_info)
  1845. {
  1846. lockdep_assert_held(&space_info->lock);
  1847. if (READ_ONCE(space_info->dynamic_reclaim))
  1848. return calc_dynamic_reclaim_threshold(space_info);
  1849. return READ_ONCE(space_info->bg_reclaim_threshold);
  1850. }
  1851. /*
  1852. * Under "urgent" reclaim, we will reclaim even fresh block groups that have
  1853. * recently seen successful allocations, as we are desperate to reclaim
  1854. * whatever we can to avoid ENOSPC in a transaction leading to a readonly fs.
  1855. */
  1856. static bool is_reclaim_urgent(struct btrfs_space_info *space_info)
  1857. {
  1858. struct btrfs_fs_info *fs_info = space_info->fs_info;
  1859. u64 unalloc = atomic64_read(&fs_info->free_chunk_space);
  1860. u64 data_chunk_size = calc_effective_data_chunk_size(fs_info);
  1861. return unalloc < data_chunk_size;
  1862. }
  1863. static bool do_reclaim_sweep(struct btrfs_space_info *space_info, int raid)
  1864. {
  1865. struct btrfs_block_group *bg;
  1866. int thresh_pct;
  1867. bool will_reclaim = false;
  1868. bool urgent;
  1869. spin_lock(&space_info->lock);
  1870. urgent = is_reclaim_urgent(space_info);
  1871. thresh_pct = btrfs_calc_reclaim_threshold(space_info);
  1872. spin_unlock(&space_info->lock);
  1873. down_read(&space_info->groups_sem);
  1874. again:
  1875. list_for_each_entry(bg, &space_info->block_groups[raid], list) {
  1876. u64 thresh;
  1877. bool reclaim = false;
  1878. btrfs_get_block_group(bg);
  1879. spin_lock(&bg->lock);
  1880. thresh = mult_perc(bg->length, thresh_pct);
  1881. if (bg->used < thresh && bg->reclaim_mark) {
  1882. will_reclaim = true;
  1883. reclaim = true;
  1884. }
  1885. bg->reclaim_mark++;
  1886. spin_unlock(&bg->lock);
  1887. if (reclaim)
  1888. btrfs_mark_bg_to_reclaim(bg);
  1889. btrfs_put_block_group(bg);
  1890. }
  1891. /*
  1892. * In situations where we are very motivated to reclaim (low unalloc)
  1893. * use two passes to make the reclaim mark check best effort.
  1894. *
  1895. * If we have any staler groups, we don't touch the fresher ones, but if we
  1896. * really need a block group, do take a fresh one.
  1897. */
  1898. if (!will_reclaim && urgent) {
  1899. urgent = false;
  1900. goto again;
  1901. }
  1902. up_read(&space_info->groups_sem);
  1903. return will_reclaim;
  1904. }
  1905. void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s64 bytes)
  1906. {
  1907. u64 chunk_sz = calc_effective_data_chunk_size(space_info->fs_info);
  1908. lockdep_assert_held(&space_info->lock);
  1909. space_info->reclaimable_bytes += bytes;
  1910. if (space_info->reclaimable_bytes > 0 &&
  1911. space_info->reclaimable_bytes >= chunk_sz)
  1912. btrfs_set_periodic_reclaim_ready(space_info, true);
  1913. }
  1914. void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool ready)
  1915. {
  1916. lockdep_assert_held(&space_info->lock);
  1917. if (!READ_ONCE(space_info->periodic_reclaim))
  1918. return;
  1919. if (ready != space_info->periodic_reclaim_ready) {
  1920. space_info->periodic_reclaim_ready = ready;
  1921. if (!ready)
  1922. space_info->reclaimable_bytes = 0;
  1923. }
  1924. }
  1925. static bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info)
  1926. {
  1927. bool ret;
  1928. if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM)
  1929. return false;
  1930. if (!READ_ONCE(space_info->periodic_reclaim))
  1931. return false;
  1932. spin_lock(&space_info->lock);
  1933. ret = space_info->periodic_reclaim_ready;
  1934. spin_unlock(&space_info->lock);
  1935. return ret;
  1936. }
  1937. void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info)
  1938. {
  1939. int raid;
  1940. struct btrfs_space_info *space_info;
  1941. list_for_each_entry(space_info, &fs_info->space_info, list) {
  1942. if (!btrfs_should_periodic_reclaim(space_info))
  1943. continue;
  1944. for (raid = 0; raid < BTRFS_NR_RAID_TYPES; raid++) {
  1945. if (do_reclaim_sweep(space_info, raid)) {
  1946. spin_lock(&space_info->lock);
  1947. btrfs_set_periodic_reclaim_ready(space_info, false);
  1948. spin_unlock(&space_info->lock);
  1949. }
  1950. }
  1951. }
  1952. }
  1953. void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len)
  1954. {
  1955. struct btrfs_fs_info *fs_info = space_info->fs_info;
  1956. struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
  1957. lockdep_assert_held(&space_info->lock);
  1958. /* Prioritize the global reservation to receive the freed space. */
  1959. if (global_rsv->space_info != space_info)
  1960. goto grant;
  1961. spin_lock(&global_rsv->lock);
  1962. if (!global_rsv->full) {
  1963. u64 to_add = min(len, global_rsv->size - global_rsv->reserved);
  1964. global_rsv->reserved += to_add;
  1965. btrfs_space_info_update_bytes_may_use(space_info, to_add);
  1966. if (global_rsv->reserved >= global_rsv->size)
  1967. global_rsv->full = true;
  1968. len -= to_add;
  1969. }
  1970. spin_unlock(&global_rsv->lock);
  1971. grant:
  1972. /* Add to any tickets we may have. */
  1973. if (len)
  1974. btrfs_try_granting_tickets(space_info);
  1975. }