qgroup.h 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457
  1. /* SPDX-License-Identifier: GPL-2.0 */
  2. /*
  3. * Copyright (C) 2014 Facebook. All rights reserved.
  4. */
  5. #ifndef BTRFS_QGROUP_H
  6. #define BTRFS_QGROUP_H
  7. #include <linux/types.h>
  8. #include <linux/spinlock.h>
  9. #include <linux/rbtree.h>
  10. #include <linux/kobject.h>
  11. #include <linux/list.h>
  12. #include <uapi/linux/btrfs_tree.h>
  13. struct extent_buffer;
  14. struct extent_changeset;
  15. struct btrfs_delayed_extent_op;
  16. struct btrfs_fs_info;
  17. struct btrfs_root;
  18. struct btrfs_ioctl_quota_ctl_args;
  19. struct btrfs_trans_handle;
  20. struct btrfs_delayed_ref_root;
  21. struct btrfs_inode;
  22. struct btrfs_transaction;
  23. struct btrfs_block_group;
  24. struct btrfs_qgroup_swapped_blocks;
  25. /*
  26. * Btrfs qgroup overview
  27. *
  28. * Btrfs qgroup splits into 3 main part:
  29. * 1) Reserve
  30. * Reserve metadata/data space for incoming operations
  31. * Affect how qgroup limit works
  32. *
  33. * 2) Trace
  34. * Tell btrfs qgroup to trace dirty extents.
  35. *
  36. * Dirty extents including:
  37. * - Newly allocated extents
  38. * - Extents going to be deleted (in this trans)
  39. * - Extents whose owner is going to be modified
  40. *
  41. * This is the main part affects whether qgroup numbers will stay
  42. * consistent.
  43. * Btrfs qgroup can trace clean extents and won't cause any problem,
  44. * but it will consume extra CPU time, it should be avoided if possible.
  45. *
  46. * 3) Account
  47. * Btrfs qgroup will updates its numbers, based on dirty extents traced
  48. * in previous step.
  49. *
  50. * Normally at qgroup rescan and transaction commit time.
  51. */
  52. /*
  53. * Special performance optimization for balance.
  54. *
  55. * For balance, we need to swap subtree of subvolume and reloc trees.
  56. * In theory, we need to trace all subtree blocks of both subvolume and reloc
  57. * trees, since their owner has changed during such swap.
  58. *
  59. * However since balance has ensured that both subtrees are containing the
  60. * same contents and have the same tree structures, such swap won't cause
  61. * qgroup number change.
  62. *
  63. * But there is a race window between subtree swap and transaction commit,
  64. * during that window, if we increase/decrease tree level or merge/split tree
  65. * blocks, we still need to trace the original subtrees.
  66. *
  67. * So for balance, we use a delayed subtree tracing, whose workflow is:
  68. *
  69. * 1) Record the subtree root block get swapped.
  70. *
  71. * During subtree swap:
  72. * O = Old tree blocks
  73. * N = New tree blocks
  74. * reloc tree subvolume tree X
  75. * Root Root
  76. * / \ / \
  77. * NA OB OA OB
  78. * / | | \ / | | \
  79. * NC ND OE OF OC OD OE OF
  80. *
  81. * In this case, NA and OA are going to be swapped, record (NA, OA) into
  82. * subvolume tree X.
  83. *
  84. * 2) After subtree swap.
  85. * reloc tree subvolume tree X
  86. * Root Root
  87. * / \ / \
  88. * OA OB NA OB
  89. * / | | \ / | | \
  90. * OC OD OE OF NC ND OE OF
  91. *
  92. * 3a) COW happens for OB
  93. * If we are going to COW tree block OB, we check OB's bytenr against
  94. * tree X's swapped_blocks structure.
  95. * If it doesn't fit any, nothing will happen.
  96. *
  97. * 3b) COW happens for NA
  98. * Check NA's bytenr against tree X's swapped_blocks, and get a hit.
  99. * Then we do subtree scan on both subtrees OA and NA.
  100. * Resulting 6 tree blocks to be scanned (OA, OC, OD, NA, NC, ND).
  101. *
  102. * Then no matter what we do to subvolume tree X, qgroup numbers will
  103. * still be correct.
  104. * Then NA's record gets removed from X's swapped_blocks.
  105. *
  106. * 4) Transaction commit
  107. * Any record in X's swapped_blocks gets removed, since there is no
  108. * modification to the swapped subtrees, no need to trigger heavy qgroup
  109. * subtree rescan for them.
  110. */
  111. /*
  112. * These flags share the flags field of the btrfs_qgroup_status_item with the
  113. * persisted flags defined in btrfs_tree.h.
  114. *
  115. * To minimize the chance of collision with new persisted status flags, these
  116. * count backwards from the MSB.
  117. */
  118. #define BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN (1ULL << 63)
  119. #define BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING (1ULL << 62)
  120. #define BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT (3)
  121. /*
  122. * Record a dirty extent, and info qgroup to update quota on it
  123. */
  124. struct btrfs_qgroup_extent_record {
  125. /*
  126. * The bytenr of the extent is given by its index in the dirty_extents
  127. * xarray of struct btrfs_delayed_ref_root left shifted by
  128. * fs_info->sectorsize_bits.
  129. */
  130. u64 num_bytes;
  131. /*
  132. * For qgroup reserved data space freeing.
  133. *
  134. * @data_rsv_refroot and @data_rsv will be recorded after
  135. * BTRFS_ADD_DELAYED_EXTENT is called.
  136. * And will be used to free reserved qgroup space at
  137. * transaction commit time.
  138. */
  139. u32 data_rsv; /* reserved data space needs to be freed */
  140. u64 data_rsv_refroot; /* which root the reserved data belongs to */
  141. struct ulist *old_roots;
  142. };
  143. struct btrfs_qgroup_swapped_block {
  144. struct rb_node node;
  145. int level;
  146. bool trace_leaf;
  147. /* bytenr/generation of the tree block in subvolume tree after swap */
  148. u64 subvol_bytenr;
  149. u64 subvol_generation;
  150. /* bytenr/generation of the tree block in reloc tree after swap */
  151. u64 reloc_bytenr;
  152. u64 reloc_generation;
  153. u64 last_snapshot;
  154. struct btrfs_key first_key;
  155. };
  156. /*
  157. * Qgroup reservation types:
  158. *
  159. * DATA:
  160. * space reserved for data
  161. *
  162. * META_PERTRANS:
  163. * Space reserved for metadata (per-transaction)
  164. * Due to the fact that qgroup data is only updated at transaction commit
  165. * time, reserved space for metadata must be kept until transaction
  166. * commits.
  167. * Any metadata reserved that are used in btrfs_start_transaction() should
  168. * be of this type.
  169. *
  170. * META_PREALLOC:
  171. * There are cases where metadata space is reserved before starting
  172. * transaction, and then btrfs_join_transaction() to get a trans handle.
  173. * Any metadata reserved for such usage should be of this type.
  174. * And after join_transaction() part (or all) of such reservation should
  175. * be converted into META_PERTRANS.
  176. */
  177. enum btrfs_qgroup_rsv_type {
  178. BTRFS_QGROUP_RSV_DATA,
  179. BTRFS_QGROUP_RSV_META_PERTRANS,
  180. BTRFS_QGROUP_RSV_META_PREALLOC,
  181. BTRFS_QGROUP_RSV_LAST,
  182. };
  183. /*
  184. * Represents how many bytes we have reserved for this qgroup.
  185. *
  186. * Each type should have different reservation behavior.
  187. * E.g, data follows its io_tree flag modification, while
  188. * *currently* meta is just reserve-and-clear during transaction.
  189. *
  190. * TODO: Add new type for reservation which can survive transaction commit.
  191. * Current metadata reservation behavior is not suitable for such case.
  192. */
  193. struct btrfs_qgroup_rsv {
  194. u64 values[BTRFS_QGROUP_RSV_LAST];
  195. };
  196. /*
  197. * one struct for each qgroup, organized in fs_info->qgroup_tree.
  198. */
  199. struct btrfs_qgroup {
  200. u64 qgroupid;
  201. /*
  202. * state
  203. */
  204. u64 rfer; /* referenced */
  205. u64 rfer_cmpr; /* referenced compressed */
  206. u64 excl; /* exclusive */
  207. u64 excl_cmpr; /* exclusive compressed */
  208. /*
  209. * limits
  210. */
  211. u64 lim_flags; /* which limits are set */
  212. u64 max_rfer;
  213. u64 max_excl;
  214. u64 rsv_rfer;
  215. u64 rsv_excl;
  216. /*
  217. * reservation tracking
  218. */
  219. struct btrfs_qgroup_rsv rsv;
  220. /*
  221. * lists
  222. */
  223. struct list_head groups; /* groups this group is member of */
  224. struct list_head members; /* groups that are members of this group */
  225. struct list_head dirty; /* dirty groups */
  226. /*
  227. * For qgroup iteration usage.
  228. *
  229. * The iteration list should always be empty until qgroup_iterator_add()
  230. * is called. And should be reset to empty after the iteration is
  231. * finished.
  232. */
  233. struct list_head iterator;
  234. /*
  235. * For nested iterator usage.
  236. *
  237. * Here we support at most one level of nested iterator calls like:
  238. *
  239. * LIST_HEAD(all_qgroups);
  240. * {
  241. * LIST_HEAD(local_qgroups);
  242. * qgroup_iterator_add(local_qgroups, qg);
  243. * qgroup_iterator_nested_add(all_qgroups, qg);
  244. * do_some_work(local_qgroups);
  245. * qgroup_iterator_clean(local_qgroups);
  246. * }
  247. * do_some_work(all_qgroups);
  248. * qgroup_iterator_nested_clean(all_qgroups);
  249. */
  250. struct list_head nested_iterator;
  251. struct rb_node node; /* tree of qgroups */
  252. /*
  253. * temp variables for accounting operations
  254. * Refer to qgroup_shared_accounting() for details.
  255. */
  256. u64 old_refcnt;
  257. u64 new_refcnt;
  258. /*
  259. * Sysfs kobjectid
  260. */
  261. struct kobject kobj;
  262. };
  263. /* Glue structure to represent the relations between qgroups. */
  264. struct btrfs_qgroup_list {
  265. struct list_head next_group;
  266. struct list_head next_member;
  267. struct btrfs_qgroup *group;
  268. struct btrfs_qgroup *member;
  269. };
  270. struct btrfs_squota_delta {
  271. /* The fstree root this delta counts against. */
  272. u64 root;
  273. /* The number of bytes in the extent being counted. */
  274. u64 num_bytes;
  275. /* The generation the extent was created in. */
  276. u64 generation;
  277. /* Whether we are using or freeing the extent. */
  278. bool is_inc;
  279. /* Whether the extent is data or metadata. */
  280. bool is_data;
  281. };
  282. static inline u64 btrfs_qgroup_subvolid(u64 qgroupid)
  283. {
  284. return (qgroupid & ((1ULL << BTRFS_QGROUP_LEVEL_SHIFT) - 1));
  285. }
  286. /*
  287. * For qgroup event trace points only
  288. */
  289. enum {
  290. ENUM_BIT(QGROUP_RESERVE),
  291. ENUM_BIT(QGROUP_RELEASE),
  292. ENUM_BIT(QGROUP_FREE),
  293. };
  294. enum btrfs_qgroup_mode {
  295. BTRFS_QGROUP_MODE_DISABLED,
  296. BTRFS_QGROUP_MODE_FULL,
  297. BTRFS_QGROUP_MODE_SIMPLE
  298. };
  299. enum btrfs_qgroup_mode btrfs_qgroup_mode(const struct btrfs_fs_info *fs_info);
  300. bool btrfs_qgroup_enabled(const struct btrfs_fs_info *fs_info);
  301. bool btrfs_qgroup_full_accounting(const struct btrfs_fs_info *fs_info);
  302. int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
  303. struct btrfs_ioctl_quota_ctl_args *quota_ctl_args);
  304. int btrfs_quota_disable(struct btrfs_fs_info *fs_info);
  305. int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
  306. void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
  307. int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
  308. bool interruptible);
  309. int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst,
  310. struct btrfs_qgroup_list *prealloc);
  311. int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
  312. u64 dst);
  313. int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid);
  314. int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid);
  315. int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 subvolid);
  316. int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
  317. struct btrfs_qgroup_limit *limit);
  318. int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
  319. void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
  320. int btrfs_qgroup_trace_extent_nolock(
  321. struct btrfs_fs_info *fs_info,
  322. struct btrfs_delayed_ref_root *delayed_refs,
  323. struct btrfs_qgroup_extent_record *record,
  324. u64 bytenr);
  325. int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
  326. struct btrfs_qgroup_extent_record *qrecord,
  327. u64 bytenr);
  328. int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
  329. u64 num_bytes);
  330. int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
  331. struct extent_buffer *eb);
  332. int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
  333. struct extent_buffer *root_eb,
  334. u64 root_gen, int root_level);
  335. int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
  336. u64 num_bytes, struct ulist *old_roots,
  337. struct ulist *new_roots);
  338. int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans);
  339. int btrfs_run_qgroups(struct btrfs_trans_handle *trans);
  340. int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info,
  341. struct btrfs_qgroup_inherit *inherit,
  342. size_t size);
  343. int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
  344. u64 objectid, u64 inode_rootid,
  345. struct btrfs_qgroup_inherit *inherit);
  346. void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
  347. u64 ref_root, u64 num_bytes,
  348. enum btrfs_qgroup_rsv_type type);
  349. #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
  350. int btrfs_verify_qgroup_counts(const struct btrfs_fs_info *fs_info, u64 qgroupid,
  351. u64 rfer, u64 excl);
  352. #endif
  353. /* New io_tree based accurate qgroup reserve API */
  354. int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
  355. struct extent_changeset **reserved, u64 start, u64 len);
  356. int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len, u64 *released);
  357. int btrfs_qgroup_free_data(struct btrfs_inode *inode,
  358. struct extent_changeset *reserved, u64 start,
  359. u64 len, u64 *freed);
  360. int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
  361. enum btrfs_qgroup_rsv_type type, bool enforce);
  362. int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
  363. enum btrfs_qgroup_rsv_type type, bool enforce,
  364. bool noflush);
  365. /* Reserve metadata space for pertrans and prealloc type */
  366. static inline int btrfs_qgroup_reserve_meta_pertrans(struct btrfs_root *root,
  367. int num_bytes, bool enforce)
  368. {
  369. return __btrfs_qgroup_reserve_meta(root, num_bytes,
  370. BTRFS_QGROUP_RSV_META_PERTRANS,
  371. enforce, false);
  372. }
  373. static inline int btrfs_qgroup_reserve_meta_prealloc(struct btrfs_root *root,
  374. int num_bytes, bool enforce,
  375. bool noflush)
  376. {
  377. return __btrfs_qgroup_reserve_meta(root, num_bytes,
  378. BTRFS_QGROUP_RSV_META_PREALLOC,
  379. enforce, noflush);
  380. }
  381. void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
  382. enum btrfs_qgroup_rsv_type type);
  383. /* Free per-transaction meta reservation for error handling */
  384. static inline void btrfs_qgroup_free_meta_pertrans(struct btrfs_root *root,
  385. int num_bytes)
  386. {
  387. __btrfs_qgroup_free_meta(root, num_bytes,
  388. BTRFS_QGROUP_RSV_META_PERTRANS);
  389. }
  390. /* Pre-allocated meta reservation can be freed at need */
  391. static inline void btrfs_qgroup_free_meta_prealloc(struct btrfs_root *root,
  392. int num_bytes)
  393. {
  394. __btrfs_qgroup_free_meta(root, num_bytes,
  395. BTRFS_QGROUP_RSV_META_PREALLOC);
  396. }
  397. void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root);
  398. void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes);
  399. void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode);
  400. /* btrfs_qgroup_swapped_blocks related functions */
  401. void btrfs_qgroup_init_swapped_blocks(
  402. struct btrfs_qgroup_swapped_blocks *swapped_blocks);
  403. void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root);
  404. int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root,
  405. struct btrfs_block_group *bg,
  406. struct extent_buffer *subvol_parent, int subvol_slot,
  407. struct extent_buffer *reloc_parent, int reloc_slot,
  408. u64 last_snapshot);
  409. int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
  410. struct btrfs_root *root, struct extent_buffer *eb);
  411. void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans);
  412. bool btrfs_check_quota_leak(const struct btrfs_fs_info *fs_info);
  413. int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info,
  414. const struct btrfs_squota_delta *delta);
  415. #endif