hugetlb_cgroup.c 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920
  1. // SPDX-License-Identifier: LGPL-2.1
  2. /*
  3. *
  4. * Copyright IBM Corporation, 2012
  5. * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
  6. *
  7. * Cgroup v2
  8. * Copyright (C) 2019 Red Hat, Inc.
  9. * Author: Giuseppe Scrivano <gscrivan@redhat.com>
  10. *
  11. */
  12. #include <linux/cgroup.h>
  13. #include <linux/page_counter.h>
  14. #include <linux/slab.h>
  15. #include <linux/hugetlb.h>
  16. #include <linux/hugetlb_cgroup.h>
  17. #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
  18. #define MEMFILE_IDX(val) (((val) >> 16) & 0xffff)
  19. #define MEMFILE_ATTR(val) ((val) & 0xffff)
  20. /* Use t->m[0] to encode the offset */
  21. #define MEMFILE_OFFSET(t, m0) (((offsetof(t, m0) << 16) | sizeof_field(t, m0)))
  22. #define MEMFILE_OFFSET0(val) (((val) >> 16) & 0xffff)
  23. #define MEMFILE_FIELD_SIZE(val) ((val) & 0xffff)
  24. #define DFL_TMPL_SIZE ARRAY_SIZE(hugetlb_dfl_tmpl)
  25. #define LEGACY_TMPL_SIZE ARRAY_SIZE(hugetlb_legacy_tmpl)
  26. static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
  27. static struct cftype *dfl_files;
  28. static struct cftype *legacy_files;
  29. static inline struct page_counter *
  30. __hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx,
  31. bool rsvd)
  32. {
  33. if (rsvd)
  34. return &h_cg->rsvd_hugepage[idx];
  35. return &h_cg->hugepage[idx];
  36. }
  37. static inline struct page_counter *
  38. hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx)
  39. {
  40. return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, false);
  41. }
  42. static inline struct page_counter *
  43. hugetlb_cgroup_counter_from_cgroup_rsvd(struct hugetlb_cgroup *h_cg, int idx)
  44. {
  45. return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, true);
  46. }
  47. static inline
  48. struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s)
  49. {
  50. return s ? container_of(s, struct hugetlb_cgroup, css) : NULL;
  51. }
  52. static inline
  53. struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task)
  54. {
  55. return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id));
  56. }
  57. static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
  58. {
  59. return (h_cg == root_h_cgroup);
  60. }
  61. static inline struct hugetlb_cgroup *
  62. parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg)
  63. {
  64. return hugetlb_cgroup_from_css(h_cg->css.parent);
  65. }
  66. static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
  67. {
  68. struct hstate *h;
  69. for_each_hstate(h) {
  70. if (page_counter_read(
  71. hugetlb_cgroup_counter_from_cgroup(h_cg, hstate_index(h))))
  72. return true;
  73. }
  74. return false;
  75. }
  76. static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
  77. struct hugetlb_cgroup *parent_h_cgroup)
  78. {
  79. int idx;
  80. for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) {
  81. struct page_counter *fault, *fault_parent = NULL;
  82. struct page_counter *rsvd, *rsvd_parent = NULL;
  83. unsigned long limit;
  84. if (parent_h_cgroup) {
  85. fault_parent = hugetlb_cgroup_counter_from_cgroup(
  86. parent_h_cgroup, idx);
  87. rsvd_parent = hugetlb_cgroup_counter_from_cgroup_rsvd(
  88. parent_h_cgroup, idx);
  89. }
  90. fault = hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx);
  91. rsvd = hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx);
  92. page_counter_init(fault, fault_parent, false);
  93. page_counter_init(rsvd, rsvd_parent, false);
  94. if (!cgroup_subsys_on_dfl(hugetlb_cgrp_subsys)) {
  95. fault->track_failcnt = true;
  96. rsvd->track_failcnt = true;
  97. }
  98. limit = round_down(PAGE_COUNTER_MAX,
  99. pages_per_huge_page(&hstates[idx]));
  100. VM_BUG_ON(page_counter_set_max(fault, limit));
  101. VM_BUG_ON(page_counter_set_max(rsvd, limit));
  102. }
  103. }
  104. static void hugetlb_cgroup_free(struct hugetlb_cgroup *h_cgroup)
  105. {
  106. int node;
  107. for_each_node(node)
  108. kfree(h_cgroup->nodeinfo[node]);
  109. kfree(h_cgroup);
  110. }
  111. static struct cgroup_subsys_state *
  112. hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
  113. {
  114. struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css);
  115. struct hugetlb_cgroup *h_cgroup;
  116. int node;
  117. h_cgroup = kzalloc_flex(*h_cgroup, nodeinfo, nr_node_ids);
  118. if (!h_cgroup)
  119. return ERR_PTR(-ENOMEM);
  120. if (!parent_h_cgroup)
  121. root_h_cgroup = h_cgroup;
  122. /*
  123. * TODO: this routine can waste much memory for nodes which will
  124. * never be onlined. It's better to use memory hotplug callback
  125. * function.
  126. */
  127. for_each_node(node) {
  128. /* Set node_to_alloc to NUMA_NO_NODE for offline nodes. */
  129. int node_to_alloc =
  130. node_state(node, N_NORMAL_MEMORY) ? node : NUMA_NO_NODE;
  131. h_cgroup->nodeinfo[node] =
  132. kzalloc_node(sizeof(struct hugetlb_cgroup_per_node),
  133. GFP_KERNEL, node_to_alloc);
  134. if (!h_cgroup->nodeinfo[node])
  135. goto fail_alloc_nodeinfo;
  136. }
  137. hugetlb_cgroup_init(h_cgroup, parent_h_cgroup);
  138. return &h_cgroup->css;
  139. fail_alloc_nodeinfo:
  140. hugetlb_cgroup_free(h_cgroup);
  141. return ERR_PTR(-ENOMEM);
  142. }
  143. static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
  144. {
  145. hugetlb_cgroup_free(hugetlb_cgroup_from_css(css));
  146. }
  147. /*
  148. * Should be called with hugetlb_lock held.
  149. * Since we are holding hugetlb_lock, pages cannot get moved from
  150. * active list or uncharged from the cgroup, So no need to get
  151. * page reference and test for page active here. This function
  152. * cannot fail.
  153. */
  154. static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
  155. struct folio *folio)
  156. {
  157. unsigned int nr_pages;
  158. struct page_counter *counter;
  159. struct hugetlb_cgroup *hcg;
  160. struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);
  161. hcg = hugetlb_cgroup_from_folio(folio);
  162. /*
  163. * We can have pages in active list without any cgroup
  164. * ie, hugepage with less than 3 pages. We can safely
  165. * ignore those pages.
  166. */
  167. if (!hcg || hcg != h_cg)
  168. goto out;
  169. nr_pages = folio_nr_pages(folio);
  170. if (!parent) {
  171. parent = root_h_cgroup;
  172. /* root has no limit */
  173. page_counter_charge(&parent->hugepage[idx], nr_pages);
  174. }
  175. counter = &h_cg->hugepage[idx];
  176. /* Take the pages off the local counter */
  177. page_counter_cancel(counter, nr_pages);
  178. set_hugetlb_cgroup(folio, parent);
  179. out:
  180. return;
  181. }
  182. /*
  183. * Force the hugetlb cgroup to empty the hugetlb resources by moving them to
  184. * the parent cgroup.
  185. */
  186. static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css)
  187. {
  188. struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
  189. struct hstate *h;
  190. struct folio *folio;
  191. do {
  192. for_each_hstate(h) {
  193. spin_lock_irq(&hugetlb_lock);
  194. list_for_each_entry(folio, &h->hugepage_activelist, lru)
  195. hugetlb_cgroup_move_parent(hstate_index(h), h_cg, folio);
  196. spin_unlock_irq(&hugetlb_lock);
  197. }
  198. cond_resched();
  199. } while (hugetlb_cgroup_have_usage(h_cg));
  200. }
  201. static inline void hugetlb_event(struct hugetlb_cgroup *hugetlb, int idx,
  202. enum hugetlb_memory_event event)
  203. {
  204. atomic_long_inc(&hugetlb->events_local[idx][event]);
  205. cgroup_file_notify(&hugetlb->events_local_file[idx]);
  206. do {
  207. atomic_long_inc(&hugetlb->events[idx][event]);
  208. cgroup_file_notify(&hugetlb->events_file[idx]);
  209. } while ((hugetlb = parent_hugetlb_cgroup(hugetlb)) &&
  210. !hugetlb_cgroup_is_root(hugetlb));
  211. }
  212. static int __hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
  213. struct hugetlb_cgroup **ptr,
  214. bool rsvd)
  215. {
  216. int ret = 0;
  217. struct page_counter *counter;
  218. struct hugetlb_cgroup *h_cg = NULL;
  219. if (hugetlb_cgroup_disabled())
  220. goto done;
  221. again:
  222. rcu_read_lock();
  223. h_cg = hugetlb_cgroup_from_task(current);
  224. if (!css_tryget(&h_cg->css)) {
  225. rcu_read_unlock();
  226. goto again;
  227. }
  228. rcu_read_unlock();
  229. if (!page_counter_try_charge(
  230. __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd),
  231. nr_pages, &counter)) {
  232. ret = -ENOMEM;
  233. hugetlb_event(h_cg, idx, HUGETLB_MAX);
  234. css_put(&h_cg->css);
  235. goto done;
  236. }
  237. /* Reservations take a reference to the css because they do not get
  238. * reparented.
  239. */
  240. if (!rsvd)
  241. css_put(&h_cg->css);
  242. done:
  243. *ptr = h_cg;
  244. return ret;
  245. }
  246. int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
  247. struct hugetlb_cgroup **ptr)
  248. {
  249. return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, false);
  250. }
  251. int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages,
  252. struct hugetlb_cgroup **ptr)
  253. {
  254. return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, true);
  255. }
  256. /* Should be called with hugetlb_lock held */
  257. static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
  258. struct hugetlb_cgroup *h_cg,
  259. struct folio *folio, bool rsvd)
  260. {
  261. if (hugetlb_cgroup_disabled() || !h_cg)
  262. return;
  263. lockdep_assert_held(&hugetlb_lock);
  264. __set_hugetlb_cgroup(folio, h_cg, rsvd);
  265. if (!rsvd) {
  266. unsigned long usage =
  267. h_cg->nodeinfo[folio_nid(folio)]->usage[idx];
  268. /*
  269. * This write is not atomic due to fetching usage and writing
  270. * to it, but that's fine because we call this with
  271. * hugetlb_lock held anyway.
  272. */
  273. WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx],
  274. usage + nr_pages);
  275. }
  276. }
  277. void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
  278. struct hugetlb_cgroup *h_cg,
  279. struct folio *folio)
  280. {
  281. __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, false);
  282. }
  283. void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages,
  284. struct hugetlb_cgroup *h_cg,
  285. struct folio *folio)
  286. {
  287. __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, true);
  288. }
  289. /*
  290. * Should be called with hugetlb_lock held
  291. */
  292. static void __hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages,
  293. struct folio *folio, bool rsvd)
  294. {
  295. struct hugetlb_cgroup *h_cg;
  296. if (hugetlb_cgroup_disabled())
  297. return;
  298. lockdep_assert_held(&hugetlb_lock);
  299. h_cg = __hugetlb_cgroup_from_folio(folio, rsvd);
  300. if (unlikely(!h_cg))
  301. return;
  302. __set_hugetlb_cgroup(folio, NULL, rsvd);
  303. page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx,
  304. rsvd),
  305. nr_pages);
  306. if (rsvd)
  307. css_put(&h_cg->css);
  308. else {
  309. unsigned long usage =
  310. h_cg->nodeinfo[folio_nid(folio)]->usage[idx];
  311. /*
  312. * This write is not atomic due to fetching usage and writing
  313. * to it, but that's fine because we call this with
  314. * hugetlb_lock held anyway.
  315. */
  316. WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx],
  317. usage - nr_pages);
  318. }
  319. }
  320. void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages,
  321. struct folio *folio)
  322. {
  323. __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, false);
  324. }
  325. void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages,
  326. struct folio *folio)
  327. {
  328. __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, true);
  329. }
  330. static void __hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
  331. struct hugetlb_cgroup *h_cg,
  332. bool rsvd)
  333. {
  334. if (hugetlb_cgroup_disabled() || !h_cg)
  335. return;
  336. page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx,
  337. rsvd),
  338. nr_pages);
  339. if (rsvd)
  340. css_put(&h_cg->css);
  341. }
  342. void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
  343. struct hugetlb_cgroup *h_cg)
  344. {
  345. __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, false);
  346. }
  347. void hugetlb_cgroup_uncharge_cgroup_rsvd(int idx, unsigned long nr_pages,
  348. struct hugetlb_cgroup *h_cg)
  349. {
  350. __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, true);
  351. }
  352. void hugetlb_cgroup_uncharge_counter(struct resv_map *resv, unsigned long start,
  353. unsigned long end)
  354. {
  355. if (hugetlb_cgroup_disabled() || !resv || !resv->reservation_counter ||
  356. !resv->css)
  357. return;
  358. page_counter_uncharge(resv->reservation_counter,
  359. (end - start) * resv->pages_per_hpage);
  360. css_put(resv->css);
  361. }
  362. void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv,
  363. struct file_region *rg,
  364. unsigned long nr_pages,
  365. bool region_del)
  366. {
  367. if (hugetlb_cgroup_disabled() || !resv || !rg || !nr_pages)
  368. return;
  369. if (rg->reservation_counter && resv->pages_per_hpage &&
  370. !resv->reservation_counter) {
  371. page_counter_uncharge(rg->reservation_counter,
  372. nr_pages * resv->pages_per_hpage);
  373. /*
  374. * Only do css_put(rg->css) when we delete the entire region
  375. * because one file_region must hold exactly one css reference.
  376. */
  377. if (region_del)
  378. css_put(rg->css);
  379. }
  380. }
  381. enum {
  382. RES_USAGE,
  383. RES_RSVD_USAGE,
  384. RES_LIMIT,
  385. RES_RSVD_LIMIT,
  386. RES_MAX_USAGE,
  387. RES_RSVD_MAX_USAGE,
  388. RES_FAILCNT,
  389. RES_RSVD_FAILCNT,
  390. };
  391. static int hugetlb_cgroup_read_numa_stat(struct seq_file *seq, void *dummy)
  392. {
  393. int nid;
  394. struct cftype *cft = seq_cft(seq);
  395. int idx = MEMFILE_IDX(cft->private);
  396. bool legacy = !cgroup_subsys_on_dfl(hugetlb_cgrp_subsys);
  397. struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
  398. struct cgroup_subsys_state *css;
  399. unsigned long usage;
  400. if (legacy) {
  401. /* Add up usage across all nodes for the non-hierarchical total. */
  402. usage = 0;
  403. for_each_node_state(nid, N_MEMORY)
  404. usage += READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]);
  405. seq_printf(seq, "total=%lu", usage * PAGE_SIZE);
  406. /* Simply print the per-node usage for the non-hierarchical total. */
  407. for_each_node_state(nid, N_MEMORY)
  408. seq_printf(seq, " N%d=%lu", nid,
  409. READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]) *
  410. PAGE_SIZE);
  411. seq_putc(seq, '\n');
  412. }
  413. /*
  414. * The hierarchical total is pretty much the value recorded by the
  415. * counter, so use that.
  416. */
  417. seq_printf(seq, "%stotal=%lu", legacy ? "hierarchical_" : "",
  418. page_counter_read(&h_cg->hugepage[idx]) * PAGE_SIZE);
  419. /*
  420. * For each node, transverse the css tree to obtain the hierarchical
  421. * node usage.
  422. */
  423. for_each_node_state(nid, N_MEMORY) {
  424. usage = 0;
  425. rcu_read_lock();
  426. css_for_each_descendant_pre(css, &h_cg->css) {
  427. usage += READ_ONCE(hugetlb_cgroup_from_css(css)
  428. ->nodeinfo[nid]
  429. ->usage[idx]);
  430. }
  431. rcu_read_unlock();
  432. seq_printf(seq, " N%d=%lu", nid, usage * PAGE_SIZE);
  433. }
  434. seq_putc(seq, '\n');
  435. return 0;
  436. }
  437. static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
  438. struct cftype *cft)
  439. {
  440. struct page_counter *counter;
  441. struct page_counter *rsvd_counter;
  442. struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
  443. counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)];
  444. rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(cft->private)];
  445. switch (MEMFILE_ATTR(cft->private)) {
  446. case RES_USAGE:
  447. return (u64)page_counter_read(counter) * PAGE_SIZE;
  448. case RES_RSVD_USAGE:
  449. return (u64)page_counter_read(rsvd_counter) * PAGE_SIZE;
  450. case RES_LIMIT:
  451. return (u64)counter->max * PAGE_SIZE;
  452. case RES_RSVD_LIMIT:
  453. return (u64)rsvd_counter->max * PAGE_SIZE;
  454. case RES_MAX_USAGE:
  455. return (u64)counter->watermark * PAGE_SIZE;
  456. case RES_RSVD_MAX_USAGE:
  457. return (u64)rsvd_counter->watermark * PAGE_SIZE;
  458. case RES_FAILCNT:
  459. return counter->failcnt;
  460. case RES_RSVD_FAILCNT:
  461. return rsvd_counter->failcnt;
  462. default:
  463. BUG();
  464. }
  465. }
  466. static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v)
  467. {
  468. int idx;
  469. u64 val;
  470. struct cftype *cft = seq_cft(seq);
  471. unsigned long limit;
  472. struct page_counter *counter;
  473. struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
  474. idx = MEMFILE_IDX(cft->private);
  475. counter = &h_cg->hugepage[idx];
  476. limit = round_down(PAGE_COUNTER_MAX,
  477. pages_per_huge_page(&hstates[idx]));
  478. switch (MEMFILE_ATTR(cft->private)) {
  479. case RES_RSVD_USAGE:
  480. counter = &h_cg->rsvd_hugepage[idx];
  481. fallthrough;
  482. case RES_USAGE:
  483. val = (u64)page_counter_read(counter);
  484. seq_printf(seq, "%llu\n", val * PAGE_SIZE);
  485. break;
  486. case RES_RSVD_LIMIT:
  487. counter = &h_cg->rsvd_hugepage[idx];
  488. fallthrough;
  489. case RES_LIMIT:
  490. val = (u64)counter->max;
  491. if (val == limit)
  492. seq_puts(seq, "max\n");
  493. else
  494. seq_printf(seq, "%llu\n", val * PAGE_SIZE);
  495. break;
  496. default:
  497. BUG();
  498. }
  499. return 0;
  500. }
  501. static DEFINE_MUTEX(hugetlb_limit_mutex);
  502. static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
  503. char *buf, size_t nbytes, loff_t off,
  504. const char *max)
  505. {
  506. int ret, idx;
  507. unsigned long nr_pages;
  508. struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
  509. bool rsvd = false;
  510. if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */
  511. return -EINVAL;
  512. buf = strstrip(buf);
  513. ret = page_counter_memparse(buf, max, &nr_pages);
  514. if (ret)
  515. return ret;
  516. idx = MEMFILE_IDX(of_cft(of)->private);
  517. nr_pages = round_down(nr_pages, pages_per_huge_page(&hstates[idx]));
  518. switch (MEMFILE_ATTR(of_cft(of)->private)) {
  519. case RES_RSVD_LIMIT:
  520. rsvd = true;
  521. fallthrough;
  522. case RES_LIMIT:
  523. mutex_lock(&hugetlb_limit_mutex);
  524. ret = page_counter_set_max(
  525. __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd),
  526. nr_pages);
  527. mutex_unlock(&hugetlb_limit_mutex);
  528. break;
  529. default:
  530. ret = -EINVAL;
  531. break;
  532. }
  533. return ret ?: nbytes;
  534. }
  535. static ssize_t hugetlb_cgroup_write_legacy(struct kernfs_open_file *of,
  536. char *buf, size_t nbytes, loff_t off)
  537. {
  538. return hugetlb_cgroup_write(of, buf, nbytes, off, "-1");
  539. }
  540. static ssize_t hugetlb_cgroup_write_dfl(struct kernfs_open_file *of,
  541. char *buf, size_t nbytes, loff_t off)
  542. {
  543. return hugetlb_cgroup_write(of, buf, nbytes, off, "max");
  544. }
  545. static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of,
  546. char *buf, size_t nbytes, loff_t off)
  547. {
  548. int ret = 0;
  549. struct page_counter *counter, *rsvd_counter;
  550. struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
  551. counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)];
  552. rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(of_cft(of)->private)];
  553. switch (MEMFILE_ATTR(of_cft(of)->private)) {
  554. case RES_MAX_USAGE:
  555. page_counter_reset_watermark(counter);
  556. break;
  557. case RES_RSVD_MAX_USAGE:
  558. page_counter_reset_watermark(rsvd_counter);
  559. break;
  560. case RES_FAILCNT:
  561. counter->failcnt = 0;
  562. break;
  563. case RES_RSVD_FAILCNT:
  564. rsvd_counter->failcnt = 0;
  565. break;
  566. default:
  567. ret = -EINVAL;
  568. break;
  569. }
  570. return ret ?: nbytes;
  571. }
  572. static char *mem_fmt(char *buf, int size, unsigned long hsize)
  573. {
  574. if (hsize >= SZ_1G)
  575. snprintf(buf, size, "%luGB", hsize / SZ_1G);
  576. else if (hsize >= SZ_1M)
  577. snprintf(buf, size, "%luMB", hsize / SZ_1M);
  578. else
  579. snprintf(buf, size, "%luKB", hsize / SZ_1K);
  580. return buf;
  581. }
  582. static int __hugetlb_events_show(struct seq_file *seq, bool local)
  583. {
  584. int idx;
  585. long max;
  586. struct cftype *cft = seq_cft(seq);
  587. struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
  588. idx = MEMFILE_IDX(cft->private);
  589. if (local)
  590. max = atomic_long_read(&h_cg->events_local[idx][HUGETLB_MAX]);
  591. else
  592. max = atomic_long_read(&h_cg->events[idx][HUGETLB_MAX]);
  593. seq_printf(seq, "max %lu\n", max);
  594. return 0;
  595. }
  596. static int hugetlb_events_show(struct seq_file *seq, void *v)
  597. {
  598. return __hugetlb_events_show(seq, false);
  599. }
  600. static int hugetlb_events_local_show(struct seq_file *seq, void *v)
  601. {
  602. return __hugetlb_events_show(seq, true);
  603. }
  604. static struct cftype hugetlb_dfl_tmpl[] = {
  605. {
  606. .name = "max",
  607. .private = RES_LIMIT,
  608. .seq_show = hugetlb_cgroup_read_u64_max,
  609. .write = hugetlb_cgroup_write_dfl,
  610. .flags = CFTYPE_NOT_ON_ROOT,
  611. },
  612. {
  613. .name = "rsvd.max",
  614. .private = RES_RSVD_LIMIT,
  615. .seq_show = hugetlb_cgroup_read_u64_max,
  616. .write = hugetlb_cgroup_write_dfl,
  617. .flags = CFTYPE_NOT_ON_ROOT,
  618. },
  619. {
  620. .name = "current",
  621. .private = RES_USAGE,
  622. .seq_show = hugetlb_cgroup_read_u64_max,
  623. .flags = CFTYPE_NOT_ON_ROOT,
  624. },
  625. {
  626. .name = "rsvd.current",
  627. .private = RES_RSVD_USAGE,
  628. .seq_show = hugetlb_cgroup_read_u64_max,
  629. .flags = CFTYPE_NOT_ON_ROOT,
  630. },
  631. {
  632. .name = "events",
  633. .seq_show = hugetlb_events_show,
  634. .file_offset = MEMFILE_OFFSET(struct hugetlb_cgroup, events_file[0]),
  635. .flags = CFTYPE_NOT_ON_ROOT,
  636. },
  637. {
  638. .name = "events.local",
  639. .seq_show = hugetlb_events_local_show,
  640. .file_offset = MEMFILE_OFFSET(struct hugetlb_cgroup, events_local_file[0]),
  641. .flags = CFTYPE_NOT_ON_ROOT,
  642. },
  643. {
  644. .name = "numa_stat",
  645. .seq_show = hugetlb_cgroup_read_numa_stat,
  646. .flags = CFTYPE_NOT_ON_ROOT,
  647. },
  648. /* don't need terminator here */
  649. };
  650. static struct cftype hugetlb_legacy_tmpl[] = {
  651. {
  652. .name = "limit_in_bytes",
  653. .private = RES_LIMIT,
  654. .read_u64 = hugetlb_cgroup_read_u64,
  655. .write = hugetlb_cgroup_write_legacy,
  656. },
  657. {
  658. .name = "rsvd.limit_in_bytes",
  659. .private = RES_RSVD_LIMIT,
  660. .read_u64 = hugetlb_cgroup_read_u64,
  661. .write = hugetlb_cgroup_write_legacy,
  662. },
  663. {
  664. .name = "usage_in_bytes",
  665. .private = RES_USAGE,
  666. .read_u64 = hugetlb_cgroup_read_u64,
  667. },
  668. {
  669. .name = "rsvd.usage_in_bytes",
  670. .private = RES_RSVD_USAGE,
  671. .read_u64 = hugetlb_cgroup_read_u64,
  672. },
  673. {
  674. .name = "max_usage_in_bytes",
  675. .private = RES_MAX_USAGE,
  676. .write = hugetlb_cgroup_reset,
  677. .read_u64 = hugetlb_cgroup_read_u64,
  678. },
  679. {
  680. .name = "rsvd.max_usage_in_bytes",
  681. .private = RES_RSVD_MAX_USAGE,
  682. .write = hugetlb_cgroup_reset,
  683. .read_u64 = hugetlb_cgroup_read_u64,
  684. },
  685. {
  686. .name = "failcnt",
  687. .private = RES_FAILCNT,
  688. .write = hugetlb_cgroup_reset,
  689. .read_u64 = hugetlb_cgroup_read_u64,
  690. },
  691. {
  692. .name = "rsvd.failcnt",
  693. .private = RES_RSVD_FAILCNT,
  694. .write = hugetlb_cgroup_reset,
  695. .read_u64 = hugetlb_cgroup_read_u64,
  696. },
  697. {
  698. .name = "numa_stat",
  699. .seq_show = hugetlb_cgroup_read_numa_stat,
  700. },
  701. /* don't need terminator here */
  702. };
  703. static void __init
  704. hugetlb_cgroup_cfttypes_init(struct hstate *h, struct cftype *cft,
  705. struct cftype *tmpl, int tmpl_size)
  706. {
  707. char buf[32];
  708. int i, idx = hstate_index(h);
  709. /* format the size */
  710. mem_fmt(buf, sizeof(buf), huge_page_size(h));
  711. for (i = 0; i < tmpl_size; cft++, tmpl++, i++) {
  712. *cft = *tmpl;
  713. /* rebuild the name */
  714. scnprintf(cft->name, MAX_CFTYPE_NAME, "%s.%s", buf, tmpl->name);
  715. /* rebuild the private */
  716. cft->private = MEMFILE_PRIVATE(idx, tmpl->private);
  717. /* rebuild the file_offset */
  718. if (tmpl->file_offset) {
  719. unsigned int offset = tmpl->file_offset;
  720. cft->file_offset = MEMFILE_OFFSET0(offset) +
  721. MEMFILE_FIELD_SIZE(offset) * idx;
  722. }
  723. lockdep_register_key(&cft->lockdep_key);
  724. }
  725. }
  726. static void __init __hugetlb_cgroup_file_dfl_init(struct hstate *h)
  727. {
  728. int idx = hstate_index(h);
  729. hugetlb_cgroup_cfttypes_init(h, dfl_files + idx * DFL_TMPL_SIZE,
  730. hugetlb_dfl_tmpl, DFL_TMPL_SIZE);
  731. }
  732. static void __init __hugetlb_cgroup_file_legacy_init(struct hstate *h)
  733. {
  734. int idx = hstate_index(h);
  735. hugetlb_cgroup_cfttypes_init(h, legacy_files + idx * LEGACY_TMPL_SIZE,
  736. hugetlb_legacy_tmpl, LEGACY_TMPL_SIZE);
  737. }
  738. static void __init __hugetlb_cgroup_file_init(struct hstate *h)
  739. {
  740. __hugetlb_cgroup_file_dfl_init(h);
  741. __hugetlb_cgroup_file_legacy_init(h);
  742. }
  743. static void __init __hugetlb_cgroup_file_pre_init(void)
  744. {
  745. int cft_count;
  746. cft_count = hugetlb_max_hstate * DFL_TMPL_SIZE + 1; /* add terminator */
  747. dfl_files = kzalloc_objs(struct cftype, cft_count);
  748. BUG_ON(!dfl_files);
  749. cft_count = hugetlb_max_hstate * LEGACY_TMPL_SIZE + 1; /* add terminator */
  750. legacy_files = kzalloc_objs(struct cftype, cft_count);
  751. BUG_ON(!legacy_files);
  752. }
  753. static void __init __hugetlb_cgroup_file_post_init(void)
  754. {
  755. WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys,
  756. dfl_files));
  757. WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys,
  758. legacy_files));
  759. }
  760. void __init hugetlb_cgroup_file_init(void)
  761. {
  762. struct hstate *h;
  763. __hugetlb_cgroup_file_pre_init();
  764. for_each_hstate(h)
  765. __hugetlb_cgroup_file_init(h);
  766. __hugetlb_cgroup_file_post_init();
  767. }
  768. /*
  769. * hugetlb_lock will make sure a parallel cgroup rmdir won't happen
  770. * when we migrate hugepages
  771. */
  772. void hugetlb_cgroup_migrate(struct folio *old_folio, struct folio *new_folio)
  773. {
  774. struct hugetlb_cgroup *h_cg;
  775. struct hugetlb_cgroup *h_cg_rsvd;
  776. struct hstate *h = folio_hstate(old_folio);
  777. if (hugetlb_cgroup_disabled())
  778. return;
  779. spin_lock_irq(&hugetlb_lock);
  780. h_cg = hugetlb_cgroup_from_folio(old_folio);
  781. h_cg_rsvd = hugetlb_cgroup_from_folio_rsvd(old_folio);
  782. set_hugetlb_cgroup(old_folio, NULL);
  783. set_hugetlb_cgroup_rsvd(old_folio, NULL);
  784. /* move the h_cg details to new cgroup */
  785. set_hugetlb_cgroup(new_folio, h_cg);
  786. set_hugetlb_cgroup_rsvd(new_folio, h_cg_rsvd);
  787. list_move(&new_folio->lru, &h->hugepage_activelist);
  788. spin_unlock_irq(&hugetlb_lock);
  789. }
  790. static struct cftype hugetlb_files[] = {
  791. {} /* terminate */
  792. };
  793. struct cgroup_subsys hugetlb_cgrp_subsys = {
  794. .css_alloc = hugetlb_cgroup_css_alloc,
  795. .css_offline = hugetlb_cgroup_css_offline,
  796. .css_free = hugetlb_cgroup_css_free,
  797. .dfl_cftypes = hugetlb_files,
  798. .legacy_cftypes = hugetlb_files,
  799. };