vmstat.c 64 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * linux/mm/vmstat.c
  4. *
  5. * Manages VM statistics
  6. * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
  7. *
  8. * zoned VM statistics
  9. * Copyright (C) 2006 Silicon Graphics, Inc.,
  10. * Christoph Lameter <cl@gentwo.org>
  11. * Copyright (C) 2008-2014 Christoph Lameter
  12. */
  13. #include <linux/fs.h>
  14. #include <linux/mm.h>
  15. #include <linux/err.h>
  16. #include <linux/module.h>
  17. #include <linux/slab.h>
  18. #include <linux/cpu.h>
  19. #include <linux/cpumask.h>
  20. #include <linux/vmstat.h>
  21. #include <linux/proc_fs.h>
  22. #include <linux/seq_file.h>
  23. #include <linux/debugfs.h>
  24. #include <linux/sched.h>
  25. #include <linux/math64.h>
  26. #include <linux/writeback.h>
  27. #include <linux/compaction.h>
  28. #include <linux/mm_inline.h>
  29. #include <linux/page_owner.h>
  30. #include <linux/sched/isolation.h>
  31. #include "internal.h"
  32. #ifdef CONFIG_PROC_FS
  33. #ifdef CONFIG_NUMA
  34. #define ENABLE_NUMA_STAT 1
  35. static int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;
  36. /* zero numa counters within a zone */
  37. static void zero_zone_numa_counters(struct zone *zone)
  38. {
  39. int item, cpu;
  40. for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) {
  41. atomic_long_set(&zone->vm_numa_event[item], 0);
  42. for_each_online_cpu(cpu) {
  43. per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_numa_event[item]
  44. = 0;
  45. }
  46. }
  47. }
  48. /* zero numa counters of all the populated zones */
  49. static void zero_zones_numa_counters(void)
  50. {
  51. struct zone *zone;
  52. for_each_populated_zone(zone)
  53. zero_zone_numa_counters(zone);
  54. }
  55. /* zero global numa counters */
  56. static void zero_global_numa_counters(void)
  57. {
  58. int item;
  59. for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
  60. atomic_long_set(&vm_numa_event[item], 0);
  61. }
  62. static void invalid_numa_statistics(void)
  63. {
  64. zero_zones_numa_counters();
  65. zero_global_numa_counters();
  66. }
  67. static DEFINE_MUTEX(vm_numa_stat_lock);
  68. static int sysctl_vm_numa_stat_handler(const struct ctl_table *table, int write,
  69. void *buffer, size_t *length, loff_t *ppos)
  70. {
  71. int ret, oldval;
  72. mutex_lock(&vm_numa_stat_lock);
  73. if (write)
  74. oldval = sysctl_vm_numa_stat;
  75. ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
  76. if (ret || !write)
  77. goto out;
  78. if (oldval == sysctl_vm_numa_stat)
  79. goto out;
  80. else if (sysctl_vm_numa_stat == ENABLE_NUMA_STAT) {
  81. static_branch_enable(&vm_numa_stat_key);
  82. pr_info("enable numa statistics\n");
  83. } else {
  84. static_branch_disable(&vm_numa_stat_key);
  85. invalid_numa_statistics();
  86. pr_info("disable numa statistics, and clear numa counters\n");
  87. }
  88. out:
  89. mutex_unlock(&vm_numa_stat_lock);
  90. return ret;
  91. }
  92. #endif
  93. #endif /* CONFIG_PROC_FS */
  94. #ifdef CONFIG_VM_EVENT_COUNTERS
  95. DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
  96. EXPORT_PER_CPU_SYMBOL(vm_event_states);
  97. static void sum_vm_events(unsigned long *ret)
  98. {
  99. int cpu;
  100. int i;
  101. memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
  102. for_each_online_cpu(cpu) {
  103. struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
  104. for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
  105. ret[i] += this->event[i];
  106. }
  107. }
  108. /*
  109. * Accumulate the vm event counters across all CPUs.
  110. * The result is unavoidably approximate - it can change
  111. * during and after execution of this function.
  112. */
  113. void all_vm_events(unsigned long *ret)
  114. {
  115. cpus_read_lock();
  116. sum_vm_events(ret);
  117. cpus_read_unlock();
  118. }
  119. EXPORT_SYMBOL_GPL(all_vm_events);
  120. /*
  121. * Fold the foreign cpu events into our own.
  122. *
  123. * This is adding to the events on one processor
  124. * but keeps the global counts constant.
  125. */
  126. void vm_events_fold_cpu(int cpu)
  127. {
  128. struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
  129. int i;
  130. for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
  131. count_vm_events(i, fold_state->event[i]);
  132. fold_state->event[i] = 0;
  133. }
  134. }
  135. #endif /* CONFIG_VM_EVENT_COUNTERS */
  136. /*
  137. * Manage combined zone based / global counters
  138. *
  139. * vm_stat contains the global counters
  140. */
  141. atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
  142. atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
  143. atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS] __cacheline_aligned_in_smp;
  144. EXPORT_SYMBOL(vm_zone_stat);
  145. EXPORT_SYMBOL(vm_node_stat);
  146. #ifdef CONFIG_NUMA
  147. static void fold_vm_zone_numa_events(struct zone *zone)
  148. {
  149. unsigned long zone_numa_events[NR_VM_NUMA_EVENT_ITEMS] = { 0, };
  150. int cpu;
  151. enum numa_stat_item item;
  152. for_each_online_cpu(cpu) {
  153. struct per_cpu_zonestat *pzstats;
  154. pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
  155. for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
  156. zone_numa_events[item] += xchg(&pzstats->vm_numa_event[item], 0);
  157. }
  158. for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
  159. zone_numa_event_add(zone_numa_events[item], zone, item);
  160. }
  161. void fold_vm_numa_events(void)
  162. {
  163. struct zone *zone;
  164. for_each_populated_zone(zone)
  165. fold_vm_zone_numa_events(zone);
  166. }
  167. #endif
  168. #ifdef CONFIG_SMP
  169. int calculate_pressure_threshold(struct zone *zone)
  170. {
  171. int threshold;
  172. int watermark_distance;
  173. /*
  174. * As vmstats are not up to date, there is drift between the estimated
  175. * and real values. For high thresholds and a high number of CPUs, it
  176. * is possible for the min watermark to be breached while the estimated
  177. * value looks fine. The pressure threshold is a reduced value such
  178. * that even the maximum amount of drift will not accidentally breach
  179. * the min watermark
  180. */
  181. watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
  182. threshold = max(1, (int)(watermark_distance / num_online_cpus()));
  183. /*
  184. * Maximum threshold is 125
  185. */
  186. threshold = min(125, threshold);
  187. return threshold;
  188. }
  189. int calculate_normal_threshold(struct zone *zone)
  190. {
  191. int threshold;
  192. int mem; /* memory in 128 MB units */
  193. /*
  194. * The threshold scales with the number of processors and the amount
  195. * of memory per zone. More memory means that we can defer updates for
  196. * longer, more processors could lead to more contention.
  197. * fls() is used to have a cheap way of logarithmic scaling.
  198. *
  199. * Some sample thresholds:
  200. *
  201. * Threshold Processors (fls) Zonesize fls(mem)+1
  202. * ------------------------------------------------------------------
  203. * 8 1 1 0.9-1 GB 4
  204. * 16 2 2 0.9-1 GB 4
  205. * 20 2 2 1-2 GB 5
  206. * 24 2 2 2-4 GB 6
  207. * 28 2 2 4-8 GB 7
  208. * 32 2 2 8-16 GB 8
  209. * 4 2 2 <128M 1
  210. * 30 4 3 2-4 GB 5
  211. * 48 4 3 8-16 GB 8
  212. * 32 8 4 1-2 GB 4
  213. * 32 8 4 0.9-1GB 4
  214. * 10 16 5 <128M 1
  215. * 40 16 5 900M 4
  216. * 70 64 7 2-4 GB 5
  217. * 84 64 7 4-8 GB 6
  218. * 108 512 9 4-8 GB 6
  219. * 125 1024 10 8-16 GB 8
  220. * 125 1024 10 16-32 GB 9
  221. */
  222. mem = zone_managed_pages(zone) >> (27 - PAGE_SHIFT);
  223. threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
  224. /*
  225. * Maximum threshold is 125
  226. */
  227. threshold = min(125, threshold);
  228. return threshold;
  229. }
  230. /*
  231. * Refresh the thresholds for each zone.
  232. */
  233. void refresh_zone_stat_thresholds(void)
  234. {
  235. struct pglist_data *pgdat;
  236. struct zone *zone;
  237. int cpu;
  238. int threshold;
  239. /* Zero current pgdat thresholds */
  240. for_each_online_pgdat(pgdat) {
  241. for_each_online_cpu(cpu) {
  242. per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold = 0;
  243. }
  244. }
  245. for_each_populated_zone(zone) {
  246. struct pglist_data *pgdat = zone->zone_pgdat;
  247. unsigned long max_drift, tolerate_drift;
  248. threshold = calculate_normal_threshold(zone);
  249. for_each_online_cpu(cpu) {
  250. int pgdat_threshold;
  251. per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold
  252. = threshold;
  253. /* Base nodestat threshold on the largest populated zone. */
  254. pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
  255. per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold
  256. = max(threshold, pgdat_threshold);
  257. }
  258. /*
  259. * Only set percpu_drift_mark if there is a danger that
  260. * NR_FREE_PAGES reports the low watermark is ok when in fact
  261. * the min watermark could be breached by an allocation
  262. */
  263. tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
  264. max_drift = num_online_cpus() * threshold;
  265. if (max_drift > tolerate_drift)
  266. zone->percpu_drift_mark = high_wmark_pages(zone) +
  267. max_drift;
  268. }
  269. }
  270. void set_pgdat_percpu_threshold(pg_data_t *pgdat,
  271. int (*calculate_pressure)(struct zone *))
  272. {
  273. struct zone *zone;
  274. int cpu;
  275. int threshold;
  276. int i;
  277. for (i = 0; i < pgdat->nr_zones; i++) {
  278. zone = &pgdat->node_zones[i];
  279. if (!zone->percpu_drift_mark)
  280. continue;
  281. threshold = (*calculate_pressure)(zone);
  282. for_each_online_cpu(cpu)
  283. per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold
  284. = threshold;
  285. }
  286. }
  287. /*
  288. * For use when we know that interrupts are disabled,
  289. * or when we know that preemption is disabled and that
  290. * particular counter cannot be updated from interrupt context.
  291. */
  292. void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
  293. long delta)
  294. {
  295. struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
  296. s8 __percpu *p = pcp->vm_stat_diff + item;
  297. long x;
  298. long t;
  299. /*
  300. * Accurate vmstat updates require a RMW. On !PREEMPT_RT kernels,
  301. * atomicity is provided by IRQs being disabled -- either explicitly
  302. * or via local_lock_irq. On PREEMPT_RT, local_lock_irq only disables
  303. * CPU migrations and preemption potentially corrupts a counter so
  304. * disable preemption.
  305. */
  306. preempt_disable_nested();
  307. x = delta + __this_cpu_read(*p);
  308. t = __this_cpu_read(pcp->stat_threshold);
  309. if (unlikely(abs(x) > t)) {
  310. zone_page_state_add(x, zone, item);
  311. x = 0;
  312. }
  313. __this_cpu_write(*p, x);
  314. preempt_enable_nested();
  315. }
  316. EXPORT_SYMBOL(__mod_zone_page_state);
  317. void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
  318. long delta)
  319. {
  320. struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
  321. s8 __percpu *p = pcp->vm_node_stat_diff + item;
  322. long x;
  323. long t;
  324. if (vmstat_item_in_bytes(item)) {
  325. /*
  326. * Only cgroups use subpage accounting right now; at
  327. * the global level, these items still change in
  328. * multiples of whole pages. Store them as pages
  329. * internally to keep the per-cpu counters compact.
  330. */
  331. VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
  332. delta >>= PAGE_SHIFT;
  333. }
  334. /* See __mod_zone_page_state() */
  335. preempt_disable_nested();
  336. x = delta + __this_cpu_read(*p);
  337. t = __this_cpu_read(pcp->stat_threshold);
  338. if (unlikely(abs(x) > t)) {
  339. node_page_state_add(x, pgdat, item);
  340. x = 0;
  341. }
  342. __this_cpu_write(*p, x);
  343. preempt_enable_nested();
  344. }
  345. EXPORT_SYMBOL(__mod_node_page_state);
  346. /*
  347. * Optimized increment and decrement functions.
  348. *
  349. * These are only for a single page and therefore can take a struct page *
  350. * argument instead of struct zone *. This allows the inclusion of the code
  351. * generated for page_zone(page) into the optimized functions.
  352. *
  353. * No overflow check is necessary and therefore the differential can be
  354. * incremented or decremented in place which may allow the compilers to
  355. * generate better code.
  356. * The increment or decrement is known and therefore one boundary check can
  357. * be omitted.
  358. *
  359. * NOTE: These functions are very performance sensitive. Change only
  360. * with care.
  361. *
  362. * Some processors have inc/dec instructions that are atomic vs an interrupt.
  363. * However, the code must first determine the differential location in a zone
  364. * based on the processor number and then inc/dec the counter. There is no
  365. * guarantee without disabling preemption that the processor will not change
  366. * in between and therefore the atomicity vs. interrupt cannot be exploited
  367. * in a useful way here.
  368. */
  369. void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
  370. {
  371. struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
  372. s8 __percpu *p = pcp->vm_stat_diff + item;
  373. s8 v, t;
  374. /* See __mod_zone_page_state() */
  375. preempt_disable_nested();
  376. v = __this_cpu_inc_return(*p);
  377. t = __this_cpu_read(pcp->stat_threshold);
  378. if (unlikely(v > t)) {
  379. s8 overstep = t >> 1;
  380. zone_page_state_add(v + overstep, zone, item);
  381. __this_cpu_write(*p, -overstep);
  382. }
  383. preempt_enable_nested();
  384. }
  385. void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
  386. {
  387. struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
  388. s8 __percpu *p = pcp->vm_node_stat_diff + item;
  389. s8 v, t;
  390. VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
  391. /* See __mod_zone_page_state() */
  392. preempt_disable_nested();
  393. v = __this_cpu_inc_return(*p);
  394. t = __this_cpu_read(pcp->stat_threshold);
  395. if (unlikely(v > t)) {
  396. s8 overstep = t >> 1;
  397. node_page_state_add(v + overstep, pgdat, item);
  398. __this_cpu_write(*p, -overstep);
  399. }
  400. preempt_enable_nested();
  401. }
  402. void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
  403. {
  404. __inc_zone_state(page_zone(page), item);
  405. }
  406. EXPORT_SYMBOL(__inc_zone_page_state);
  407. void __inc_node_page_state(struct page *page, enum node_stat_item item)
  408. {
  409. __inc_node_state(page_pgdat(page), item);
  410. }
  411. EXPORT_SYMBOL(__inc_node_page_state);
  412. void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
  413. {
  414. struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
  415. s8 __percpu *p = pcp->vm_stat_diff + item;
  416. s8 v, t;
  417. /* See __mod_zone_page_state() */
  418. preempt_disable_nested();
  419. v = __this_cpu_dec_return(*p);
  420. t = __this_cpu_read(pcp->stat_threshold);
  421. if (unlikely(v < - t)) {
  422. s8 overstep = t >> 1;
  423. zone_page_state_add(v - overstep, zone, item);
  424. __this_cpu_write(*p, overstep);
  425. }
  426. preempt_enable_nested();
  427. }
  428. void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
  429. {
  430. struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
  431. s8 __percpu *p = pcp->vm_node_stat_diff + item;
  432. s8 v, t;
  433. VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
  434. /* See __mod_zone_page_state() */
  435. preempt_disable_nested();
  436. v = __this_cpu_dec_return(*p);
  437. t = __this_cpu_read(pcp->stat_threshold);
  438. if (unlikely(v < - t)) {
  439. s8 overstep = t >> 1;
  440. node_page_state_add(v - overstep, pgdat, item);
  441. __this_cpu_write(*p, overstep);
  442. }
  443. preempt_enable_nested();
  444. }
  445. void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
  446. {
  447. __dec_zone_state(page_zone(page), item);
  448. }
  449. EXPORT_SYMBOL(__dec_zone_page_state);
  450. void __dec_node_page_state(struct page *page, enum node_stat_item item)
  451. {
  452. __dec_node_state(page_pgdat(page), item);
  453. }
  454. EXPORT_SYMBOL(__dec_node_page_state);
  455. #ifdef CONFIG_HAVE_CMPXCHG_LOCAL
  456. /*
  457. * If we have cmpxchg_local support then we do not need to incur the overhead
  458. * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
  459. *
  460. * mod_state() modifies the zone counter state through atomic per cpu
  461. * operations.
  462. *
  463. * Overstep mode specifies how overstep should handled:
  464. * 0 No overstepping
  465. * 1 Overstepping half of threshold
  466. * -1 Overstepping minus half of threshold
  467. */
  468. static inline void mod_zone_state(struct zone *zone,
  469. enum zone_stat_item item, long delta, int overstep_mode)
  470. {
  471. struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
  472. s8 __percpu *p = pcp->vm_stat_diff + item;
  473. long n, t, z;
  474. s8 o;
  475. o = this_cpu_read(*p);
  476. do {
  477. z = 0; /* overflow to zone counters */
  478. /*
  479. * The fetching of the stat_threshold is racy. We may apply
  480. * a counter threshold to the wrong the cpu if we get
  481. * rescheduled while executing here. However, the next
  482. * counter update will apply the threshold again and
  483. * therefore bring the counter under the threshold again.
  484. *
  485. * Most of the time the thresholds are the same anyways
  486. * for all cpus in a zone.
  487. */
  488. t = this_cpu_read(pcp->stat_threshold);
  489. n = delta + (long)o;
  490. if (abs(n) > t) {
  491. int os = overstep_mode * (t >> 1) ;
  492. /* Overflow must be added to zone counters */
  493. z = n + os;
  494. n = -os;
  495. }
  496. } while (!this_cpu_try_cmpxchg(*p, &o, n));
  497. if (z)
  498. zone_page_state_add(z, zone, item);
  499. }
  500. void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
  501. long delta)
  502. {
  503. mod_zone_state(zone, item, delta, 0);
  504. }
  505. EXPORT_SYMBOL(mod_zone_page_state);
  506. void inc_zone_page_state(struct page *page, enum zone_stat_item item)
  507. {
  508. mod_zone_state(page_zone(page), item, 1, 1);
  509. }
  510. EXPORT_SYMBOL(inc_zone_page_state);
  511. void dec_zone_page_state(struct page *page, enum zone_stat_item item)
  512. {
  513. mod_zone_state(page_zone(page), item, -1, -1);
  514. }
  515. EXPORT_SYMBOL(dec_zone_page_state);
  516. static inline void mod_node_state(struct pglist_data *pgdat,
  517. enum node_stat_item item, int delta, int overstep_mode)
  518. {
  519. struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
  520. s8 __percpu *p = pcp->vm_node_stat_diff + item;
  521. long n, t, z;
  522. s8 o;
  523. if (vmstat_item_in_bytes(item)) {
  524. /*
  525. * Only cgroups use subpage accounting right now; at
  526. * the global level, these items still change in
  527. * multiples of whole pages. Store them as pages
  528. * internally to keep the per-cpu counters compact.
  529. */
  530. VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
  531. delta >>= PAGE_SHIFT;
  532. }
  533. o = this_cpu_read(*p);
  534. do {
  535. z = 0; /* overflow to node counters */
  536. /*
  537. * The fetching of the stat_threshold is racy. We may apply
  538. * a counter threshold to the wrong the cpu if we get
  539. * rescheduled while executing here. However, the next
  540. * counter update will apply the threshold again and
  541. * therefore bring the counter under the threshold again.
  542. *
  543. * Most of the time the thresholds are the same anyways
  544. * for all cpus in a node.
  545. */
  546. t = this_cpu_read(pcp->stat_threshold);
  547. n = delta + (long)o;
  548. if (abs(n) > t) {
  549. int os = overstep_mode * (t >> 1) ;
  550. /* Overflow must be added to node counters */
  551. z = n + os;
  552. n = -os;
  553. }
  554. } while (!this_cpu_try_cmpxchg(*p, &o, n));
  555. if (z)
  556. node_page_state_add(z, pgdat, item);
  557. }
  558. void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
  559. long delta)
  560. {
  561. mod_node_state(pgdat, item, delta, 0);
  562. }
  563. EXPORT_SYMBOL(mod_node_page_state);
  564. void inc_node_page_state(struct page *page, enum node_stat_item item)
  565. {
  566. mod_node_state(page_pgdat(page), item, 1, 1);
  567. }
  568. EXPORT_SYMBOL(inc_node_page_state);
  569. void dec_node_page_state(struct page *page, enum node_stat_item item)
  570. {
  571. mod_node_state(page_pgdat(page), item, -1, -1);
  572. }
  573. EXPORT_SYMBOL(dec_node_page_state);
  574. #else
  575. /*
  576. * Use interrupt disable to serialize counter updates
  577. */
  578. void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
  579. long delta)
  580. {
  581. unsigned long flags;
  582. local_irq_save(flags);
  583. __mod_zone_page_state(zone, item, delta);
  584. local_irq_restore(flags);
  585. }
  586. EXPORT_SYMBOL(mod_zone_page_state);
  587. void inc_zone_page_state(struct page *page, enum zone_stat_item item)
  588. {
  589. unsigned long flags;
  590. struct zone *zone;
  591. zone = page_zone(page);
  592. local_irq_save(flags);
  593. __inc_zone_state(zone, item);
  594. local_irq_restore(flags);
  595. }
  596. EXPORT_SYMBOL(inc_zone_page_state);
  597. void dec_zone_page_state(struct page *page, enum zone_stat_item item)
  598. {
  599. unsigned long flags;
  600. local_irq_save(flags);
  601. __dec_zone_page_state(page, item);
  602. local_irq_restore(flags);
  603. }
  604. EXPORT_SYMBOL(dec_zone_page_state);
  605. void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
  606. long delta)
  607. {
  608. unsigned long flags;
  609. local_irq_save(flags);
  610. __mod_node_page_state(pgdat, item, delta);
  611. local_irq_restore(flags);
  612. }
  613. EXPORT_SYMBOL(mod_node_page_state);
  614. void inc_node_page_state(struct page *page, enum node_stat_item item)
  615. {
  616. unsigned long flags;
  617. struct pglist_data *pgdat;
  618. pgdat = page_pgdat(page);
  619. local_irq_save(flags);
  620. __inc_node_state(pgdat, item);
  621. local_irq_restore(flags);
  622. }
  623. EXPORT_SYMBOL(inc_node_page_state);
  624. void dec_node_page_state(struct page *page, enum node_stat_item item)
  625. {
  626. unsigned long flags;
  627. local_irq_save(flags);
  628. __dec_node_page_state(page, item);
  629. local_irq_restore(flags);
  630. }
  631. EXPORT_SYMBOL(dec_node_page_state);
  632. #endif
  633. /*
  634. * Fold a differential into the global counters.
  635. * Returns whether counters were updated.
  636. */
  637. static int fold_diff(int *zone_diff, int *node_diff)
  638. {
  639. int i;
  640. bool changed = false;
  641. for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
  642. if (zone_diff[i]) {
  643. atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
  644. changed = true;
  645. }
  646. }
  647. for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
  648. if (node_diff[i]) {
  649. atomic_long_add(node_diff[i], &vm_node_stat[i]);
  650. changed = true;
  651. }
  652. }
  653. return changed;
  654. }
  655. /*
  656. * Update the zone counters for the current cpu.
  657. *
  658. * Note that refresh_cpu_vm_stats strives to only access
  659. * node local memory. The per cpu pagesets on remote zones are placed
  660. * in the memory local to the processor using that pageset. So the
  661. * loop over all zones will access a series of cachelines local to
  662. * the processor.
  663. *
  664. * The call to zone_page_state_add updates the cachelines with the
  665. * statistics in the remote zone struct as well as the global cachelines
  666. * with the global counters. These could cause remote node cache line
  667. * bouncing and will have to be only done when necessary.
  668. *
  669. * The function returns whether global counters were updated.
  670. */
  671. static bool refresh_cpu_vm_stats(bool do_pagesets)
  672. {
  673. struct pglist_data *pgdat;
  674. struct zone *zone;
  675. int i;
  676. int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
  677. int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
  678. bool changed = false;
  679. for_each_populated_zone(zone) {
  680. struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
  681. struct per_cpu_pages __percpu *pcp = zone->per_cpu_pageset;
  682. for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
  683. int v;
  684. v = this_cpu_xchg(pzstats->vm_stat_diff[i], 0);
  685. if (v) {
  686. atomic_long_add(v, &zone->vm_stat[i]);
  687. global_zone_diff[i] += v;
  688. #ifdef CONFIG_NUMA
  689. /* 3 seconds idle till flush */
  690. __this_cpu_write(pcp->expire, 3);
  691. #endif
  692. }
  693. }
  694. if (do_pagesets) {
  695. cond_resched();
  696. if (decay_pcp_high(zone, this_cpu_ptr(pcp)))
  697. changed = true;
  698. #ifdef CONFIG_NUMA
  699. /*
  700. * Deal with draining the remote pageset of this
  701. * processor
  702. *
  703. * Check if there are pages remaining in this pageset
  704. * if not then there is nothing to expire.
  705. */
  706. if (!__this_cpu_read(pcp->expire) ||
  707. !__this_cpu_read(pcp->count))
  708. continue;
  709. /*
  710. * We never drain zones local to this processor.
  711. */
  712. if (zone_to_nid(zone) == numa_node_id()) {
  713. __this_cpu_write(pcp->expire, 0);
  714. continue;
  715. }
  716. if (__this_cpu_dec_return(pcp->expire)) {
  717. changed = true;
  718. continue;
  719. }
  720. if (__this_cpu_read(pcp->count)) {
  721. drain_zone_pages(zone, this_cpu_ptr(pcp));
  722. changed = true;
  723. }
  724. #endif
  725. }
  726. }
  727. for_each_online_pgdat(pgdat) {
  728. struct per_cpu_nodestat __percpu *p = pgdat->per_cpu_nodestats;
  729. for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
  730. int v;
  731. v = this_cpu_xchg(p->vm_node_stat_diff[i], 0);
  732. if (v) {
  733. atomic_long_add(v, &pgdat->vm_stat[i]);
  734. global_node_diff[i] += v;
  735. }
  736. }
  737. }
  738. if (fold_diff(global_zone_diff, global_node_diff))
  739. changed = true;
  740. return changed;
  741. }
  742. /*
  743. * Fold the data for an offline cpu into the global array.
  744. * There cannot be any access by the offline cpu and therefore
  745. * synchronization is simplified.
  746. */
  747. void cpu_vm_stats_fold(int cpu)
  748. {
  749. struct pglist_data *pgdat;
  750. struct zone *zone;
  751. int i;
  752. int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
  753. int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
  754. for_each_populated_zone(zone) {
  755. struct per_cpu_zonestat *pzstats;
  756. pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
  757. for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
  758. if (pzstats->vm_stat_diff[i]) {
  759. int v;
  760. v = pzstats->vm_stat_diff[i];
  761. pzstats->vm_stat_diff[i] = 0;
  762. atomic_long_add(v, &zone->vm_stat[i]);
  763. global_zone_diff[i] += v;
  764. }
  765. }
  766. #ifdef CONFIG_NUMA
  767. for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) {
  768. if (pzstats->vm_numa_event[i]) {
  769. unsigned long v;
  770. v = pzstats->vm_numa_event[i];
  771. pzstats->vm_numa_event[i] = 0;
  772. zone_numa_event_add(v, zone, i);
  773. }
  774. }
  775. #endif
  776. }
  777. for_each_online_pgdat(pgdat) {
  778. struct per_cpu_nodestat *p;
  779. p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
  780. for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
  781. if (p->vm_node_stat_diff[i]) {
  782. int v;
  783. v = p->vm_node_stat_diff[i];
  784. p->vm_node_stat_diff[i] = 0;
  785. atomic_long_add(v, &pgdat->vm_stat[i]);
  786. global_node_diff[i] += v;
  787. }
  788. }
  789. fold_diff(global_zone_diff, global_node_diff);
  790. }
  791. /*
  792. * this is only called if !populated_zone(zone), which implies no other users of
  793. * pset->vm_stat_diff[] exist.
  794. */
  795. void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *pzstats)
  796. {
  797. unsigned long v;
  798. int i;
  799. for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
  800. if (pzstats->vm_stat_diff[i]) {
  801. v = pzstats->vm_stat_diff[i];
  802. pzstats->vm_stat_diff[i] = 0;
  803. zone_page_state_add(v, zone, i);
  804. }
  805. }
  806. #ifdef CONFIG_NUMA
  807. for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) {
  808. if (pzstats->vm_numa_event[i]) {
  809. v = pzstats->vm_numa_event[i];
  810. pzstats->vm_numa_event[i] = 0;
  811. zone_numa_event_add(v, zone, i);
  812. }
  813. }
  814. #endif
  815. }
  816. #endif
  817. #ifdef CONFIG_NUMA
  818. /*
  819. * Determine the per node value of a stat item. This function
  820. * is called frequently in a NUMA machine, so try to be as
  821. * frugal as possible.
  822. */
  823. unsigned long sum_zone_node_page_state(int node,
  824. enum zone_stat_item item)
  825. {
  826. struct zone *zones = NODE_DATA(node)->node_zones;
  827. int i;
  828. unsigned long count = 0;
  829. for (i = 0; i < MAX_NR_ZONES; i++)
  830. count += zone_page_state(zones + i, item);
  831. return count;
  832. }
  833. /* Determine the per node value of a numa stat item. */
  834. unsigned long sum_zone_numa_event_state(int node,
  835. enum numa_stat_item item)
  836. {
  837. struct zone *zones = NODE_DATA(node)->node_zones;
  838. unsigned long count = 0;
  839. int i;
  840. for (i = 0; i < MAX_NR_ZONES; i++)
  841. count += zone_numa_event_state(zones + i, item);
  842. return count;
  843. }
  844. /*
  845. * Determine the per node value of a stat item.
  846. */
  847. unsigned long node_page_state_pages(struct pglist_data *pgdat,
  848. enum node_stat_item item)
  849. {
  850. long x = atomic_long_read(&pgdat->vm_stat[item]);
  851. #ifdef CONFIG_SMP
  852. if (x < 0)
  853. x = 0;
  854. #endif
  855. return x;
  856. }
  857. unsigned long node_page_state(struct pglist_data *pgdat,
  858. enum node_stat_item item)
  859. {
  860. VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
  861. return node_page_state_pages(pgdat, item);
  862. }
  863. #endif
  864. /*
  865. * Count number of pages "struct page" and "struct page_ext" consume.
  866. * nr_memmap_boot_pages: # of pages allocated by boot allocator
  867. * nr_memmap_pages: # of pages that were allocated by buddy allocator
  868. */
  869. static atomic_long_t nr_memmap_boot_pages = ATOMIC_LONG_INIT(0);
  870. static atomic_long_t nr_memmap_pages = ATOMIC_LONG_INIT(0);
  871. void memmap_boot_pages_add(long delta)
  872. {
  873. atomic_long_add(delta, &nr_memmap_boot_pages);
  874. }
  875. void memmap_pages_add(long delta)
  876. {
  877. atomic_long_add(delta, &nr_memmap_pages);
  878. }
  879. #ifdef CONFIG_COMPACTION
  880. struct contig_page_info {
  881. unsigned long free_pages;
  882. unsigned long free_blocks_total;
  883. unsigned long free_blocks_suitable;
  884. };
  885. /*
  886. * Calculate the number of free pages in a zone, how many contiguous
  887. * pages are free and how many are large enough to satisfy an allocation of
  888. * the target size. Note that this function makes no attempt to estimate
  889. * how many suitable free blocks there *might* be if MOVABLE pages were
  890. * migrated. Calculating that is possible, but expensive and can be
  891. * figured out from userspace
  892. */
  893. static void fill_contig_page_info(struct zone *zone,
  894. unsigned int suitable_order,
  895. struct contig_page_info *info)
  896. {
  897. unsigned int order;
  898. info->free_pages = 0;
  899. info->free_blocks_total = 0;
  900. info->free_blocks_suitable = 0;
  901. for (order = 0; order < NR_PAGE_ORDERS; order++) {
  902. unsigned long blocks;
  903. /*
  904. * Count number of free blocks.
  905. *
  906. * Access to nr_free is lockless as nr_free is used only for
  907. * diagnostic purposes. Use data_race to avoid KCSAN warning.
  908. */
  909. blocks = data_race(zone->free_area[order].nr_free);
  910. info->free_blocks_total += blocks;
  911. /* Count free base pages */
  912. info->free_pages += blocks << order;
  913. /* Count the suitable free blocks */
  914. if (order >= suitable_order)
  915. info->free_blocks_suitable += blocks <<
  916. (order - suitable_order);
  917. }
  918. }
  919. /*
  920. * A fragmentation index only makes sense if an allocation of a requested
  921. * size would fail. If that is true, the fragmentation index indicates
  922. * whether external fragmentation or a lack of memory was the problem.
  923. * The value can be used to determine if page reclaim or compaction
  924. * should be used
  925. */
  926. static int __fragmentation_index(unsigned int order, struct contig_page_info *info)
  927. {
  928. unsigned long requested = 1UL << order;
  929. if (WARN_ON_ONCE(order > MAX_PAGE_ORDER))
  930. return 0;
  931. if (!info->free_blocks_total)
  932. return 0;
  933. /* Fragmentation index only makes sense when a request would fail */
  934. if (info->free_blocks_suitable)
  935. return -1000;
  936. /*
  937. * Index is between 0 and 1 so return within 3 decimal places
  938. *
  939. * 0 => allocation would fail due to lack of memory
  940. * 1 => allocation would fail due to fragmentation
  941. */
  942. return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total);
  943. }
  944. /*
  945. * Calculates external fragmentation within a zone wrt the given order.
  946. * It is defined as the percentage of pages found in blocks of size
  947. * less than 1 << order. It returns values in range [0, 100].
  948. */
  949. unsigned int extfrag_for_order(struct zone *zone, unsigned int order)
  950. {
  951. struct contig_page_info info;
  952. fill_contig_page_info(zone, order, &info);
  953. if (info.free_pages == 0)
  954. return 0;
  955. return div_u64((info.free_pages -
  956. (info.free_blocks_suitable << order)) * 100,
  957. info.free_pages);
  958. }
  959. /* Same as __fragmentation index but allocs contig_page_info on stack */
  960. int fragmentation_index(struct zone *zone, unsigned int order)
  961. {
  962. struct contig_page_info info;
  963. fill_contig_page_info(zone, order, &info);
  964. return __fragmentation_index(order, &info);
  965. }
  966. #endif
  967. #if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || \
  968. defined(CONFIG_NUMA) || defined(CONFIG_MEMCG)
  969. #ifdef CONFIG_ZONE_DMA
  970. #define TEXT_FOR_DMA(xx, yy) [xx##_DMA] = yy "_dma",
  971. #else
  972. #define TEXT_FOR_DMA(xx, yy)
  973. #endif
  974. #ifdef CONFIG_ZONE_DMA32
  975. #define TEXT_FOR_DMA32(xx, yy) [xx##_DMA32] = yy "_dma32",
  976. #else
  977. #define TEXT_FOR_DMA32(xx, yy)
  978. #endif
  979. #ifdef CONFIG_HIGHMEM
  980. #define TEXT_FOR_HIGHMEM(xx, yy) [xx##_HIGH] = yy "_high",
  981. #else
  982. #define TEXT_FOR_HIGHMEM(xx, yy)
  983. #endif
  984. #ifdef CONFIG_ZONE_DEVICE
  985. #define TEXT_FOR_DEVICE(xx, yy) [xx##_DEVICE] = yy "_device",
  986. #else
  987. #define TEXT_FOR_DEVICE(xx, yy)
  988. #endif
  989. #define TEXTS_FOR_ZONES(xx, yy) \
  990. TEXT_FOR_DMA(xx, yy) \
  991. TEXT_FOR_DMA32(xx, yy) \
  992. [xx##_NORMAL] = yy "_normal", \
  993. TEXT_FOR_HIGHMEM(xx, yy) \
  994. [xx##_MOVABLE] = yy "_movable", \
  995. TEXT_FOR_DEVICE(xx, yy)
  996. const char * const vmstat_text[] = {
  997. /* enum zone_stat_item counters */
  998. #define I(x) (x)
  999. [I(NR_FREE_PAGES)] = "nr_free_pages",
  1000. [I(NR_FREE_PAGES_BLOCKS)] = "nr_free_pages_blocks",
  1001. [I(NR_ZONE_INACTIVE_ANON)] = "nr_zone_inactive_anon",
  1002. [I(NR_ZONE_ACTIVE_ANON)] = "nr_zone_active_anon",
  1003. [I(NR_ZONE_INACTIVE_FILE)] = "nr_zone_inactive_file",
  1004. [I(NR_ZONE_ACTIVE_FILE)] = "nr_zone_active_file",
  1005. [I(NR_ZONE_UNEVICTABLE)] = "nr_zone_unevictable",
  1006. [I(NR_ZONE_WRITE_PENDING)] = "nr_zone_write_pending",
  1007. [I(NR_MLOCK)] = "nr_mlock",
  1008. #if IS_ENABLED(CONFIG_ZSMALLOC)
  1009. [I(NR_ZSPAGES)] = "nr_zspages",
  1010. #endif
  1011. [I(NR_FREE_CMA_PAGES)] = "nr_free_cma",
  1012. #ifdef CONFIG_UNACCEPTED_MEMORY
  1013. [I(NR_UNACCEPTED)] = "nr_unaccepted",
  1014. #endif
  1015. #undef I
  1016. /* enum numa_stat_item counters */
  1017. #define I(x) (NR_VM_ZONE_STAT_ITEMS + x)
  1018. #ifdef CONFIG_NUMA
  1019. [I(NUMA_HIT)] = "numa_hit",
  1020. [I(NUMA_MISS)] = "numa_miss",
  1021. [I(NUMA_FOREIGN)] = "numa_foreign",
  1022. [I(NUMA_INTERLEAVE_HIT)] = "numa_interleave",
  1023. [I(NUMA_LOCAL)] = "numa_local",
  1024. [I(NUMA_OTHER)] = "numa_other",
  1025. #endif
  1026. #undef I
  1027. /* enum node_stat_item counters */
  1028. #define I(x) (NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS + x)
  1029. [I(NR_INACTIVE_ANON)] = "nr_inactive_anon",
  1030. [I(NR_ACTIVE_ANON)] = "nr_active_anon",
  1031. [I(NR_INACTIVE_FILE)] = "nr_inactive_file",
  1032. [I(NR_ACTIVE_FILE)] = "nr_active_file",
  1033. [I(NR_UNEVICTABLE)] = "nr_unevictable",
  1034. [I(NR_SLAB_RECLAIMABLE_B)] = "nr_slab_reclaimable",
  1035. [I(NR_SLAB_UNRECLAIMABLE_B)] = "nr_slab_unreclaimable",
  1036. [I(NR_ISOLATED_ANON)] = "nr_isolated_anon",
  1037. [I(NR_ISOLATED_FILE)] = "nr_isolated_file",
  1038. [I(WORKINGSET_NODES)] = "workingset_nodes",
  1039. [I(WORKINGSET_REFAULT_ANON)] = "workingset_refault_anon",
  1040. [I(WORKINGSET_REFAULT_FILE)] = "workingset_refault_file",
  1041. [I(WORKINGSET_ACTIVATE_ANON)] = "workingset_activate_anon",
  1042. [I(WORKINGSET_ACTIVATE_FILE)] = "workingset_activate_file",
  1043. [I(WORKINGSET_RESTORE_ANON)] = "workingset_restore_anon",
  1044. [I(WORKINGSET_RESTORE_FILE)] = "workingset_restore_file",
  1045. [I(WORKINGSET_NODERECLAIM)] = "workingset_nodereclaim",
  1046. [I(NR_ANON_MAPPED)] = "nr_anon_pages",
  1047. [I(NR_FILE_MAPPED)] = "nr_mapped",
  1048. [I(NR_FILE_PAGES)] = "nr_file_pages",
  1049. [I(NR_FILE_DIRTY)] = "nr_dirty",
  1050. [I(NR_WRITEBACK)] = "nr_writeback",
  1051. [I(NR_SHMEM)] = "nr_shmem",
  1052. [I(NR_SHMEM_THPS)] = "nr_shmem_hugepages",
  1053. [I(NR_SHMEM_PMDMAPPED)] = "nr_shmem_pmdmapped",
  1054. [I(NR_FILE_THPS)] = "nr_file_hugepages",
  1055. [I(NR_FILE_PMDMAPPED)] = "nr_file_pmdmapped",
  1056. [I(NR_ANON_THPS)] = "nr_anon_transparent_hugepages",
  1057. [I(NR_VMSCAN_WRITE)] = "nr_vmscan_write",
  1058. [I(NR_VMSCAN_IMMEDIATE)] = "nr_vmscan_immediate_reclaim",
  1059. [I(NR_DIRTIED)] = "nr_dirtied",
  1060. [I(NR_WRITTEN)] = "nr_written",
  1061. [I(NR_THROTTLED_WRITTEN)] = "nr_throttled_written",
  1062. [I(NR_KERNEL_MISC_RECLAIMABLE)] = "nr_kernel_misc_reclaimable",
  1063. [I(NR_FOLL_PIN_ACQUIRED)] = "nr_foll_pin_acquired",
  1064. [I(NR_FOLL_PIN_RELEASED)] = "nr_foll_pin_released",
  1065. [I(NR_KERNEL_STACK_KB)] = "nr_kernel_stack",
  1066. #if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
  1067. [I(NR_KERNEL_SCS_KB)] = "nr_shadow_call_stack",
  1068. #endif
  1069. [I(NR_PAGETABLE)] = "nr_page_table_pages",
  1070. [I(NR_SECONDARY_PAGETABLE)] = "nr_sec_page_table_pages",
  1071. #ifdef CONFIG_IOMMU_SUPPORT
  1072. [I(NR_IOMMU_PAGES)] = "nr_iommu_pages",
  1073. #endif
  1074. #ifdef CONFIG_SWAP
  1075. [I(NR_SWAPCACHE)] = "nr_swapcached",
  1076. #endif
  1077. #ifdef CONFIG_NUMA_BALANCING
  1078. [I(PGPROMOTE_SUCCESS)] = "pgpromote_success",
  1079. [I(PGPROMOTE_CANDIDATE)] = "pgpromote_candidate",
  1080. [I(PGPROMOTE_CANDIDATE_NRL)] = "pgpromote_candidate_nrl",
  1081. #endif
  1082. [I(PGDEMOTE_KSWAPD)] = "pgdemote_kswapd",
  1083. [I(PGDEMOTE_DIRECT)] = "pgdemote_direct",
  1084. [I(PGDEMOTE_KHUGEPAGED)] = "pgdemote_khugepaged",
  1085. [I(PGDEMOTE_PROACTIVE)] = "pgdemote_proactive",
  1086. #ifdef CONFIG_HUGETLB_PAGE
  1087. [I(NR_HUGETLB)] = "nr_hugetlb",
  1088. #endif
  1089. [I(NR_BALLOON_PAGES)] = "nr_balloon_pages",
  1090. [I(NR_KERNEL_FILE_PAGES)] = "nr_kernel_file_pages",
  1091. #undef I
  1092. /* system-wide enum vm_stat_item counters */
  1093. #define I(x) (NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS + \
  1094. NR_VM_NODE_STAT_ITEMS + x)
  1095. [I(NR_DIRTY_THRESHOLD)] = "nr_dirty_threshold",
  1096. [I(NR_DIRTY_BG_THRESHOLD)] = "nr_dirty_background_threshold",
  1097. [I(NR_MEMMAP_PAGES)] = "nr_memmap_pages",
  1098. [I(NR_MEMMAP_BOOT_PAGES)] = "nr_memmap_boot_pages",
  1099. #undef I
  1100. #if defined(CONFIG_VM_EVENT_COUNTERS)
  1101. /* enum vm_event_item counters */
  1102. #define I(x) (NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS + \
  1103. NR_VM_NODE_STAT_ITEMS + NR_VM_STAT_ITEMS + x)
  1104. [I(PGPGIN)] = "pgpgin",
  1105. [I(PGPGOUT)] = "pgpgout",
  1106. [I(PSWPIN)] = "pswpin",
  1107. [I(PSWPOUT)] = "pswpout",
  1108. #define OFF (NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS + \
  1109. NR_VM_NODE_STAT_ITEMS + NR_VM_STAT_ITEMS)
  1110. TEXTS_FOR_ZONES(OFF+PGALLOC, "pgalloc")
  1111. TEXTS_FOR_ZONES(OFF+ALLOCSTALL, "allocstall")
  1112. TEXTS_FOR_ZONES(OFF+PGSCAN_SKIP, "pgskip")
  1113. #undef OFF
  1114. [I(PGFREE)] = "pgfree",
  1115. [I(PGACTIVATE)] = "pgactivate",
  1116. [I(PGDEACTIVATE)] = "pgdeactivate",
  1117. [I(PGLAZYFREE)] = "pglazyfree",
  1118. [I(PGFAULT)] = "pgfault",
  1119. [I(PGMAJFAULT)] = "pgmajfault",
  1120. [I(PGLAZYFREED)] = "pglazyfreed",
  1121. [I(PGREFILL)] = "pgrefill",
  1122. [I(PGREUSE)] = "pgreuse",
  1123. [I(PGSTEAL_KSWAPD)] = "pgsteal_kswapd",
  1124. [I(PGSTEAL_DIRECT)] = "pgsteal_direct",
  1125. [I(PGSTEAL_KHUGEPAGED)] = "pgsteal_khugepaged",
  1126. [I(PGSTEAL_PROACTIVE)] = "pgsteal_proactive",
  1127. [I(PGSCAN_KSWAPD)] = "pgscan_kswapd",
  1128. [I(PGSCAN_DIRECT)] = "pgscan_direct",
  1129. [I(PGSCAN_KHUGEPAGED)] = "pgscan_khugepaged",
  1130. [I(PGSCAN_PROACTIVE)] = "pgscan_proactive",
  1131. [I(PGSCAN_DIRECT_THROTTLE)] = "pgscan_direct_throttle",
  1132. [I(PGSCAN_ANON)] = "pgscan_anon",
  1133. [I(PGSCAN_FILE)] = "pgscan_file",
  1134. [I(PGSTEAL_ANON)] = "pgsteal_anon",
  1135. [I(PGSTEAL_FILE)] = "pgsteal_file",
  1136. #ifdef CONFIG_NUMA
  1137. [I(PGSCAN_ZONE_RECLAIM_SUCCESS)] = "zone_reclaim_success",
  1138. [I(PGSCAN_ZONE_RECLAIM_FAILED)] = "zone_reclaim_failed",
  1139. #endif
  1140. [I(PGINODESTEAL)] = "pginodesteal",
  1141. [I(SLABS_SCANNED)] = "slabs_scanned",
  1142. [I(KSWAPD_INODESTEAL)] = "kswapd_inodesteal",
  1143. [I(KSWAPD_LOW_WMARK_HIT_QUICKLY)] = "kswapd_low_wmark_hit_quickly",
  1144. [I(KSWAPD_HIGH_WMARK_HIT_QUICKLY)] = "kswapd_high_wmark_hit_quickly",
  1145. [I(PAGEOUTRUN)] = "pageoutrun",
  1146. [I(PGROTATED)] = "pgrotated",
  1147. [I(DROP_PAGECACHE)] = "drop_pagecache",
  1148. [I(DROP_SLAB)] = "drop_slab",
  1149. [I(OOM_KILL)] = "oom_kill",
  1150. #ifdef CONFIG_NUMA_BALANCING
  1151. [I(NUMA_PTE_UPDATES)] = "numa_pte_updates",
  1152. [I(NUMA_HUGE_PTE_UPDATES)] = "numa_huge_pte_updates",
  1153. [I(NUMA_HINT_FAULTS)] = "numa_hint_faults",
  1154. [I(NUMA_HINT_FAULTS_LOCAL)] = "numa_hint_faults_local",
  1155. [I(NUMA_PAGE_MIGRATE)] = "numa_pages_migrated",
  1156. #endif
  1157. #ifdef CONFIG_MIGRATION
  1158. [I(PGMIGRATE_SUCCESS)] = "pgmigrate_success",
  1159. [I(PGMIGRATE_FAIL)] = "pgmigrate_fail",
  1160. [I(THP_MIGRATION_SUCCESS)] = "thp_migration_success",
  1161. [I(THP_MIGRATION_FAIL)] = "thp_migration_fail",
  1162. [I(THP_MIGRATION_SPLIT)] = "thp_migration_split",
  1163. #endif
  1164. #ifdef CONFIG_COMPACTION
  1165. [I(COMPACTMIGRATE_SCANNED)] = "compact_migrate_scanned",
  1166. [I(COMPACTFREE_SCANNED)] = "compact_free_scanned",
  1167. [I(COMPACTISOLATED)] = "compact_isolated",
  1168. [I(COMPACTSTALL)] = "compact_stall",
  1169. [I(COMPACTFAIL)] = "compact_fail",
  1170. [I(COMPACTSUCCESS)] = "compact_success",
  1171. [I(KCOMPACTD_WAKE)] = "compact_daemon_wake",
  1172. [I(KCOMPACTD_MIGRATE_SCANNED)] = "compact_daemon_migrate_scanned",
  1173. [I(KCOMPACTD_FREE_SCANNED)] = "compact_daemon_free_scanned",
  1174. #endif
  1175. #ifdef CONFIG_HUGETLB_PAGE
  1176. [I(HTLB_BUDDY_PGALLOC)] = "htlb_buddy_alloc_success",
  1177. [I(HTLB_BUDDY_PGALLOC_FAIL)] = "htlb_buddy_alloc_fail",
  1178. #endif
  1179. #ifdef CONFIG_CMA
  1180. [I(CMA_ALLOC_SUCCESS)] = "cma_alloc_success",
  1181. [I(CMA_ALLOC_FAIL)] = "cma_alloc_fail",
  1182. #endif
  1183. [I(UNEVICTABLE_PGCULLED)] = "unevictable_pgs_culled",
  1184. [I(UNEVICTABLE_PGSCANNED)] = "unevictable_pgs_scanned",
  1185. [I(UNEVICTABLE_PGRESCUED)] = "unevictable_pgs_rescued",
  1186. [I(UNEVICTABLE_PGMLOCKED)] = "unevictable_pgs_mlocked",
  1187. [I(UNEVICTABLE_PGMUNLOCKED)] = "unevictable_pgs_munlocked",
  1188. [I(UNEVICTABLE_PGCLEARED)] = "unevictable_pgs_cleared",
  1189. [I(UNEVICTABLE_PGSTRANDED)] = "unevictable_pgs_stranded",
  1190. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  1191. [I(THP_FAULT_ALLOC)] = "thp_fault_alloc",
  1192. [I(THP_FAULT_FALLBACK)] = "thp_fault_fallback",
  1193. [I(THP_FAULT_FALLBACK_CHARGE)] = "thp_fault_fallback_charge",
  1194. [I(THP_COLLAPSE_ALLOC)] = "thp_collapse_alloc",
  1195. [I(THP_COLLAPSE_ALLOC_FAILED)] = "thp_collapse_alloc_failed",
  1196. [I(THP_FILE_ALLOC)] = "thp_file_alloc",
  1197. [I(THP_FILE_FALLBACK)] = "thp_file_fallback",
  1198. [I(THP_FILE_FALLBACK_CHARGE)] = "thp_file_fallback_charge",
  1199. [I(THP_FILE_MAPPED)] = "thp_file_mapped",
  1200. [I(THP_SPLIT_PAGE)] = "thp_split_page",
  1201. [I(THP_SPLIT_PAGE_FAILED)] = "thp_split_page_failed",
  1202. [I(THP_DEFERRED_SPLIT_PAGE)] = "thp_deferred_split_page",
  1203. [I(THP_UNDERUSED_SPLIT_PAGE)] = "thp_underused_split_page",
  1204. [I(THP_SPLIT_PMD)] = "thp_split_pmd",
  1205. [I(THP_SCAN_EXCEED_NONE_PTE)] = "thp_scan_exceed_none_pte",
  1206. [I(THP_SCAN_EXCEED_SWAP_PTE)] = "thp_scan_exceed_swap_pte",
  1207. [I(THP_SCAN_EXCEED_SHARED_PTE)] = "thp_scan_exceed_share_pte",
  1208. #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
  1209. [I(THP_SPLIT_PUD)] = "thp_split_pud",
  1210. #endif
  1211. [I(THP_ZERO_PAGE_ALLOC)] = "thp_zero_page_alloc",
  1212. [I(THP_ZERO_PAGE_ALLOC_FAILED)] = "thp_zero_page_alloc_failed",
  1213. [I(THP_SWPOUT)] = "thp_swpout",
  1214. [I(THP_SWPOUT_FALLBACK)] = "thp_swpout_fallback",
  1215. #endif
  1216. #ifdef CONFIG_BALLOON
  1217. [I(BALLOON_INFLATE)] = "balloon_inflate",
  1218. [I(BALLOON_DEFLATE)] = "balloon_deflate",
  1219. #ifdef CONFIG_BALLOON_MIGRATION
  1220. [I(BALLOON_MIGRATE)] = "balloon_migrate",
  1221. #endif /* CONFIG_BALLOON_MIGRATION */
  1222. #endif /* CONFIG_BALLOON */
  1223. #ifdef CONFIG_DEBUG_TLBFLUSH
  1224. [I(NR_TLB_REMOTE_FLUSH)] = "nr_tlb_remote_flush",
  1225. [I(NR_TLB_REMOTE_FLUSH_RECEIVED)] = "nr_tlb_remote_flush_received",
  1226. [I(NR_TLB_LOCAL_FLUSH_ALL)] = "nr_tlb_local_flush_all",
  1227. [I(NR_TLB_LOCAL_FLUSH_ONE)] = "nr_tlb_local_flush_one",
  1228. #endif /* CONFIG_DEBUG_TLBFLUSH */
  1229. #ifdef CONFIG_SWAP
  1230. [I(SWAP_RA)] = "swap_ra",
  1231. [I(SWAP_RA_HIT)] = "swap_ra_hit",
  1232. [I(SWPIN_ZERO)] = "swpin_zero",
  1233. [I(SWPOUT_ZERO)] = "swpout_zero",
  1234. #ifdef CONFIG_KSM
  1235. [I(KSM_SWPIN_COPY)] = "ksm_swpin_copy",
  1236. #endif
  1237. #endif
  1238. #ifdef CONFIG_KSM
  1239. [I(COW_KSM)] = "cow_ksm",
  1240. #endif
  1241. #ifdef CONFIG_ZSWAP
  1242. [I(ZSWPIN)] = "zswpin",
  1243. [I(ZSWPOUT)] = "zswpout",
  1244. [I(ZSWPWB)] = "zswpwb",
  1245. #endif
  1246. #ifdef CONFIG_X86
  1247. [I(DIRECT_MAP_LEVEL2_SPLIT)] = "direct_map_level2_splits",
  1248. [I(DIRECT_MAP_LEVEL3_SPLIT)] = "direct_map_level3_splits",
  1249. [I(DIRECT_MAP_LEVEL2_COLLAPSE)] = "direct_map_level2_collapses",
  1250. [I(DIRECT_MAP_LEVEL3_COLLAPSE)] = "direct_map_level3_collapses",
  1251. #endif
  1252. #ifdef CONFIG_PER_VMA_LOCK_STATS
  1253. [I(VMA_LOCK_SUCCESS)] = "vma_lock_success",
  1254. [I(VMA_LOCK_ABORT)] = "vma_lock_abort",
  1255. [I(VMA_LOCK_RETRY)] = "vma_lock_retry",
  1256. [I(VMA_LOCK_MISS)] = "vma_lock_miss",
  1257. #endif
  1258. #ifdef CONFIG_DEBUG_STACK_USAGE
  1259. [I(KSTACK_1K)] = "kstack_1k",
  1260. #if THREAD_SIZE > 1024
  1261. [I(KSTACK_2K)] = "kstack_2k",
  1262. #endif
  1263. #if THREAD_SIZE > 2048
  1264. [I(KSTACK_4K)] = "kstack_4k",
  1265. #endif
  1266. #if THREAD_SIZE > 4096
  1267. [I(KSTACK_8K)] = "kstack_8k",
  1268. #endif
  1269. #if THREAD_SIZE > 8192
  1270. [I(KSTACK_16K)] = "kstack_16k",
  1271. #endif
  1272. #if THREAD_SIZE > 16384
  1273. [I(KSTACK_32K)] = "kstack_32k",
  1274. #endif
  1275. #if THREAD_SIZE > 32768
  1276. [I(KSTACK_64K)] = "kstack_64k",
  1277. #endif
  1278. #if THREAD_SIZE > 65536
  1279. [I(KSTACK_REST)] = "kstack_rest",
  1280. #endif
  1281. #endif
  1282. #undef I
  1283. #endif /* CONFIG_VM_EVENT_COUNTERS */
  1284. };
  1285. #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
  1286. #if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \
  1287. defined(CONFIG_PROC_FS)
  1288. static void *frag_start(struct seq_file *m, loff_t *pos)
  1289. {
  1290. pg_data_t *pgdat;
  1291. loff_t node = *pos;
  1292. for (pgdat = first_online_pgdat();
  1293. pgdat && node;
  1294. pgdat = next_online_pgdat(pgdat))
  1295. --node;
  1296. return pgdat;
  1297. }
  1298. static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
  1299. {
  1300. pg_data_t *pgdat = (pg_data_t *)arg;
  1301. (*pos)++;
  1302. return next_online_pgdat(pgdat);
  1303. }
  1304. static void frag_stop(struct seq_file *m, void *arg)
  1305. {
  1306. }
  1307. /*
  1308. * Walk zones in a node and print using a callback.
  1309. * If @assert_populated is true, only use callback for zones that are populated.
  1310. */
  1311. static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
  1312. bool assert_populated, bool nolock,
  1313. void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
  1314. {
  1315. struct zone *zone;
  1316. struct zone *node_zones = pgdat->node_zones;
  1317. unsigned long flags;
  1318. for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
  1319. if (assert_populated && !populated_zone(zone))
  1320. continue;
  1321. if (!nolock)
  1322. spin_lock_irqsave(&zone->lock, flags);
  1323. print(m, pgdat, zone);
  1324. if (!nolock)
  1325. spin_unlock_irqrestore(&zone->lock, flags);
  1326. }
  1327. }
  1328. #endif
  1329. #ifdef CONFIG_PROC_FS
  1330. static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
  1331. struct zone *zone)
  1332. {
  1333. int order;
  1334. seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
  1335. for (order = 0; order < NR_PAGE_ORDERS; ++order)
  1336. /*
  1337. * Access to nr_free is lockless as nr_free is used only for
  1338. * printing purposes. Use data_race to avoid KCSAN warning.
  1339. */
  1340. seq_printf(m, "%6lu ", data_race(zone->free_area[order].nr_free));
  1341. seq_putc(m, '\n');
  1342. }
  1343. /*
  1344. * This walks the free areas for each zone.
  1345. */
  1346. static int frag_show(struct seq_file *m, void *arg)
  1347. {
  1348. pg_data_t *pgdat = (pg_data_t *)arg;
  1349. walk_zones_in_node(m, pgdat, true, false, frag_show_print);
  1350. return 0;
  1351. }
  1352. static void pagetypeinfo_showfree_print(struct seq_file *m,
  1353. pg_data_t *pgdat, struct zone *zone)
  1354. {
  1355. int order, mtype;
  1356. for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {
  1357. seq_printf(m, "Node %4d, zone %8s, type %12s ",
  1358. pgdat->node_id,
  1359. zone->name,
  1360. migratetype_names[mtype]);
  1361. for (order = 0; order < NR_PAGE_ORDERS; ++order) {
  1362. unsigned long freecount = 0;
  1363. struct free_area *area;
  1364. struct list_head *curr;
  1365. bool overflow = false;
  1366. area = &(zone->free_area[order]);
  1367. list_for_each(curr, &area->free_list[mtype]) {
  1368. /*
  1369. * Cap the free_list iteration because it might
  1370. * be really large and we are under a spinlock
  1371. * so a long time spent here could trigger a
  1372. * hard lockup detector. Anyway this is a
  1373. * debugging tool so knowing there is a handful
  1374. * of pages of this order should be more than
  1375. * sufficient.
  1376. */
  1377. if (++freecount >= 100000) {
  1378. overflow = true;
  1379. break;
  1380. }
  1381. }
  1382. seq_printf(m, "%s%6lu ", overflow ? ">" : "", freecount);
  1383. spin_unlock_irq(&zone->lock);
  1384. cond_resched();
  1385. spin_lock_irq(&zone->lock);
  1386. }
  1387. seq_putc(m, '\n');
  1388. }
  1389. }
  1390. /* Print out the free pages at each order for each migratetype */
  1391. static void pagetypeinfo_showfree(struct seq_file *m, void *arg)
  1392. {
  1393. int order;
  1394. pg_data_t *pgdat = (pg_data_t *)arg;
  1395. /* Print header */
  1396. seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
  1397. for (order = 0; order < NR_PAGE_ORDERS; ++order)
  1398. seq_printf(m, "%6d ", order);
  1399. seq_putc(m, '\n');
  1400. walk_zones_in_node(m, pgdat, true, false, pagetypeinfo_showfree_print);
  1401. }
  1402. static void pagetypeinfo_showblockcount_print(struct seq_file *m,
  1403. pg_data_t *pgdat, struct zone *zone)
  1404. {
  1405. int mtype;
  1406. unsigned long pfn;
  1407. unsigned long start_pfn = zone->zone_start_pfn;
  1408. unsigned long end_pfn = zone_end_pfn(zone);
  1409. unsigned long count[MIGRATE_TYPES] = { 0, };
  1410. for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
  1411. struct page *page;
  1412. page = pfn_to_online_page(pfn);
  1413. if (!page)
  1414. continue;
  1415. if (page_zone(page) != zone)
  1416. continue;
  1417. mtype = get_pageblock_migratetype(page);
  1418. if (mtype < MIGRATE_TYPES)
  1419. count[mtype]++;
  1420. }
  1421. /* Print counts */
  1422. seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
  1423. for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
  1424. seq_printf(m, "%12lu ", count[mtype]);
  1425. seq_putc(m, '\n');
  1426. }
  1427. /* Print out the number of pageblocks for each migratetype */
  1428. static void pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
  1429. {
  1430. int mtype;
  1431. pg_data_t *pgdat = (pg_data_t *)arg;
  1432. seq_printf(m, "\n%-23s", "Number of blocks type ");
  1433. for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
  1434. seq_printf(m, "%12s ", migratetype_names[mtype]);
  1435. seq_putc(m, '\n');
  1436. walk_zones_in_node(m, pgdat, true, false,
  1437. pagetypeinfo_showblockcount_print);
  1438. }
  1439. /*
  1440. * Print out the number of pageblocks for each migratetype that contain pages
  1441. * of other types. This gives an indication of how well fallbacks are being
  1442. * contained by rmqueue_fallback(). It requires information from PAGE_OWNER
  1443. * to determine what is going on
  1444. */
  1445. static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
  1446. {
  1447. #ifdef CONFIG_PAGE_OWNER
  1448. int mtype;
  1449. if (!static_branch_unlikely(&page_owner_inited))
  1450. return;
  1451. drain_all_pages(NULL);
  1452. seq_printf(m, "\n%-23s", "Number of mixed blocks ");
  1453. for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
  1454. seq_printf(m, "%12s ", migratetype_names[mtype]);
  1455. seq_putc(m, '\n');
  1456. walk_zones_in_node(m, pgdat, true, true,
  1457. pagetypeinfo_showmixedcount_print);
  1458. #endif /* CONFIG_PAGE_OWNER */
  1459. }
  1460. /*
  1461. * This prints out statistics in relation to grouping pages by mobility.
  1462. * It is expensive to collect so do not constantly read the file.
  1463. */
  1464. static int pagetypeinfo_show(struct seq_file *m, void *arg)
  1465. {
  1466. pg_data_t *pgdat = (pg_data_t *)arg;
  1467. /* check memoryless node */
  1468. if (!node_state(pgdat->node_id, N_MEMORY))
  1469. return 0;
  1470. seq_printf(m, "Page block order: %d\n", pageblock_order);
  1471. seq_printf(m, "Pages per block: %lu\n", pageblock_nr_pages);
  1472. seq_putc(m, '\n');
  1473. pagetypeinfo_showfree(m, pgdat);
  1474. pagetypeinfo_showblockcount(m, pgdat);
  1475. pagetypeinfo_showmixedcount(m, pgdat);
  1476. return 0;
  1477. }
  1478. static const struct seq_operations fragmentation_op = {
  1479. .start = frag_start,
  1480. .next = frag_next,
  1481. .stop = frag_stop,
  1482. .show = frag_show,
  1483. };
  1484. static const struct seq_operations pagetypeinfo_op = {
  1485. .start = frag_start,
  1486. .next = frag_next,
  1487. .stop = frag_stop,
  1488. .show = pagetypeinfo_show,
  1489. };
  1490. static bool is_zone_first_populated(pg_data_t *pgdat, struct zone *zone)
  1491. {
  1492. int zid;
  1493. for (zid = 0; zid < MAX_NR_ZONES; zid++) {
  1494. struct zone *compare = &pgdat->node_zones[zid];
  1495. if (populated_zone(compare))
  1496. return zone == compare;
  1497. }
  1498. return false;
  1499. }
  1500. static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
  1501. struct zone *zone)
  1502. {
  1503. int i;
  1504. seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
  1505. if (is_zone_first_populated(pgdat, zone)) {
  1506. seq_printf(m, "\n per-node stats");
  1507. for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
  1508. unsigned long pages = node_page_state_pages(pgdat, i);
  1509. if (vmstat_item_print_in_thp(i))
  1510. pages /= HPAGE_PMD_NR;
  1511. seq_printf(m, "\n %-12s %lu", node_stat_name(i),
  1512. pages);
  1513. }
  1514. }
  1515. seq_printf(m,
  1516. "\n pages free %lu"
  1517. "\n boost %lu"
  1518. "\n min %lu"
  1519. "\n low %lu"
  1520. "\n high %lu"
  1521. "\n promo %lu"
  1522. "\n spanned %lu"
  1523. "\n present %lu"
  1524. "\n managed %lu"
  1525. "\n cma %lu",
  1526. zone_page_state(zone, NR_FREE_PAGES),
  1527. zone->watermark_boost,
  1528. min_wmark_pages(zone),
  1529. low_wmark_pages(zone),
  1530. high_wmark_pages(zone),
  1531. promo_wmark_pages(zone),
  1532. zone->spanned_pages,
  1533. zone->present_pages,
  1534. zone_managed_pages(zone),
  1535. zone_cma_pages(zone));
  1536. seq_printf(m,
  1537. "\n protection: (%ld",
  1538. zone->lowmem_reserve[0]);
  1539. for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
  1540. seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
  1541. seq_putc(m, ')');
  1542. /* If unpopulated, no other information is useful */
  1543. if (!populated_zone(zone)) {
  1544. seq_putc(m, '\n');
  1545. return;
  1546. }
  1547. for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
  1548. seq_printf(m, "\n %-12s %lu", zone_stat_name(i),
  1549. zone_page_state(zone, i));
  1550. #ifdef CONFIG_NUMA
  1551. fold_vm_zone_numa_events(zone);
  1552. for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
  1553. seq_printf(m, "\n %-12s %lu", numa_stat_name(i),
  1554. zone_numa_event_state(zone, i));
  1555. #endif
  1556. seq_printf(m, "\n pagesets");
  1557. for_each_online_cpu(i) {
  1558. struct per_cpu_pages *pcp;
  1559. struct per_cpu_zonestat __maybe_unused *pzstats;
  1560. pcp = per_cpu_ptr(zone->per_cpu_pageset, i);
  1561. seq_printf(m,
  1562. "\n cpu: %i"
  1563. "\n count: %i"
  1564. "\n high: %i"
  1565. "\n batch: %i"
  1566. "\n high_min: %i"
  1567. "\n high_max: %i",
  1568. i,
  1569. pcp->count,
  1570. pcp->high,
  1571. pcp->batch,
  1572. pcp->high_min,
  1573. pcp->high_max);
  1574. #ifdef CONFIG_SMP
  1575. pzstats = per_cpu_ptr(zone->per_cpu_zonestats, i);
  1576. seq_printf(m, "\n vm stats threshold: %d",
  1577. pzstats->stat_threshold);
  1578. #endif
  1579. }
  1580. seq_printf(m,
  1581. "\n node_unreclaimable: %u"
  1582. "\n start_pfn: %lu"
  1583. "\n reserved_highatomic: %lu"
  1584. "\n free_highatomic: %lu",
  1585. kswapd_test_hopeless(pgdat),
  1586. zone->zone_start_pfn,
  1587. zone->nr_reserved_highatomic,
  1588. zone->nr_free_highatomic);
  1589. seq_putc(m, '\n');
  1590. }
  1591. /*
  1592. * Output information about zones in @pgdat. All zones are printed regardless
  1593. * of whether they are populated or not: lowmem_reserve_ratio operates on the
  1594. * set of all zones and userspace would not be aware of such zones if they are
  1595. * suppressed here (zoneinfo displays the effect of lowmem_reserve_ratio).
  1596. */
  1597. static int zoneinfo_show(struct seq_file *m, void *arg)
  1598. {
  1599. pg_data_t *pgdat = (pg_data_t *)arg;
  1600. walk_zones_in_node(m, pgdat, false, false, zoneinfo_show_print);
  1601. return 0;
  1602. }
  1603. static const struct seq_operations zoneinfo_op = {
  1604. .start = frag_start, /* iterate over all zones. The same as in
  1605. * fragmentation. */
  1606. .next = frag_next,
  1607. .stop = frag_stop,
  1608. .show = zoneinfo_show,
  1609. };
  1610. #define NR_VMSTAT_ITEMS (NR_VM_ZONE_STAT_ITEMS + \
  1611. NR_VM_NUMA_EVENT_ITEMS + \
  1612. NR_VM_NODE_STAT_ITEMS + \
  1613. NR_VM_STAT_ITEMS + \
  1614. (IS_ENABLED(CONFIG_VM_EVENT_COUNTERS) ? \
  1615. NR_VM_EVENT_ITEMS : 0))
  1616. static void *vmstat_start(struct seq_file *m, loff_t *pos)
  1617. {
  1618. unsigned long *v;
  1619. int i;
  1620. if (*pos >= NR_VMSTAT_ITEMS)
  1621. return NULL;
  1622. BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) != NR_VMSTAT_ITEMS);
  1623. fold_vm_numa_events();
  1624. v = kmalloc_array(NR_VMSTAT_ITEMS, sizeof(unsigned long), GFP_KERNEL);
  1625. m->private = v;
  1626. if (!v)
  1627. return ERR_PTR(-ENOMEM);
  1628. for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
  1629. v[i] = global_zone_page_state(i);
  1630. v += NR_VM_ZONE_STAT_ITEMS;
  1631. #ifdef CONFIG_NUMA
  1632. for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
  1633. v[i] = global_numa_event_state(i);
  1634. v += NR_VM_NUMA_EVENT_ITEMS;
  1635. #endif
  1636. for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
  1637. v[i] = global_node_page_state_pages(i);
  1638. if (vmstat_item_print_in_thp(i))
  1639. v[i] /= HPAGE_PMD_NR;
  1640. }
  1641. v += NR_VM_NODE_STAT_ITEMS;
  1642. global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
  1643. v + NR_DIRTY_THRESHOLD);
  1644. v[NR_MEMMAP_PAGES] = atomic_long_read(&nr_memmap_pages);
  1645. v[NR_MEMMAP_BOOT_PAGES] = atomic_long_read(&nr_memmap_boot_pages);
  1646. v += NR_VM_STAT_ITEMS;
  1647. #ifdef CONFIG_VM_EVENT_COUNTERS
  1648. all_vm_events(v);
  1649. v[PGPGIN] /= 2; /* sectors -> kbytes */
  1650. v[PGPGOUT] /= 2;
  1651. #endif
  1652. return (unsigned long *)m->private + *pos;
  1653. }
  1654. static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
  1655. {
  1656. (*pos)++;
  1657. if (*pos >= NR_VMSTAT_ITEMS)
  1658. return NULL;
  1659. return (unsigned long *)m->private + *pos;
  1660. }
  1661. static int vmstat_show(struct seq_file *m, void *arg)
  1662. {
  1663. unsigned long *l = arg;
  1664. unsigned long off = l - (unsigned long *)m->private;
  1665. seq_puts(m, vmstat_text[off]);
  1666. seq_put_decimal_ull(m, " ", *l);
  1667. seq_putc(m, '\n');
  1668. if (off == NR_VMSTAT_ITEMS - 1) {
  1669. /*
  1670. * We've come to the end - add any deprecated counters to avoid
  1671. * breaking userspace which might depend on them being present.
  1672. */
  1673. seq_puts(m, "nr_unstable 0\n");
  1674. }
  1675. return 0;
  1676. }
  1677. static void vmstat_stop(struct seq_file *m, void *arg)
  1678. {
  1679. kfree(m->private);
  1680. m->private = NULL;
  1681. }
  1682. static const struct seq_operations vmstat_op = {
  1683. .start = vmstat_start,
  1684. .next = vmstat_next,
  1685. .stop = vmstat_stop,
  1686. .show = vmstat_show,
  1687. };
  1688. #endif /* CONFIG_PROC_FS */
  1689. #ifdef CONFIG_SMP
  1690. static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
  1691. static int sysctl_stat_interval __read_mostly = HZ;
  1692. static int vmstat_late_init_done;
  1693. #ifdef CONFIG_PROC_FS
  1694. static void refresh_vm_stats(struct work_struct *work)
  1695. {
  1696. refresh_cpu_vm_stats(true);
  1697. }
  1698. static int vmstat_refresh(const struct ctl_table *table, int write,
  1699. void *buffer, size_t *lenp, loff_t *ppos)
  1700. {
  1701. long val;
  1702. int err;
  1703. int i;
  1704. /*
  1705. * The regular update, every sysctl_stat_interval, may come later
  1706. * than expected: leaving a significant amount in per_cpu buckets.
  1707. * This is particularly misleading when checking a quantity of HUGE
  1708. * pages, immediately after running a test. /proc/sys/vm/stat_refresh,
  1709. * which can equally be echo'ed to or cat'ted from (by root),
  1710. * can be used to update the stats just before reading them.
  1711. *
  1712. * Oh, and since global_zone_page_state() etc. are so careful to hide
  1713. * transiently negative values, report an error here if any of
  1714. * the stats is negative, so we know to go looking for imbalance.
  1715. */
  1716. err = schedule_on_each_cpu(refresh_vm_stats);
  1717. if (err)
  1718. return err;
  1719. for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
  1720. /*
  1721. * Skip checking stats known to go negative occasionally.
  1722. */
  1723. switch (i) {
  1724. case NR_ZONE_WRITE_PENDING:
  1725. case NR_FREE_CMA_PAGES:
  1726. continue;
  1727. }
  1728. val = atomic_long_read(&vm_zone_stat[i]);
  1729. if (val < 0) {
  1730. pr_warn("%s: %s %ld\n",
  1731. __func__, zone_stat_name(i), val);
  1732. }
  1733. }
  1734. for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
  1735. /*
  1736. * Skip checking stats known to go negative occasionally.
  1737. */
  1738. switch (i) {
  1739. case NR_WRITEBACK:
  1740. continue;
  1741. }
  1742. val = atomic_long_read(&vm_node_stat[i]);
  1743. if (val < 0) {
  1744. pr_warn("%s: %s %ld\n",
  1745. __func__, node_stat_name(i), val);
  1746. }
  1747. }
  1748. if (write)
  1749. *ppos += *lenp;
  1750. else
  1751. *lenp = 0;
  1752. return 0;
  1753. }
  1754. #endif /* CONFIG_PROC_FS */
  1755. static void vmstat_update(struct work_struct *w)
  1756. {
  1757. if (refresh_cpu_vm_stats(true)) {
  1758. /*
  1759. * Counters were updated so we expect more updates
  1760. * to occur in the future. Keep on running the
  1761. * update worker thread.
  1762. */
  1763. queue_delayed_work_on(smp_processor_id(), mm_percpu_wq,
  1764. this_cpu_ptr(&vmstat_work),
  1765. round_jiffies_relative(sysctl_stat_interval));
  1766. }
  1767. }
  1768. /*
  1769. * Check if the diffs for a certain cpu indicate that
  1770. * an update is needed.
  1771. */
  1772. static bool need_update(int cpu)
  1773. {
  1774. pg_data_t *last_pgdat = NULL;
  1775. struct zone *zone;
  1776. for_each_populated_zone(zone) {
  1777. struct per_cpu_zonestat *pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
  1778. struct per_cpu_nodestat *n;
  1779. /*
  1780. * The fast way of checking if there are any vmstat diffs.
  1781. */
  1782. if (memchr_inv(pzstats->vm_stat_diff, 0, sizeof(pzstats->vm_stat_diff)))
  1783. return true;
  1784. if (last_pgdat == zone->zone_pgdat)
  1785. continue;
  1786. last_pgdat = zone->zone_pgdat;
  1787. n = per_cpu_ptr(zone->zone_pgdat->per_cpu_nodestats, cpu);
  1788. if (memchr_inv(n->vm_node_stat_diff, 0, sizeof(n->vm_node_stat_diff)))
  1789. return true;
  1790. }
  1791. return false;
  1792. }
  1793. /*
  1794. * Switch off vmstat processing and then fold all the remaining differentials
  1795. * until the diffs stay at zero. The function is used by NOHZ and can only be
  1796. * invoked when tick processing is not active.
  1797. */
  1798. void quiet_vmstat(void)
  1799. {
  1800. if (system_state != SYSTEM_RUNNING)
  1801. return;
  1802. if (!delayed_work_pending(this_cpu_ptr(&vmstat_work)))
  1803. return;
  1804. if (!need_update(smp_processor_id()))
  1805. return;
  1806. /*
  1807. * Just refresh counters and do not care about the pending delayed
  1808. * vmstat_update. It doesn't fire that often to matter and canceling
  1809. * it would be too expensive from this path.
  1810. * vmstat_shepherd will take care about that for us.
  1811. */
  1812. refresh_cpu_vm_stats(false);
  1813. }
  1814. /*
  1815. * Shepherd worker thread that checks the
  1816. * differentials of processors that have their worker
  1817. * threads for vm statistics updates disabled because of
  1818. * inactivity.
  1819. */
  1820. static void vmstat_shepherd(struct work_struct *w);
  1821. static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd);
  1822. void vmstat_flush_workqueue(void)
  1823. {
  1824. flush_workqueue(mm_percpu_wq);
  1825. }
  1826. static void vmstat_shepherd(struct work_struct *w)
  1827. {
  1828. int cpu;
  1829. cpus_read_lock();
  1830. /* Check processors whose vmstat worker threads have been disabled */
  1831. for_each_online_cpu(cpu) {
  1832. struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
  1833. /*
  1834. * In kernel users of vmstat counters either require the precise value and
  1835. * they are using zone_page_state_snapshot interface or they can live with
  1836. * an imprecision as the regular flushing can happen at arbitrary time and
  1837. * cumulative error can grow (see calculate_normal_threshold).
  1838. *
  1839. * From that POV the regular flushing can be postponed for CPUs that have
  1840. * been isolated from the kernel interference without critical
  1841. * infrastructure ever noticing. Skip regular flushing from vmstat_shepherd
  1842. * for all isolated CPUs to avoid interference with the isolated workload.
  1843. */
  1844. scoped_guard(rcu) {
  1845. if (cpu_is_isolated(cpu))
  1846. continue;
  1847. if (!delayed_work_pending(dw) && need_update(cpu))
  1848. queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
  1849. }
  1850. cond_resched();
  1851. }
  1852. cpus_read_unlock();
  1853. schedule_delayed_work(&shepherd,
  1854. round_jiffies_relative(sysctl_stat_interval));
  1855. }
  1856. static void __init start_shepherd_timer(void)
  1857. {
  1858. int cpu;
  1859. for_each_possible_cpu(cpu) {
  1860. INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
  1861. vmstat_update);
  1862. /*
  1863. * For secondary CPUs during CPU hotplug scenarios,
  1864. * vmstat_cpu_online() will enable the work.
  1865. * mm/vmstat:online enables and disables vmstat_work
  1866. * symmetrically during CPU hotplug events.
  1867. */
  1868. if (!cpu_online(cpu))
  1869. disable_delayed_work_sync(&per_cpu(vmstat_work, cpu));
  1870. }
  1871. schedule_delayed_work(&shepherd,
  1872. round_jiffies_relative(sysctl_stat_interval));
  1873. }
  1874. static void __init init_cpu_node_state(void)
  1875. {
  1876. int node;
  1877. for_each_online_node(node) {
  1878. if (!cpumask_empty(cpumask_of_node(node)))
  1879. node_set_state(node, N_CPU);
  1880. }
  1881. }
  1882. static int vmstat_cpu_online(unsigned int cpu)
  1883. {
  1884. if (vmstat_late_init_done)
  1885. refresh_zone_stat_thresholds();
  1886. if (!node_state(cpu_to_node(cpu), N_CPU)) {
  1887. node_set_state(cpu_to_node(cpu), N_CPU);
  1888. }
  1889. enable_delayed_work(&per_cpu(vmstat_work, cpu));
  1890. return 0;
  1891. }
  1892. static int vmstat_cpu_down_prep(unsigned int cpu)
  1893. {
  1894. disable_delayed_work_sync(&per_cpu(vmstat_work, cpu));
  1895. return 0;
  1896. }
  1897. static int vmstat_cpu_dead(unsigned int cpu)
  1898. {
  1899. const struct cpumask *node_cpus;
  1900. int node;
  1901. node = cpu_to_node(cpu);
  1902. refresh_zone_stat_thresholds();
  1903. node_cpus = cpumask_of_node(node);
  1904. if (!cpumask_empty(node_cpus))
  1905. return 0;
  1906. node_clear_state(node, N_CPU);
  1907. return 0;
  1908. }
  1909. static int __init vmstat_late_init(void)
  1910. {
  1911. refresh_zone_stat_thresholds();
  1912. vmstat_late_init_done = 1;
  1913. return 0;
  1914. }
  1915. late_initcall(vmstat_late_init);
  1916. #endif
  1917. #ifdef CONFIG_PROC_FS
  1918. static const struct ctl_table vmstat_table[] = {
  1919. #ifdef CONFIG_SMP
  1920. {
  1921. .procname = "stat_interval",
  1922. .data = &sysctl_stat_interval,
  1923. .maxlen = sizeof(sysctl_stat_interval),
  1924. .mode = 0644,
  1925. .proc_handler = proc_dointvec_jiffies,
  1926. },
  1927. {
  1928. .procname = "stat_refresh",
  1929. .data = NULL,
  1930. .maxlen = 0,
  1931. .mode = 0600,
  1932. .proc_handler = vmstat_refresh,
  1933. },
  1934. #endif
  1935. #ifdef CONFIG_NUMA
  1936. {
  1937. .procname = "numa_stat",
  1938. .data = &sysctl_vm_numa_stat,
  1939. .maxlen = sizeof(int),
  1940. .mode = 0644,
  1941. .proc_handler = sysctl_vm_numa_stat_handler,
  1942. .extra1 = SYSCTL_ZERO,
  1943. .extra2 = SYSCTL_ONE,
  1944. },
  1945. #endif
  1946. };
  1947. #endif
  1948. struct workqueue_struct *mm_percpu_wq;
  1949. void __init init_mm_internals(void)
  1950. {
  1951. int ret __maybe_unused;
  1952. mm_percpu_wq = alloc_workqueue("mm_percpu_wq",
  1953. WQ_MEM_RECLAIM | WQ_PERCPU, 0);
  1954. #ifdef CONFIG_SMP
  1955. ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead",
  1956. NULL, vmstat_cpu_dead);
  1957. if (ret < 0)
  1958. pr_err("vmstat: failed to register 'dead' hotplug state\n");
  1959. ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "mm/vmstat:online",
  1960. vmstat_cpu_online,
  1961. vmstat_cpu_down_prep);
  1962. if (ret < 0)
  1963. pr_err("vmstat: failed to register 'online' hotplug state\n");
  1964. cpus_read_lock();
  1965. init_cpu_node_state();
  1966. cpus_read_unlock();
  1967. start_shepherd_timer();
  1968. #endif
  1969. #ifdef CONFIG_PROC_FS
  1970. proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
  1971. proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
  1972. proc_create_seq("vmstat", 0444, NULL, &vmstat_op);
  1973. proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op);
  1974. register_sysctl_init("vm", vmstat_table);
  1975. #endif
  1976. }
  1977. #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
  1978. /*
  1979. * Return an index indicating how much of the available free memory is
  1980. * unusable for an allocation of the requested size.
  1981. */
  1982. static int unusable_free_index(unsigned int order,
  1983. struct contig_page_info *info)
  1984. {
  1985. /* No free memory is interpreted as all free memory is unusable */
  1986. if (info->free_pages == 0)
  1987. return 1000;
  1988. /*
  1989. * Index should be a value between 0 and 1. Return a value to 3
  1990. * decimal places.
  1991. *
  1992. * 0 => no fragmentation
  1993. * 1 => high fragmentation
  1994. */
  1995. return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages);
  1996. }
  1997. static void unusable_show_print(struct seq_file *m,
  1998. pg_data_t *pgdat, struct zone *zone)
  1999. {
  2000. unsigned int order;
  2001. int index;
  2002. struct contig_page_info info;
  2003. seq_printf(m, "Node %d, zone %8s ",
  2004. pgdat->node_id,
  2005. zone->name);
  2006. for (order = 0; order < NR_PAGE_ORDERS; ++order) {
  2007. fill_contig_page_info(zone, order, &info);
  2008. index = unusable_free_index(order, &info);
  2009. seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
  2010. }
  2011. seq_putc(m, '\n');
  2012. }
  2013. /*
  2014. * Display unusable free space index
  2015. *
  2016. * The unusable free space index measures how much of the available free
  2017. * memory cannot be used to satisfy an allocation of a given size and is a
  2018. * value between 0 and 1. The higher the value, the more of free memory is
  2019. * unusable and by implication, the worse the external fragmentation is. This
  2020. * can be expressed as a percentage by multiplying by 100.
  2021. */
  2022. static int unusable_show(struct seq_file *m, void *arg)
  2023. {
  2024. pg_data_t *pgdat = (pg_data_t *)arg;
  2025. /* check memoryless node */
  2026. if (!node_state(pgdat->node_id, N_MEMORY))
  2027. return 0;
  2028. walk_zones_in_node(m, pgdat, true, false, unusable_show_print);
  2029. return 0;
  2030. }
  2031. static const struct seq_operations unusable_sops = {
  2032. .start = frag_start,
  2033. .next = frag_next,
  2034. .stop = frag_stop,
  2035. .show = unusable_show,
  2036. };
  2037. DEFINE_SEQ_ATTRIBUTE(unusable);
  2038. static void extfrag_show_print(struct seq_file *m,
  2039. pg_data_t *pgdat, struct zone *zone)
  2040. {
  2041. unsigned int order;
  2042. int index;
  2043. /* Alloc on stack as interrupts are disabled for zone walk */
  2044. struct contig_page_info info;
  2045. seq_printf(m, "Node %d, zone %8s ",
  2046. pgdat->node_id,
  2047. zone->name);
  2048. for (order = 0; order < NR_PAGE_ORDERS; ++order) {
  2049. fill_contig_page_info(zone, order, &info);
  2050. index = __fragmentation_index(order, &info);
  2051. seq_printf(m, "%2d.%03d ", index / 1000, index % 1000);
  2052. }
  2053. seq_putc(m, '\n');
  2054. }
  2055. /*
  2056. * Display fragmentation index for orders that allocations would fail for
  2057. */
  2058. static int extfrag_show(struct seq_file *m, void *arg)
  2059. {
  2060. pg_data_t *pgdat = (pg_data_t *)arg;
  2061. walk_zones_in_node(m, pgdat, true, false, extfrag_show_print);
  2062. return 0;
  2063. }
  2064. static const struct seq_operations extfrag_sops = {
  2065. .start = frag_start,
  2066. .next = frag_next,
  2067. .stop = frag_stop,
  2068. .show = extfrag_show,
  2069. };
  2070. DEFINE_SEQ_ATTRIBUTE(extfrag);
  2071. static int __init extfrag_debug_init(void)
  2072. {
  2073. struct dentry *extfrag_debug_root;
  2074. extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
  2075. debugfs_create_file("unusable_index", 0444, extfrag_debug_root, NULL,
  2076. &unusable_fops);
  2077. debugfs_create_file("extfrag_index", 0444, extfrag_debug_root, NULL,
  2078. &extfrag_fops);
  2079. return 0;
  2080. }
  2081. module_init(extfrag_debug_init);
  2082. #endif