slab_common.c 59 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Slab allocator functions that are independent of the allocator strategy
  4. *
  5. * (C) 2012 Christoph Lameter <cl@gentwo.org>
  6. */
  7. #include <linux/slab.h>
  8. #include <linux/mm.h>
  9. #include <linux/poison.h>
  10. #include <linux/interrupt.h>
  11. #include <linux/memory.h>
  12. #include <linux/cache.h>
  13. #include <linux/compiler.h>
  14. #include <linux/kfence.h>
  15. #include <linux/module.h>
  16. #include <linux/cpu.h>
  17. #include <linux/uaccess.h>
  18. #include <linux/seq_file.h>
  19. #include <linux/dma-mapping.h>
  20. #include <linux/swiotlb.h>
  21. #include <linux/proc_fs.h>
  22. #include <linux/debugfs.h>
  23. #include <linux/kmemleak.h>
  24. #include <linux/kasan.h>
  25. #include <asm/cacheflush.h>
  26. #include <asm/tlbflush.h>
  27. #include <asm/page.h>
  28. #include <linux/memcontrol.h>
  29. #include <linux/stackdepot.h>
  30. #include <trace/events/rcu.h>
  31. #include "../kernel/rcu/rcu.h"
  32. #include "internal.h"
  33. #include "slab.h"
  34. #define CREATE_TRACE_POINTS
  35. #include <trace/events/kmem.h>
  36. enum slab_state slab_state;
  37. LIST_HEAD(slab_caches);
  38. DEFINE_MUTEX(slab_mutex);
  39. struct kmem_cache *kmem_cache;
  40. /*
  41. * Set of flags that will prevent slab merging.
  42. * Any flag that adds per-object metadata should be included,
  43. * since slab merging can update s->inuse that affects the metadata layout.
  44. */
  45. #define SLAB_NEVER_MERGE (SLAB_DEBUG_FLAGS | SLAB_TYPESAFE_BY_RCU | \
  46. SLAB_NOLEAKTRACE | SLAB_FAILSLAB | SLAB_NO_MERGE | \
  47. SLAB_OBJ_EXT_IN_OBJ)
  48. #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
  49. SLAB_CACHE_DMA32 | SLAB_ACCOUNT)
  50. /*
  51. * Merge control. If this is set then no merging of slab caches will occur.
  52. */
  53. static bool slab_nomerge = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT);
  54. static int __init setup_slab_nomerge(char *str)
  55. {
  56. slab_nomerge = true;
  57. return 1;
  58. }
  59. static int __init setup_slab_merge(char *str)
  60. {
  61. slab_nomerge = false;
  62. return 1;
  63. }
  64. __setup_param("slub_nomerge", slub_nomerge, setup_slab_nomerge, 0);
  65. __setup_param("slub_merge", slub_merge, setup_slab_merge, 0);
  66. __setup("slab_nomerge", setup_slab_nomerge);
  67. __setup("slab_merge", setup_slab_merge);
  68. /*
  69. * Determine the size of a slab object
  70. */
  71. unsigned int kmem_cache_size(struct kmem_cache *s)
  72. {
  73. return s->object_size;
  74. }
  75. EXPORT_SYMBOL(kmem_cache_size);
  76. #ifdef CONFIG_DEBUG_VM
  77. static bool kmem_cache_is_duplicate_name(const char *name)
  78. {
  79. struct kmem_cache *s;
  80. list_for_each_entry(s, &slab_caches, list) {
  81. if (!strcmp(s->name, name))
  82. return true;
  83. }
  84. return false;
  85. }
  86. static int kmem_cache_sanity_check(const char *name, unsigned int size)
  87. {
  88. if (!name || in_interrupt() || size > KMALLOC_MAX_SIZE) {
  89. pr_err("kmem_cache_create(%s) integrity check failed\n", name);
  90. return -EINVAL;
  91. }
  92. /* Duplicate names will confuse slabtop, et al */
  93. WARN(kmem_cache_is_duplicate_name(name),
  94. "kmem_cache of name '%s' already exists\n", name);
  95. WARN_ON(strchr(name, ' ')); /* It confuses parsers */
  96. return 0;
  97. }
  98. #else
  99. static inline int kmem_cache_sanity_check(const char *name, unsigned int size)
  100. {
  101. return 0;
  102. }
  103. #endif
  104. /*
  105. * Figure out what the alignment of the objects will be given a set of
  106. * flags, a user specified alignment and the size of the objects.
  107. */
  108. static unsigned int calculate_alignment(slab_flags_t flags,
  109. unsigned int align, unsigned int size)
  110. {
  111. /*
  112. * If the user wants hardware cache aligned objects then follow that
  113. * suggestion if the object is sufficiently large.
  114. *
  115. * The hardware cache alignment cannot override the specified
  116. * alignment though. If that is greater then use it.
  117. */
  118. if (flags & SLAB_HWCACHE_ALIGN) {
  119. unsigned int ralign;
  120. ralign = cache_line_size();
  121. while (size <= ralign / 2)
  122. ralign /= 2;
  123. align = max(align, ralign);
  124. }
  125. align = max(align, arch_slab_minalign());
  126. return ALIGN(align, sizeof(void *));
  127. }
  128. /*
  129. * Find a mergeable slab cache
  130. */
  131. int slab_unmergeable(struct kmem_cache *s)
  132. {
  133. if (slab_nomerge || (s->flags & SLAB_NEVER_MERGE))
  134. return 1;
  135. if (s->ctor)
  136. return 1;
  137. #ifdef CONFIG_HARDENED_USERCOPY
  138. if (s->usersize)
  139. return 1;
  140. #endif
  141. /*
  142. * We may have set a slab to be unmergeable during bootstrap.
  143. */
  144. if (s->refcount < 0)
  145. return 1;
  146. return 0;
  147. }
  148. bool slab_args_unmergeable(struct kmem_cache_args *args, slab_flags_t flags)
  149. {
  150. if (slab_nomerge)
  151. return true;
  152. if (args->ctor)
  153. return true;
  154. if (IS_ENABLED(CONFIG_HARDENED_USERCOPY) && args->usersize)
  155. return true;
  156. if (flags & SLAB_NEVER_MERGE)
  157. return true;
  158. return false;
  159. }
  160. static struct kmem_cache *find_mergeable(unsigned int size, slab_flags_t flags,
  161. const char *name, struct kmem_cache_args *args)
  162. {
  163. struct kmem_cache *s;
  164. unsigned int align;
  165. flags = kmem_cache_flags(flags, name);
  166. if (slab_args_unmergeable(args, flags))
  167. return NULL;
  168. size = ALIGN(size, sizeof(void *));
  169. align = calculate_alignment(flags, args->align, size);
  170. size = ALIGN(size, align);
  171. list_for_each_entry_reverse(s, &slab_caches, list) {
  172. if (slab_unmergeable(s))
  173. continue;
  174. if (size > s->size)
  175. continue;
  176. if ((flags & SLAB_MERGE_SAME) != (s->flags & SLAB_MERGE_SAME))
  177. continue;
  178. /*
  179. * Check if alignment is compatible.
  180. * Courtesy of Adrian Drzewiecki
  181. */
  182. if ((s->size & ~(align - 1)) != s->size)
  183. continue;
  184. if (s->size - size >= sizeof(void *))
  185. continue;
  186. return s;
  187. }
  188. return NULL;
  189. }
  190. static struct kmem_cache *create_cache(const char *name,
  191. unsigned int object_size,
  192. struct kmem_cache_args *args,
  193. slab_flags_t flags)
  194. {
  195. struct kmem_cache *s;
  196. int err;
  197. /* If a custom freelist pointer is requested make sure it's sane. */
  198. err = -EINVAL;
  199. if (args->use_freeptr_offset &&
  200. (args->freeptr_offset >= object_size ||
  201. (!(flags & SLAB_TYPESAFE_BY_RCU) && !args->ctor) ||
  202. !IS_ALIGNED(args->freeptr_offset, __alignof__(freeptr_t))))
  203. goto out;
  204. err = -ENOMEM;
  205. s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
  206. if (!s)
  207. goto out;
  208. err = do_kmem_cache_create(s, name, object_size, args, flags);
  209. if (err)
  210. goto out_free_cache;
  211. s->refcount = 1;
  212. list_add(&s->list, &slab_caches);
  213. return s;
  214. out_free_cache:
  215. kmem_cache_free(kmem_cache, s);
  216. out:
  217. return ERR_PTR(err);
  218. }
  219. static struct kmem_cache *
  220. __kmem_cache_alias(const char *name, unsigned int size, slab_flags_t flags,
  221. struct kmem_cache_args *args)
  222. {
  223. struct kmem_cache *s;
  224. s = find_mergeable(size, flags, name, args);
  225. if (s) {
  226. if (sysfs_slab_alias(s, name))
  227. pr_err("SLUB: Unable to add cache alias %s to sysfs\n",
  228. name);
  229. s->refcount++;
  230. /*
  231. * Adjust the object sizes so that we clear
  232. * the complete object on kzalloc.
  233. */
  234. s->object_size = max(s->object_size, size);
  235. s->inuse = max(s->inuse, ALIGN(size, sizeof(void *)));
  236. }
  237. return s;
  238. }
  239. /**
  240. * __kmem_cache_create_args - Create a kmem cache.
  241. * @name: A string which is used in /proc/slabinfo to identify this cache.
  242. * @object_size: The size of objects to be created in this cache.
  243. * @args: Additional arguments for the cache creation (see
  244. * &struct kmem_cache_args).
  245. * @flags: See the descriptions of individual flags. The common ones are listed
  246. * in the description below.
  247. *
  248. * Not to be called directly, use the kmem_cache_create() wrapper with the same
  249. * parameters.
  250. *
  251. * Commonly used @flags:
  252. *
  253. * &SLAB_ACCOUNT - Account allocations to memcg.
  254. *
  255. * &SLAB_HWCACHE_ALIGN - Align objects on cache line boundaries.
  256. *
  257. * &SLAB_RECLAIM_ACCOUNT - Objects are reclaimable.
  258. *
  259. * &SLAB_TYPESAFE_BY_RCU - Slab page (not individual objects) freeing delayed
  260. * by a grace period - see the full description before using.
  261. *
  262. * Context: Cannot be called within a interrupt, but can be interrupted.
  263. *
  264. * Return: a pointer to the cache on success, NULL on failure.
  265. */
  266. struct kmem_cache *__kmem_cache_create_args(const char *name,
  267. unsigned int object_size,
  268. struct kmem_cache_args *args,
  269. slab_flags_t flags)
  270. {
  271. struct kmem_cache *s = NULL;
  272. const char *cache_name;
  273. int err;
  274. #ifdef CONFIG_SLUB_DEBUG
  275. /*
  276. * If no slab_debug was enabled globally, the static key is not yet
  277. * enabled by setup_slub_debug(). Enable it if the cache is being
  278. * created with any of the debugging flags passed explicitly.
  279. * It's also possible that this is the first cache created with
  280. * SLAB_STORE_USER and we should init stack_depot for it.
  281. */
  282. if (flags & SLAB_DEBUG_FLAGS)
  283. static_branch_enable(&slub_debug_enabled);
  284. if (flags & SLAB_STORE_USER)
  285. stack_depot_init();
  286. #else
  287. flags &= ~SLAB_DEBUG_FLAGS;
  288. #endif
  289. /*
  290. * Caches with specific capacity are special enough. It's simpler to
  291. * make them unmergeable.
  292. */
  293. if (args->sheaf_capacity)
  294. flags |= SLAB_NO_MERGE;
  295. mutex_lock(&slab_mutex);
  296. err = kmem_cache_sanity_check(name, object_size);
  297. if (err) {
  298. goto out_unlock;
  299. }
  300. if (flags & ~SLAB_FLAGS_PERMITTED) {
  301. err = -EINVAL;
  302. goto out_unlock;
  303. }
  304. /* Fail closed on bad usersize of useroffset values. */
  305. if (!IS_ENABLED(CONFIG_HARDENED_USERCOPY) ||
  306. WARN_ON(!args->usersize && args->useroffset) ||
  307. WARN_ON(object_size < args->usersize ||
  308. object_size - args->usersize < args->useroffset))
  309. args->usersize = args->useroffset = 0;
  310. s = __kmem_cache_alias(name, object_size, flags, args);
  311. if (s)
  312. goto out_unlock;
  313. cache_name = kstrdup_const(name, GFP_KERNEL);
  314. if (!cache_name) {
  315. err = -ENOMEM;
  316. goto out_unlock;
  317. }
  318. args->align = calculate_alignment(flags, args->align, object_size);
  319. s = create_cache(cache_name, object_size, args, flags);
  320. if (IS_ERR(s)) {
  321. err = PTR_ERR(s);
  322. kfree_const(cache_name);
  323. }
  324. out_unlock:
  325. mutex_unlock(&slab_mutex);
  326. if (err) {
  327. if (flags & SLAB_PANIC)
  328. panic("%s: Failed to create slab '%s'. Error %d\n",
  329. __func__, name, err);
  330. else {
  331. pr_warn("%s(%s) failed with error %d\n",
  332. __func__, name, err);
  333. dump_stack();
  334. }
  335. return NULL;
  336. }
  337. return s;
  338. }
  339. EXPORT_SYMBOL(__kmem_cache_create_args);
  340. static struct kmem_cache *kmem_buckets_cache __ro_after_init;
  341. /**
  342. * kmem_buckets_create - Create a set of caches that handle dynamic sized
  343. * allocations via kmem_buckets_alloc()
  344. * @name: A prefix string which is used in /proc/slabinfo to identify this
  345. * cache. The individual caches with have their sizes as the suffix.
  346. * @flags: SLAB flags (see kmem_cache_create() for details).
  347. * @useroffset: Starting offset within an allocation that may be copied
  348. * to/from userspace.
  349. * @usersize: How many bytes, starting at @useroffset, may be copied
  350. * to/from userspace.
  351. * @ctor: A constructor for the objects, run when new allocations are made.
  352. *
  353. * Cannot be called within an interrupt, but can be interrupted.
  354. *
  355. * Return: a pointer to the cache on success, NULL on failure. When
  356. * CONFIG_SLAB_BUCKETS is not enabled, ZERO_SIZE_PTR is returned, and
  357. * subsequent calls to kmem_buckets_alloc() will fall back to kmalloc().
  358. * (i.e. callers only need to check for NULL on failure.)
  359. */
  360. kmem_buckets *kmem_buckets_create(const char *name, slab_flags_t flags,
  361. unsigned int useroffset,
  362. unsigned int usersize,
  363. void (*ctor)(void *))
  364. {
  365. unsigned long mask = 0;
  366. unsigned int idx;
  367. kmem_buckets *b;
  368. BUILD_BUG_ON(ARRAY_SIZE(kmalloc_caches[KMALLOC_NORMAL]) > BITS_PER_LONG);
  369. /*
  370. * When the separate buckets API is not built in, just return
  371. * a non-NULL value for the kmem_buckets pointer, which will be
  372. * unused when performing allocations.
  373. */
  374. if (!IS_ENABLED(CONFIG_SLAB_BUCKETS))
  375. return ZERO_SIZE_PTR;
  376. if (WARN_ON(!kmem_buckets_cache))
  377. return NULL;
  378. b = kmem_cache_alloc(kmem_buckets_cache, GFP_KERNEL|__GFP_ZERO);
  379. if (WARN_ON(!b))
  380. return NULL;
  381. flags |= SLAB_NO_MERGE;
  382. for (idx = 0; idx < ARRAY_SIZE(kmalloc_caches[KMALLOC_NORMAL]); idx++) {
  383. char *short_size, *cache_name;
  384. unsigned int cache_useroffset, cache_usersize;
  385. unsigned int size, aligned_idx;
  386. if (!kmalloc_caches[KMALLOC_NORMAL][idx])
  387. continue;
  388. size = kmalloc_caches[KMALLOC_NORMAL][idx]->object_size;
  389. if (!size)
  390. continue;
  391. short_size = strchr(kmalloc_caches[KMALLOC_NORMAL][idx]->name, '-');
  392. if (WARN_ON(!short_size))
  393. goto fail;
  394. if (useroffset >= size) {
  395. cache_useroffset = 0;
  396. cache_usersize = 0;
  397. } else {
  398. cache_useroffset = useroffset;
  399. cache_usersize = min(size - cache_useroffset, usersize);
  400. }
  401. aligned_idx = __kmalloc_index(size, false);
  402. if (!(*b)[aligned_idx]) {
  403. cache_name = kasprintf(GFP_KERNEL, "%s-%s", name, short_size + 1);
  404. if (WARN_ON(!cache_name))
  405. goto fail;
  406. (*b)[aligned_idx] = kmem_cache_create_usercopy(cache_name, size,
  407. 0, flags, cache_useroffset,
  408. cache_usersize, ctor);
  409. kfree(cache_name);
  410. if (WARN_ON(!(*b)[aligned_idx]))
  411. goto fail;
  412. set_bit(aligned_idx, &mask);
  413. }
  414. if (idx != aligned_idx)
  415. (*b)[idx] = (*b)[aligned_idx];
  416. }
  417. return b;
  418. fail:
  419. for_each_set_bit(idx, &mask, ARRAY_SIZE(kmalloc_caches[KMALLOC_NORMAL]))
  420. kmem_cache_destroy((*b)[idx]);
  421. kmem_cache_free(kmem_buckets_cache, b);
  422. return NULL;
  423. }
  424. EXPORT_SYMBOL(kmem_buckets_create);
  425. /*
  426. * For a given kmem_cache, kmem_cache_destroy() should only be called
  427. * once or there will be a use-after-free problem. The actual deletion
  428. * and release of the kobject does not need slab_mutex or cpu_hotplug_lock
  429. * protection. So they are now done without holding those locks.
  430. */
  431. static void kmem_cache_release(struct kmem_cache *s)
  432. {
  433. kfence_shutdown_cache(s);
  434. if (__is_defined(SLAB_SUPPORTS_SYSFS) && slab_state >= FULL)
  435. sysfs_slab_release(s);
  436. else
  437. slab_kmem_cache_release(s);
  438. }
  439. void slab_kmem_cache_release(struct kmem_cache *s)
  440. {
  441. __kmem_cache_release(s);
  442. kfree_const(s->name);
  443. kmem_cache_free(kmem_cache, s);
  444. }
  445. void kmem_cache_destroy(struct kmem_cache *s)
  446. {
  447. int err;
  448. if (unlikely(!s) || !kasan_check_byte(s))
  449. return;
  450. /* in-flight kfree_rcu()'s may include objects from our cache */
  451. kvfree_rcu_barrier_on_cache(s);
  452. if (IS_ENABLED(CONFIG_SLUB_RCU_DEBUG) &&
  453. (s->flags & SLAB_TYPESAFE_BY_RCU)) {
  454. /*
  455. * Under CONFIG_SLUB_RCU_DEBUG, when objects in a
  456. * SLAB_TYPESAFE_BY_RCU slab are freed, SLUB will internally
  457. * defer their freeing with call_rcu().
  458. * Wait for such call_rcu() invocations here before actually
  459. * destroying the cache.
  460. *
  461. * It doesn't matter that we haven't looked at the slab refcount
  462. * yet - slabs with SLAB_TYPESAFE_BY_RCU can't be merged, so
  463. * the refcount should be 1 here.
  464. */
  465. rcu_barrier();
  466. }
  467. /* Wait for deferred work from kmalloc/kfree_nolock() */
  468. defer_free_barrier();
  469. cpus_read_lock();
  470. mutex_lock(&slab_mutex);
  471. s->refcount--;
  472. if (s->refcount) {
  473. mutex_unlock(&slab_mutex);
  474. cpus_read_unlock();
  475. return;
  476. }
  477. /* free asan quarantined objects */
  478. kasan_cache_shutdown(s);
  479. err = __kmem_cache_shutdown(s);
  480. if (!slab_in_kunit_test())
  481. WARN(err, "%s %s: Slab cache still has objects when called from %pS",
  482. __func__, s->name, (void *)_RET_IP_);
  483. list_del(&s->list);
  484. mutex_unlock(&slab_mutex);
  485. cpus_read_unlock();
  486. if (slab_state >= FULL)
  487. sysfs_slab_unlink(s);
  488. debugfs_slab_release(s);
  489. if (err)
  490. return;
  491. if (s->flags & SLAB_TYPESAFE_BY_RCU)
  492. rcu_barrier();
  493. kmem_cache_release(s);
  494. }
  495. EXPORT_SYMBOL(kmem_cache_destroy);
  496. /**
  497. * kmem_cache_shrink - Shrink a cache.
  498. * @cachep: The cache to shrink.
  499. *
  500. * Releases as many slabs as possible for a cache.
  501. * To help debugging, a zero exit status indicates all slabs were released.
  502. *
  503. * Return: %0 if all slabs were released, non-zero otherwise
  504. */
  505. int kmem_cache_shrink(struct kmem_cache *cachep)
  506. {
  507. kasan_cache_shrink(cachep);
  508. return __kmem_cache_shrink(cachep);
  509. }
  510. EXPORT_SYMBOL(kmem_cache_shrink);
  511. bool slab_is_available(void)
  512. {
  513. return slab_state >= UP;
  514. }
  515. #ifdef CONFIG_PRINTK
  516. static void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
  517. {
  518. if (__kfence_obj_info(kpp, object, slab))
  519. return;
  520. __kmem_obj_info(kpp, object, slab);
  521. }
  522. /**
  523. * kmem_dump_obj - Print available slab provenance information
  524. * @object: slab object for which to find provenance information.
  525. *
  526. * This function uses pr_cont(), so that the caller is expected to have
  527. * printed out whatever preamble is appropriate. The provenance information
  528. * depends on the type of object and on how much debugging is enabled.
  529. * For a slab-cache object, the fact that it is a slab object is printed,
  530. * and, if available, the slab name, return address, and stack trace from
  531. * the allocation and last free path of that object.
  532. *
  533. * Return: %true if the pointer is to a not-yet-freed object from
  534. * kmalloc() or kmem_cache_alloc(), either %true or %false if the pointer
  535. * is to an already-freed object, and %false otherwise.
  536. */
  537. bool kmem_dump_obj(void *object)
  538. {
  539. char *cp = IS_ENABLED(CONFIG_MMU) ? "" : "/vmalloc";
  540. int i;
  541. struct slab *slab;
  542. unsigned long ptroffset;
  543. struct kmem_obj_info kp = { };
  544. /* Some arches consider ZERO_SIZE_PTR to be a valid address. */
  545. if (object < (void *)PAGE_SIZE || !virt_addr_valid(object))
  546. return false;
  547. slab = virt_to_slab(object);
  548. if (!slab)
  549. return false;
  550. kmem_obj_info(&kp, object, slab);
  551. if (kp.kp_slab_cache)
  552. pr_cont(" slab%s %s", cp, kp.kp_slab_cache->name);
  553. else
  554. pr_cont(" slab%s", cp);
  555. if (is_kfence_address(object))
  556. pr_cont(" (kfence)");
  557. if (kp.kp_objp)
  558. pr_cont(" start %px", kp.kp_objp);
  559. if (kp.kp_data_offset)
  560. pr_cont(" data offset %lu", kp.kp_data_offset);
  561. if (kp.kp_objp) {
  562. ptroffset = ((char *)object - (char *)kp.kp_objp) - kp.kp_data_offset;
  563. pr_cont(" pointer offset %lu", ptroffset);
  564. }
  565. if (kp.kp_slab_cache && kp.kp_slab_cache->object_size)
  566. pr_cont(" size %u", kp.kp_slab_cache->object_size);
  567. if (kp.kp_ret)
  568. pr_cont(" allocated at %pS\n", kp.kp_ret);
  569. else
  570. pr_cont("\n");
  571. for (i = 0; i < ARRAY_SIZE(kp.kp_stack); i++) {
  572. if (!kp.kp_stack[i])
  573. break;
  574. pr_info(" %pS\n", kp.kp_stack[i]);
  575. }
  576. if (kp.kp_free_stack[0])
  577. pr_cont(" Free path:\n");
  578. for (i = 0; i < ARRAY_SIZE(kp.kp_free_stack); i++) {
  579. if (!kp.kp_free_stack[i])
  580. break;
  581. pr_info(" %pS\n", kp.kp_free_stack[i]);
  582. }
  583. return true;
  584. }
  585. EXPORT_SYMBOL_GPL(kmem_dump_obj);
  586. #endif
  587. /* Create a cache during boot when no slab services are available yet */
  588. void __init create_boot_cache(struct kmem_cache *s, const char *name,
  589. unsigned int size, slab_flags_t flags,
  590. unsigned int useroffset, unsigned int usersize)
  591. {
  592. int err;
  593. unsigned int align = ARCH_KMALLOC_MINALIGN;
  594. struct kmem_cache_args kmem_args = {};
  595. /*
  596. * kmalloc caches guarantee alignment of at least the largest
  597. * power-of-two divisor of the size. For power-of-two sizes,
  598. * it is the size itself.
  599. */
  600. if (flags & SLAB_KMALLOC)
  601. align = max(align, 1U << (ffs(size) - 1));
  602. kmem_args.align = calculate_alignment(flags, align, size);
  603. #ifdef CONFIG_HARDENED_USERCOPY
  604. kmem_args.useroffset = useroffset;
  605. kmem_args.usersize = usersize;
  606. #endif
  607. err = do_kmem_cache_create(s, name, size, &kmem_args, flags);
  608. if (err)
  609. panic("Creation of kmalloc slab %s size=%u failed. Reason %d\n",
  610. name, size, err);
  611. s->refcount = -1; /* Exempt from merging for now */
  612. }
  613. static struct kmem_cache *__init create_kmalloc_cache(const char *name,
  614. unsigned int size,
  615. slab_flags_t flags)
  616. {
  617. struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
  618. if (!s)
  619. panic("Out of memory when creating slab %s\n", name);
  620. create_boot_cache(s, name, size, flags | SLAB_KMALLOC, 0, size);
  621. list_add(&s->list, &slab_caches);
  622. s->refcount = 1;
  623. return s;
  624. }
  625. kmem_buckets kmalloc_caches[NR_KMALLOC_TYPES] __ro_after_init =
  626. { /* initialization for https://llvm.org/pr42570 */ };
  627. EXPORT_SYMBOL(kmalloc_caches);
  628. #ifdef CONFIG_RANDOM_KMALLOC_CACHES
  629. unsigned long random_kmalloc_seed __ro_after_init;
  630. EXPORT_SYMBOL(random_kmalloc_seed);
  631. #endif
  632. /*
  633. * Conversion table for small slabs sizes / 8 to the index in the
  634. * kmalloc array. This is necessary for slabs < 192 since we have non power
  635. * of two cache sizes there. The size of larger slabs can be determined using
  636. * fls.
  637. */
  638. u8 kmalloc_size_index[24] __ro_after_init = {
  639. 3, /* 8 */
  640. 4, /* 16 */
  641. 5, /* 24 */
  642. 5, /* 32 */
  643. 6, /* 40 */
  644. 6, /* 48 */
  645. 6, /* 56 */
  646. 6, /* 64 */
  647. 1, /* 72 */
  648. 1, /* 80 */
  649. 1, /* 88 */
  650. 1, /* 96 */
  651. 7, /* 104 */
  652. 7, /* 112 */
  653. 7, /* 120 */
  654. 7, /* 128 */
  655. 2, /* 136 */
  656. 2, /* 144 */
  657. 2, /* 152 */
  658. 2, /* 160 */
  659. 2, /* 168 */
  660. 2, /* 176 */
  661. 2, /* 184 */
  662. 2 /* 192 */
  663. };
  664. size_t kmalloc_size_roundup(size_t size)
  665. {
  666. if (size && size <= KMALLOC_MAX_CACHE_SIZE) {
  667. /*
  668. * The flags don't matter since size_index is common to all.
  669. * Neither does the caller for just getting ->object_size.
  670. */
  671. return kmalloc_slab(size, NULL, GFP_KERNEL, 0)->object_size;
  672. }
  673. /* Above the smaller buckets, size is a multiple of page size. */
  674. if (size && size <= KMALLOC_MAX_SIZE)
  675. return PAGE_SIZE << get_order(size);
  676. /*
  677. * Return 'size' for 0 - kmalloc() returns ZERO_SIZE_PTR
  678. * and very large size - kmalloc() may fail.
  679. */
  680. return size;
  681. }
  682. EXPORT_SYMBOL(kmalloc_size_roundup);
  683. #ifdef CONFIG_ZONE_DMA
  684. #define KMALLOC_DMA_NAME(sz) .name[KMALLOC_DMA] = "dma-kmalloc-" #sz,
  685. #else
  686. #define KMALLOC_DMA_NAME(sz)
  687. #endif
  688. #ifdef CONFIG_MEMCG
  689. #define KMALLOC_CGROUP_NAME(sz) .name[KMALLOC_CGROUP] = "kmalloc-cg-" #sz,
  690. #else
  691. #define KMALLOC_CGROUP_NAME(sz)
  692. #endif
  693. #ifndef CONFIG_SLUB_TINY
  694. #define KMALLOC_RCL_NAME(sz) .name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #sz,
  695. #else
  696. #define KMALLOC_RCL_NAME(sz)
  697. #endif
  698. #ifdef CONFIG_RANDOM_KMALLOC_CACHES
  699. #define __KMALLOC_RANDOM_CONCAT(a, b) a ## b
  700. #define KMALLOC_RANDOM_NAME(N, sz) __KMALLOC_RANDOM_CONCAT(KMA_RAND_, N)(sz)
  701. #define KMA_RAND_1(sz) .name[KMALLOC_RANDOM_START + 1] = "kmalloc-rnd-01-" #sz,
  702. #define KMA_RAND_2(sz) KMA_RAND_1(sz) .name[KMALLOC_RANDOM_START + 2] = "kmalloc-rnd-02-" #sz,
  703. #define KMA_RAND_3(sz) KMA_RAND_2(sz) .name[KMALLOC_RANDOM_START + 3] = "kmalloc-rnd-03-" #sz,
  704. #define KMA_RAND_4(sz) KMA_RAND_3(sz) .name[KMALLOC_RANDOM_START + 4] = "kmalloc-rnd-04-" #sz,
  705. #define KMA_RAND_5(sz) KMA_RAND_4(sz) .name[KMALLOC_RANDOM_START + 5] = "kmalloc-rnd-05-" #sz,
  706. #define KMA_RAND_6(sz) KMA_RAND_5(sz) .name[KMALLOC_RANDOM_START + 6] = "kmalloc-rnd-06-" #sz,
  707. #define KMA_RAND_7(sz) KMA_RAND_6(sz) .name[KMALLOC_RANDOM_START + 7] = "kmalloc-rnd-07-" #sz,
  708. #define KMA_RAND_8(sz) KMA_RAND_7(sz) .name[KMALLOC_RANDOM_START + 8] = "kmalloc-rnd-08-" #sz,
  709. #define KMA_RAND_9(sz) KMA_RAND_8(sz) .name[KMALLOC_RANDOM_START + 9] = "kmalloc-rnd-09-" #sz,
  710. #define KMA_RAND_10(sz) KMA_RAND_9(sz) .name[KMALLOC_RANDOM_START + 10] = "kmalloc-rnd-10-" #sz,
  711. #define KMA_RAND_11(sz) KMA_RAND_10(sz) .name[KMALLOC_RANDOM_START + 11] = "kmalloc-rnd-11-" #sz,
  712. #define KMA_RAND_12(sz) KMA_RAND_11(sz) .name[KMALLOC_RANDOM_START + 12] = "kmalloc-rnd-12-" #sz,
  713. #define KMA_RAND_13(sz) KMA_RAND_12(sz) .name[KMALLOC_RANDOM_START + 13] = "kmalloc-rnd-13-" #sz,
  714. #define KMA_RAND_14(sz) KMA_RAND_13(sz) .name[KMALLOC_RANDOM_START + 14] = "kmalloc-rnd-14-" #sz,
  715. #define KMA_RAND_15(sz) KMA_RAND_14(sz) .name[KMALLOC_RANDOM_START + 15] = "kmalloc-rnd-15-" #sz,
  716. #else // CONFIG_RANDOM_KMALLOC_CACHES
  717. #define KMALLOC_RANDOM_NAME(N, sz)
  718. #endif
  719. #define INIT_KMALLOC_INFO(__size, __short_size) \
  720. { \
  721. .name[KMALLOC_NORMAL] = "kmalloc-" #__short_size, \
  722. KMALLOC_RCL_NAME(__short_size) \
  723. KMALLOC_CGROUP_NAME(__short_size) \
  724. KMALLOC_DMA_NAME(__short_size) \
  725. KMALLOC_RANDOM_NAME(RANDOM_KMALLOC_CACHES_NR, __short_size) \
  726. .size = __size, \
  727. }
  728. /*
  729. * kmalloc_info[] is to make slab_debug=,kmalloc-xx option work at boot time.
  730. * kmalloc_index() supports up to 2^21=2MB, so the final entry of the table is
  731. * kmalloc-2M.
  732. */
  733. const struct kmalloc_info_struct kmalloc_info[] __initconst = {
  734. INIT_KMALLOC_INFO(0, 0),
  735. INIT_KMALLOC_INFO(96, 96),
  736. INIT_KMALLOC_INFO(192, 192),
  737. INIT_KMALLOC_INFO(8, 8),
  738. INIT_KMALLOC_INFO(16, 16),
  739. INIT_KMALLOC_INFO(32, 32),
  740. INIT_KMALLOC_INFO(64, 64),
  741. INIT_KMALLOC_INFO(128, 128),
  742. INIT_KMALLOC_INFO(256, 256),
  743. INIT_KMALLOC_INFO(512, 512),
  744. INIT_KMALLOC_INFO(1024, 1k),
  745. INIT_KMALLOC_INFO(2048, 2k),
  746. INIT_KMALLOC_INFO(4096, 4k),
  747. INIT_KMALLOC_INFO(8192, 8k),
  748. INIT_KMALLOC_INFO(16384, 16k),
  749. INIT_KMALLOC_INFO(32768, 32k),
  750. INIT_KMALLOC_INFO(65536, 64k),
  751. INIT_KMALLOC_INFO(131072, 128k),
  752. INIT_KMALLOC_INFO(262144, 256k),
  753. INIT_KMALLOC_INFO(524288, 512k),
  754. INIT_KMALLOC_INFO(1048576, 1M),
  755. INIT_KMALLOC_INFO(2097152, 2M)
  756. };
  757. /*
  758. * Patch up the size_index table if we have strange large alignment
  759. * requirements for the kmalloc array. This is only the case for
  760. * MIPS it seems. The standard arches will not generate any code here.
  761. *
  762. * Largest permitted alignment is 256 bytes due to the way we
  763. * handle the index determination for the smaller caches.
  764. *
  765. * Make sure that nothing crazy happens if someone starts tinkering
  766. * around with ARCH_KMALLOC_MINALIGN
  767. */
  768. void __init setup_kmalloc_cache_index_table(void)
  769. {
  770. unsigned int i;
  771. BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
  772. !is_power_of_2(KMALLOC_MIN_SIZE));
  773. for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) {
  774. unsigned int elem = size_index_elem(i);
  775. if (elem >= ARRAY_SIZE(kmalloc_size_index))
  776. break;
  777. kmalloc_size_index[elem] = KMALLOC_SHIFT_LOW;
  778. }
  779. if (KMALLOC_MIN_SIZE >= 64) {
  780. /*
  781. * The 96 byte sized cache is not used if the alignment
  782. * is 64 byte.
  783. */
  784. for (i = 64 + 8; i <= 96; i += 8)
  785. kmalloc_size_index[size_index_elem(i)] = 7;
  786. }
  787. if (KMALLOC_MIN_SIZE >= 128) {
  788. /*
  789. * The 192 byte sized cache is not used if the alignment
  790. * is 128 byte. Redirect kmalloc to use the 256 byte cache
  791. * instead.
  792. */
  793. for (i = 128 + 8; i <= 192; i += 8)
  794. kmalloc_size_index[size_index_elem(i)] = 8;
  795. }
  796. }
  797. static unsigned int __kmalloc_minalign(void)
  798. {
  799. unsigned int minalign = dma_get_cache_alignment();
  800. if (IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) &&
  801. is_swiotlb_allocated())
  802. minalign = ARCH_KMALLOC_MINALIGN;
  803. return max(minalign, arch_slab_minalign());
  804. }
  805. static void __init
  806. new_kmalloc_cache(int idx, enum kmalloc_cache_type type)
  807. {
  808. slab_flags_t flags = 0;
  809. unsigned int minalign = __kmalloc_minalign();
  810. unsigned int aligned_size = kmalloc_info[idx].size;
  811. int aligned_idx = idx;
  812. if ((KMALLOC_RECLAIM != KMALLOC_NORMAL) && (type == KMALLOC_RECLAIM)) {
  813. flags |= SLAB_RECLAIM_ACCOUNT;
  814. } else if (IS_ENABLED(CONFIG_MEMCG) && (type == KMALLOC_CGROUP)) {
  815. if (mem_cgroup_kmem_disabled()) {
  816. kmalloc_caches[type][idx] = kmalloc_caches[KMALLOC_NORMAL][idx];
  817. return;
  818. }
  819. flags |= SLAB_ACCOUNT;
  820. } else if (IS_ENABLED(CONFIG_ZONE_DMA) && (type == KMALLOC_DMA)) {
  821. flags |= SLAB_CACHE_DMA;
  822. }
  823. #ifdef CONFIG_RANDOM_KMALLOC_CACHES
  824. if (type >= KMALLOC_RANDOM_START && type <= KMALLOC_RANDOM_END)
  825. flags |= SLAB_NO_MERGE;
  826. #endif
  827. /*
  828. * If CONFIG_MEMCG is enabled, disable cache merging for
  829. * KMALLOC_NORMAL caches.
  830. */
  831. if (IS_ENABLED(CONFIG_MEMCG) && (type == KMALLOC_NORMAL))
  832. flags |= SLAB_NO_MERGE;
  833. if (minalign > ARCH_KMALLOC_MINALIGN) {
  834. aligned_size = ALIGN(aligned_size, minalign);
  835. aligned_idx = __kmalloc_index(aligned_size, false);
  836. }
  837. if (!kmalloc_caches[type][aligned_idx])
  838. kmalloc_caches[type][aligned_idx] = create_kmalloc_cache(
  839. kmalloc_info[aligned_idx].name[type],
  840. aligned_size, flags);
  841. if (idx != aligned_idx)
  842. kmalloc_caches[type][idx] = kmalloc_caches[type][aligned_idx];
  843. }
  844. /*
  845. * Create the kmalloc array. Some of the regular kmalloc arrays
  846. * may already have been created because they were needed to
  847. * enable allocations for slab creation.
  848. */
  849. void __init create_kmalloc_caches(void)
  850. {
  851. int i;
  852. enum kmalloc_cache_type type;
  853. /*
  854. * Including KMALLOC_CGROUP if CONFIG_MEMCG defined
  855. */
  856. for (type = KMALLOC_NORMAL; type < NR_KMALLOC_TYPES; type++) {
  857. /* Caches that are NOT of the two-to-the-power-of size. */
  858. if (KMALLOC_MIN_SIZE <= 32)
  859. new_kmalloc_cache(1, type);
  860. if (KMALLOC_MIN_SIZE <= 64)
  861. new_kmalloc_cache(2, type);
  862. /* Caches that are of the two-to-the-power-of size. */
  863. for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++)
  864. new_kmalloc_cache(i, type);
  865. }
  866. #ifdef CONFIG_RANDOM_KMALLOC_CACHES
  867. random_kmalloc_seed = get_random_u64();
  868. #endif
  869. /* Kmalloc array is now usable */
  870. slab_state = UP;
  871. if (IS_ENABLED(CONFIG_SLAB_BUCKETS))
  872. kmem_buckets_cache = kmem_cache_create("kmalloc_buckets",
  873. sizeof(kmem_buckets),
  874. 0, SLAB_NO_MERGE, NULL);
  875. }
  876. gfp_t kmalloc_fix_flags(gfp_t flags)
  877. {
  878. gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
  879. flags &= ~GFP_SLAB_BUG_MASK;
  880. pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n",
  881. invalid_mask, &invalid_mask, flags, &flags);
  882. dump_stack();
  883. return flags;
  884. }
  885. #ifdef CONFIG_SLAB_FREELIST_RANDOM
  886. /* Randomize a generic freelist */
  887. static void freelist_randomize(unsigned int *list,
  888. unsigned int count)
  889. {
  890. unsigned int rand;
  891. unsigned int i;
  892. for (i = 0; i < count; i++)
  893. list[i] = i;
  894. /* Fisher-Yates shuffle */
  895. for (i = count - 1; i > 0; i--) {
  896. rand = get_random_u32_below(i + 1);
  897. swap(list[i], list[rand]);
  898. }
  899. }
  900. /* Create a random sequence per cache */
  901. int cache_random_seq_create(struct kmem_cache *cachep, unsigned int count,
  902. gfp_t gfp)
  903. {
  904. if (count < 2 || cachep->random_seq)
  905. return 0;
  906. cachep->random_seq = kcalloc(count, sizeof(unsigned int), gfp);
  907. if (!cachep->random_seq)
  908. return -ENOMEM;
  909. freelist_randomize(cachep->random_seq, count);
  910. return 0;
  911. }
  912. /* Destroy the per-cache random freelist sequence */
  913. void cache_random_seq_destroy(struct kmem_cache *cachep)
  914. {
  915. kfree(cachep->random_seq);
  916. cachep->random_seq = NULL;
  917. }
  918. #endif /* CONFIG_SLAB_FREELIST_RANDOM */
  919. #ifdef CONFIG_SLUB_DEBUG
  920. #define SLABINFO_RIGHTS (0400)
  921. static void print_slabinfo_header(struct seq_file *m)
  922. {
  923. /*
  924. * Output format version, so at least we can change it
  925. * without _too_ many complaints.
  926. */
  927. seq_puts(m, "slabinfo - version: 2.1\n");
  928. seq_puts(m, "# name <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>");
  929. seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
  930. seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
  931. seq_putc(m, '\n');
  932. }
  933. static void *slab_start(struct seq_file *m, loff_t *pos)
  934. {
  935. mutex_lock(&slab_mutex);
  936. return seq_list_start(&slab_caches, *pos);
  937. }
  938. static void *slab_next(struct seq_file *m, void *p, loff_t *pos)
  939. {
  940. return seq_list_next(p, &slab_caches, pos);
  941. }
  942. static void slab_stop(struct seq_file *m, void *p)
  943. {
  944. mutex_unlock(&slab_mutex);
  945. }
  946. static void cache_show(struct kmem_cache *s, struct seq_file *m)
  947. {
  948. struct slabinfo sinfo;
  949. memset(&sinfo, 0, sizeof(sinfo));
  950. get_slabinfo(s, &sinfo);
  951. seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
  952. s->name, sinfo.active_objs, sinfo.num_objs, s->size,
  953. sinfo.objects_per_slab, (1 << sinfo.cache_order));
  954. seq_printf(m, " : tunables %4u %4u %4u",
  955. sinfo.limit, sinfo.batchcount, sinfo.shared);
  956. seq_printf(m, " : slabdata %6lu %6lu %6lu",
  957. sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail);
  958. seq_putc(m, '\n');
  959. }
  960. static int slab_show(struct seq_file *m, void *p)
  961. {
  962. struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
  963. if (p == slab_caches.next)
  964. print_slabinfo_header(m);
  965. cache_show(s, m);
  966. return 0;
  967. }
  968. void dump_unreclaimable_slab(void)
  969. {
  970. struct kmem_cache *s;
  971. struct slabinfo sinfo;
  972. /*
  973. * Here acquiring slab_mutex is risky since we don't prefer to get
  974. * sleep in oom path. But, without mutex hold, it may introduce a
  975. * risk of crash.
  976. * Use mutex_trylock to protect the list traverse, dump nothing
  977. * without acquiring the mutex.
  978. */
  979. if (!mutex_trylock(&slab_mutex)) {
  980. pr_warn("excessive unreclaimable slab but cannot dump stats\n");
  981. return;
  982. }
  983. pr_info("Unreclaimable slab info:\n");
  984. pr_info("Name Used Total\n");
  985. list_for_each_entry(s, &slab_caches, list) {
  986. if (s->flags & SLAB_RECLAIM_ACCOUNT)
  987. continue;
  988. get_slabinfo(s, &sinfo);
  989. if (sinfo.num_objs > 0)
  990. pr_info("%-17s %10luKB %10luKB\n", s->name,
  991. (sinfo.active_objs * s->size) / 1024,
  992. (sinfo.num_objs * s->size) / 1024);
  993. }
  994. mutex_unlock(&slab_mutex);
  995. }
  996. /*
  997. * slabinfo_op - iterator that generates /proc/slabinfo
  998. *
  999. * Output layout:
  1000. * cache-name
  1001. * num-active-objs
  1002. * total-objs
  1003. * object size
  1004. * num-active-slabs
  1005. * total-slabs
  1006. * num-pages-per-slab
  1007. * + further values on SMP and with statistics enabled
  1008. */
  1009. static const struct seq_operations slabinfo_op = {
  1010. .start = slab_start,
  1011. .next = slab_next,
  1012. .stop = slab_stop,
  1013. .show = slab_show,
  1014. };
  1015. static int slabinfo_open(struct inode *inode, struct file *file)
  1016. {
  1017. return seq_open(file, &slabinfo_op);
  1018. }
  1019. static const struct proc_ops slabinfo_proc_ops = {
  1020. .proc_flags = PROC_ENTRY_PERMANENT,
  1021. .proc_open = slabinfo_open,
  1022. .proc_read = seq_read,
  1023. .proc_lseek = seq_lseek,
  1024. .proc_release = seq_release,
  1025. };
  1026. static int __init slab_proc_init(void)
  1027. {
  1028. proc_create("slabinfo", SLABINFO_RIGHTS, NULL, &slabinfo_proc_ops);
  1029. return 0;
  1030. }
  1031. module_init(slab_proc_init);
  1032. #endif /* CONFIG_SLUB_DEBUG */
  1033. /**
  1034. * kfree_sensitive - Clear sensitive information in memory before freeing
  1035. * @p: object to free memory of
  1036. *
  1037. * The memory of the object @p points to is zeroed before freed.
  1038. * If @p is %NULL, kfree_sensitive() does nothing.
  1039. *
  1040. * Note: this function zeroes the whole allocated buffer which can be a good
  1041. * deal bigger than the requested buffer size passed to kmalloc(). So be
  1042. * careful when using this function in performance sensitive code.
  1043. */
  1044. void kfree_sensitive(const void *p)
  1045. {
  1046. size_t ks;
  1047. void *mem = (void *)p;
  1048. ks = ksize(mem);
  1049. if (ks) {
  1050. kasan_unpoison_range(mem, ks);
  1051. memzero_explicit(mem, ks);
  1052. }
  1053. kfree(mem);
  1054. }
  1055. EXPORT_SYMBOL(kfree_sensitive);
  1056. #ifdef CONFIG_BPF_SYSCALL
  1057. #include <linux/btf.h>
  1058. __bpf_kfunc_start_defs();
  1059. __bpf_kfunc struct kmem_cache *bpf_get_kmem_cache(u64 addr)
  1060. {
  1061. struct slab *slab;
  1062. if (!virt_addr_valid((void *)(long)addr))
  1063. return NULL;
  1064. slab = virt_to_slab((void *)(long)addr);
  1065. return slab ? slab->slab_cache : NULL;
  1066. }
  1067. __bpf_kfunc_end_defs();
  1068. #endif /* CONFIG_BPF_SYSCALL */
  1069. /* Tracepoints definitions. */
  1070. EXPORT_TRACEPOINT_SYMBOL(kmalloc);
  1071. EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
  1072. EXPORT_TRACEPOINT_SYMBOL(kfree);
  1073. EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
  1074. #ifndef CONFIG_KVFREE_RCU_BATCHED
  1075. void kvfree_call_rcu(struct rcu_head *head, void *ptr)
  1076. {
  1077. if (head) {
  1078. kasan_record_aux_stack(ptr);
  1079. call_rcu(head, kvfree_rcu_cb);
  1080. return;
  1081. }
  1082. // kvfree_rcu(one_arg) call.
  1083. might_sleep();
  1084. synchronize_rcu();
  1085. kvfree(ptr);
  1086. }
  1087. EXPORT_SYMBOL_GPL(kvfree_call_rcu);
  1088. void __init kvfree_rcu_init(void)
  1089. {
  1090. }
  1091. #else /* CONFIG_KVFREE_RCU_BATCHED */
  1092. /*
  1093. * This rcu parameter is runtime-read-only. It reflects
  1094. * a minimum allowed number of objects which can be cached
  1095. * per-CPU. Object size is equal to one page. This value
  1096. * can be changed at boot time.
  1097. */
  1098. static int rcu_min_cached_objs = 5;
  1099. module_param(rcu_min_cached_objs, int, 0444);
  1100. // A page shrinker can ask for pages to be freed to make them
  1101. // available for other parts of the system. This usually happens
  1102. // under low memory conditions, and in that case we should also
  1103. // defer page-cache filling for a short time period.
  1104. //
  1105. // The default value is 5 seconds, which is long enough to reduce
  1106. // interference with the shrinker while it asks other systems to
  1107. // drain their caches.
  1108. static int rcu_delay_page_cache_fill_msec = 5000;
  1109. module_param(rcu_delay_page_cache_fill_msec, int, 0444);
  1110. static struct workqueue_struct *rcu_reclaim_wq;
  1111. /* Maximum number of jiffies to wait before draining a batch. */
  1112. #define KFREE_DRAIN_JIFFIES (5 * HZ)
  1113. #define KFREE_N_BATCHES 2
  1114. #define FREE_N_CHANNELS 2
  1115. /**
  1116. * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers
  1117. * @list: List node. All blocks are linked between each other
  1118. * @gp_snap: Snapshot of RCU state for objects placed to this bulk
  1119. * @nr_records: Number of active pointers in the array
  1120. * @records: Array of the kvfree_rcu() pointers
  1121. */
  1122. struct kvfree_rcu_bulk_data {
  1123. struct list_head list;
  1124. struct rcu_gp_oldstate gp_snap;
  1125. unsigned long nr_records;
  1126. void *records[] __counted_by(nr_records);
  1127. };
  1128. /*
  1129. * This macro defines how many entries the "records" array
  1130. * will contain. It is based on the fact that the size of
  1131. * kvfree_rcu_bulk_data structure becomes exactly one page.
  1132. */
  1133. #define KVFREE_BULK_MAX_ENTR \
  1134. ((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *))
  1135. /**
  1136. * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
  1137. * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
  1138. * @head_free: List of kfree_rcu() objects waiting for a grace period
  1139. * @head_free_gp_snap: Grace-period snapshot to check for attempted premature frees.
  1140. * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
  1141. * @krcp: Pointer to @kfree_rcu_cpu structure
  1142. */
  1143. struct kfree_rcu_cpu_work {
  1144. struct rcu_work rcu_work;
  1145. struct rcu_head *head_free;
  1146. struct rcu_gp_oldstate head_free_gp_snap;
  1147. struct list_head bulk_head_free[FREE_N_CHANNELS];
  1148. struct kfree_rcu_cpu *krcp;
  1149. };
  1150. /**
  1151. * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
  1152. * @head: List of kfree_rcu() objects not yet waiting for a grace period
  1153. * @head_gp_snap: Snapshot of RCU state for objects placed to "@head"
  1154. * @bulk_head: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
  1155. * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
  1156. * @lock: Synchronize access to this structure
  1157. * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
  1158. * @initialized: The @rcu_work fields have been initialized
  1159. * @head_count: Number of objects in rcu_head singular list
  1160. * @bulk_count: Number of objects in bulk-list
  1161. * @bkvcache:
  1162. * A simple cache list that contains objects for reuse purpose.
  1163. * In order to save some per-cpu space the list is singular.
  1164. * Even though it is lockless an access has to be protected by the
  1165. * per-cpu lock.
  1166. * @page_cache_work: A work to refill the cache when it is empty
  1167. * @backoff_page_cache_fill: Delay cache refills
  1168. * @work_in_progress: Indicates that page_cache_work is running
  1169. * @hrtimer: A hrtimer for scheduling a page_cache_work
  1170. * @nr_bkv_objs: number of allocated objects at @bkvcache.
  1171. *
  1172. * This is a per-CPU structure. The reason that it is not included in
  1173. * the rcu_data structure is to permit this code to be extracted from
  1174. * the RCU files. Such extraction could allow further optimization of
  1175. * the interactions with the slab allocators.
  1176. */
  1177. struct kfree_rcu_cpu {
  1178. // Objects queued on a linked list
  1179. // through their rcu_head structures.
  1180. struct rcu_head *head;
  1181. unsigned long head_gp_snap;
  1182. atomic_t head_count;
  1183. // Objects queued on a bulk-list.
  1184. struct list_head bulk_head[FREE_N_CHANNELS];
  1185. atomic_t bulk_count[FREE_N_CHANNELS];
  1186. struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
  1187. raw_spinlock_t lock;
  1188. struct delayed_work monitor_work;
  1189. bool initialized;
  1190. struct delayed_work page_cache_work;
  1191. atomic_t backoff_page_cache_fill;
  1192. atomic_t work_in_progress;
  1193. struct hrtimer hrtimer;
  1194. struct llist_head bkvcache;
  1195. int nr_bkv_objs;
  1196. };
  1197. static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = {
  1198. .lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock),
  1199. };
  1200. static __always_inline void
  1201. debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead)
  1202. {
  1203. #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
  1204. int i;
  1205. for (i = 0; i < bhead->nr_records; i++)
  1206. debug_rcu_head_unqueue((struct rcu_head *)(bhead->records[i]));
  1207. #endif
  1208. }
  1209. static inline struct kfree_rcu_cpu *
  1210. krc_this_cpu_lock(unsigned long *flags)
  1211. {
  1212. struct kfree_rcu_cpu *krcp;
  1213. local_irq_save(*flags); // For safely calling this_cpu_ptr().
  1214. krcp = this_cpu_ptr(&krc);
  1215. raw_spin_lock(&krcp->lock);
  1216. return krcp;
  1217. }
  1218. static inline void
  1219. krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags)
  1220. {
  1221. raw_spin_unlock_irqrestore(&krcp->lock, flags);
  1222. }
  1223. static inline struct kvfree_rcu_bulk_data *
  1224. get_cached_bnode(struct kfree_rcu_cpu *krcp)
  1225. {
  1226. if (!krcp->nr_bkv_objs)
  1227. return NULL;
  1228. WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs - 1);
  1229. return (struct kvfree_rcu_bulk_data *)
  1230. llist_del_first(&krcp->bkvcache);
  1231. }
  1232. static inline bool
  1233. put_cached_bnode(struct kfree_rcu_cpu *krcp,
  1234. struct kvfree_rcu_bulk_data *bnode)
  1235. {
  1236. // Check the limit.
  1237. if (krcp->nr_bkv_objs >= rcu_min_cached_objs)
  1238. return false;
  1239. llist_add((struct llist_node *) bnode, &krcp->bkvcache);
  1240. WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs + 1);
  1241. return true;
  1242. }
  1243. static int
  1244. drain_page_cache(struct kfree_rcu_cpu *krcp)
  1245. {
  1246. unsigned long flags;
  1247. struct llist_node *page_list, *pos, *n;
  1248. int freed = 0;
  1249. if (!rcu_min_cached_objs)
  1250. return 0;
  1251. raw_spin_lock_irqsave(&krcp->lock, flags);
  1252. page_list = llist_del_all(&krcp->bkvcache);
  1253. WRITE_ONCE(krcp->nr_bkv_objs, 0);
  1254. raw_spin_unlock_irqrestore(&krcp->lock, flags);
  1255. llist_for_each_safe(pos, n, page_list) {
  1256. free_page((unsigned long)pos);
  1257. freed++;
  1258. }
  1259. return freed;
  1260. }
  1261. static void
  1262. kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp,
  1263. struct kvfree_rcu_bulk_data *bnode, int idx)
  1264. {
  1265. unsigned long flags;
  1266. int i;
  1267. if (!WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&bnode->gp_snap))) {
  1268. debug_rcu_bhead_unqueue(bnode);
  1269. rcu_lock_acquire(&rcu_callback_map);
  1270. if (idx == 0) { // kmalloc() / kfree().
  1271. trace_rcu_invoke_kfree_bulk_callback(
  1272. "slab", bnode->nr_records,
  1273. bnode->records);
  1274. kfree_bulk(bnode->nr_records, bnode->records);
  1275. } else { // vmalloc() / vfree().
  1276. for (i = 0; i < bnode->nr_records; i++) {
  1277. trace_rcu_invoke_kvfree_callback(
  1278. "slab", bnode->records[i], 0);
  1279. vfree(bnode->records[i]);
  1280. }
  1281. }
  1282. rcu_lock_release(&rcu_callback_map);
  1283. }
  1284. raw_spin_lock_irqsave(&krcp->lock, flags);
  1285. if (put_cached_bnode(krcp, bnode))
  1286. bnode = NULL;
  1287. raw_spin_unlock_irqrestore(&krcp->lock, flags);
  1288. if (bnode)
  1289. free_page((unsigned long) bnode);
  1290. cond_resched_tasks_rcu_qs();
  1291. }
  1292. static void
  1293. kvfree_rcu_list(struct rcu_head *head)
  1294. {
  1295. struct rcu_head *next;
  1296. for (; head; head = next) {
  1297. void *ptr = (void *) head->func;
  1298. unsigned long offset = (void *) head - ptr;
  1299. next = head->next;
  1300. debug_rcu_head_unqueue((struct rcu_head *)ptr);
  1301. rcu_lock_acquire(&rcu_callback_map);
  1302. trace_rcu_invoke_kvfree_callback("slab", head, offset);
  1303. kvfree(ptr);
  1304. rcu_lock_release(&rcu_callback_map);
  1305. cond_resched_tasks_rcu_qs();
  1306. }
  1307. }
  1308. /*
  1309. * This function is invoked in workqueue context after a grace period.
  1310. * It frees all the objects queued on ->bulk_head_free or ->head_free.
  1311. */
  1312. static void kfree_rcu_work(struct work_struct *work)
  1313. {
  1314. unsigned long flags;
  1315. struct kvfree_rcu_bulk_data *bnode, *n;
  1316. struct list_head bulk_head[FREE_N_CHANNELS];
  1317. struct rcu_head *head;
  1318. struct kfree_rcu_cpu *krcp;
  1319. struct kfree_rcu_cpu_work *krwp;
  1320. struct rcu_gp_oldstate head_gp_snap;
  1321. int i;
  1322. krwp = container_of(to_rcu_work(work),
  1323. struct kfree_rcu_cpu_work, rcu_work);
  1324. krcp = krwp->krcp;
  1325. raw_spin_lock_irqsave(&krcp->lock, flags);
  1326. // Channels 1 and 2.
  1327. for (i = 0; i < FREE_N_CHANNELS; i++)
  1328. list_replace_init(&krwp->bulk_head_free[i], &bulk_head[i]);
  1329. // Channel 3.
  1330. head = krwp->head_free;
  1331. krwp->head_free = NULL;
  1332. head_gp_snap = krwp->head_free_gp_snap;
  1333. raw_spin_unlock_irqrestore(&krcp->lock, flags);
  1334. // Handle the first two channels.
  1335. for (i = 0; i < FREE_N_CHANNELS; i++) {
  1336. // Start from the tail page, so a GP is likely passed for it.
  1337. list_for_each_entry_safe(bnode, n, &bulk_head[i], list)
  1338. kvfree_rcu_bulk(krcp, bnode, i);
  1339. }
  1340. /*
  1341. * This is used when the "bulk" path can not be used for the
  1342. * double-argument of kvfree_rcu(). This happens when the
  1343. * page-cache is empty, which means that objects are instead
  1344. * queued on a linked list through their rcu_head structures.
  1345. * This list is named "Channel 3".
  1346. */
  1347. if (head && !WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&head_gp_snap)))
  1348. kvfree_rcu_list(head);
  1349. }
  1350. static bool kfree_rcu_sheaf(void *obj)
  1351. {
  1352. struct kmem_cache *s;
  1353. struct slab *slab;
  1354. if (is_vmalloc_addr(obj))
  1355. return false;
  1356. slab = virt_to_slab(obj);
  1357. if (unlikely(!slab))
  1358. return false;
  1359. s = slab->slab_cache;
  1360. if (likely(!IS_ENABLED(CONFIG_NUMA) || slab_nid(slab) == numa_mem_id()))
  1361. return __kfree_rcu_sheaf(s, obj);
  1362. return false;
  1363. }
  1364. static bool
  1365. need_offload_krc(struct kfree_rcu_cpu *krcp)
  1366. {
  1367. int i;
  1368. for (i = 0; i < FREE_N_CHANNELS; i++)
  1369. if (!list_empty(&krcp->bulk_head[i]))
  1370. return true;
  1371. return !!READ_ONCE(krcp->head);
  1372. }
  1373. static bool
  1374. need_wait_for_krwp_work(struct kfree_rcu_cpu_work *krwp)
  1375. {
  1376. int i;
  1377. for (i = 0; i < FREE_N_CHANNELS; i++)
  1378. if (!list_empty(&krwp->bulk_head_free[i]))
  1379. return true;
  1380. return !!krwp->head_free;
  1381. }
  1382. static int krc_count(struct kfree_rcu_cpu *krcp)
  1383. {
  1384. int sum = atomic_read(&krcp->head_count);
  1385. int i;
  1386. for (i = 0; i < FREE_N_CHANNELS; i++)
  1387. sum += atomic_read(&krcp->bulk_count[i]);
  1388. return sum;
  1389. }
  1390. static void
  1391. __schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
  1392. {
  1393. long delay, delay_left;
  1394. delay = krc_count(krcp) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES;
  1395. if (delayed_work_pending(&krcp->monitor_work)) {
  1396. delay_left = krcp->monitor_work.timer.expires - jiffies;
  1397. if (delay < delay_left)
  1398. mod_delayed_work(rcu_reclaim_wq, &krcp->monitor_work, delay);
  1399. return;
  1400. }
  1401. queue_delayed_work(rcu_reclaim_wq, &krcp->monitor_work, delay);
  1402. }
  1403. static void
  1404. schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
  1405. {
  1406. unsigned long flags;
  1407. raw_spin_lock_irqsave(&krcp->lock, flags);
  1408. __schedule_delayed_monitor_work(krcp);
  1409. raw_spin_unlock_irqrestore(&krcp->lock, flags);
  1410. }
  1411. static void
  1412. kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp)
  1413. {
  1414. struct list_head bulk_ready[FREE_N_CHANNELS];
  1415. struct kvfree_rcu_bulk_data *bnode, *n;
  1416. struct rcu_head *head_ready = NULL;
  1417. unsigned long flags;
  1418. int i;
  1419. raw_spin_lock_irqsave(&krcp->lock, flags);
  1420. for (i = 0; i < FREE_N_CHANNELS; i++) {
  1421. INIT_LIST_HEAD(&bulk_ready[i]);
  1422. list_for_each_entry_safe_reverse(bnode, n, &krcp->bulk_head[i], list) {
  1423. if (!poll_state_synchronize_rcu_full(&bnode->gp_snap))
  1424. break;
  1425. atomic_sub(bnode->nr_records, &krcp->bulk_count[i]);
  1426. list_move(&bnode->list, &bulk_ready[i]);
  1427. }
  1428. }
  1429. if (krcp->head && poll_state_synchronize_rcu(krcp->head_gp_snap)) {
  1430. head_ready = krcp->head;
  1431. atomic_set(&krcp->head_count, 0);
  1432. WRITE_ONCE(krcp->head, NULL);
  1433. }
  1434. raw_spin_unlock_irqrestore(&krcp->lock, flags);
  1435. for (i = 0; i < FREE_N_CHANNELS; i++) {
  1436. list_for_each_entry_safe(bnode, n, &bulk_ready[i], list)
  1437. kvfree_rcu_bulk(krcp, bnode, i);
  1438. }
  1439. if (head_ready)
  1440. kvfree_rcu_list(head_ready);
  1441. }
  1442. /*
  1443. * Return: %true if a work is queued, %false otherwise.
  1444. */
  1445. static bool
  1446. kvfree_rcu_queue_batch(struct kfree_rcu_cpu *krcp)
  1447. {
  1448. unsigned long flags;
  1449. bool queued = false;
  1450. int i, j;
  1451. raw_spin_lock_irqsave(&krcp->lock, flags);
  1452. // Attempt to start a new batch.
  1453. for (i = 0; i < KFREE_N_BATCHES; i++) {
  1454. struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]);
  1455. // Try to detach bulk_head or head and attach it, only when
  1456. // all channels are free. Any channel is not free means at krwp
  1457. // there is on-going rcu work to handle krwp's free business.
  1458. if (need_wait_for_krwp_work(krwp))
  1459. continue;
  1460. // kvfree_rcu_drain_ready() might handle this krcp, if so give up.
  1461. if (need_offload_krc(krcp)) {
  1462. // Channel 1 corresponds to the SLAB-pointer bulk path.
  1463. // Channel 2 corresponds to vmalloc-pointer bulk path.
  1464. for (j = 0; j < FREE_N_CHANNELS; j++) {
  1465. if (list_empty(&krwp->bulk_head_free[j])) {
  1466. atomic_set(&krcp->bulk_count[j], 0);
  1467. list_replace_init(&krcp->bulk_head[j],
  1468. &krwp->bulk_head_free[j]);
  1469. }
  1470. }
  1471. // Channel 3 corresponds to both SLAB and vmalloc
  1472. // objects queued on the linked list.
  1473. if (!krwp->head_free) {
  1474. krwp->head_free = krcp->head;
  1475. get_state_synchronize_rcu_full(&krwp->head_free_gp_snap);
  1476. atomic_set(&krcp->head_count, 0);
  1477. WRITE_ONCE(krcp->head, NULL);
  1478. }
  1479. // One work is per one batch, so there are three
  1480. // "free channels", the batch can handle. Break
  1481. // the loop since it is done with this CPU thus
  1482. // queuing an RCU work is _always_ success here.
  1483. queued = queue_rcu_work(rcu_reclaim_wq, &krwp->rcu_work);
  1484. WARN_ON_ONCE(!queued);
  1485. break;
  1486. }
  1487. }
  1488. raw_spin_unlock_irqrestore(&krcp->lock, flags);
  1489. return queued;
  1490. }
  1491. /*
  1492. * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
  1493. */
  1494. static void kfree_rcu_monitor(struct work_struct *work)
  1495. {
  1496. struct kfree_rcu_cpu *krcp = container_of(work,
  1497. struct kfree_rcu_cpu, monitor_work.work);
  1498. // Drain ready for reclaim.
  1499. kvfree_rcu_drain_ready(krcp);
  1500. // Queue a batch for a rest.
  1501. kvfree_rcu_queue_batch(krcp);
  1502. // If there is nothing to detach, it means that our job is
  1503. // successfully done here. In case of having at least one
  1504. // of the channels that is still busy we should rearm the
  1505. // work to repeat an attempt. Because previous batches are
  1506. // still in progress.
  1507. if (need_offload_krc(krcp))
  1508. schedule_delayed_monitor_work(krcp);
  1509. }
  1510. static void fill_page_cache_func(struct work_struct *work)
  1511. {
  1512. struct kvfree_rcu_bulk_data *bnode;
  1513. struct kfree_rcu_cpu *krcp =
  1514. container_of(work, struct kfree_rcu_cpu,
  1515. page_cache_work.work);
  1516. unsigned long flags;
  1517. int nr_pages;
  1518. bool pushed;
  1519. int i;
  1520. nr_pages = atomic_read(&krcp->backoff_page_cache_fill) ?
  1521. 1 : rcu_min_cached_objs;
  1522. for (i = READ_ONCE(krcp->nr_bkv_objs); i < nr_pages; i++) {
  1523. bnode = (struct kvfree_rcu_bulk_data *)
  1524. __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
  1525. if (!bnode)
  1526. break;
  1527. raw_spin_lock_irqsave(&krcp->lock, flags);
  1528. pushed = put_cached_bnode(krcp, bnode);
  1529. raw_spin_unlock_irqrestore(&krcp->lock, flags);
  1530. if (!pushed) {
  1531. free_page((unsigned long) bnode);
  1532. break;
  1533. }
  1534. }
  1535. atomic_set(&krcp->work_in_progress, 0);
  1536. atomic_set(&krcp->backoff_page_cache_fill, 0);
  1537. }
  1538. // Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock()
  1539. // state specified by flags. If can_alloc is true, the caller must
  1540. // be schedulable and not be holding any locks or mutexes that might be
  1541. // acquired by the memory allocator or anything that it might invoke.
  1542. // Returns true if ptr was successfully recorded, else the caller must
  1543. // use a fallback.
  1544. static inline bool
  1545. add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
  1546. unsigned long *flags, void *ptr, bool can_alloc)
  1547. {
  1548. struct kvfree_rcu_bulk_data *bnode;
  1549. int idx;
  1550. *krcp = krc_this_cpu_lock(flags);
  1551. if (unlikely(!(*krcp)->initialized))
  1552. return false;
  1553. idx = !!is_vmalloc_addr(ptr);
  1554. bnode = list_first_entry_or_null(&(*krcp)->bulk_head[idx],
  1555. struct kvfree_rcu_bulk_data, list);
  1556. /* Check if a new block is required. */
  1557. if (!bnode || bnode->nr_records == KVFREE_BULK_MAX_ENTR) {
  1558. bnode = get_cached_bnode(*krcp);
  1559. if (!bnode && can_alloc) {
  1560. krc_this_cpu_unlock(*krcp, *flags);
  1561. // __GFP_NORETRY - allows a light-weight direct reclaim
  1562. // what is OK from minimizing of fallback hitting point of
  1563. // view. Apart of that it forbids any OOM invoking what is
  1564. // also beneficial since we are about to release memory soon.
  1565. //
  1566. // __GFP_NOMEMALLOC - prevents from consuming of all the
  1567. // memory reserves. Please note we have a fallback path.
  1568. //
  1569. // __GFP_NOWARN - it is supposed that an allocation can
  1570. // be failed under low memory or high memory pressure
  1571. // scenarios.
  1572. bnode = (struct kvfree_rcu_bulk_data *)
  1573. __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
  1574. raw_spin_lock_irqsave(&(*krcp)->lock, *flags);
  1575. }
  1576. if (!bnode)
  1577. return false;
  1578. // Initialize the new block and attach it.
  1579. bnode->nr_records = 0;
  1580. list_add(&bnode->list, &(*krcp)->bulk_head[idx]);
  1581. }
  1582. // Finally insert and update the GP for this page.
  1583. bnode->nr_records++;
  1584. bnode->records[bnode->nr_records - 1] = ptr;
  1585. get_state_synchronize_rcu_full(&bnode->gp_snap);
  1586. atomic_inc(&(*krcp)->bulk_count[idx]);
  1587. return true;
  1588. }
  1589. static enum hrtimer_restart
  1590. schedule_page_work_fn(struct hrtimer *t)
  1591. {
  1592. struct kfree_rcu_cpu *krcp =
  1593. container_of(t, struct kfree_rcu_cpu, hrtimer);
  1594. queue_delayed_work(system_highpri_wq, &krcp->page_cache_work, 0);
  1595. return HRTIMER_NORESTART;
  1596. }
  1597. static void
  1598. run_page_cache_worker(struct kfree_rcu_cpu *krcp)
  1599. {
  1600. // If cache disabled, bail out.
  1601. if (!rcu_min_cached_objs)
  1602. return;
  1603. if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
  1604. !atomic_xchg(&krcp->work_in_progress, 1)) {
  1605. if (atomic_read(&krcp->backoff_page_cache_fill)) {
  1606. queue_delayed_work(rcu_reclaim_wq,
  1607. &krcp->page_cache_work,
  1608. msecs_to_jiffies(rcu_delay_page_cache_fill_msec));
  1609. } else {
  1610. hrtimer_setup(&krcp->hrtimer, schedule_page_work_fn, CLOCK_MONOTONIC,
  1611. HRTIMER_MODE_REL);
  1612. hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
  1613. }
  1614. }
  1615. }
  1616. void __init kfree_rcu_scheduler_running(void)
  1617. {
  1618. int cpu;
  1619. for_each_possible_cpu(cpu) {
  1620. struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
  1621. if (need_offload_krc(krcp))
  1622. schedule_delayed_monitor_work(krcp);
  1623. }
  1624. }
  1625. /*
  1626. * Queue a request for lazy invocation of the appropriate free routine
  1627. * after a grace period. Please note that three paths are maintained,
  1628. * two for the common case using arrays of pointers and a third one that
  1629. * is used only when the main paths cannot be used, for example, due to
  1630. * memory pressure.
  1631. *
  1632. * Each kvfree_call_rcu() request is added to a batch. The batch will be drained
  1633. * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will
  1634. * be free'd in workqueue context. This allows us to: batch requests together to
  1635. * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load.
  1636. */
  1637. void kvfree_call_rcu(struct rcu_head *head, void *ptr)
  1638. {
  1639. unsigned long flags;
  1640. struct kfree_rcu_cpu *krcp;
  1641. bool success;
  1642. /*
  1643. * Please note there is a limitation for the head-less
  1644. * variant, that is why there is a clear rule for such
  1645. * objects: it can be used from might_sleep() context
  1646. * only. For other places please embed an rcu_head to
  1647. * your data.
  1648. */
  1649. if (!head)
  1650. might_sleep();
  1651. if (!IS_ENABLED(CONFIG_PREEMPT_RT) && kfree_rcu_sheaf(ptr))
  1652. return;
  1653. // Queue the object but don't yet schedule the batch.
  1654. if (debug_rcu_head_queue(ptr)) {
  1655. // Probable double kfree_rcu(), just leak.
  1656. WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n",
  1657. __func__, head);
  1658. // Mark as success and leave.
  1659. return;
  1660. }
  1661. kasan_record_aux_stack(ptr);
  1662. success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head);
  1663. if (!success) {
  1664. run_page_cache_worker(krcp);
  1665. if (head == NULL)
  1666. // Inline if kvfree_rcu(one_arg) call.
  1667. goto unlock_return;
  1668. head->func = ptr;
  1669. head->next = krcp->head;
  1670. WRITE_ONCE(krcp->head, head);
  1671. atomic_inc(&krcp->head_count);
  1672. // Take a snapshot for this krcp.
  1673. krcp->head_gp_snap = get_state_synchronize_rcu();
  1674. success = true;
  1675. }
  1676. /*
  1677. * The kvfree_rcu() caller considers the pointer freed at this point
  1678. * and likely removes any references to it. Since the actual slab
  1679. * freeing (and kmemleak_free()) is deferred, tell kmemleak to ignore
  1680. * this object (no scanning or false positives reporting).
  1681. */
  1682. kmemleak_ignore(ptr);
  1683. // Set timer to drain after KFREE_DRAIN_JIFFIES.
  1684. if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING)
  1685. __schedule_delayed_monitor_work(krcp);
  1686. unlock_return:
  1687. krc_this_cpu_unlock(krcp, flags);
  1688. /*
  1689. * Inline kvfree() after synchronize_rcu(). We can do
  1690. * it from might_sleep() context only, so the current
  1691. * CPU can pass the QS state.
  1692. */
  1693. if (!success) {
  1694. debug_rcu_head_unqueue((struct rcu_head *) ptr);
  1695. synchronize_rcu();
  1696. kvfree(ptr);
  1697. }
  1698. }
  1699. EXPORT_SYMBOL_GPL(kvfree_call_rcu);
  1700. static inline void __kvfree_rcu_barrier(void)
  1701. {
  1702. struct kfree_rcu_cpu_work *krwp;
  1703. struct kfree_rcu_cpu *krcp;
  1704. bool queued;
  1705. int i, cpu;
  1706. /*
  1707. * Firstly we detach objects and queue them over an RCU-batch
  1708. * for all CPUs. Finally queued works are flushed for each CPU.
  1709. *
  1710. * Please note. If there are outstanding batches for a particular
  1711. * CPU, those have to be finished first following by queuing a new.
  1712. */
  1713. for_each_possible_cpu(cpu) {
  1714. krcp = per_cpu_ptr(&krc, cpu);
  1715. /*
  1716. * Check if this CPU has any objects which have been queued for a
  1717. * new GP completion. If not(means nothing to detach), we are done
  1718. * with it. If any batch is pending/running for this "krcp", below
  1719. * per-cpu flush_rcu_work() waits its completion(see last step).
  1720. */
  1721. if (!need_offload_krc(krcp))
  1722. continue;
  1723. while (1) {
  1724. /*
  1725. * If we are not able to queue a new RCU work it means:
  1726. * - batches for this CPU are still in flight which should
  1727. * be flushed first and then repeat;
  1728. * - no objects to detach, because of concurrency.
  1729. */
  1730. queued = kvfree_rcu_queue_batch(krcp);
  1731. /*
  1732. * Bail out, if there is no need to offload this "krcp"
  1733. * anymore. As noted earlier it can run concurrently.
  1734. */
  1735. if (queued || !need_offload_krc(krcp))
  1736. break;
  1737. /* There are ongoing batches. */
  1738. for (i = 0; i < KFREE_N_BATCHES; i++) {
  1739. krwp = &(krcp->krw_arr[i]);
  1740. flush_rcu_work(&krwp->rcu_work);
  1741. }
  1742. }
  1743. }
  1744. /*
  1745. * Now we guarantee that all objects are flushed.
  1746. */
  1747. for_each_possible_cpu(cpu) {
  1748. krcp = per_cpu_ptr(&krc, cpu);
  1749. /*
  1750. * A monitor work can drain ready to reclaim objects
  1751. * directly. Wait its completion if running or pending.
  1752. */
  1753. cancel_delayed_work_sync(&krcp->monitor_work);
  1754. for (i = 0; i < KFREE_N_BATCHES; i++) {
  1755. krwp = &(krcp->krw_arr[i]);
  1756. flush_rcu_work(&krwp->rcu_work);
  1757. }
  1758. }
  1759. }
  1760. /**
  1761. * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete.
  1762. *
  1763. * Note that a single argument of kvfree_rcu() call has a slow path that
  1764. * triggers synchronize_rcu() following by freeing a pointer. It is done
  1765. * before the return from the function. Therefore for any single-argument
  1766. * call that will result in a kfree() to a cache that is to be destroyed
  1767. * during module exit, it is developer's responsibility to ensure that all
  1768. * such calls have returned before the call to kmem_cache_destroy().
  1769. */
  1770. void kvfree_rcu_barrier(void)
  1771. {
  1772. flush_all_rcu_sheaves();
  1773. __kvfree_rcu_barrier();
  1774. }
  1775. EXPORT_SYMBOL_GPL(kvfree_rcu_barrier);
  1776. /**
  1777. * kvfree_rcu_barrier_on_cache - Wait for in-flight kvfree_rcu() calls on a
  1778. * specific slab cache.
  1779. * @s: slab cache to wait for
  1780. *
  1781. * See the description of kvfree_rcu_barrier() for details.
  1782. */
  1783. void kvfree_rcu_barrier_on_cache(struct kmem_cache *s)
  1784. {
  1785. if (cache_has_sheaves(s)) {
  1786. flush_rcu_sheaves_on_cache(s);
  1787. rcu_barrier();
  1788. }
  1789. /*
  1790. * TODO: Introduce a version of __kvfree_rcu_barrier() that works
  1791. * on a specific slab cache.
  1792. */
  1793. __kvfree_rcu_barrier();
  1794. }
  1795. EXPORT_SYMBOL_GPL(kvfree_rcu_barrier_on_cache);
  1796. static unsigned long
  1797. kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
  1798. {
  1799. int cpu;
  1800. unsigned long count = 0;
  1801. /* Snapshot count of all CPUs */
  1802. for_each_possible_cpu(cpu) {
  1803. struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
  1804. count += krc_count(krcp);
  1805. count += READ_ONCE(krcp->nr_bkv_objs);
  1806. atomic_set(&krcp->backoff_page_cache_fill, 1);
  1807. }
  1808. return count == 0 ? SHRINK_EMPTY : count;
  1809. }
  1810. static unsigned long
  1811. kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
  1812. {
  1813. int cpu, freed = 0;
  1814. for_each_possible_cpu(cpu) {
  1815. int count;
  1816. struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
  1817. count = krc_count(krcp);
  1818. count += drain_page_cache(krcp);
  1819. kfree_rcu_monitor(&krcp->monitor_work.work);
  1820. sc->nr_to_scan -= count;
  1821. freed += count;
  1822. if (sc->nr_to_scan <= 0)
  1823. break;
  1824. }
  1825. return freed == 0 ? SHRINK_STOP : freed;
  1826. }
  1827. void __init kvfree_rcu_init(void)
  1828. {
  1829. int cpu;
  1830. int i, j;
  1831. struct shrinker *kfree_rcu_shrinker;
  1832. rcu_reclaim_wq = alloc_workqueue("kvfree_rcu_reclaim",
  1833. WQ_UNBOUND | WQ_MEM_RECLAIM, 0);
  1834. WARN_ON(!rcu_reclaim_wq);
  1835. /* Clamp it to [0:100] seconds interval. */
  1836. if (rcu_delay_page_cache_fill_msec < 0 ||
  1837. rcu_delay_page_cache_fill_msec > 100 * MSEC_PER_SEC) {
  1838. rcu_delay_page_cache_fill_msec =
  1839. clamp(rcu_delay_page_cache_fill_msec, 0,
  1840. (int) (100 * MSEC_PER_SEC));
  1841. pr_info("Adjusting rcutree.rcu_delay_page_cache_fill_msec to %d ms.\n",
  1842. rcu_delay_page_cache_fill_msec);
  1843. }
  1844. for_each_possible_cpu(cpu) {
  1845. struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
  1846. for (i = 0; i < KFREE_N_BATCHES; i++) {
  1847. INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
  1848. krcp->krw_arr[i].krcp = krcp;
  1849. for (j = 0; j < FREE_N_CHANNELS; j++)
  1850. INIT_LIST_HEAD(&krcp->krw_arr[i].bulk_head_free[j]);
  1851. }
  1852. for (i = 0; i < FREE_N_CHANNELS; i++)
  1853. INIT_LIST_HEAD(&krcp->bulk_head[i]);
  1854. INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
  1855. INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func);
  1856. krcp->initialized = true;
  1857. }
  1858. kfree_rcu_shrinker = shrinker_alloc(0, "slab-kvfree-rcu");
  1859. if (!kfree_rcu_shrinker) {
  1860. pr_err("Failed to allocate kfree_rcu() shrinker!\n");
  1861. return;
  1862. }
  1863. kfree_rcu_shrinker->count_objects = kfree_rcu_shrink_count;
  1864. kfree_rcu_shrinker->scan_objects = kfree_rcu_shrink_scan;
  1865. shrinker_register(kfree_rcu_shrinker);
  1866. }
  1867. #endif /* CONFIG_KVFREE_RCU_BATCHED */