zsmalloc.c 55 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. /*
  3. * zsmalloc memory allocator
  4. *
  5. * Copyright (C) 2011 Nitin Gupta
  6. * Copyright (C) 2012, 2013 Minchan Kim
  7. *
  8. * This code is released using a dual license strategy: BSD/GPL
  9. * You can choose the license that better fits your requirements.
  10. *
  11. * Released under the terms of 3-clause BSD License
  12. * Released under the terms of GNU General Public License Version 2.0
  13. */
  14. #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  15. /*
  16. * lock ordering:
  17. * page_lock
  18. * pool->lock
  19. * class->lock
  20. * zspage->lock
  21. */
  22. #include <linux/module.h>
  23. #include <linux/kernel.h>
  24. #include <linux/sched.h>
  25. #include <linux/errno.h>
  26. #include <linux/highmem.h>
  27. #include <linux/string.h>
  28. #include <linux/slab.h>
  29. #include <linux/scatterlist.h>
  30. #include <linux/spinlock.h>
  31. #include <linux/sprintf.h>
  32. #include <linux/shrinker.h>
  33. #include <linux/types.h>
  34. #include <linux/debugfs.h>
  35. #include <linux/zsmalloc.h>
  36. #include <linux/fs.h>
  37. #include <linux/workqueue.h>
  38. #include "zpdesc.h"
  39. #define ZSPAGE_MAGIC 0x58
  40. /*
  41. * This must be power of 2 and greater than or equal to sizeof(link_free).
  42. * These two conditions ensure that any 'struct link_free' itself doesn't
  43. * span more than 1 page which avoids complex case of mapping 2 pages simply
  44. * to restore link_free pointer values.
  45. */
  46. #define ZS_ALIGN 8
  47. #define ZS_HANDLE_SIZE (sizeof(unsigned long))
  48. /*
  49. * Object location (<PFN>, <obj_idx>) is encoded as
  50. * a single (unsigned long) handle value.
  51. *
  52. * Note that object index <obj_idx> starts from 0.
  53. *
  54. * This is made more complicated by various memory models and PAE.
  55. */
  56. #ifndef MAX_POSSIBLE_PHYSMEM_BITS
  57. #ifdef MAX_PHYSMEM_BITS
  58. #define MAX_POSSIBLE_PHYSMEM_BITS MAX_PHYSMEM_BITS
  59. #else
  60. /*
  61. * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just
  62. * be PAGE_SHIFT
  63. */
  64. #define MAX_POSSIBLE_PHYSMEM_BITS BITS_PER_LONG
  65. #endif
  66. #endif
  67. #define _PFN_BITS (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT)
  68. /*
  69. * Head in allocated object should have OBJ_ALLOCATED_TAG
  70. * to identify the object was allocated or not.
  71. * It's okay to add the status bit in the least bit because
  72. * header keeps handle which is 4byte-aligned address so we
  73. * have room for two bit at least.
  74. */
  75. #define OBJ_ALLOCATED_TAG 1
  76. #define OBJ_TAG_BITS 1
  77. #define OBJ_TAG_MASK OBJ_ALLOCATED_TAG
  78. #define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS)
  79. #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
  80. #define HUGE_BITS 1
  81. #define FULLNESS_BITS 4
  82. #define CLASS_BITS 8
  83. #define MAGIC_VAL_BITS 8
  84. #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(CONFIG_ZSMALLOC_CHAIN_SIZE, UL))
  85. /* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */
  86. #define ZS_MIN_ALLOC_SIZE \
  87. MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
  88. /* each chunk includes extra space to keep handle */
  89. #define ZS_MAX_ALLOC_SIZE PAGE_SIZE
  90. /*
  91. * On systems with 4K page size, this gives 255 size classes! There is a
  92. * trade-off here:
  93. * - Large number of size classes is potentially wasteful as free page are
  94. * spread across these classes
  95. * - Small number of size classes causes large internal fragmentation
  96. * - Probably its better to use specific size classes (empirically
  97. * determined). NOTE: all those class sizes must be set as multiple of
  98. * ZS_ALIGN to make sure link_free itself never has to span 2 pages.
  99. *
  100. * ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN
  101. * (reason above)
  102. */
  103. #define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> CLASS_BITS)
  104. #define ZS_SIZE_CLASSES (DIV_ROUND_UP(ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE, \
  105. ZS_SIZE_CLASS_DELTA) + 1)
  106. /*
  107. * Pages are distinguished by the ratio of used memory (that is the ratio
  108. * of ->inuse objects to all objects that page can store). For example,
  109. * INUSE_RATIO_10 means that the ratio of used objects is > 0% and <= 10%.
  110. *
  111. * The number of fullness groups is not random. It allows us to keep
  112. * difference between the least busy page in the group (minimum permitted
  113. * number of ->inuse objects) and the most busy page (maximum permitted
  114. * number of ->inuse objects) at a reasonable value.
  115. */
  116. enum fullness_group {
  117. ZS_INUSE_RATIO_0,
  118. ZS_INUSE_RATIO_10,
  119. /* NOTE: 8 more fullness groups here */
  120. ZS_INUSE_RATIO_99 = 10,
  121. ZS_INUSE_RATIO_100,
  122. NR_FULLNESS_GROUPS,
  123. };
  124. enum class_stat_type {
  125. /* NOTE: stats for 12 fullness groups here: from inuse 0 to 100 */
  126. ZS_OBJS_ALLOCATED = NR_FULLNESS_GROUPS,
  127. ZS_OBJS_INUSE,
  128. NR_CLASS_STAT_TYPES,
  129. };
  130. struct zs_size_stat {
  131. unsigned long objs[NR_CLASS_STAT_TYPES];
  132. };
  133. #ifdef CONFIG_ZSMALLOC_STAT
  134. static struct dentry *zs_stat_root;
  135. #endif
  136. static size_t huge_class_size;
  137. struct size_class {
  138. spinlock_t lock;
  139. struct list_head fullness_list[NR_FULLNESS_GROUPS];
  140. /*
  141. * Size of objects stored in this class. Must be multiple
  142. * of ZS_ALIGN.
  143. */
  144. int size;
  145. int objs_per_zspage;
  146. /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
  147. int pages_per_zspage;
  148. unsigned int index;
  149. struct zs_size_stat stats;
  150. };
  151. /*
  152. * Placed within free objects to form a singly linked list.
  153. * For every zspage, zspage->freeobj gives head of this list.
  154. *
  155. * This must be power of 2 and less than or equal to ZS_ALIGN
  156. */
  157. struct link_free {
  158. union {
  159. /*
  160. * Free object index;
  161. * It's valid for non-allocated object
  162. */
  163. unsigned long next;
  164. /*
  165. * Handle of allocated object.
  166. */
  167. unsigned long handle;
  168. };
  169. };
  170. static struct kmem_cache *handle_cachep;
  171. static struct kmem_cache *zspage_cachep;
  172. struct zs_pool {
  173. const char *name;
  174. struct size_class *size_class[ZS_SIZE_CLASSES];
  175. atomic_long_t pages_allocated;
  176. struct zs_pool_stats stats;
  177. /* Compact classes */
  178. struct shrinker *shrinker;
  179. #ifdef CONFIG_ZSMALLOC_STAT
  180. struct dentry *stat_dentry;
  181. #endif
  182. #ifdef CONFIG_COMPACTION
  183. struct work_struct free_work;
  184. #endif
  185. /* protect zspage migration/compaction */
  186. rwlock_t lock;
  187. atomic_t compaction_in_progress;
  188. };
  189. static inline void zpdesc_set_first(struct zpdesc *zpdesc)
  190. {
  191. SetPagePrivate(zpdesc_page(zpdesc));
  192. }
  193. static inline void zpdesc_inc_zone_page_state(struct zpdesc *zpdesc)
  194. {
  195. inc_zone_page_state(zpdesc_page(zpdesc), NR_ZSPAGES);
  196. }
  197. static inline void zpdesc_dec_zone_page_state(struct zpdesc *zpdesc)
  198. {
  199. dec_zone_page_state(zpdesc_page(zpdesc), NR_ZSPAGES);
  200. }
  201. static inline struct zpdesc *alloc_zpdesc(gfp_t gfp, const int nid)
  202. {
  203. struct page *page = alloc_pages_node(nid, gfp, 0);
  204. return page_zpdesc(page);
  205. }
  206. static inline void free_zpdesc(struct zpdesc *zpdesc)
  207. {
  208. struct page *page = zpdesc_page(zpdesc);
  209. /* PageZsmalloc is sticky until the page is freed to the buddy. */
  210. __free_page(page);
  211. }
  212. #define ZS_PAGE_UNLOCKED 0
  213. #define ZS_PAGE_WRLOCKED -1
  214. struct zspage_lock {
  215. spinlock_t lock;
  216. int cnt;
  217. struct lockdep_map dep_map;
  218. };
  219. struct zspage {
  220. struct {
  221. unsigned int huge:HUGE_BITS;
  222. unsigned int fullness:FULLNESS_BITS;
  223. unsigned int class:CLASS_BITS + 1;
  224. unsigned int magic:MAGIC_VAL_BITS;
  225. };
  226. unsigned int inuse;
  227. unsigned int freeobj;
  228. struct zpdesc *first_zpdesc;
  229. struct list_head list; /* fullness list */
  230. struct zs_pool *pool;
  231. struct zspage_lock zsl;
  232. };
  233. static void zspage_lock_init(struct zspage *zspage)
  234. {
  235. static struct lock_class_key __key;
  236. struct zspage_lock *zsl = &zspage->zsl;
  237. lockdep_init_map(&zsl->dep_map, "zspage->lock", &__key, 0);
  238. spin_lock_init(&zsl->lock);
  239. zsl->cnt = ZS_PAGE_UNLOCKED;
  240. }
  241. /*
  242. * The zspage lock can be held from atomic contexts, but it needs to remain
  243. * preemptible when held for reading because it remains held outside of those
  244. * atomic contexts, otherwise we unnecessarily lose preemptibility.
  245. *
  246. * To achieve this, the following rules are enforced on readers and writers:
  247. *
  248. * - Writers are blocked by both writers and readers, while readers are only
  249. * blocked by writers (i.e. normal rwlock semantics).
  250. *
  251. * - Writers are always atomic (to allow readers to spin waiting for them).
  252. *
  253. * - Writers always use trylock (as the lock may be held be sleeping readers).
  254. *
  255. * - Readers may spin on the lock (as they can only wait for atomic writers).
  256. *
  257. * - Readers may sleep while holding the lock (as writes only use trylock).
  258. */
  259. static void zspage_read_lock(struct zspage *zspage)
  260. {
  261. struct zspage_lock *zsl = &zspage->zsl;
  262. rwsem_acquire_read(&zsl->dep_map, 0, 0, _RET_IP_);
  263. spin_lock(&zsl->lock);
  264. zsl->cnt++;
  265. spin_unlock(&zsl->lock);
  266. lock_acquired(&zsl->dep_map, _RET_IP_);
  267. }
  268. static void zspage_read_unlock(struct zspage *zspage)
  269. {
  270. struct zspage_lock *zsl = &zspage->zsl;
  271. rwsem_release(&zsl->dep_map, _RET_IP_);
  272. spin_lock(&zsl->lock);
  273. zsl->cnt--;
  274. spin_unlock(&zsl->lock);
  275. }
  276. static __must_check bool zspage_write_trylock(struct zspage *zspage)
  277. {
  278. struct zspage_lock *zsl = &zspage->zsl;
  279. spin_lock(&zsl->lock);
  280. if (zsl->cnt == ZS_PAGE_UNLOCKED) {
  281. zsl->cnt = ZS_PAGE_WRLOCKED;
  282. rwsem_acquire(&zsl->dep_map, 0, 1, _RET_IP_);
  283. lock_acquired(&zsl->dep_map, _RET_IP_);
  284. return true;
  285. }
  286. spin_unlock(&zsl->lock);
  287. return false;
  288. }
  289. static void zspage_write_unlock(struct zspage *zspage)
  290. {
  291. struct zspage_lock *zsl = &zspage->zsl;
  292. rwsem_release(&zsl->dep_map, _RET_IP_);
  293. zsl->cnt = ZS_PAGE_UNLOCKED;
  294. spin_unlock(&zsl->lock);
  295. }
  296. /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
  297. static void SetZsHugePage(struct zspage *zspage)
  298. {
  299. zspage->huge = 1;
  300. }
  301. static bool ZsHugePage(struct zspage *zspage)
  302. {
  303. return zspage->huge;
  304. }
  305. #ifdef CONFIG_COMPACTION
  306. static void kick_deferred_free(struct zs_pool *pool);
  307. static void init_deferred_free(struct zs_pool *pool);
  308. static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage);
  309. #else
  310. static void kick_deferred_free(struct zs_pool *pool) {}
  311. static void init_deferred_free(struct zs_pool *pool) {}
  312. static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {}
  313. #endif
  314. static unsigned long cache_alloc_handle(gfp_t gfp)
  315. {
  316. gfp = gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE);
  317. return (unsigned long)kmem_cache_alloc(handle_cachep, gfp);
  318. }
  319. static void cache_free_handle(unsigned long handle)
  320. {
  321. kmem_cache_free(handle_cachep, (void *)handle);
  322. }
  323. static struct zspage *cache_alloc_zspage(gfp_t gfp)
  324. {
  325. gfp = gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE);
  326. return kmem_cache_zalloc(zspage_cachep, gfp);
  327. }
  328. static void cache_free_zspage(struct zspage *zspage)
  329. {
  330. kmem_cache_free(zspage_cachep, zspage);
  331. }
  332. /* class->lock(which owns the handle) synchronizes races */
  333. static void record_obj(unsigned long handle, unsigned long obj)
  334. {
  335. *(unsigned long *)handle = obj;
  336. }
  337. static inline bool __maybe_unused is_first_zpdesc(struct zpdesc *zpdesc)
  338. {
  339. return PagePrivate(zpdesc_page(zpdesc));
  340. }
  341. /* Protected by class->lock */
  342. static inline int get_zspage_inuse(struct zspage *zspage)
  343. {
  344. return zspage->inuse;
  345. }
  346. static inline void mod_zspage_inuse(struct zspage *zspage, int val)
  347. {
  348. zspage->inuse += val;
  349. }
  350. static struct zpdesc *get_first_zpdesc(struct zspage *zspage)
  351. {
  352. struct zpdesc *first_zpdesc = zspage->first_zpdesc;
  353. VM_BUG_ON_PAGE(!is_first_zpdesc(first_zpdesc), zpdesc_page(first_zpdesc));
  354. return first_zpdesc;
  355. }
  356. #define FIRST_OBJ_PAGE_TYPE_MASK 0xffffff
  357. static inline unsigned int get_first_obj_offset(struct zpdesc *zpdesc)
  358. {
  359. VM_WARN_ON_ONCE(!PageZsmalloc(zpdesc_page(zpdesc)));
  360. return zpdesc->first_obj_offset & FIRST_OBJ_PAGE_TYPE_MASK;
  361. }
  362. static inline void set_first_obj_offset(struct zpdesc *zpdesc, unsigned int offset)
  363. {
  364. /* With 24 bits available, we can support offsets into 16 MiB pages. */
  365. BUILD_BUG_ON(PAGE_SIZE > SZ_16M);
  366. VM_WARN_ON_ONCE(!PageZsmalloc(zpdesc_page(zpdesc)));
  367. VM_WARN_ON_ONCE(offset & ~FIRST_OBJ_PAGE_TYPE_MASK);
  368. zpdesc->first_obj_offset &= ~FIRST_OBJ_PAGE_TYPE_MASK;
  369. zpdesc->first_obj_offset |= offset & FIRST_OBJ_PAGE_TYPE_MASK;
  370. }
  371. static inline unsigned int get_freeobj(struct zspage *zspage)
  372. {
  373. return zspage->freeobj;
  374. }
  375. static inline void set_freeobj(struct zspage *zspage, unsigned int obj)
  376. {
  377. zspage->freeobj = obj;
  378. }
  379. static struct size_class *zspage_class(struct zs_pool *pool,
  380. struct zspage *zspage)
  381. {
  382. return pool->size_class[zspage->class];
  383. }
  384. /*
  385. * zsmalloc divides the pool into various size classes where each
  386. * class maintains a list of zspages where each zspage is divided
  387. * into equal sized chunks. Each allocation falls into one of these
  388. * classes depending on its size. This function returns index of the
  389. * size class which has chunk size big enough to hold the given size.
  390. */
  391. static int get_size_class_index(int size)
  392. {
  393. int idx = 0;
  394. if (likely(size > ZS_MIN_ALLOC_SIZE))
  395. idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE,
  396. ZS_SIZE_CLASS_DELTA);
  397. return min_t(int, ZS_SIZE_CLASSES - 1, idx);
  398. }
  399. static inline void class_stat_add(struct size_class *class, int type,
  400. unsigned long cnt)
  401. {
  402. class->stats.objs[type] += cnt;
  403. }
  404. static inline void class_stat_sub(struct size_class *class, int type,
  405. unsigned long cnt)
  406. {
  407. class->stats.objs[type] -= cnt;
  408. }
  409. static inline unsigned long class_stat_read(struct size_class *class, int type)
  410. {
  411. return class->stats.objs[type];
  412. }
  413. #ifdef CONFIG_ZSMALLOC_STAT
  414. static void __init zs_stat_init(void)
  415. {
  416. if (!debugfs_initialized()) {
  417. pr_warn("debugfs not available, stat dir not created\n");
  418. return;
  419. }
  420. zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
  421. }
  422. static void __exit zs_stat_exit(void)
  423. {
  424. debugfs_remove_recursive(zs_stat_root);
  425. }
  426. static unsigned long zs_can_compact(struct size_class *class);
  427. static int zs_stats_size_show(struct seq_file *s, void *v)
  428. {
  429. int i, fg;
  430. struct zs_pool *pool = s->private;
  431. struct size_class *class;
  432. int objs_per_zspage;
  433. unsigned long obj_allocated, obj_used, pages_used, freeable;
  434. unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
  435. unsigned long total_freeable = 0;
  436. unsigned long inuse_totals[NR_FULLNESS_GROUPS] = {0, };
  437. seq_printf(s, " %5s %5s %9s %9s %9s %9s %9s %9s %9s %9s %9s %9s %9s %13s %10s %10s %16s %8s\n",
  438. "class", "size", "10%", "20%", "30%", "40%",
  439. "50%", "60%", "70%", "80%", "90%", "99%", "100%",
  440. "obj_allocated", "obj_used", "pages_used",
  441. "pages_per_zspage", "freeable");
  442. for (i = 0; i < ZS_SIZE_CLASSES; i++) {
  443. class = pool->size_class[i];
  444. if (class->index != i)
  445. continue;
  446. spin_lock(&class->lock);
  447. seq_printf(s, " %5u %5u ", i, class->size);
  448. for (fg = ZS_INUSE_RATIO_10; fg < NR_FULLNESS_GROUPS; fg++) {
  449. inuse_totals[fg] += class_stat_read(class, fg);
  450. seq_printf(s, "%9lu ", class_stat_read(class, fg));
  451. }
  452. obj_allocated = class_stat_read(class, ZS_OBJS_ALLOCATED);
  453. obj_used = class_stat_read(class, ZS_OBJS_INUSE);
  454. freeable = zs_can_compact(class);
  455. spin_unlock(&class->lock);
  456. objs_per_zspage = class->objs_per_zspage;
  457. pages_used = obj_allocated / objs_per_zspage *
  458. class->pages_per_zspage;
  459. seq_printf(s, "%13lu %10lu %10lu %16d %8lu\n",
  460. obj_allocated, obj_used, pages_used,
  461. class->pages_per_zspage, freeable);
  462. total_objs += obj_allocated;
  463. total_used_objs += obj_used;
  464. total_pages += pages_used;
  465. total_freeable += freeable;
  466. }
  467. seq_puts(s, "\n");
  468. seq_printf(s, " %5s %5s ", "Total", "");
  469. for (fg = ZS_INUSE_RATIO_10; fg < NR_FULLNESS_GROUPS; fg++)
  470. seq_printf(s, "%9lu ", inuse_totals[fg]);
  471. seq_printf(s, "%13lu %10lu %10lu %16s %8lu\n",
  472. total_objs, total_used_objs, total_pages, "",
  473. total_freeable);
  474. return 0;
  475. }
  476. DEFINE_SHOW_ATTRIBUTE(zs_stats_size);
  477. static void zs_pool_stat_create(struct zs_pool *pool, const char *name)
  478. {
  479. if (!zs_stat_root) {
  480. pr_warn("no root stat dir, not creating <%s> stat dir\n", name);
  481. return;
  482. }
  483. pool->stat_dentry = debugfs_create_dir(name, zs_stat_root);
  484. debugfs_create_file("classes", S_IFREG | 0444, pool->stat_dentry, pool,
  485. &zs_stats_size_fops);
  486. }
  487. static void zs_pool_stat_destroy(struct zs_pool *pool)
  488. {
  489. debugfs_remove_recursive(pool->stat_dentry);
  490. }
  491. #else /* CONFIG_ZSMALLOC_STAT */
  492. static void __init zs_stat_init(void)
  493. {
  494. }
  495. static void __exit zs_stat_exit(void)
  496. {
  497. }
  498. static inline void zs_pool_stat_create(struct zs_pool *pool, const char *name)
  499. {
  500. }
  501. static inline void zs_pool_stat_destroy(struct zs_pool *pool)
  502. {
  503. }
  504. #endif
  505. /*
  506. * For each size class, zspages are divided into different groups
  507. * depending on their usage ratio. This function returns fullness
  508. * status of the given page.
  509. */
  510. static int get_fullness_group(struct size_class *class, struct zspage *zspage)
  511. {
  512. int inuse, objs_per_zspage, ratio;
  513. inuse = get_zspage_inuse(zspage);
  514. objs_per_zspage = class->objs_per_zspage;
  515. if (inuse == 0)
  516. return ZS_INUSE_RATIO_0;
  517. if (inuse == objs_per_zspage)
  518. return ZS_INUSE_RATIO_100;
  519. ratio = 100 * inuse / objs_per_zspage;
  520. /*
  521. * Take integer division into consideration: a page with one inuse
  522. * object out of 127 possible, will end up having 0 usage ratio,
  523. * which is wrong as it belongs in ZS_INUSE_RATIO_10 fullness group.
  524. */
  525. return ratio / 10 + 1;
  526. }
  527. /*
  528. * Each size class maintains various freelists and zspages are assigned
  529. * to one of these freelists based on the number of live objects they
  530. * have. This functions inserts the given zspage into the freelist
  531. * identified by <class, fullness_group>.
  532. */
  533. static void insert_zspage(struct size_class *class,
  534. struct zspage *zspage,
  535. int fullness)
  536. {
  537. class_stat_add(class, fullness, 1);
  538. list_add(&zspage->list, &class->fullness_list[fullness]);
  539. zspage->fullness = fullness;
  540. }
  541. /*
  542. * This function removes the given zspage from the freelist identified
  543. * by <class, fullness_group>.
  544. */
  545. static void remove_zspage(struct size_class *class, struct zspage *zspage)
  546. {
  547. int fullness = zspage->fullness;
  548. VM_BUG_ON(list_empty(&class->fullness_list[fullness]));
  549. list_del_init(&zspage->list);
  550. class_stat_sub(class, fullness, 1);
  551. }
  552. /*
  553. * Each size class maintains zspages in different fullness groups depending
  554. * on the number of live objects they contain. When allocating or freeing
  555. * objects, the fullness status of the page can change, for instance, from
  556. * INUSE_RATIO_80 to INUSE_RATIO_70 when freeing an object. This function
  557. * checks if such a status change has occurred for the given page and
  558. * accordingly moves the page from the list of the old fullness group to that
  559. * of the new fullness group.
  560. */
  561. static int fix_fullness_group(struct size_class *class, struct zspage *zspage)
  562. {
  563. int newfg;
  564. newfg = get_fullness_group(class, zspage);
  565. if (newfg == zspage->fullness)
  566. goto out;
  567. remove_zspage(class, zspage);
  568. insert_zspage(class, zspage, newfg);
  569. out:
  570. return newfg;
  571. }
  572. static struct zspage *get_zspage(struct zpdesc *zpdesc)
  573. {
  574. struct zspage *zspage = zpdesc->zspage;
  575. BUG_ON(zspage->magic != ZSPAGE_MAGIC);
  576. return zspage;
  577. }
  578. static struct zpdesc *get_next_zpdesc(struct zpdesc *zpdesc)
  579. {
  580. struct zspage *zspage = get_zspage(zpdesc);
  581. if (unlikely(ZsHugePage(zspage)))
  582. return NULL;
  583. return zpdesc->next;
  584. }
  585. /**
  586. * obj_to_location - get (<zpdesc>, <obj_idx>) from encoded object value
  587. * @obj: the encoded object value
  588. * @zpdesc: zpdesc object resides in zspage
  589. * @obj_idx: object index
  590. */
  591. static void obj_to_location(unsigned long obj, struct zpdesc **zpdesc,
  592. unsigned int *obj_idx)
  593. {
  594. *zpdesc = pfn_zpdesc(obj >> OBJ_INDEX_BITS);
  595. *obj_idx = (obj & OBJ_INDEX_MASK);
  596. }
  597. static void obj_to_zpdesc(unsigned long obj, struct zpdesc **zpdesc)
  598. {
  599. *zpdesc = pfn_zpdesc(obj >> OBJ_INDEX_BITS);
  600. }
  601. /**
  602. * location_to_obj - get obj value encoded from (<zpdesc>, <obj_idx>)
  603. * @zpdesc: zpdesc object resides in zspage
  604. * @obj_idx: object index
  605. */
  606. static unsigned long location_to_obj(struct zpdesc *zpdesc, unsigned int obj_idx)
  607. {
  608. unsigned long obj;
  609. obj = zpdesc_pfn(zpdesc) << OBJ_INDEX_BITS;
  610. obj |= obj_idx & OBJ_INDEX_MASK;
  611. return obj;
  612. }
  613. static unsigned long handle_to_obj(unsigned long handle)
  614. {
  615. return *(unsigned long *)handle;
  616. }
  617. static inline bool obj_allocated(struct zpdesc *zpdesc, void *obj,
  618. unsigned long *phandle)
  619. {
  620. unsigned long handle;
  621. struct zspage *zspage = get_zspage(zpdesc);
  622. if (unlikely(ZsHugePage(zspage))) {
  623. VM_BUG_ON_PAGE(!is_first_zpdesc(zpdesc), zpdesc_page(zpdesc));
  624. handle = zpdesc->handle;
  625. } else
  626. handle = *(unsigned long *)obj;
  627. if (!(handle & OBJ_ALLOCATED_TAG))
  628. return false;
  629. /* Clear all tags before returning the handle */
  630. *phandle = handle & ~OBJ_TAG_MASK;
  631. return true;
  632. }
  633. static void reset_zpdesc(struct zpdesc *zpdesc)
  634. {
  635. struct page *page = zpdesc_page(zpdesc);
  636. ClearPagePrivate(page);
  637. zpdesc->zspage = NULL;
  638. zpdesc->next = NULL;
  639. /* PageZsmalloc is sticky until the page is freed to the buddy. */
  640. }
  641. static int trylock_zspage(struct zspage *zspage)
  642. {
  643. struct zpdesc *cursor, *fail;
  644. for (cursor = get_first_zpdesc(zspage); cursor != NULL; cursor =
  645. get_next_zpdesc(cursor)) {
  646. if (!zpdesc_trylock(cursor)) {
  647. fail = cursor;
  648. goto unlock;
  649. }
  650. }
  651. return 1;
  652. unlock:
  653. for (cursor = get_first_zpdesc(zspage); cursor != fail; cursor =
  654. get_next_zpdesc(cursor))
  655. zpdesc_unlock(cursor);
  656. return 0;
  657. }
  658. static void __free_zspage(struct zs_pool *pool, struct size_class *class,
  659. struct zspage *zspage)
  660. {
  661. struct zpdesc *zpdesc, *next;
  662. assert_spin_locked(&class->lock);
  663. VM_BUG_ON(get_zspage_inuse(zspage));
  664. VM_BUG_ON(zspage->fullness != ZS_INUSE_RATIO_0);
  665. next = zpdesc = get_first_zpdesc(zspage);
  666. do {
  667. VM_BUG_ON_PAGE(!zpdesc_is_locked(zpdesc), zpdesc_page(zpdesc));
  668. next = get_next_zpdesc(zpdesc);
  669. reset_zpdesc(zpdesc);
  670. zpdesc_unlock(zpdesc);
  671. zpdesc_dec_zone_page_state(zpdesc);
  672. zpdesc_put(zpdesc);
  673. zpdesc = next;
  674. } while (zpdesc != NULL);
  675. cache_free_zspage(zspage);
  676. class_stat_sub(class, ZS_OBJS_ALLOCATED, class->objs_per_zspage);
  677. atomic_long_sub(class->pages_per_zspage, &pool->pages_allocated);
  678. }
  679. static void free_zspage(struct zs_pool *pool, struct size_class *class,
  680. struct zspage *zspage)
  681. {
  682. VM_BUG_ON(get_zspage_inuse(zspage));
  683. VM_BUG_ON(list_empty(&zspage->list));
  684. /*
  685. * Since zs_free couldn't be sleepable, this function cannot call
  686. * lock_page. The page locks trylock_zspage got will be released
  687. * by __free_zspage.
  688. */
  689. if (!trylock_zspage(zspage)) {
  690. kick_deferred_free(pool);
  691. return;
  692. }
  693. remove_zspage(class, zspage);
  694. __free_zspage(pool, class, zspage);
  695. }
  696. /* Initialize a newly allocated zspage */
  697. static void init_zspage(struct size_class *class, struct zspage *zspage)
  698. {
  699. unsigned int freeobj = 1;
  700. unsigned long off = 0;
  701. struct zpdesc *zpdesc = get_first_zpdesc(zspage);
  702. while (zpdesc) {
  703. struct zpdesc *next_zpdesc;
  704. struct link_free *link;
  705. void *vaddr;
  706. set_first_obj_offset(zpdesc, off);
  707. vaddr = kmap_local_zpdesc(zpdesc);
  708. link = (struct link_free *)vaddr + off / sizeof(*link);
  709. while ((off += class->size) < PAGE_SIZE) {
  710. link->next = freeobj++ << OBJ_TAG_BITS;
  711. link += class->size / sizeof(*link);
  712. }
  713. /*
  714. * We now come to the last (full or partial) object on this
  715. * page, which must point to the first object on the next
  716. * page (if present)
  717. */
  718. next_zpdesc = get_next_zpdesc(zpdesc);
  719. if (next_zpdesc) {
  720. link->next = freeobj++ << OBJ_TAG_BITS;
  721. } else {
  722. /*
  723. * Reset OBJ_TAG_BITS bit to last link to tell
  724. * whether it's allocated object or not.
  725. */
  726. link->next = -1UL << OBJ_TAG_BITS;
  727. }
  728. kunmap_local(vaddr);
  729. zpdesc = next_zpdesc;
  730. off %= PAGE_SIZE;
  731. }
  732. set_freeobj(zspage, 0);
  733. }
  734. static void create_page_chain(struct size_class *class, struct zspage *zspage,
  735. struct zpdesc *zpdescs[])
  736. {
  737. int i;
  738. struct zpdesc *zpdesc;
  739. struct zpdesc *prev_zpdesc = NULL;
  740. int nr_zpdescs = class->pages_per_zspage;
  741. /*
  742. * Allocate individual pages and link them together as:
  743. * 1. all pages are linked together using zpdesc->next
  744. * 2. each sub-page point to zspage using zpdesc->zspage
  745. *
  746. * we set PG_private to identify the first zpdesc (i.e. no other zpdesc
  747. * has this flag set).
  748. */
  749. for (i = 0; i < nr_zpdescs; i++) {
  750. zpdesc = zpdescs[i];
  751. zpdesc->zspage = zspage;
  752. zpdesc->next = NULL;
  753. if (i == 0) {
  754. zspage->first_zpdesc = zpdesc;
  755. zpdesc_set_first(zpdesc);
  756. if (unlikely(class->objs_per_zspage == 1 &&
  757. class->pages_per_zspage == 1))
  758. SetZsHugePage(zspage);
  759. } else {
  760. prev_zpdesc->next = zpdesc;
  761. }
  762. prev_zpdesc = zpdesc;
  763. }
  764. }
  765. /*
  766. * Allocate a zspage for the given size class
  767. */
  768. static struct zspage *alloc_zspage(struct zs_pool *pool,
  769. struct size_class *class,
  770. gfp_t gfp, const int nid)
  771. {
  772. int i;
  773. struct zpdesc *zpdescs[ZS_MAX_PAGES_PER_ZSPAGE];
  774. struct zspage *zspage = cache_alloc_zspage(gfp);
  775. if (!zspage)
  776. return NULL;
  777. if (!IS_ENABLED(CONFIG_COMPACTION))
  778. gfp &= ~__GFP_MOVABLE;
  779. zspage->magic = ZSPAGE_MAGIC;
  780. zspage->pool = pool;
  781. zspage->class = class->index;
  782. zspage_lock_init(zspage);
  783. for (i = 0; i < class->pages_per_zspage; i++) {
  784. struct zpdesc *zpdesc;
  785. zpdesc = alloc_zpdesc(gfp, nid);
  786. if (!zpdesc) {
  787. while (--i >= 0) {
  788. zpdesc_dec_zone_page_state(zpdescs[i]);
  789. free_zpdesc(zpdescs[i]);
  790. }
  791. cache_free_zspage(zspage);
  792. return NULL;
  793. }
  794. __zpdesc_set_zsmalloc(zpdesc);
  795. zpdesc_inc_zone_page_state(zpdesc);
  796. zpdescs[i] = zpdesc;
  797. }
  798. create_page_chain(class, zspage, zpdescs);
  799. init_zspage(class, zspage);
  800. return zspage;
  801. }
  802. static struct zspage *find_get_zspage(struct size_class *class)
  803. {
  804. int i;
  805. struct zspage *zspage;
  806. for (i = ZS_INUSE_RATIO_99; i >= ZS_INUSE_RATIO_0; i--) {
  807. zspage = list_first_entry_or_null(&class->fullness_list[i],
  808. struct zspage, list);
  809. if (zspage)
  810. break;
  811. }
  812. return zspage;
  813. }
  814. static bool can_merge(struct size_class *prev, int pages_per_zspage,
  815. int objs_per_zspage)
  816. {
  817. if (prev->pages_per_zspage == pages_per_zspage &&
  818. prev->objs_per_zspage == objs_per_zspage)
  819. return true;
  820. return false;
  821. }
  822. static bool zspage_full(struct size_class *class, struct zspage *zspage)
  823. {
  824. return get_zspage_inuse(zspage) == class->objs_per_zspage;
  825. }
  826. static bool zspage_empty(struct zspage *zspage)
  827. {
  828. return get_zspage_inuse(zspage) == 0;
  829. }
  830. /**
  831. * zs_lookup_class_index() - Returns index of the zsmalloc &size_class
  832. * that hold objects of the provided size.
  833. * @pool: zsmalloc pool to use
  834. * @size: object size
  835. *
  836. * Context: Any context.
  837. *
  838. * Return: the index of the zsmalloc &size_class that hold objects of the
  839. * provided size.
  840. */
  841. unsigned int zs_lookup_class_index(struct zs_pool *pool, unsigned int size)
  842. {
  843. struct size_class *class;
  844. class = pool->size_class[get_size_class_index(size)];
  845. return class->index;
  846. }
  847. EXPORT_SYMBOL_GPL(zs_lookup_class_index);
  848. unsigned long zs_get_total_pages(struct zs_pool *pool)
  849. {
  850. return atomic_long_read(&pool->pages_allocated);
  851. }
  852. EXPORT_SYMBOL_GPL(zs_get_total_pages);
  853. void *zs_obj_read_begin(struct zs_pool *pool, unsigned long handle,
  854. size_t mem_len, void *local_copy)
  855. {
  856. struct zspage *zspage;
  857. struct zpdesc *zpdesc;
  858. unsigned long obj, off;
  859. unsigned int obj_idx;
  860. struct size_class *class;
  861. void *addr;
  862. /* Guarantee we can get zspage from handle safely */
  863. read_lock(&pool->lock);
  864. obj = handle_to_obj(handle);
  865. obj_to_location(obj, &zpdesc, &obj_idx);
  866. zspage = get_zspage(zpdesc);
  867. /* Make sure migration doesn't move any pages in this zspage */
  868. zspage_read_lock(zspage);
  869. read_unlock(&pool->lock);
  870. class = zspage_class(pool, zspage);
  871. off = offset_in_page(class->size * obj_idx);
  872. if (!ZsHugePage(zspage))
  873. off += ZS_HANDLE_SIZE;
  874. if (off + mem_len <= PAGE_SIZE) {
  875. /* this object is contained entirely within a page */
  876. addr = kmap_local_zpdesc(zpdesc);
  877. addr += off;
  878. } else {
  879. size_t sizes[2];
  880. /* this object spans two pages */
  881. sizes[0] = PAGE_SIZE - off;
  882. sizes[1] = mem_len - sizes[0];
  883. addr = local_copy;
  884. memcpy_from_page(addr, zpdesc_page(zpdesc),
  885. off, sizes[0]);
  886. zpdesc = get_next_zpdesc(zpdesc);
  887. memcpy_from_page(addr + sizes[0],
  888. zpdesc_page(zpdesc),
  889. 0, sizes[1]);
  890. }
  891. return addr;
  892. }
  893. EXPORT_SYMBOL_GPL(zs_obj_read_begin);
  894. void zs_obj_read_end(struct zs_pool *pool, unsigned long handle,
  895. size_t mem_len, void *handle_mem)
  896. {
  897. struct zspage *zspage;
  898. struct zpdesc *zpdesc;
  899. unsigned long obj, off;
  900. unsigned int obj_idx;
  901. struct size_class *class;
  902. obj = handle_to_obj(handle);
  903. obj_to_location(obj, &zpdesc, &obj_idx);
  904. zspage = get_zspage(zpdesc);
  905. class = zspage_class(pool, zspage);
  906. off = offset_in_page(class->size * obj_idx);
  907. if (!ZsHugePage(zspage))
  908. off += ZS_HANDLE_SIZE;
  909. if (off + mem_len <= PAGE_SIZE) {
  910. handle_mem -= off;
  911. kunmap_local(handle_mem);
  912. }
  913. zspage_read_unlock(zspage);
  914. }
  915. EXPORT_SYMBOL_GPL(zs_obj_read_end);
  916. void zs_obj_read_sg_begin(struct zs_pool *pool, unsigned long handle,
  917. struct scatterlist *sg, size_t mem_len)
  918. {
  919. struct zspage *zspage;
  920. struct zpdesc *zpdesc;
  921. unsigned long obj, off;
  922. unsigned int obj_idx;
  923. struct size_class *class;
  924. /* Guarantee we can get zspage from handle safely */
  925. read_lock(&pool->lock);
  926. obj = handle_to_obj(handle);
  927. obj_to_location(obj, &zpdesc, &obj_idx);
  928. zspage = get_zspage(zpdesc);
  929. /* Make sure migration doesn't move any pages in this zspage */
  930. zspage_read_lock(zspage);
  931. read_unlock(&pool->lock);
  932. class = zspage_class(pool, zspage);
  933. off = offset_in_page(class->size * obj_idx);
  934. if (!ZsHugePage(zspage))
  935. off += ZS_HANDLE_SIZE;
  936. if (off + mem_len <= PAGE_SIZE) {
  937. /* this object is contained entirely within a page */
  938. sg_init_table(sg, 1);
  939. sg_set_page(sg, zpdesc_page(zpdesc), mem_len, off);
  940. } else {
  941. size_t sizes[2];
  942. /* this object spans two pages */
  943. sizes[0] = PAGE_SIZE - off;
  944. sizes[1] = mem_len - sizes[0];
  945. sg_init_table(sg, 2);
  946. sg_set_page(sg, zpdesc_page(zpdesc), sizes[0], off);
  947. zpdesc = get_next_zpdesc(zpdesc);
  948. sg = sg_next(sg);
  949. sg_set_page(sg, zpdesc_page(zpdesc), sizes[1], 0);
  950. }
  951. }
  952. EXPORT_SYMBOL_GPL(zs_obj_read_sg_begin);
  953. void zs_obj_read_sg_end(struct zs_pool *pool, unsigned long handle)
  954. {
  955. struct zspage *zspage;
  956. struct zpdesc *zpdesc;
  957. unsigned long obj;
  958. unsigned int obj_idx;
  959. obj = handle_to_obj(handle);
  960. obj_to_location(obj, &zpdesc, &obj_idx);
  961. zspage = get_zspage(zpdesc);
  962. zspage_read_unlock(zspage);
  963. }
  964. EXPORT_SYMBOL_GPL(zs_obj_read_sg_end);
  965. void zs_obj_write(struct zs_pool *pool, unsigned long handle,
  966. void *handle_mem, size_t mem_len)
  967. {
  968. struct zspage *zspage;
  969. struct zpdesc *zpdesc;
  970. unsigned long obj, off;
  971. unsigned int obj_idx;
  972. struct size_class *class;
  973. /* Guarantee we can get zspage from handle safely */
  974. read_lock(&pool->lock);
  975. obj = handle_to_obj(handle);
  976. obj_to_location(obj, &zpdesc, &obj_idx);
  977. zspage = get_zspage(zpdesc);
  978. /* Make sure migration doesn't move any pages in this zspage */
  979. zspage_read_lock(zspage);
  980. read_unlock(&pool->lock);
  981. class = zspage_class(pool, zspage);
  982. off = offset_in_page(class->size * obj_idx);
  983. if (!ZsHugePage(zspage))
  984. off += ZS_HANDLE_SIZE;
  985. if (off + mem_len <= PAGE_SIZE) {
  986. /* this object is contained entirely within a page */
  987. void *dst = kmap_local_zpdesc(zpdesc);
  988. memcpy(dst + off, handle_mem, mem_len);
  989. kunmap_local(dst);
  990. } else {
  991. /* this object spans two pages */
  992. size_t sizes[2];
  993. sizes[0] = PAGE_SIZE - off;
  994. sizes[1] = mem_len - sizes[0];
  995. memcpy_to_page(zpdesc_page(zpdesc), off,
  996. handle_mem, sizes[0]);
  997. zpdesc = get_next_zpdesc(zpdesc);
  998. memcpy_to_page(zpdesc_page(zpdesc), 0,
  999. handle_mem + sizes[0], sizes[1]);
  1000. }
  1001. zspage_read_unlock(zspage);
  1002. }
  1003. EXPORT_SYMBOL_GPL(zs_obj_write);
  1004. /**
  1005. * zs_huge_class_size() - Returns the size (in bytes) of the first huge
  1006. * zsmalloc &size_class.
  1007. * @pool: zsmalloc pool to use
  1008. *
  1009. * The function returns the size of the first huge class - any object of equal
  1010. * or bigger size will be stored in zspage consisting of a single physical
  1011. * page.
  1012. *
  1013. * Context: Any context.
  1014. *
  1015. * Return: the size (in bytes) of the first huge zsmalloc &size_class.
  1016. */
  1017. size_t zs_huge_class_size(struct zs_pool *pool)
  1018. {
  1019. return huge_class_size;
  1020. }
  1021. EXPORT_SYMBOL_GPL(zs_huge_class_size);
  1022. static unsigned long obj_malloc(struct zs_pool *pool,
  1023. struct zspage *zspage, unsigned long handle)
  1024. {
  1025. int i, nr_zpdesc, offset;
  1026. unsigned long obj;
  1027. struct link_free *link;
  1028. struct size_class *class;
  1029. struct zpdesc *m_zpdesc;
  1030. unsigned long m_offset;
  1031. void *vaddr;
  1032. class = pool->size_class[zspage->class];
  1033. obj = get_freeobj(zspage);
  1034. offset = obj * class->size;
  1035. nr_zpdesc = offset >> PAGE_SHIFT;
  1036. m_offset = offset_in_page(offset);
  1037. m_zpdesc = get_first_zpdesc(zspage);
  1038. for (i = 0; i < nr_zpdesc; i++)
  1039. m_zpdesc = get_next_zpdesc(m_zpdesc);
  1040. vaddr = kmap_local_zpdesc(m_zpdesc);
  1041. link = (struct link_free *)vaddr + m_offset / sizeof(*link);
  1042. set_freeobj(zspage, link->next >> OBJ_TAG_BITS);
  1043. if (likely(!ZsHugePage(zspage)))
  1044. /* record handle in the header of allocated chunk */
  1045. link->handle = handle | OBJ_ALLOCATED_TAG;
  1046. else
  1047. zspage->first_zpdesc->handle = handle | OBJ_ALLOCATED_TAG;
  1048. kunmap_local(vaddr);
  1049. mod_zspage_inuse(zspage, 1);
  1050. obj = location_to_obj(m_zpdesc, obj);
  1051. record_obj(handle, obj);
  1052. return obj;
  1053. }
  1054. /**
  1055. * zs_malloc - Allocate block of given size from pool.
  1056. * @pool: pool to allocate from
  1057. * @size: size of block to allocate
  1058. * @gfp: gfp flags when allocating object
  1059. * @nid: The preferred node id to allocate new zspage (if needed)
  1060. *
  1061. * On success, handle to the allocated object is returned,
  1062. * otherwise an ERR_PTR().
  1063. * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail.
  1064. */
  1065. unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp,
  1066. const int nid)
  1067. {
  1068. unsigned long handle;
  1069. struct size_class *class;
  1070. int newfg;
  1071. struct zspage *zspage;
  1072. if (unlikely(!size))
  1073. return (unsigned long)ERR_PTR(-EINVAL);
  1074. if (unlikely(size > ZS_MAX_ALLOC_SIZE))
  1075. return (unsigned long)ERR_PTR(-ENOSPC);
  1076. handle = cache_alloc_handle(gfp);
  1077. if (!handle)
  1078. return (unsigned long)ERR_PTR(-ENOMEM);
  1079. /* extra space in chunk to keep the handle */
  1080. size += ZS_HANDLE_SIZE;
  1081. class = pool->size_class[get_size_class_index(size)];
  1082. /* class->lock effectively protects the zpage migration */
  1083. spin_lock(&class->lock);
  1084. zspage = find_get_zspage(class);
  1085. if (likely(zspage)) {
  1086. obj_malloc(pool, zspage, handle);
  1087. /* Now move the zspage to another fullness group, if required */
  1088. fix_fullness_group(class, zspage);
  1089. class_stat_add(class, ZS_OBJS_INUSE, 1);
  1090. goto out;
  1091. }
  1092. spin_unlock(&class->lock);
  1093. zspage = alloc_zspage(pool, class, gfp, nid);
  1094. if (!zspage) {
  1095. cache_free_handle(handle);
  1096. return (unsigned long)ERR_PTR(-ENOMEM);
  1097. }
  1098. spin_lock(&class->lock);
  1099. obj_malloc(pool, zspage, handle);
  1100. newfg = get_fullness_group(class, zspage);
  1101. insert_zspage(class, zspage, newfg);
  1102. atomic_long_add(class->pages_per_zspage, &pool->pages_allocated);
  1103. class_stat_add(class, ZS_OBJS_ALLOCATED, class->objs_per_zspage);
  1104. class_stat_add(class, ZS_OBJS_INUSE, 1);
  1105. /* We completely set up zspage so mark them as movable */
  1106. SetZsPageMovable(pool, zspage);
  1107. out:
  1108. spin_unlock(&class->lock);
  1109. return handle;
  1110. }
  1111. EXPORT_SYMBOL_GPL(zs_malloc);
  1112. static void obj_free(int class_size, unsigned long obj)
  1113. {
  1114. struct link_free *link;
  1115. struct zspage *zspage;
  1116. struct zpdesc *f_zpdesc;
  1117. unsigned long f_offset;
  1118. unsigned int f_objidx;
  1119. void *vaddr;
  1120. obj_to_location(obj, &f_zpdesc, &f_objidx);
  1121. f_offset = offset_in_page(class_size * f_objidx);
  1122. zspage = get_zspage(f_zpdesc);
  1123. vaddr = kmap_local_zpdesc(f_zpdesc);
  1124. link = (struct link_free *)(vaddr + f_offset);
  1125. /* Insert this object in containing zspage's freelist */
  1126. if (likely(!ZsHugePage(zspage)))
  1127. link->next = get_freeobj(zspage) << OBJ_TAG_BITS;
  1128. else
  1129. f_zpdesc->handle = 0;
  1130. set_freeobj(zspage, f_objidx);
  1131. kunmap_local(vaddr);
  1132. mod_zspage_inuse(zspage, -1);
  1133. }
  1134. void zs_free(struct zs_pool *pool, unsigned long handle)
  1135. {
  1136. struct zspage *zspage;
  1137. struct zpdesc *f_zpdesc;
  1138. unsigned long obj;
  1139. struct size_class *class;
  1140. int fullness;
  1141. if (IS_ERR_OR_NULL((void *)handle))
  1142. return;
  1143. /*
  1144. * The pool->lock protects the race with zpage's migration
  1145. * so it's safe to get the page from handle.
  1146. */
  1147. read_lock(&pool->lock);
  1148. obj = handle_to_obj(handle);
  1149. obj_to_zpdesc(obj, &f_zpdesc);
  1150. zspage = get_zspage(f_zpdesc);
  1151. class = zspage_class(pool, zspage);
  1152. spin_lock(&class->lock);
  1153. read_unlock(&pool->lock);
  1154. class_stat_sub(class, ZS_OBJS_INUSE, 1);
  1155. obj_free(class->size, obj);
  1156. fullness = fix_fullness_group(class, zspage);
  1157. if (fullness == ZS_INUSE_RATIO_0)
  1158. free_zspage(pool, class, zspage);
  1159. spin_unlock(&class->lock);
  1160. cache_free_handle(handle);
  1161. }
  1162. EXPORT_SYMBOL_GPL(zs_free);
  1163. static void zs_object_copy(struct size_class *class, unsigned long dst,
  1164. unsigned long src)
  1165. {
  1166. struct zpdesc *s_zpdesc, *d_zpdesc;
  1167. unsigned int s_objidx, d_objidx;
  1168. unsigned long s_off, d_off;
  1169. void *s_addr, *d_addr;
  1170. int s_size, d_size, size;
  1171. int written = 0;
  1172. s_size = d_size = class->size;
  1173. obj_to_location(src, &s_zpdesc, &s_objidx);
  1174. obj_to_location(dst, &d_zpdesc, &d_objidx);
  1175. s_off = offset_in_page(class->size * s_objidx);
  1176. d_off = offset_in_page(class->size * d_objidx);
  1177. if (s_off + class->size > PAGE_SIZE)
  1178. s_size = PAGE_SIZE - s_off;
  1179. if (d_off + class->size > PAGE_SIZE)
  1180. d_size = PAGE_SIZE - d_off;
  1181. s_addr = kmap_local_zpdesc(s_zpdesc);
  1182. d_addr = kmap_local_zpdesc(d_zpdesc);
  1183. while (1) {
  1184. size = min(s_size, d_size);
  1185. memcpy(d_addr + d_off, s_addr + s_off, size);
  1186. written += size;
  1187. if (written == class->size)
  1188. break;
  1189. s_off += size;
  1190. s_size -= size;
  1191. d_off += size;
  1192. d_size -= size;
  1193. /*
  1194. * Calling kunmap_local(d_addr) is necessary. kunmap_local()
  1195. * calls must occurs in reverse order of calls to kmap_local_page().
  1196. * So, to call kunmap_local(s_addr) we should first call
  1197. * kunmap_local(d_addr). For more details see
  1198. * Documentation/mm/highmem.rst.
  1199. */
  1200. if (s_off >= PAGE_SIZE) {
  1201. kunmap_local(d_addr);
  1202. kunmap_local(s_addr);
  1203. s_zpdesc = get_next_zpdesc(s_zpdesc);
  1204. s_addr = kmap_local_zpdesc(s_zpdesc);
  1205. d_addr = kmap_local_zpdesc(d_zpdesc);
  1206. s_size = class->size - written;
  1207. s_off = 0;
  1208. }
  1209. if (d_off >= PAGE_SIZE) {
  1210. kunmap_local(d_addr);
  1211. d_zpdesc = get_next_zpdesc(d_zpdesc);
  1212. d_addr = kmap_local_zpdesc(d_zpdesc);
  1213. d_size = class->size - written;
  1214. d_off = 0;
  1215. }
  1216. }
  1217. kunmap_local(d_addr);
  1218. kunmap_local(s_addr);
  1219. }
  1220. /*
  1221. * Find alloced object in zspage from index object and
  1222. * return handle.
  1223. */
  1224. static unsigned long find_alloced_obj(struct size_class *class,
  1225. struct zpdesc *zpdesc, int *obj_idx)
  1226. {
  1227. unsigned int offset;
  1228. int index = *obj_idx;
  1229. unsigned long handle = 0;
  1230. void *addr = kmap_local_zpdesc(zpdesc);
  1231. offset = get_first_obj_offset(zpdesc);
  1232. offset += class->size * index;
  1233. while (offset < PAGE_SIZE) {
  1234. if (obj_allocated(zpdesc, addr + offset, &handle))
  1235. break;
  1236. offset += class->size;
  1237. index++;
  1238. }
  1239. kunmap_local(addr);
  1240. *obj_idx = index;
  1241. return handle;
  1242. }
  1243. static void migrate_zspage(struct zs_pool *pool, struct zspage *src_zspage,
  1244. struct zspage *dst_zspage)
  1245. {
  1246. unsigned long used_obj, free_obj;
  1247. unsigned long handle;
  1248. int obj_idx = 0;
  1249. struct zpdesc *s_zpdesc = get_first_zpdesc(src_zspage);
  1250. struct size_class *class = pool->size_class[src_zspage->class];
  1251. while (1) {
  1252. handle = find_alloced_obj(class, s_zpdesc, &obj_idx);
  1253. if (!handle) {
  1254. s_zpdesc = get_next_zpdesc(s_zpdesc);
  1255. if (!s_zpdesc)
  1256. break;
  1257. obj_idx = 0;
  1258. continue;
  1259. }
  1260. used_obj = handle_to_obj(handle);
  1261. free_obj = obj_malloc(pool, dst_zspage, handle);
  1262. zs_object_copy(class, free_obj, used_obj);
  1263. obj_idx++;
  1264. obj_free(class->size, used_obj);
  1265. /* Stop if there is no more space */
  1266. if (zspage_full(class, dst_zspage))
  1267. break;
  1268. /* Stop if there are no more objects to migrate */
  1269. if (zspage_empty(src_zspage))
  1270. break;
  1271. }
  1272. }
  1273. static struct zspage *isolate_src_zspage(struct size_class *class)
  1274. {
  1275. struct zspage *zspage;
  1276. int fg;
  1277. for (fg = ZS_INUSE_RATIO_10; fg <= ZS_INUSE_RATIO_99; fg++) {
  1278. zspage = list_first_entry_or_null(&class->fullness_list[fg],
  1279. struct zspage, list);
  1280. if (zspage) {
  1281. remove_zspage(class, zspage);
  1282. return zspage;
  1283. }
  1284. }
  1285. return zspage;
  1286. }
  1287. static struct zspage *isolate_dst_zspage(struct size_class *class)
  1288. {
  1289. struct zspage *zspage;
  1290. int fg;
  1291. for (fg = ZS_INUSE_RATIO_99; fg >= ZS_INUSE_RATIO_10; fg--) {
  1292. zspage = list_first_entry_or_null(&class->fullness_list[fg],
  1293. struct zspage, list);
  1294. if (zspage) {
  1295. remove_zspage(class, zspage);
  1296. return zspage;
  1297. }
  1298. }
  1299. return zspage;
  1300. }
  1301. /*
  1302. * putback_zspage - add @zspage into right class's fullness list
  1303. * @class: destination class
  1304. * @zspage: target page
  1305. *
  1306. * Return @zspage's fullness status
  1307. */
  1308. static int putback_zspage(struct size_class *class, struct zspage *zspage)
  1309. {
  1310. int fullness;
  1311. fullness = get_fullness_group(class, zspage);
  1312. insert_zspage(class, zspage, fullness);
  1313. return fullness;
  1314. }
  1315. #ifdef CONFIG_COMPACTION
  1316. /*
  1317. * To prevent zspage destroy during migration, zspage freeing should
  1318. * hold locks of all pages in the zspage.
  1319. */
  1320. static void lock_zspage(struct zspage *zspage)
  1321. {
  1322. struct zpdesc *curr_zpdesc, *zpdesc;
  1323. /*
  1324. * Pages we haven't locked yet can be migrated off the list while we're
  1325. * trying to lock them, so we need to be careful and only attempt to
  1326. * lock each page under zspage_read_lock(). Otherwise, the page we lock
  1327. * may no longer belong to the zspage. This means that we may wait for
  1328. * the wrong page to unlock, so we must take a reference to the page
  1329. * prior to waiting for it to unlock outside zspage_read_lock().
  1330. */
  1331. while (1) {
  1332. zspage_read_lock(zspage);
  1333. zpdesc = get_first_zpdesc(zspage);
  1334. if (zpdesc_trylock(zpdesc))
  1335. break;
  1336. zpdesc_get(zpdesc);
  1337. zspage_read_unlock(zspage);
  1338. zpdesc_wait_locked(zpdesc);
  1339. zpdesc_put(zpdesc);
  1340. }
  1341. curr_zpdesc = zpdesc;
  1342. while ((zpdesc = get_next_zpdesc(curr_zpdesc))) {
  1343. if (zpdesc_trylock(zpdesc)) {
  1344. curr_zpdesc = zpdesc;
  1345. } else {
  1346. zpdesc_get(zpdesc);
  1347. zspage_read_unlock(zspage);
  1348. zpdesc_wait_locked(zpdesc);
  1349. zpdesc_put(zpdesc);
  1350. zspage_read_lock(zspage);
  1351. }
  1352. }
  1353. zspage_read_unlock(zspage);
  1354. }
  1355. #endif /* CONFIG_COMPACTION */
  1356. #ifdef CONFIG_COMPACTION
  1357. static void replace_sub_page(struct size_class *class, struct zspage *zspage,
  1358. struct zpdesc *newzpdesc, struct zpdesc *oldzpdesc)
  1359. {
  1360. struct zpdesc *zpdesc;
  1361. struct zpdesc *zpdescs[ZS_MAX_PAGES_PER_ZSPAGE] = {NULL, };
  1362. unsigned int first_obj_offset;
  1363. int idx = 0;
  1364. zpdesc = get_first_zpdesc(zspage);
  1365. do {
  1366. if (zpdesc == oldzpdesc)
  1367. zpdescs[idx] = newzpdesc;
  1368. else
  1369. zpdescs[idx] = zpdesc;
  1370. idx++;
  1371. } while ((zpdesc = get_next_zpdesc(zpdesc)) != NULL);
  1372. create_page_chain(class, zspage, zpdescs);
  1373. first_obj_offset = get_first_obj_offset(oldzpdesc);
  1374. set_first_obj_offset(newzpdesc, first_obj_offset);
  1375. if (unlikely(ZsHugePage(zspage)))
  1376. newzpdesc->handle = oldzpdesc->handle;
  1377. __zpdesc_set_movable(newzpdesc);
  1378. }
  1379. static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
  1380. {
  1381. /*
  1382. * Page is locked so zspage can't be destroyed concurrently
  1383. * (see free_zspage()). But if the page was already destroyed
  1384. * (see reset_zpdesc()), refuse isolation here.
  1385. */
  1386. return page_zpdesc(page)->zspage;
  1387. }
  1388. static int zs_page_migrate(struct page *newpage, struct page *page,
  1389. enum migrate_mode mode)
  1390. {
  1391. struct zs_pool *pool;
  1392. struct size_class *class;
  1393. struct zspage *zspage;
  1394. struct zpdesc *dummy;
  1395. struct zpdesc *newzpdesc = page_zpdesc(newpage);
  1396. struct zpdesc *zpdesc = page_zpdesc(page);
  1397. void *s_addr, *d_addr, *addr;
  1398. unsigned int offset;
  1399. unsigned long handle;
  1400. unsigned long old_obj, new_obj;
  1401. unsigned int obj_idx;
  1402. /*
  1403. * TODO: nothing prevents a zspage from getting destroyed while
  1404. * it is isolated for migration, as the page lock is temporarily
  1405. * dropped after zs_page_isolate() succeeded: we should rework that
  1406. * and defer destroying such pages once they are un-isolated (putback)
  1407. * instead.
  1408. */
  1409. if (!zpdesc->zspage)
  1410. return 0;
  1411. /* The page is locked, so this pointer must remain valid */
  1412. zspage = get_zspage(zpdesc);
  1413. pool = zspage->pool;
  1414. /*
  1415. * The pool migrate_lock protects the race between zpage migration
  1416. * and zs_free.
  1417. */
  1418. write_lock(&pool->lock);
  1419. class = zspage_class(pool, zspage);
  1420. /*
  1421. * the class lock protects zpage alloc/free in the zspage.
  1422. */
  1423. spin_lock(&class->lock);
  1424. /* the zspage write_lock protects zpage access via zs_obj_read/write() */
  1425. if (!zspage_write_trylock(zspage)) {
  1426. spin_unlock(&class->lock);
  1427. write_unlock(&pool->lock);
  1428. return -EINVAL;
  1429. }
  1430. /* We're committed, tell the world that this is a Zsmalloc page. */
  1431. __zpdesc_set_zsmalloc(newzpdesc);
  1432. offset = get_first_obj_offset(zpdesc);
  1433. s_addr = kmap_local_zpdesc(zpdesc);
  1434. /*
  1435. * Here, any user cannot access all objects in the zspage so let's move.
  1436. */
  1437. d_addr = kmap_local_zpdesc(newzpdesc);
  1438. copy_page(d_addr, s_addr);
  1439. kunmap_local(d_addr);
  1440. for (addr = s_addr + offset; addr < s_addr + PAGE_SIZE;
  1441. addr += class->size) {
  1442. if (obj_allocated(zpdesc, addr, &handle)) {
  1443. old_obj = handle_to_obj(handle);
  1444. obj_to_location(old_obj, &dummy, &obj_idx);
  1445. new_obj = (unsigned long)location_to_obj(newzpdesc, obj_idx);
  1446. record_obj(handle, new_obj);
  1447. }
  1448. }
  1449. kunmap_local(s_addr);
  1450. replace_sub_page(class, zspage, newzpdesc, zpdesc);
  1451. /*
  1452. * Since we complete the data copy and set up new zspage structure,
  1453. * it's okay to release migration_lock.
  1454. */
  1455. write_unlock(&pool->lock);
  1456. spin_unlock(&class->lock);
  1457. zspage_write_unlock(zspage);
  1458. zpdesc_get(newzpdesc);
  1459. if (zpdesc_zone(newzpdesc) != zpdesc_zone(zpdesc)) {
  1460. zpdesc_dec_zone_page_state(zpdesc);
  1461. zpdesc_inc_zone_page_state(newzpdesc);
  1462. }
  1463. reset_zpdesc(zpdesc);
  1464. zpdesc_put(zpdesc);
  1465. return 0;
  1466. }
  1467. static void zs_page_putback(struct page *page)
  1468. {
  1469. }
  1470. const struct movable_operations zsmalloc_mops = {
  1471. .isolate_page = zs_page_isolate,
  1472. .migrate_page = zs_page_migrate,
  1473. .putback_page = zs_page_putback,
  1474. };
  1475. /*
  1476. * Caller should hold page_lock of all pages in the zspage
  1477. * In here, we cannot use zspage meta data.
  1478. */
  1479. static void async_free_zspage(struct work_struct *work)
  1480. {
  1481. int i;
  1482. struct size_class *class;
  1483. struct zspage *zspage, *tmp;
  1484. LIST_HEAD(free_pages);
  1485. struct zs_pool *pool = container_of(work, struct zs_pool,
  1486. free_work);
  1487. for (i = 0; i < ZS_SIZE_CLASSES; i++) {
  1488. class = pool->size_class[i];
  1489. if (class->index != i)
  1490. continue;
  1491. spin_lock(&class->lock);
  1492. list_splice_init(&class->fullness_list[ZS_INUSE_RATIO_0],
  1493. &free_pages);
  1494. spin_unlock(&class->lock);
  1495. }
  1496. list_for_each_entry_safe(zspage, tmp, &free_pages, list) {
  1497. list_del(&zspage->list);
  1498. lock_zspage(zspage);
  1499. class = zspage_class(pool, zspage);
  1500. spin_lock(&class->lock);
  1501. class_stat_sub(class, ZS_INUSE_RATIO_0, 1);
  1502. __free_zspage(pool, class, zspage);
  1503. spin_unlock(&class->lock);
  1504. }
  1505. };
  1506. static void kick_deferred_free(struct zs_pool *pool)
  1507. {
  1508. schedule_work(&pool->free_work);
  1509. }
  1510. static void zs_flush_migration(struct zs_pool *pool)
  1511. {
  1512. flush_work(&pool->free_work);
  1513. }
  1514. static void init_deferred_free(struct zs_pool *pool)
  1515. {
  1516. INIT_WORK(&pool->free_work, async_free_zspage);
  1517. }
  1518. static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage)
  1519. {
  1520. struct zpdesc *zpdesc = get_first_zpdesc(zspage);
  1521. do {
  1522. WARN_ON(!zpdesc_trylock(zpdesc));
  1523. __zpdesc_set_movable(zpdesc);
  1524. zpdesc_unlock(zpdesc);
  1525. } while ((zpdesc = get_next_zpdesc(zpdesc)) != NULL);
  1526. }
  1527. #else
  1528. static inline void zs_flush_migration(struct zs_pool *pool) { }
  1529. #endif
  1530. /*
  1531. *
  1532. * Based on the number of unused allocated objects calculate
  1533. * and return the number of pages that we can free.
  1534. */
  1535. static unsigned long zs_can_compact(struct size_class *class)
  1536. {
  1537. unsigned long obj_wasted;
  1538. unsigned long obj_allocated = class_stat_read(class, ZS_OBJS_ALLOCATED);
  1539. unsigned long obj_used = class_stat_read(class, ZS_OBJS_INUSE);
  1540. if (obj_allocated <= obj_used)
  1541. return 0;
  1542. obj_wasted = obj_allocated - obj_used;
  1543. obj_wasted /= class->objs_per_zspage;
  1544. return obj_wasted * class->pages_per_zspage;
  1545. }
  1546. static unsigned long __zs_compact(struct zs_pool *pool,
  1547. struct size_class *class)
  1548. {
  1549. struct zspage *src_zspage = NULL;
  1550. struct zspage *dst_zspage = NULL;
  1551. unsigned long pages_freed = 0;
  1552. /*
  1553. * protect the race between zpage migration and zs_free
  1554. * as well as zpage allocation/free
  1555. */
  1556. write_lock(&pool->lock);
  1557. spin_lock(&class->lock);
  1558. while (zs_can_compact(class)) {
  1559. int fg;
  1560. if (!dst_zspage) {
  1561. dst_zspage = isolate_dst_zspage(class);
  1562. if (!dst_zspage)
  1563. break;
  1564. }
  1565. src_zspage = isolate_src_zspage(class);
  1566. if (!src_zspage)
  1567. break;
  1568. if (!zspage_write_trylock(src_zspage))
  1569. break;
  1570. migrate_zspage(pool, src_zspage, dst_zspage);
  1571. zspage_write_unlock(src_zspage);
  1572. fg = putback_zspage(class, src_zspage);
  1573. if (fg == ZS_INUSE_RATIO_0) {
  1574. free_zspage(pool, class, src_zspage);
  1575. pages_freed += class->pages_per_zspage;
  1576. }
  1577. src_zspage = NULL;
  1578. if (get_fullness_group(class, dst_zspage) == ZS_INUSE_RATIO_100
  1579. || rwlock_is_contended(&pool->lock)) {
  1580. putback_zspage(class, dst_zspage);
  1581. dst_zspage = NULL;
  1582. spin_unlock(&class->lock);
  1583. write_unlock(&pool->lock);
  1584. cond_resched();
  1585. write_lock(&pool->lock);
  1586. spin_lock(&class->lock);
  1587. }
  1588. }
  1589. if (src_zspage)
  1590. putback_zspage(class, src_zspage);
  1591. if (dst_zspage)
  1592. putback_zspage(class, dst_zspage);
  1593. spin_unlock(&class->lock);
  1594. write_unlock(&pool->lock);
  1595. return pages_freed;
  1596. }
  1597. unsigned long zs_compact(struct zs_pool *pool)
  1598. {
  1599. int i;
  1600. struct size_class *class;
  1601. unsigned long pages_freed = 0;
  1602. /*
  1603. * Pool compaction is performed under pool->lock so it is basically
  1604. * single-threaded. Having more than one thread in __zs_compact()
  1605. * will increase pool->lock contention, which will impact other
  1606. * zsmalloc operations that need pool->lock.
  1607. */
  1608. if (atomic_xchg(&pool->compaction_in_progress, 1))
  1609. return 0;
  1610. for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) {
  1611. class = pool->size_class[i];
  1612. if (class->index != i)
  1613. continue;
  1614. pages_freed += __zs_compact(pool, class);
  1615. }
  1616. atomic_long_add(pages_freed, &pool->stats.pages_compacted);
  1617. atomic_set(&pool->compaction_in_progress, 0);
  1618. return pages_freed;
  1619. }
  1620. EXPORT_SYMBOL_GPL(zs_compact);
  1621. void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats)
  1622. {
  1623. memcpy(stats, &pool->stats, sizeof(struct zs_pool_stats));
  1624. }
  1625. EXPORT_SYMBOL_GPL(zs_pool_stats);
  1626. static unsigned long zs_shrinker_scan(struct shrinker *shrinker,
  1627. struct shrink_control *sc)
  1628. {
  1629. unsigned long pages_freed;
  1630. struct zs_pool *pool = shrinker->private_data;
  1631. /*
  1632. * Compact classes and calculate compaction delta.
  1633. * Can run concurrently with a manually triggered
  1634. * (by user) compaction.
  1635. */
  1636. pages_freed = zs_compact(pool);
  1637. return pages_freed ? pages_freed : SHRINK_STOP;
  1638. }
  1639. static unsigned long zs_shrinker_count(struct shrinker *shrinker,
  1640. struct shrink_control *sc)
  1641. {
  1642. int i;
  1643. struct size_class *class;
  1644. unsigned long pages_to_free = 0;
  1645. struct zs_pool *pool = shrinker->private_data;
  1646. for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) {
  1647. class = pool->size_class[i];
  1648. if (class->index != i)
  1649. continue;
  1650. pages_to_free += zs_can_compact(class);
  1651. }
  1652. return pages_to_free;
  1653. }
  1654. static void zs_unregister_shrinker(struct zs_pool *pool)
  1655. {
  1656. shrinker_free(pool->shrinker);
  1657. }
  1658. static int zs_register_shrinker(struct zs_pool *pool)
  1659. {
  1660. pool->shrinker = shrinker_alloc(0, "mm-zspool:%s", pool->name);
  1661. if (!pool->shrinker)
  1662. return -ENOMEM;
  1663. pool->shrinker->scan_objects = zs_shrinker_scan;
  1664. pool->shrinker->count_objects = zs_shrinker_count;
  1665. pool->shrinker->batch = 0;
  1666. pool->shrinker->private_data = pool;
  1667. shrinker_register(pool->shrinker);
  1668. return 0;
  1669. }
  1670. static int calculate_zspage_chain_size(int class_size)
  1671. {
  1672. int i, min_waste = INT_MAX;
  1673. int chain_size = 1;
  1674. if (is_power_of_2(class_size))
  1675. return chain_size;
  1676. for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) {
  1677. int waste;
  1678. waste = (i * PAGE_SIZE) % class_size;
  1679. if (waste < min_waste) {
  1680. min_waste = waste;
  1681. chain_size = i;
  1682. }
  1683. }
  1684. return chain_size;
  1685. }
  1686. /**
  1687. * zs_create_pool - Creates an allocation pool to work from.
  1688. * @name: pool name to be created
  1689. *
  1690. * This function must be called before anything when using
  1691. * the zsmalloc allocator.
  1692. *
  1693. * On success, a pointer to the newly created pool is returned,
  1694. * otherwise NULL.
  1695. */
  1696. struct zs_pool *zs_create_pool(const char *name)
  1697. {
  1698. int i;
  1699. struct zs_pool *pool;
  1700. struct size_class *prev_class = NULL;
  1701. pool = kzalloc_obj(*pool);
  1702. if (!pool)
  1703. return NULL;
  1704. init_deferred_free(pool);
  1705. rwlock_init(&pool->lock);
  1706. atomic_set(&pool->compaction_in_progress, 0);
  1707. pool->name = kstrdup(name, GFP_KERNEL);
  1708. if (!pool->name)
  1709. goto err;
  1710. /*
  1711. * Iterate reversely, because, size of size_class that we want to use
  1712. * for merging should be larger or equal to current size.
  1713. */
  1714. for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) {
  1715. int size;
  1716. int pages_per_zspage;
  1717. int objs_per_zspage;
  1718. struct size_class *class;
  1719. int fullness;
  1720. size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA;
  1721. if (size > ZS_MAX_ALLOC_SIZE)
  1722. size = ZS_MAX_ALLOC_SIZE;
  1723. pages_per_zspage = calculate_zspage_chain_size(size);
  1724. objs_per_zspage = pages_per_zspage * PAGE_SIZE / size;
  1725. /*
  1726. * We iterate from biggest down to smallest classes,
  1727. * so huge_class_size holds the size of the first huge
  1728. * class. Any object bigger than or equal to that will
  1729. * endup in the huge class.
  1730. */
  1731. if (pages_per_zspage != 1 && objs_per_zspage != 1 &&
  1732. !huge_class_size) {
  1733. huge_class_size = size;
  1734. /*
  1735. * The object uses ZS_HANDLE_SIZE bytes to store the
  1736. * handle. We need to subtract it, because zs_malloc()
  1737. * unconditionally adds handle size before it performs
  1738. * size class search - so object may be smaller than
  1739. * huge class size, yet it still can end up in the huge
  1740. * class because it grows by ZS_HANDLE_SIZE extra bytes
  1741. * right before class lookup.
  1742. */
  1743. huge_class_size -= (ZS_HANDLE_SIZE - 1);
  1744. }
  1745. /*
  1746. * size_class is used for normal zsmalloc operation such
  1747. * as alloc/free for that size. Although it is natural that we
  1748. * have one size_class for each size, there is a chance that we
  1749. * can get more memory utilization if we use one size_class for
  1750. * many different sizes whose size_class have same
  1751. * characteristics. So, we makes size_class point to
  1752. * previous size_class if possible.
  1753. */
  1754. if (prev_class) {
  1755. if (can_merge(prev_class, pages_per_zspage, objs_per_zspage)) {
  1756. pool->size_class[i] = prev_class;
  1757. continue;
  1758. }
  1759. }
  1760. class = kzalloc_obj(struct size_class);
  1761. if (!class)
  1762. goto err;
  1763. class->size = size;
  1764. class->index = i;
  1765. class->pages_per_zspage = pages_per_zspage;
  1766. class->objs_per_zspage = objs_per_zspage;
  1767. spin_lock_init(&class->lock);
  1768. pool->size_class[i] = class;
  1769. fullness = ZS_INUSE_RATIO_0;
  1770. while (fullness < NR_FULLNESS_GROUPS) {
  1771. INIT_LIST_HEAD(&class->fullness_list[fullness]);
  1772. fullness++;
  1773. }
  1774. prev_class = class;
  1775. }
  1776. /* debug only, don't abort if it fails */
  1777. zs_pool_stat_create(pool, name);
  1778. /*
  1779. * Not critical since shrinker is only used to trigger internal
  1780. * defragmentation of the pool which is pretty optional thing. If
  1781. * registration fails we still can use the pool normally and user can
  1782. * trigger compaction manually. Thus, ignore return code.
  1783. */
  1784. zs_register_shrinker(pool);
  1785. return pool;
  1786. err:
  1787. zs_destroy_pool(pool);
  1788. return NULL;
  1789. }
  1790. EXPORT_SYMBOL_GPL(zs_create_pool);
  1791. void zs_destroy_pool(struct zs_pool *pool)
  1792. {
  1793. int i;
  1794. zs_unregister_shrinker(pool);
  1795. zs_flush_migration(pool);
  1796. zs_pool_stat_destroy(pool);
  1797. for (i = 0; i < ZS_SIZE_CLASSES; i++) {
  1798. int fg;
  1799. struct size_class *class = pool->size_class[i];
  1800. if (!class)
  1801. continue;
  1802. if (class->index != i)
  1803. continue;
  1804. for (fg = ZS_INUSE_RATIO_0; fg < NR_FULLNESS_GROUPS; fg++) {
  1805. if (list_empty(&class->fullness_list[fg]))
  1806. continue;
  1807. pr_err("Class-%d fullness group %d is not empty\n",
  1808. class->size, fg);
  1809. }
  1810. kfree(class);
  1811. }
  1812. kfree(pool->name);
  1813. kfree(pool);
  1814. }
  1815. EXPORT_SYMBOL_GPL(zs_destroy_pool);
  1816. static void zs_destroy_caches(void)
  1817. {
  1818. kmem_cache_destroy(handle_cachep);
  1819. handle_cachep = NULL;
  1820. kmem_cache_destroy(zspage_cachep);
  1821. zspage_cachep = NULL;
  1822. }
  1823. static int __init zs_init_caches(void)
  1824. {
  1825. handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
  1826. 0, 0, NULL);
  1827. zspage_cachep = kmem_cache_create("zspage", sizeof(struct zspage),
  1828. 0, 0, NULL);
  1829. if (!handle_cachep || !zspage_cachep) {
  1830. zs_destroy_caches();
  1831. return -ENOMEM;
  1832. }
  1833. return 0;
  1834. }
  1835. static int __init zs_init(void)
  1836. {
  1837. int rc;
  1838. rc = zs_init_caches();
  1839. if (rc)
  1840. return rc;
  1841. #ifdef CONFIG_COMPACTION
  1842. rc = set_movable_ops(&zsmalloc_mops, PGTY_zsmalloc);
  1843. if (rc) {
  1844. zs_destroy_caches();
  1845. return rc;
  1846. }
  1847. #endif
  1848. zs_stat_init();
  1849. return 0;
  1850. }
  1851. static void __exit zs_exit(void)
  1852. {
  1853. #ifdef CONFIG_COMPACTION
  1854. set_movable_ops(NULL, PGTY_zsmalloc);
  1855. #endif
  1856. zs_stat_exit();
  1857. zs_destroy_caches();
  1858. }
  1859. module_init(zs_init);
  1860. module_exit(zs_exit);
  1861. MODULE_LICENSE("Dual BSD/GPL");
  1862. MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
  1863. MODULE_DESCRIPTION("zsmalloc memory allocator");