md-llbitmap.c 44 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. #include <linux/blkdev.h>
  3. #include <linux/module.h>
  4. #include <linux/errno.h>
  5. #include <linux/slab.h>
  6. #include <linux/init.h>
  7. #include <linux/timer.h>
  8. #include <linux/sched.h>
  9. #include <linux/list.h>
  10. #include <linux/file.h>
  11. #include <linux/seq_file.h>
  12. #include <trace/events/block.h>
  13. #include "md.h"
  14. #include "md-bitmap.h"
  15. /*
  16. * #### Background
  17. *
  18. * Redundant data is used to enhance data fault tolerance, and the storage
  19. * methods for redundant data vary depending on the RAID levels. And it's
  20. * important to maintain the consistency of redundant data.
  21. *
  22. * Bitmap is used to record which data blocks have been synchronized and which
  23. * ones need to be resynchronized or recovered. Each bit in the bitmap
  24. * represents a segment of data in the array. When a bit is set, it indicates
  25. * that the multiple redundant copies of that data segment may not be
  26. * consistent. Data synchronization can be performed based on the bitmap after
  27. * power failure or readding a disk. If there is no bitmap, a full disk
  28. * synchronization is required.
  29. *
  30. * #### Key Features
  31. *
  32. * - IO fastpath is lockless, if user issues lots of write IO to the same
  33. * bitmap bit in a short time, only the first write has additional overhead
  34. * to update bitmap bit, no additional overhead for the following writes;
  35. * - support only resync or recover written data, means in the case creating
  36. * new array or replacing with a new disk, there is no need to do a full disk
  37. * resync/recovery;
  38. *
  39. * #### Key Concept
  40. *
  41. * ##### State Machine
  42. *
  43. * Each bit is one byte, contain 6 different states, see llbitmap_state. And
  44. * there are total 8 different actions, see llbitmap_action, can change state:
  45. *
  46. * llbitmap state machine: transitions between states
  47. *
  48. * | | Startwrite | Startsync | Endsync | Abortsync|
  49. * | --------- | ---------- | --------- | ------- | ------- |
  50. * | Unwritten | Dirty | x | x | x |
  51. * | Clean | Dirty | x | x | x |
  52. * | Dirty | x | x | x | x |
  53. * | NeedSync | x | Syncing | x | x |
  54. * | Syncing | x | Syncing | Dirty | NeedSync |
  55. *
  56. * | | Reload | Daemon | Discard | Stale |
  57. * | --------- | -------- | ------ | --------- | --------- |
  58. * | Unwritten | x | x | x | x |
  59. * | Clean | x | x | Unwritten | NeedSync |
  60. * | Dirty | NeedSync | Clean | Unwritten | NeedSync |
  61. * | NeedSync | x | x | Unwritten | x |
  62. * | Syncing | NeedSync | x | Unwritten | NeedSync |
  63. *
  64. * Typical scenarios:
  65. *
  66. * 1) Create new array
  67. * All bits will be set to Unwritten by default, if --assume-clean is set,
  68. * all bits will be set to Clean instead.
  69. *
  70. * 2) write data, raid1/raid10 have full copy of data, while raid456 doesn't and
  71. * rely on xor data
  72. *
  73. * 2.1) write new data to raid1/raid10:
  74. * Unwritten --StartWrite--> Dirty
  75. *
  76. * 2.2) write new data to raid456:
  77. * Unwritten --StartWrite--> NeedSync
  78. *
  79. * Because the initial recover for raid456 is skipped, the xor data is not built
  80. * yet, the bit must be set to NeedSync first and after lazy initial recover is
  81. * finished, the bit will finally set to Dirty(see 5.1 and 5.4);
  82. *
  83. * 2.3) cover write
  84. * Clean --StartWrite--> Dirty
  85. *
  86. * 3) daemon, if the array is not degraded:
  87. * Dirty --Daemon--> Clean
  88. *
  89. * 4) discard
  90. * {Clean, Dirty, NeedSync, Syncing} --Discard--> Unwritten
  91. *
  92. * 5) resync and recover
  93. *
  94. * 5.1) common process
  95. * NeedSync --Startsync--> Syncing --Endsync--> Dirty --Daemon--> Clean
  96. *
  97. * 5.2) resync after power failure
  98. * Dirty --Reload--> NeedSync
  99. *
  100. * 5.3) recover while replacing with a new disk
  101. * By default, the old bitmap framework will recover all data, and llbitmap
  102. * implements this by a new helper, see llbitmap_skip_sync_blocks:
  103. *
  104. * skip recover for bits other than dirty or clean;
  105. *
  106. * 5.4) lazy initial recover for raid5:
  107. * By default, the old bitmap framework will only allow new recover when there
  108. * are spares(new disk), a new recovery flag MD_RECOVERY_LAZY_RECOVER is added
  109. * to perform raid456 lazy recover for set bits(from 2.2).
  110. *
  111. * 6. special handling for degraded array:
  112. *
  113. * - Dirty bits will never be cleared, daemon will just do nothing, so that if
  114. * a disk is readded, Clean bits can be skipped with recovery;
  115. * - Dirty bits will convert to Syncing from start write, to do data recovery
  116. * for new added disks;
  117. * - New write will convert bits to NeedSync directly;
  118. *
  119. * ##### Bitmap IO
  120. *
  121. * ##### Chunksize
  122. *
  123. * The default bitmap size is 128k, incluing 1k bitmap super block, and
  124. * the default size of segment of data in the array each bit(chunksize) is 64k,
  125. * and chunksize will adjust to twice the old size each time if the total number
  126. * bits is not less than 127k.(see llbitmap_init)
  127. *
  128. * ##### READ
  129. *
  130. * While creating bitmap, all pages will be allocated and read for llbitmap,
  131. * there won't be read afterwards
  132. *
  133. * ##### WRITE
  134. *
  135. * WRITE IO is divided into logical_block_size of the array, the dirty state
  136. * of each block is tracked independently, for example:
  137. *
  138. * each page is 4k, contain 8 blocks; each block is 512 bytes contain 512 bit;
  139. *
  140. * | page0 | page1 | ... | page 31 |
  141. * | |
  142. * | \-----------------------\
  143. * | |
  144. * | block0 | block1 | ... | block 8|
  145. * | |
  146. * | \-----------------\
  147. * | |
  148. * | bit0 | bit1 | ... | bit511 |
  149. *
  150. * From IO path, if one bit is changed to Dirty or NeedSync, the corresponding
  151. * subpage will be marked dirty, such block must write first before the IO is
  152. * issued. This behaviour will affect IO performance, to reduce the impact, if
  153. * multiple bits are changed in the same block in a short time, all bits in this
  154. * block will be changed to Dirty/NeedSync, so that there won't be any overhead
  155. * until daemon clears dirty bits.
  156. *
  157. * ##### Dirty Bits synchronization
  158. *
  159. * IO fast path will set bits to dirty, and those dirty bits will be cleared
  160. * by daemon after IO is done. llbitmap_page_ctl is used to synchronize between
  161. * IO path and daemon;
  162. *
  163. * IO path:
  164. * 1) try to grab a reference, if succeed, set expire time after 5s and return;
  165. * 2) if failed to grab a reference, wait for daemon to finish clearing dirty
  166. * bits;
  167. *
  168. * Daemon (Daemon will be woken up every daemon_sleep seconds):
  169. * For each page:
  170. * 1) check if page expired, if not skip this page; for expired page:
  171. * 2) suspend the page and wait for inflight write IO to be done;
  172. * 3) change dirty page to clean;
  173. * 4) resume the page;
  174. */
  175. #define BITMAP_DATA_OFFSET 1024
  176. /* 64k is the max IO size of sync IO for raid1/raid10 */
  177. #define MIN_CHUNK_SIZE (64 * 2)
  178. /* By default, daemon will be woken up every 30s */
  179. #define DEFAULT_DAEMON_SLEEP 30
  180. /*
  181. * Dirtied bits that have not been accessed for more than 5s will be cleared
  182. * by daemon.
  183. */
  184. #define DEFAULT_BARRIER_IDLE 5
  185. enum llbitmap_state {
  186. /* No valid data, init state after assemble the array */
  187. BitUnwritten = 0,
  188. /* data is consistent */
  189. BitClean,
  190. /* data will be consistent after IO is done, set directly for writes */
  191. BitDirty,
  192. /*
  193. * data need to be resynchronized:
  194. * 1) set directly for writes if array is degraded, prevent full disk
  195. * synchronization after readding a disk;
  196. * 2) reassemble the array after power failure, and dirty bits are
  197. * found after reloading the bitmap;
  198. * 3) set for first write for raid5, to build initial xor data lazily
  199. */
  200. BitNeedSync,
  201. /* data is synchronizing */
  202. BitSyncing,
  203. BitStateCount,
  204. BitNone = 0xff,
  205. };
  206. enum llbitmap_action {
  207. /* User write new data, this is the only action from IO fast path */
  208. BitmapActionStartwrite = 0,
  209. /* Start recovery */
  210. BitmapActionStartsync,
  211. /* Finish recovery */
  212. BitmapActionEndsync,
  213. /* Failed recovery */
  214. BitmapActionAbortsync,
  215. /* Reassemble the array */
  216. BitmapActionReload,
  217. /* Daemon thread is trying to clear dirty bits */
  218. BitmapActionDaemon,
  219. /* Data is deleted */
  220. BitmapActionDiscard,
  221. /*
  222. * Bitmap is stale, mark all bits in addition to BitUnwritten to
  223. * BitNeedSync.
  224. */
  225. BitmapActionStale,
  226. BitmapActionCount,
  227. /* Init state is BitUnwritten */
  228. BitmapActionInit,
  229. };
  230. enum llbitmap_page_state {
  231. LLPageFlush = 0,
  232. LLPageDirty,
  233. };
  234. struct llbitmap_page_ctl {
  235. char *state;
  236. struct page *page;
  237. unsigned long expire;
  238. unsigned long flags;
  239. wait_queue_head_t wait;
  240. struct percpu_ref active;
  241. /* Per block size dirty state, maximum 64k page / 1 sector = 128 */
  242. unsigned long dirty[];
  243. };
  244. struct llbitmap {
  245. struct mddev *mddev;
  246. struct llbitmap_page_ctl **pctl;
  247. unsigned int nr_pages;
  248. unsigned int io_size;
  249. unsigned int blocks_per_page;
  250. /* shift of one chunk */
  251. unsigned long chunkshift;
  252. /* size of one chunk in sector */
  253. unsigned long chunksize;
  254. /* total number of chunks */
  255. unsigned long chunks;
  256. unsigned long last_end_sync;
  257. /*
  258. * time in seconds that dirty bits will be cleared if the page is not
  259. * accessed.
  260. */
  261. unsigned long barrier_idle;
  262. /* fires on first BitDirty state */
  263. struct timer_list pending_timer;
  264. struct work_struct daemon_work;
  265. unsigned long flags;
  266. __u64 events_cleared;
  267. /* for slow disks */
  268. atomic_t behind_writes;
  269. wait_queue_head_t behind_wait;
  270. };
  271. struct llbitmap_unplug_work {
  272. struct work_struct work;
  273. struct llbitmap *llbitmap;
  274. struct completion *done;
  275. };
  276. static struct workqueue_struct *md_llbitmap_io_wq;
  277. static struct workqueue_struct *md_llbitmap_unplug_wq;
  278. static char state_machine[BitStateCount][BitmapActionCount] = {
  279. [BitUnwritten] = {
  280. [BitmapActionStartwrite] = BitDirty,
  281. [BitmapActionStartsync] = BitNone,
  282. [BitmapActionEndsync] = BitNone,
  283. [BitmapActionAbortsync] = BitNone,
  284. [BitmapActionReload] = BitNone,
  285. [BitmapActionDaemon] = BitNone,
  286. [BitmapActionDiscard] = BitNone,
  287. [BitmapActionStale] = BitNone,
  288. },
  289. [BitClean] = {
  290. [BitmapActionStartwrite] = BitDirty,
  291. [BitmapActionStartsync] = BitNone,
  292. [BitmapActionEndsync] = BitNone,
  293. [BitmapActionAbortsync] = BitNone,
  294. [BitmapActionReload] = BitNone,
  295. [BitmapActionDaemon] = BitNone,
  296. [BitmapActionDiscard] = BitUnwritten,
  297. [BitmapActionStale] = BitNeedSync,
  298. },
  299. [BitDirty] = {
  300. [BitmapActionStartwrite] = BitNone,
  301. [BitmapActionStartsync] = BitNone,
  302. [BitmapActionEndsync] = BitNone,
  303. [BitmapActionAbortsync] = BitNone,
  304. [BitmapActionReload] = BitNeedSync,
  305. [BitmapActionDaemon] = BitClean,
  306. [BitmapActionDiscard] = BitUnwritten,
  307. [BitmapActionStale] = BitNeedSync,
  308. },
  309. [BitNeedSync] = {
  310. [BitmapActionStartwrite] = BitNone,
  311. [BitmapActionStartsync] = BitSyncing,
  312. [BitmapActionEndsync] = BitNone,
  313. [BitmapActionAbortsync] = BitNone,
  314. [BitmapActionReload] = BitNone,
  315. [BitmapActionDaemon] = BitNone,
  316. [BitmapActionDiscard] = BitUnwritten,
  317. [BitmapActionStale] = BitNone,
  318. },
  319. [BitSyncing] = {
  320. [BitmapActionStartwrite] = BitNone,
  321. [BitmapActionStartsync] = BitSyncing,
  322. [BitmapActionEndsync] = BitDirty,
  323. [BitmapActionAbortsync] = BitNeedSync,
  324. [BitmapActionReload] = BitNeedSync,
  325. [BitmapActionDaemon] = BitNone,
  326. [BitmapActionDiscard] = BitUnwritten,
  327. [BitmapActionStale] = BitNeedSync,
  328. },
  329. };
  330. static void __llbitmap_flush(struct mddev *mddev);
  331. static enum llbitmap_state llbitmap_read(struct llbitmap *llbitmap, loff_t pos)
  332. {
  333. unsigned int idx;
  334. unsigned int offset;
  335. pos += BITMAP_DATA_OFFSET;
  336. idx = pos >> PAGE_SHIFT;
  337. offset = offset_in_page(pos);
  338. return llbitmap->pctl[idx]->state[offset];
  339. }
  340. /* set all the bits in the subpage as dirty */
  341. static void llbitmap_infect_dirty_bits(struct llbitmap *llbitmap,
  342. struct llbitmap_page_ctl *pctl,
  343. unsigned int block)
  344. {
  345. bool level_456 = raid_is_456(llbitmap->mddev);
  346. unsigned int io_size = llbitmap->io_size;
  347. int pos;
  348. for (pos = block * io_size; pos < (block + 1) * io_size; pos++) {
  349. switch (pctl->state[pos]) {
  350. case BitUnwritten:
  351. pctl->state[pos] = level_456 ? BitNeedSync : BitDirty;
  352. break;
  353. case BitClean:
  354. pctl->state[pos] = BitDirty;
  355. break;
  356. }
  357. }
  358. }
  359. static void llbitmap_set_page_dirty(struct llbitmap *llbitmap, int idx,
  360. int offset)
  361. {
  362. struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx];
  363. unsigned int io_size = llbitmap->io_size;
  364. int block = offset / io_size;
  365. int pos;
  366. if (!test_bit(LLPageDirty, &pctl->flags))
  367. set_bit(LLPageDirty, &pctl->flags);
  368. /*
  369. * For degraded array, dirty bits will never be cleared, and we must
  370. * resync all the dirty bits, hence skip infect new dirty bits to
  371. * prevent resync unnecessary data.
  372. */
  373. if (llbitmap->mddev->degraded) {
  374. set_bit(block, pctl->dirty);
  375. return;
  376. }
  377. /*
  378. * The subpage usually contains a total of 512 bits. If any single bit
  379. * within the subpage is marked as dirty, the entire sector will be
  380. * written. To avoid impacting write performance, when multiple bits
  381. * within the same sector are modified within llbitmap->barrier_idle,
  382. * all bits in the sector will be collectively marked as dirty at once.
  383. */
  384. if (test_and_set_bit(block, pctl->dirty)) {
  385. llbitmap_infect_dirty_bits(llbitmap, pctl, block);
  386. return;
  387. }
  388. for (pos = block * io_size; pos < (block + 1) * io_size; pos++) {
  389. if (pos == offset)
  390. continue;
  391. if (pctl->state[pos] == BitDirty ||
  392. pctl->state[pos] == BitNeedSync) {
  393. llbitmap_infect_dirty_bits(llbitmap, pctl, block);
  394. return;
  395. }
  396. }
  397. }
  398. static void llbitmap_write(struct llbitmap *llbitmap, enum llbitmap_state state,
  399. loff_t pos)
  400. {
  401. unsigned int idx;
  402. unsigned int bit;
  403. pos += BITMAP_DATA_OFFSET;
  404. idx = pos >> PAGE_SHIFT;
  405. bit = offset_in_page(pos);
  406. llbitmap->pctl[idx]->state[bit] = state;
  407. if (state == BitDirty || state == BitNeedSync)
  408. llbitmap_set_page_dirty(llbitmap, idx, bit);
  409. }
  410. static struct page *llbitmap_read_page(struct llbitmap *llbitmap, int idx)
  411. {
  412. struct mddev *mddev = llbitmap->mddev;
  413. struct page *page = NULL;
  414. struct md_rdev *rdev;
  415. if (llbitmap->pctl && llbitmap->pctl[idx])
  416. page = llbitmap->pctl[idx]->page;
  417. if (page)
  418. return page;
  419. page = alloc_page(GFP_KERNEL | __GFP_ZERO);
  420. if (!page)
  421. return ERR_PTR(-ENOMEM);
  422. rdev_for_each(rdev, mddev) {
  423. sector_t sector;
  424. if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
  425. continue;
  426. sector = mddev->bitmap_info.offset +
  427. (idx << PAGE_SECTORS_SHIFT);
  428. if (sync_page_io(rdev, sector, PAGE_SIZE, page, REQ_OP_READ,
  429. true))
  430. return page;
  431. md_error(mddev, rdev);
  432. }
  433. __free_page(page);
  434. return ERR_PTR(-EIO);
  435. }
  436. static void llbitmap_write_page(struct llbitmap *llbitmap, int idx)
  437. {
  438. struct page *page = llbitmap->pctl[idx]->page;
  439. struct mddev *mddev = llbitmap->mddev;
  440. struct md_rdev *rdev;
  441. int block;
  442. for (block = 0; block < llbitmap->blocks_per_page; block++) {
  443. struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx];
  444. if (!test_and_clear_bit(block, pctl->dirty))
  445. continue;
  446. rdev_for_each(rdev, mddev) {
  447. sector_t sector;
  448. sector_t bit_sector = llbitmap->io_size >> SECTOR_SHIFT;
  449. if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
  450. continue;
  451. sector = mddev->bitmap_info.offset + rdev->sb_start +
  452. (idx << PAGE_SECTORS_SHIFT) +
  453. block * bit_sector;
  454. md_write_metadata(mddev, rdev, sector,
  455. llbitmap->io_size, page,
  456. block * llbitmap->io_size);
  457. }
  458. }
  459. }
  460. static void active_release(struct percpu_ref *ref)
  461. {
  462. struct llbitmap_page_ctl *pctl =
  463. container_of(ref, struct llbitmap_page_ctl, active);
  464. wake_up(&pctl->wait);
  465. }
  466. static void llbitmap_free_pages(struct llbitmap *llbitmap)
  467. {
  468. int i;
  469. if (!llbitmap->pctl)
  470. return;
  471. for (i = 0; i < llbitmap->nr_pages; i++) {
  472. struct llbitmap_page_ctl *pctl = llbitmap->pctl[i];
  473. if (!pctl || !pctl->page)
  474. break;
  475. __free_page(pctl->page);
  476. percpu_ref_exit(&pctl->active);
  477. }
  478. kfree(llbitmap->pctl[0]);
  479. kfree(llbitmap->pctl);
  480. llbitmap->pctl = NULL;
  481. }
  482. static int llbitmap_cache_pages(struct llbitmap *llbitmap)
  483. {
  484. struct llbitmap_page_ctl *pctl;
  485. unsigned int nr_pages = DIV_ROUND_UP(llbitmap->chunks +
  486. BITMAP_DATA_OFFSET, PAGE_SIZE);
  487. unsigned int size = struct_size(pctl, dirty, BITS_TO_LONGS(
  488. llbitmap->blocks_per_page));
  489. int i;
  490. llbitmap->pctl = kmalloc_array(nr_pages, sizeof(void *),
  491. GFP_KERNEL | __GFP_ZERO);
  492. if (!llbitmap->pctl)
  493. return -ENOMEM;
  494. size = round_up(size, cache_line_size());
  495. pctl = kmalloc_array(nr_pages, size, GFP_KERNEL | __GFP_ZERO);
  496. if (!pctl) {
  497. kfree(llbitmap->pctl);
  498. return -ENOMEM;
  499. }
  500. llbitmap->nr_pages = nr_pages;
  501. for (i = 0; i < nr_pages; i++, pctl = (void *)pctl + size) {
  502. struct page *page = llbitmap_read_page(llbitmap, i);
  503. llbitmap->pctl[i] = pctl;
  504. if (IS_ERR(page)) {
  505. llbitmap_free_pages(llbitmap);
  506. return PTR_ERR(page);
  507. }
  508. if (percpu_ref_init(&pctl->active, active_release,
  509. PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
  510. __free_page(page);
  511. llbitmap_free_pages(llbitmap);
  512. return -ENOMEM;
  513. }
  514. pctl->page = page;
  515. pctl->state = page_address(page);
  516. init_waitqueue_head(&pctl->wait);
  517. }
  518. return 0;
  519. }
  520. static void llbitmap_init_state(struct llbitmap *llbitmap)
  521. {
  522. enum llbitmap_state state = BitUnwritten;
  523. unsigned long i;
  524. if (test_and_clear_bit(BITMAP_CLEAN, &llbitmap->flags))
  525. state = BitClean;
  526. for (i = 0; i < llbitmap->chunks; i++)
  527. llbitmap_write(llbitmap, state, i);
  528. }
  529. /* The return value is only used from resync, where @start == @end. */
  530. static enum llbitmap_state llbitmap_state_machine(struct llbitmap *llbitmap,
  531. unsigned long start,
  532. unsigned long end,
  533. enum llbitmap_action action)
  534. {
  535. struct mddev *mddev = llbitmap->mddev;
  536. enum llbitmap_state state = BitNone;
  537. bool level_456 = raid_is_456(llbitmap->mddev);
  538. bool need_resync = false;
  539. bool need_recovery = false;
  540. if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags))
  541. return BitNone;
  542. if (action == BitmapActionInit) {
  543. llbitmap_init_state(llbitmap);
  544. return BitNone;
  545. }
  546. while (start <= end) {
  547. enum llbitmap_state c = llbitmap_read(llbitmap, start);
  548. if (c < 0 || c >= BitStateCount) {
  549. pr_err("%s: invalid bit %lu state %d action %d, forcing resync\n",
  550. __func__, start, c, action);
  551. state = BitNeedSync;
  552. goto write_bitmap;
  553. }
  554. if (c == BitNeedSync)
  555. need_resync = !mddev->degraded;
  556. state = state_machine[c][action];
  557. write_bitmap:
  558. if (unlikely(mddev->degraded)) {
  559. /* For degraded array, mark new data as need sync. */
  560. if (state == BitDirty &&
  561. action == BitmapActionStartwrite)
  562. state = BitNeedSync;
  563. /*
  564. * For degraded array, resync dirty data as well, noted
  565. * if array is still degraded after resync is done, all
  566. * new data will still be dirty until array is clean.
  567. */
  568. else if (c == BitDirty &&
  569. action == BitmapActionStartsync)
  570. state = BitSyncing;
  571. } else if (c == BitUnwritten && state == BitDirty &&
  572. action == BitmapActionStartwrite && level_456) {
  573. /* Delay raid456 initial recovery to first write. */
  574. state = BitNeedSync;
  575. }
  576. if (state == BitNone) {
  577. start++;
  578. continue;
  579. }
  580. llbitmap_write(llbitmap, state, start);
  581. if (state == BitNeedSync)
  582. need_resync = !mddev->degraded;
  583. else if (state == BitDirty &&
  584. !timer_pending(&llbitmap->pending_timer))
  585. mod_timer(&llbitmap->pending_timer,
  586. jiffies + mddev->bitmap_info.daemon_sleep * HZ);
  587. start++;
  588. }
  589. if (need_resync && level_456)
  590. need_recovery = true;
  591. if (need_recovery) {
  592. set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
  593. set_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery);
  594. md_wakeup_thread(mddev->thread);
  595. } else if (need_resync) {
  596. set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
  597. set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
  598. md_wakeup_thread(mddev->thread);
  599. }
  600. return state;
  601. }
  602. static void llbitmap_raise_barrier(struct llbitmap *llbitmap, int page_idx)
  603. {
  604. struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx];
  605. retry:
  606. if (likely(percpu_ref_tryget_live(&pctl->active))) {
  607. WRITE_ONCE(pctl->expire, jiffies + llbitmap->barrier_idle * HZ);
  608. return;
  609. }
  610. wait_event(pctl->wait, !percpu_ref_is_dying(&pctl->active));
  611. goto retry;
  612. }
  613. static void llbitmap_release_barrier(struct llbitmap *llbitmap, int page_idx)
  614. {
  615. struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx];
  616. percpu_ref_put(&pctl->active);
  617. }
  618. static int llbitmap_suspend_timeout(struct llbitmap *llbitmap, int page_idx)
  619. {
  620. struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx];
  621. percpu_ref_kill(&pctl->active);
  622. if (!wait_event_timeout(pctl->wait, percpu_ref_is_zero(&pctl->active),
  623. llbitmap->mddev->bitmap_info.daemon_sleep * HZ)) {
  624. percpu_ref_resurrect(&pctl->active);
  625. return -ETIMEDOUT;
  626. }
  627. return 0;
  628. }
  629. static void llbitmap_resume(struct llbitmap *llbitmap, int page_idx)
  630. {
  631. struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx];
  632. pctl->expire = LONG_MAX;
  633. percpu_ref_resurrect(&pctl->active);
  634. wake_up(&pctl->wait);
  635. }
  636. static int llbitmap_check_support(struct mddev *mddev)
  637. {
  638. if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
  639. pr_notice("md/llbitmap: %s: array with journal cannot have bitmap\n",
  640. mdname(mddev));
  641. return -EBUSY;
  642. }
  643. if (mddev->bitmap_info.space == 0) {
  644. if (mddev->bitmap_info.default_space == 0) {
  645. pr_notice("md/llbitmap: %s: no space for bitmap\n",
  646. mdname(mddev));
  647. return -ENOSPC;
  648. }
  649. }
  650. if (!mddev->persistent) {
  651. pr_notice("md/llbitmap: %s: array must be persistent\n",
  652. mdname(mddev));
  653. return -EOPNOTSUPP;
  654. }
  655. if (mddev->bitmap_info.file) {
  656. pr_notice("md/llbitmap: %s: doesn't support bitmap file\n",
  657. mdname(mddev));
  658. return -EOPNOTSUPP;
  659. }
  660. if (mddev->bitmap_info.external) {
  661. pr_notice("md/llbitmap: %s: doesn't support external metadata\n",
  662. mdname(mddev));
  663. return -EOPNOTSUPP;
  664. }
  665. if (mddev_is_dm(mddev)) {
  666. pr_notice("md/llbitmap: %s: doesn't support dm-raid\n",
  667. mdname(mddev));
  668. return -EOPNOTSUPP;
  669. }
  670. return 0;
  671. }
  672. static int llbitmap_init(struct llbitmap *llbitmap)
  673. {
  674. struct mddev *mddev = llbitmap->mddev;
  675. sector_t blocks = mddev->resync_max_sectors;
  676. unsigned long chunksize = MIN_CHUNK_SIZE;
  677. unsigned long chunks = DIV_ROUND_UP(blocks, chunksize);
  678. unsigned long space = mddev->bitmap_info.space << SECTOR_SHIFT;
  679. int ret;
  680. while (chunks > space) {
  681. chunksize = chunksize << 1;
  682. chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize);
  683. }
  684. llbitmap->barrier_idle = DEFAULT_BARRIER_IDLE;
  685. llbitmap->chunkshift = ffz(~chunksize);
  686. llbitmap->chunksize = chunksize;
  687. llbitmap->chunks = chunks;
  688. mddev->bitmap_info.daemon_sleep = DEFAULT_DAEMON_SLEEP;
  689. ret = llbitmap_cache_pages(llbitmap);
  690. if (ret)
  691. return ret;
  692. llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1,
  693. BitmapActionInit);
  694. /* flush initial llbitmap to disk */
  695. __llbitmap_flush(mddev);
  696. return 0;
  697. }
  698. static int llbitmap_read_sb(struct llbitmap *llbitmap)
  699. {
  700. struct mddev *mddev = llbitmap->mddev;
  701. unsigned long daemon_sleep;
  702. unsigned long chunksize;
  703. unsigned long events;
  704. struct page *sb_page;
  705. bitmap_super_t *sb;
  706. int ret = -EINVAL;
  707. if (!mddev->bitmap_info.offset) {
  708. pr_err("md/llbitmap: %s: no super block found", mdname(mddev));
  709. return -EINVAL;
  710. }
  711. sb_page = llbitmap_read_page(llbitmap, 0);
  712. if (IS_ERR(sb_page)) {
  713. pr_err("md/llbitmap: %s: read super block failed",
  714. mdname(mddev));
  715. return -EIO;
  716. }
  717. sb = kmap_local_page(sb_page);
  718. if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) {
  719. pr_err("md/llbitmap: %s: invalid super block magic number",
  720. mdname(mddev));
  721. goto out_put_page;
  722. }
  723. if (sb->version != cpu_to_le32(BITMAP_MAJOR_LOCKLESS)) {
  724. pr_err("md/llbitmap: %s: invalid super block version",
  725. mdname(mddev));
  726. goto out_put_page;
  727. }
  728. if (memcmp(sb->uuid, mddev->uuid, 16)) {
  729. pr_err("md/llbitmap: %s: bitmap superblock UUID mismatch\n",
  730. mdname(mddev));
  731. goto out_put_page;
  732. }
  733. if (mddev->bitmap_info.space == 0) {
  734. int room = le32_to_cpu(sb->sectors_reserved);
  735. if (room)
  736. mddev->bitmap_info.space = room;
  737. else
  738. mddev->bitmap_info.space = mddev->bitmap_info.default_space;
  739. }
  740. llbitmap->flags = le32_to_cpu(sb->state);
  741. if (test_and_clear_bit(BITMAP_FIRST_USE, &llbitmap->flags)) {
  742. ret = llbitmap_init(llbitmap);
  743. goto out_put_page;
  744. }
  745. chunksize = le32_to_cpu(sb->chunksize);
  746. if (!is_power_of_2(chunksize)) {
  747. pr_err("md/llbitmap: %s: chunksize not a power of 2",
  748. mdname(mddev));
  749. goto out_put_page;
  750. }
  751. if (chunksize < DIV_ROUND_UP_SECTOR_T(mddev->resync_max_sectors,
  752. mddev->bitmap_info.space << SECTOR_SHIFT)) {
  753. pr_err("md/llbitmap: %s: chunksize too small %lu < %llu / %lu",
  754. mdname(mddev), chunksize, mddev->resync_max_sectors,
  755. mddev->bitmap_info.space);
  756. goto out_put_page;
  757. }
  758. daemon_sleep = le32_to_cpu(sb->daemon_sleep);
  759. if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT / HZ) {
  760. pr_err("md/llbitmap: %s: daemon sleep %lu period out of range",
  761. mdname(mddev), daemon_sleep);
  762. goto out_put_page;
  763. }
  764. events = le64_to_cpu(sb->events);
  765. if (events < mddev->events) {
  766. pr_warn("md/llbitmap :%s: bitmap file is out of date (%lu < %llu) -- forcing full recovery",
  767. mdname(mddev), events, mddev->events);
  768. set_bit(BITMAP_STALE, &llbitmap->flags);
  769. }
  770. sb->sync_size = cpu_to_le64(mddev->resync_max_sectors);
  771. mddev->bitmap_info.chunksize = chunksize;
  772. mddev->bitmap_info.daemon_sleep = daemon_sleep;
  773. llbitmap->barrier_idle = DEFAULT_BARRIER_IDLE;
  774. llbitmap->chunksize = chunksize;
  775. llbitmap->chunks = DIV_ROUND_UP_SECTOR_T(mddev->resync_max_sectors, chunksize);
  776. llbitmap->chunkshift = ffz(~chunksize);
  777. ret = llbitmap_cache_pages(llbitmap);
  778. out_put_page:
  779. __free_page(sb_page);
  780. kunmap_local(sb);
  781. return ret;
  782. }
  783. static void llbitmap_pending_timer_fn(struct timer_list *pending_timer)
  784. {
  785. struct llbitmap *llbitmap =
  786. container_of(pending_timer, struct llbitmap, pending_timer);
  787. if (work_busy(&llbitmap->daemon_work)) {
  788. pr_warn("md/llbitmap: %s daemon_work not finished in %lu seconds\n",
  789. mdname(llbitmap->mddev),
  790. llbitmap->mddev->bitmap_info.daemon_sleep);
  791. set_bit(BITMAP_DAEMON_BUSY, &llbitmap->flags);
  792. return;
  793. }
  794. queue_work(md_llbitmap_io_wq, &llbitmap->daemon_work);
  795. }
  796. static void md_llbitmap_daemon_fn(struct work_struct *work)
  797. {
  798. struct llbitmap *llbitmap =
  799. container_of(work, struct llbitmap, daemon_work);
  800. unsigned long start;
  801. unsigned long end;
  802. bool restart;
  803. int idx;
  804. if (llbitmap->mddev->degraded)
  805. return;
  806. retry:
  807. start = 0;
  808. end = min(llbitmap->chunks, PAGE_SIZE - BITMAP_DATA_OFFSET) - 1;
  809. restart = false;
  810. for (idx = 0; idx < llbitmap->nr_pages; idx++) {
  811. struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx];
  812. if (idx > 0) {
  813. start = end + 1;
  814. end = min(end + PAGE_SIZE, llbitmap->chunks - 1);
  815. }
  816. if (!test_bit(LLPageFlush, &pctl->flags) &&
  817. time_before(jiffies, pctl->expire)) {
  818. restart = true;
  819. continue;
  820. }
  821. if (llbitmap_suspend_timeout(llbitmap, idx) < 0) {
  822. pr_warn("md/llbitmap: %s: %s waiting for page %d timeout\n",
  823. mdname(llbitmap->mddev), __func__, idx);
  824. continue;
  825. }
  826. llbitmap_state_machine(llbitmap, start, end, BitmapActionDaemon);
  827. llbitmap_resume(llbitmap, idx);
  828. }
  829. /*
  830. * If the daemon took a long time to finish, retry to prevent missing
  831. * clearing dirty bits.
  832. */
  833. if (test_and_clear_bit(BITMAP_DAEMON_BUSY, &llbitmap->flags))
  834. goto retry;
  835. /* If some page is dirty but not expired, setup timer again */
  836. if (restart)
  837. mod_timer(&llbitmap->pending_timer,
  838. jiffies + llbitmap->mddev->bitmap_info.daemon_sleep * HZ);
  839. }
  840. static int llbitmap_create(struct mddev *mddev)
  841. {
  842. struct llbitmap *llbitmap;
  843. int ret;
  844. ret = llbitmap_check_support(mddev);
  845. if (ret)
  846. return ret;
  847. llbitmap = kzalloc_obj(*llbitmap);
  848. if (!llbitmap)
  849. return -ENOMEM;
  850. llbitmap->mddev = mddev;
  851. llbitmap->io_size = bdev_logical_block_size(mddev->gendisk->part0);
  852. llbitmap->blocks_per_page = PAGE_SIZE / llbitmap->io_size;
  853. timer_setup(&llbitmap->pending_timer, llbitmap_pending_timer_fn, 0);
  854. INIT_WORK(&llbitmap->daemon_work, md_llbitmap_daemon_fn);
  855. atomic_set(&llbitmap->behind_writes, 0);
  856. init_waitqueue_head(&llbitmap->behind_wait);
  857. mutex_lock(&mddev->bitmap_info.mutex);
  858. mddev->bitmap = llbitmap;
  859. ret = llbitmap_read_sb(llbitmap);
  860. mutex_unlock(&mddev->bitmap_info.mutex);
  861. if (ret) {
  862. kfree(llbitmap);
  863. mddev->bitmap = NULL;
  864. }
  865. return ret;
  866. }
  867. static int llbitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize)
  868. {
  869. struct llbitmap *llbitmap = mddev->bitmap;
  870. unsigned long chunks;
  871. if (chunksize == 0)
  872. chunksize = llbitmap->chunksize;
  873. /* If there is enough space, leave the chunksize unchanged. */
  874. chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize);
  875. while (chunks > mddev->bitmap_info.space << SECTOR_SHIFT) {
  876. chunksize = chunksize << 1;
  877. chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize);
  878. }
  879. llbitmap->chunkshift = ffz(~chunksize);
  880. llbitmap->chunksize = chunksize;
  881. llbitmap->chunks = chunks;
  882. return 0;
  883. }
  884. static int llbitmap_load(struct mddev *mddev)
  885. {
  886. enum llbitmap_action action = BitmapActionReload;
  887. struct llbitmap *llbitmap = mddev->bitmap;
  888. if (test_and_clear_bit(BITMAP_STALE, &llbitmap->flags))
  889. action = BitmapActionStale;
  890. llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, action);
  891. return 0;
  892. }
  893. static void llbitmap_destroy(struct mddev *mddev)
  894. {
  895. struct llbitmap *llbitmap = mddev->bitmap;
  896. if (!llbitmap)
  897. return;
  898. mutex_lock(&mddev->bitmap_info.mutex);
  899. timer_delete_sync(&llbitmap->pending_timer);
  900. flush_workqueue(md_llbitmap_io_wq);
  901. flush_workqueue(md_llbitmap_unplug_wq);
  902. mddev->bitmap = NULL;
  903. llbitmap_free_pages(llbitmap);
  904. kfree(llbitmap);
  905. mutex_unlock(&mddev->bitmap_info.mutex);
  906. }
  907. static void llbitmap_start_write(struct mddev *mddev, sector_t offset,
  908. unsigned long sectors)
  909. {
  910. struct llbitmap *llbitmap = mddev->bitmap;
  911. unsigned long start = offset >> llbitmap->chunkshift;
  912. unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
  913. int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
  914. int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
  915. llbitmap_state_machine(llbitmap, start, end, BitmapActionStartwrite);
  916. while (page_start <= page_end) {
  917. llbitmap_raise_barrier(llbitmap, page_start);
  918. page_start++;
  919. }
  920. }
  921. static void llbitmap_end_write(struct mddev *mddev, sector_t offset,
  922. unsigned long sectors)
  923. {
  924. struct llbitmap *llbitmap = mddev->bitmap;
  925. unsigned long start = offset >> llbitmap->chunkshift;
  926. unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
  927. int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
  928. int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
  929. while (page_start <= page_end) {
  930. llbitmap_release_barrier(llbitmap, page_start);
  931. page_start++;
  932. }
  933. }
  934. static void llbitmap_start_discard(struct mddev *mddev, sector_t offset,
  935. unsigned long sectors)
  936. {
  937. struct llbitmap *llbitmap = mddev->bitmap;
  938. unsigned long start = DIV_ROUND_UP_SECTOR_T(offset, llbitmap->chunksize);
  939. unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
  940. int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
  941. int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
  942. llbitmap_state_machine(llbitmap, start, end, BitmapActionDiscard);
  943. while (page_start <= page_end) {
  944. llbitmap_raise_barrier(llbitmap, page_start);
  945. page_start++;
  946. }
  947. }
  948. static void llbitmap_end_discard(struct mddev *mddev, sector_t offset,
  949. unsigned long sectors)
  950. {
  951. struct llbitmap *llbitmap = mddev->bitmap;
  952. unsigned long start = DIV_ROUND_UP_SECTOR_T(offset, llbitmap->chunksize);
  953. unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
  954. int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
  955. int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
  956. while (page_start <= page_end) {
  957. llbitmap_release_barrier(llbitmap, page_start);
  958. page_start++;
  959. }
  960. }
  961. static void llbitmap_unplug_fn(struct work_struct *work)
  962. {
  963. struct llbitmap_unplug_work *unplug_work =
  964. container_of(work, struct llbitmap_unplug_work, work);
  965. struct llbitmap *llbitmap = unplug_work->llbitmap;
  966. struct blk_plug plug;
  967. int i;
  968. blk_start_plug(&plug);
  969. for (i = 0; i < llbitmap->nr_pages; i++) {
  970. if (!test_bit(LLPageDirty, &llbitmap->pctl[i]->flags) ||
  971. !test_and_clear_bit(LLPageDirty, &llbitmap->pctl[i]->flags))
  972. continue;
  973. llbitmap_write_page(llbitmap, i);
  974. }
  975. blk_finish_plug(&plug);
  976. md_super_wait(llbitmap->mddev);
  977. complete(unplug_work->done);
  978. }
  979. static bool llbitmap_dirty(struct llbitmap *llbitmap)
  980. {
  981. int i;
  982. for (i = 0; i < llbitmap->nr_pages; i++)
  983. if (test_bit(LLPageDirty, &llbitmap->pctl[i]->flags))
  984. return true;
  985. return false;
  986. }
  987. static void llbitmap_unplug(struct mddev *mddev, bool sync)
  988. {
  989. DECLARE_COMPLETION_ONSTACK(done);
  990. struct llbitmap *llbitmap = mddev->bitmap;
  991. struct llbitmap_unplug_work unplug_work = {
  992. .llbitmap = llbitmap,
  993. .done = &done,
  994. };
  995. if (!llbitmap_dirty(llbitmap))
  996. return;
  997. /*
  998. * Issue new bitmap IO under submit_bio() context will deadlock:
  999. * - the bio will wait for bitmap bio to be done, before it can be
  1000. * issued;
  1001. * - bitmap bio will be added to current->bio_list and wait for this
  1002. * bio to be issued;
  1003. */
  1004. INIT_WORK_ONSTACK(&unplug_work.work, llbitmap_unplug_fn);
  1005. queue_work(md_llbitmap_unplug_wq, &unplug_work.work);
  1006. wait_for_completion(&done);
  1007. destroy_work_on_stack(&unplug_work.work);
  1008. }
  1009. /*
  1010. * Force to write all bitmap pages to disk, called when stopping the array, or
  1011. * every daemon_sleep seconds when sync_thread is running.
  1012. */
  1013. static void __llbitmap_flush(struct mddev *mddev)
  1014. {
  1015. struct llbitmap *llbitmap = mddev->bitmap;
  1016. struct blk_plug plug;
  1017. int i;
  1018. blk_start_plug(&plug);
  1019. for (i = 0; i < llbitmap->nr_pages; i++) {
  1020. struct llbitmap_page_ctl *pctl = llbitmap->pctl[i];
  1021. /* mark all blocks as dirty */
  1022. set_bit(LLPageDirty, &pctl->flags);
  1023. bitmap_fill(pctl->dirty, llbitmap->blocks_per_page);
  1024. llbitmap_write_page(llbitmap, i);
  1025. }
  1026. blk_finish_plug(&plug);
  1027. md_super_wait(llbitmap->mddev);
  1028. }
  1029. static void llbitmap_flush(struct mddev *mddev)
  1030. {
  1031. struct llbitmap *llbitmap = mddev->bitmap;
  1032. int i;
  1033. for (i = 0; i < llbitmap->nr_pages; i++)
  1034. set_bit(LLPageFlush, &llbitmap->pctl[i]->flags);
  1035. timer_delete_sync(&llbitmap->pending_timer);
  1036. queue_work(md_llbitmap_io_wq, &llbitmap->daemon_work);
  1037. flush_work(&llbitmap->daemon_work);
  1038. __llbitmap_flush(mddev);
  1039. }
  1040. /* This is used for raid5 lazy initial recovery */
  1041. static bool llbitmap_blocks_synced(struct mddev *mddev, sector_t offset)
  1042. {
  1043. struct llbitmap *llbitmap = mddev->bitmap;
  1044. unsigned long p = offset >> llbitmap->chunkshift;
  1045. enum llbitmap_state c = llbitmap_read(llbitmap, p);
  1046. return c == BitClean || c == BitDirty;
  1047. }
  1048. static sector_t llbitmap_skip_sync_blocks(struct mddev *mddev, sector_t offset)
  1049. {
  1050. struct llbitmap *llbitmap = mddev->bitmap;
  1051. unsigned long p = offset >> llbitmap->chunkshift;
  1052. int blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1));
  1053. enum llbitmap_state c = llbitmap_read(llbitmap, p);
  1054. /* always skip unwritten blocks */
  1055. if (c == BitUnwritten)
  1056. return blocks;
  1057. /* For degraded array, don't skip */
  1058. if (mddev->degraded)
  1059. return 0;
  1060. /* For resync also skip clean/dirty blocks */
  1061. if ((c == BitClean || c == BitDirty) &&
  1062. test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
  1063. !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
  1064. return blocks;
  1065. return 0;
  1066. }
  1067. static bool llbitmap_start_sync(struct mddev *mddev, sector_t offset,
  1068. sector_t *blocks, bool degraded)
  1069. {
  1070. struct llbitmap *llbitmap = mddev->bitmap;
  1071. unsigned long p = offset >> llbitmap->chunkshift;
  1072. /*
  1073. * Handle one bit at a time, this is much simpler. And it doesn't matter
  1074. * if md_do_sync() loop more times.
  1075. */
  1076. *blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1));
  1077. return llbitmap_state_machine(llbitmap, p, p,
  1078. BitmapActionStartsync) == BitSyncing;
  1079. }
  1080. /* Something is wrong, sync_thread stop at @offset */
  1081. static void llbitmap_end_sync(struct mddev *mddev, sector_t offset,
  1082. sector_t *blocks)
  1083. {
  1084. struct llbitmap *llbitmap = mddev->bitmap;
  1085. unsigned long p = offset >> llbitmap->chunkshift;
  1086. *blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1));
  1087. llbitmap_state_machine(llbitmap, p, llbitmap->chunks - 1,
  1088. BitmapActionAbortsync);
  1089. }
  1090. /* A full sync_thread is finished */
  1091. static void llbitmap_close_sync(struct mddev *mddev)
  1092. {
  1093. struct llbitmap *llbitmap = mddev->bitmap;
  1094. int i;
  1095. for (i = 0; i < llbitmap->nr_pages; i++) {
  1096. struct llbitmap_page_ctl *pctl = llbitmap->pctl[i];
  1097. /* let daemon_fn clear dirty bits immediately */
  1098. WRITE_ONCE(pctl->expire, jiffies);
  1099. }
  1100. llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1,
  1101. BitmapActionEndsync);
  1102. }
  1103. /*
  1104. * sync_thread have reached @sector, update metadata every daemon_sleep seconds,
  1105. * just in case sync_thread have to restart after power failure.
  1106. */
  1107. static void llbitmap_cond_end_sync(struct mddev *mddev, sector_t sector,
  1108. bool force)
  1109. {
  1110. struct llbitmap *llbitmap = mddev->bitmap;
  1111. if (sector == 0) {
  1112. llbitmap->last_end_sync = jiffies;
  1113. return;
  1114. }
  1115. if (time_before(jiffies, llbitmap->last_end_sync +
  1116. HZ * mddev->bitmap_info.daemon_sleep))
  1117. return;
  1118. wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
  1119. mddev->curr_resync_completed = sector;
  1120. set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
  1121. llbitmap_state_machine(llbitmap, 0, sector >> llbitmap->chunkshift,
  1122. BitmapActionEndsync);
  1123. __llbitmap_flush(mddev);
  1124. llbitmap->last_end_sync = jiffies;
  1125. sysfs_notify_dirent_safe(mddev->sysfs_completed);
  1126. }
  1127. static bool llbitmap_enabled(void *data, bool flush)
  1128. {
  1129. struct llbitmap *llbitmap = data;
  1130. return llbitmap && !test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags);
  1131. }
  1132. static void llbitmap_dirty_bits(struct mddev *mddev, unsigned long s,
  1133. unsigned long e)
  1134. {
  1135. llbitmap_state_machine(mddev->bitmap, s, e, BitmapActionStartwrite);
  1136. }
  1137. static void llbitmap_write_sb(struct llbitmap *llbitmap)
  1138. {
  1139. int nr_blocks = DIV_ROUND_UP(BITMAP_DATA_OFFSET, llbitmap->io_size);
  1140. bitmap_fill(llbitmap->pctl[0]->dirty, nr_blocks);
  1141. llbitmap_write_page(llbitmap, 0);
  1142. md_super_wait(llbitmap->mddev);
  1143. }
  1144. static void llbitmap_update_sb(void *data)
  1145. {
  1146. struct llbitmap *llbitmap = data;
  1147. struct mddev *mddev = llbitmap->mddev;
  1148. struct page *sb_page;
  1149. bitmap_super_t *sb;
  1150. if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags))
  1151. return;
  1152. sb_page = llbitmap_read_page(llbitmap, 0);
  1153. if (IS_ERR(sb_page)) {
  1154. pr_err("%s: %s: read super block failed", __func__,
  1155. mdname(mddev));
  1156. set_bit(BITMAP_WRITE_ERROR, &llbitmap->flags);
  1157. return;
  1158. }
  1159. if (mddev->events < llbitmap->events_cleared)
  1160. llbitmap->events_cleared = mddev->events;
  1161. sb = kmap_local_page(sb_page);
  1162. sb->events = cpu_to_le64(mddev->events);
  1163. sb->state = cpu_to_le32(llbitmap->flags);
  1164. sb->chunksize = cpu_to_le32(llbitmap->chunksize);
  1165. sb->sync_size = cpu_to_le64(mddev->resync_max_sectors);
  1166. sb->events_cleared = cpu_to_le64(llbitmap->events_cleared);
  1167. sb->sectors_reserved = cpu_to_le32(mddev->bitmap_info.space);
  1168. sb->daemon_sleep = cpu_to_le32(mddev->bitmap_info.daemon_sleep);
  1169. kunmap_local(sb);
  1170. llbitmap_write_sb(llbitmap);
  1171. }
  1172. static int llbitmap_get_stats(void *data, struct md_bitmap_stats *stats)
  1173. {
  1174. struct llbitmap *llbitmap = data;
  1175. memset(stats, 0, sizeof(*stats));
  1176. stats->missing_pages = 0;
  1177. stats->pages = llbitmap->nr_pages;
  1178. stats->file_pages = llbitmap->nr_pages;
  1179. stats->behind_writes = atomic_read(&llbitmap->behind_writes);
  1180. stats->behind_wait = wq_has_sleeper(&llbitmap->behind_wait);
  1181. stats->events_cleared = llbitmap->events_cleared;
  1182. return 0;
  1183. }
  1184. /* just flag all pages as needing to be written */
  1185. static void llbitmap_write_all(struct mddev *mddev)
  1186. {
  1187. int i;
  1188. struct llbitmap *llbitmap = mddev->bitmap;
  1189. for (i = 0; i < llbitmap->nr_pages; i++) {
  1190. struct llbitmap_page_ctl *pctl = llbitmap->pctl[i];
  1191. set_bit(LLPageDirty, &pctl->flags);
  1192. bitmap_fill(pctl->dirty, llbitmap->blocks_per_page);
  1193. }
  1194. }
  1195. static void llbitmap_start_behind_write(struct mddev *mddev)
  1196. {
  1197. struct llbitmap *llbitmap = mddev->bitmap;
  1198. atomic_inc(&llbitmap->behind_writes);
  1199. }
  1200. static void llbitmap_end_behind_write(struct mddev *mddev)
  1201. {
  1202. struct llbitmap *llbitmap = mddev->bitmap;
  1203. if (atomic_dec_and_test(&llbitmap->behind_writes))
  1204. wake_up(&llbitmap->behind_wait);
  1205. }
  1206. static void llbitmap_wait_behind_writes(struct mddev *mddev)
  1207. {
  1208. struct llbitmap *llbitmap = mddev->bitmap;
  1209. if (!llbitmap)
  1210. return;
  1211. wait_event(llbitmap->behind_wait,
  1212. atomic_read(&llbitmap->behind_writes) == 0);
  1213. }
  1214. static ssize_t bits_show(struct mddev *mddev, char *page)
  1215. {
  1216. struct llbitmap *llbitmap;
  1217. int bits[BitStateCount] = {0};
  1218. loff_t start = 0;
  1219. mutex_lock(&mddev->bitmap_info.mutex);
  1220. llbitmap = mddev->bitmap;
  1221. if (!llbitmap || !llbitmap->pctl) {
  1222. mutex_unlock(&mddev->bitmap_info.mutex);
  1223. return sprintf(page, "no bitmap\n");
  1224. }
  1225. if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags)) {
  1226. mutex_unlock(&mddev->bitmap_info.mutex);
  1227. return sprintf(page, "bitmap io error\n");
  1228. }
  1229. while (start < llbitmap->chunks) {
  1230. enum llbitmap_state c = llbitmap_read(llbitmap, start);
  1231. if (c < 0 || c >= BitStateCount)
  1232. pr_err("%s: invalid bit %llu state %d\n",
  1233. __func__, start, c);
  1234. else
  1235. bits[c]++;
  1236. start++;
  1237. }
  1238. mutex_unlock(&mddev->bitmap_info.mutex);
  1239. return sprintf(page, "unwritten %d\nclean %d\ndirty %d\nneed sync %d\nsyncing %d\n",
  1240. bits[BitUnwritten], bits[BitClean], bits[BitDirty],
  1241. bits[BitNeedSync], bits[BitSyncing]);
  1242. }
  1243. static struct md_sysfs_entry llbitmap_bits = __ATTR_RO(bits);
  1244. static ssize_t metadata_show(struct mddev *mddev, char *page)
  1245. {
  1246. struct llbitmap *llbitmap;
  1247. ssize_t ret;
  1248. mutex_lock(&mddev->bitmap_info.mutex);
  1249. llbitmap = mddev->bitmap;
  1250. if (!llbitmap) {
  1251. mutex_unlock(&mddev->bitmap_info.mutex);
  1252. return sprintf(page, "no bitmap\n");
  1253. }
  1254. ret = sprintf(page, "chunksize %lu\nchunkshift %lu\nchunks %lu\noffset %llu\ndaemon_sleep %lu\n",
  1255. llbitmap->chunksize, llbitmap->chunkshift,
  1256. llbitmap->chunks, mddev->bitmap_info.offset,
  1257. llbitmap->mddev->bitmap_info.daemon_sleep);
  1258. mutex_unlock(&mddev->bitmap_info.mutex);
  1259. return ret;
  1260. }
  1261. static struct md_sysfs_entry llbitmap_metadata = __ATTR_RO(metadata);
  1262. static ssize_t
  1263. daemon_sleep_show(struct mddev *mddev, char *page)
  1264. {
  1265. return sprintf(page, "%lu\n", mddev->bitmap_info.daemon_sleep);
  1266. }
  1267. static ssize_t
  1268. daemon_sleep_store(struct mddev *mddev, const char *buf, size_t len)
  1269. {
  1270. unsigned long timeout;
  1271. int rv = kstrtoul(buf, 10, &timeout);
  1272. if (rv)
  1273. return rv;
  1274. mddev->bitmap_info.daemon_sleep = timeout;
  1275. return len;
  1276. }
  1277. static struct md_sysfs_entry llbitmap_daemon_sleep = __ATTR_RW(daemon_sleep);
  1278. static ssize_t
  1279. barrier_idle_show(struct mddev *mddev, char *page)
  1280. {
  1281. struct llbitmap *llbitmap = mddev->bitmap;
  1282. return sprintf(page, "%lu\n", llbitmap->barrier_idle);
  1283. }
  1284. static ssize_t
  1285. barrier_idle_store(struct mddev *mddev, const char *buf, size_t len)
  1286. {
  1287. struct llbitmap *llbitmap = mddev->bitmap;
  1288. unsigned long timeout;
  1289. int rv = kstrtoul(buf, 10, &timeout);
  1290. if (rv)
  1291. return rv;
  1292. llbitmap->barrier_idle = timeout;
  1293. return len;
  1294. }
  1295. static struct md_sysfs_entry llbitmap_barrier_idle = __ATTR_RW(barrier_idle);
  1296. static struct attribute *md_llbitmap_attrs[] = {
  1297. &llbitmap_bits.attr,
  1298. &llbitmap_metadata.attr,
  1299. &llbitmap_daemon_sleep.attr,
  1300. &llbitmap_barrier_idle.attr,
  1301. NULL
  1302. };
  1303. static struct attribute_group md_llbitmap_group = {
  1304. .name = "llbitmap",
  1305. .attrs = md_llbitmap_attrs,
  1306. };
  1307. static struct bitmap_operations llbitmap_ops = {
  1308. .head = {
  1309. .type = MD_BITMAP,
  1310. .id = ID_LLBITMAP,
  1311. .name = "llbitmap",
  1312. },
  1313. .enabled = llbitmap_enabled,
  1314. .create = llbitmap_create,
  1315. .resize = llbitmap_resize,
  1316. .load = llbitmap_load,
  1317. .destroy = llbitmap_destroy,
  1318. .start_write = llbitmap_start_write,
  1319. .end_write = llbitmap_end_write,
  1320. .start_discard = llbitmap_start_discard,
  1321. .end_discard = llbitmap_end_discard,
  1322. .unplug = llbitmap_unplug,
  1323. .flush = llbitmap_flush,
  1324. .start_behind_write = llbitmap_start_behind_write,
  1325. .end_behind_write = llbitmap_end_behind_write,
  1326. .wait_behind_writes = llbitmap_wait_behind_writes,
  1327. .blocks_synced = llbitmap_blocks_synced,
  1328. .skip_sync_blocks = llbitmap_skip_sync_blocks,
  1329. .start_sync = llbitmap_start_sync,
  1330. .end_sync = llbitmap_end_sync,
  1331. .close_sync = llbitmap_close_sync,
  1332. .cond_end_sync = llbitmap_cond_end_sync,
  1333. .update_sb = llbitmap_update_sb,
  1334. .get_stats = llbitmap_get_stats,
  1335. .dirty_bits = llbitmap_dirty_bits,
  1336. .write_all = llbitmap_write_all,
  1337. .group = &md_llbitmap_group,
  1338. };
  1339. int md_llbitmap_init(void)
  1340. {
  1341. md_llbitmap_io_wq = alloc_workqueue("md_llbitmap_io",
  1342. WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
  1343. if (!md_llbitmap_io_wq)
  1344. return -ENOMEM;
  1345. md_llbitmap_unplug_wq = alloc_workqueue("md_llbitmap_unplug",
  1346. WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
  1347. if (!md_llbitmap_unplug_wq) {
  1348. destroy_workqueue(md_llbitmap_io_wq);
  1349. md_llbitmap_io_wq = NULL;
  1350. return -ENOMEM;
  1351. }
  1352. return register_md_submodule(&llbitmap_ops.head);
  1353. }
  1354. void md_llbitmap_exit(void)
  1355. {
  1356. destroy_workqueue(md_llbitmap_io_wq);
  1357. md_llbitmap_io_wq = NULL;
  1358. destroy_workqueue(md_llbitmap_unplug_wq);
  1359. md_llbitmap_unplug_wq = NULL;
  1360. unregister_md_submodule(&llbitmap_ops.head);
  1361. }