checkpoint.c 55 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * fs/f2fs/checkpoint.c
  4. *
  5. * Copyright (c) 2012 Samsung Electronics Co., Ltd.
  6. * http://www.samsung.com/
  7. */
  8. #include <linux/fs.h>
  9. #include <linux/bio.h>
  10. #include <linux/mpage.h>
  11. #include <linux/writeback.h>
  12. #include <linux/blkdev.h>
  13. #include <linux/f2fs_fs.h>
  14. #include <linux/pagevec.h>
  15. #include <linux/swap.h>
  16. #include <linux/kthread.h>
  17. #include <linux/delayacct.h>
  18. #include <linux/ioprio.h>
  19. #include <linux/math64.h>
  20. #include "f2fs.h"
  21. #include "node.h"
  22. #include "segment.h"
  23. #include "iostat.h"
  24. #include <trace/events/f2fs.h>
  25. static inline void get_lock_elapsed_time(struct f2fs_time_stat *ts)
  26. {
  27. ts->total_time = ktime_get();
  28. #ifdef CONFIG_64BIT
  29. ts->running_time = current->se.sum_exec_runtime;
  30. #endif
  31. #if defined(CONFIG_SCHED_INFO) && defined(CONFIG_SCHEDSTATS)
  32. ts->runnable_time = current->sched_info.run_delay;
  33. #endif
  34. #ifdef CONFIG_TASK_DELAY_ACCT
  35. if (current->delays)
  36. ts->io_sleep_time = current->delays->blkio_delay;
  37. #endif
  38. }
  39. static inline void trace_lock_elapsed_time_start(struct f2fs_rwsem *sem,
  40. struct f2fs_lock_context *lc)
  41. {
  42. lc->lock_trace = trace_f2fs_lock_elapsed_time_enabled();
  43. if (!lc->lock_trace)
  44. return;
  45. get_lock_elapsed_time(&lc->ts);
  46. }
  47. static inline void trace_lock_elapsed_time_end(struct f2fs_rwsem *sem,
  48. struct f2fs_lock_context *lc, bool is_write)
  49. {
  50. struct f2fs_time_stat tts;
  51. unsigned long long total_time;
  52. unsigned long long running_time = 0;
  53. unsigned long long runnable_time = 0;
  54. unsigned long long io_sleep_time = 0;
  55. unsigned long long other_time = 0;
  56. unsigned npm = NSEC_PER_MSEC;
  57. if (!lc->lock_trace)
  58. return;
  59. if (time_to_inject(sem->sbi, FAULT_LOCK_TIMEOUT))
  60. f2fs_schedule_timeout_killable(DEFAULT_FAULT_TIMEOUT, true);
  61. get_lock_elapsed_time(&tts);
  62. total_time = div_u64(tts.total_time - lc->ts.total_time, npm);
  63. if (total_time <= sem->sbi->max_lock_elapsed_time)
  64. return;
  65. #ifdef CONFIG_64BIT
  66. running_time = div_u64(tts.running_time - lc->ts.running_time, npm);
  67. #endif
  68. #if defined(CONFIG_SCHED_INFO) && defined(CONFIG_SCHEDSTATS)
  69. runnable_time = div_u64(tts.runnable_time - lc->ts.runnable_time, npm);
  70. #endif
  71. #ifdef CONFIG_TASK_DELAY_ACCT
  72. io_sleep_time = div_u64(tts.io_sleep_time - lc->ts.io_sleep_time, npm);
  73. #endif
  74. if (total_time > running_time + io_sleep_time + runnable_time)
  75. other_time = total_time - running_time -
  76. io_sleep_time - runnable_time;
  77. trace_f2fs_lock_elapsed_time(sem->sbi, sem->name, is_write, current,
  78. get_current_ioprio(), total_time, running_time,
  79. runnable_time, io_sleep_time, other_time);
  80. }
  81. static bool need_uplift_priority(struct f2fs_rwsem *sem, bool is_write)
  82. {
  83. if (!(sem->sbi->adjust_lock_priority & BIT(sem->name - 1)))
  84. return false;
  85. switch (sem->name) {
  86. /*
  87. * writer is checkpoint which has high priority, let's just uplift
  88. * priority for reader
  89. */
  90. case LOCK_NAME_CP_RWSEM:
  91. case LOCK_NAME_NODE_CHANGE:
  92. case LOCK_NAME_NODE_WRITE:
  93. return !is_write;
  94. case LOCK_NAME_GC_LOCK:
  95. case LOCK_NAME_CP_GLOBAL:
  96. case LOCK_NAME_IO_RWSEM:
  97. return true;
  98. default:
  99. f2fs_bug_on(sem->sbi, 1);
  100. }
  101. return false;
  102. }
  103. static void uplift_priority(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc,
  104. bool is_write)
  105. {
  106. lc->need_restore = false;
  107. if (!sem->sbi->adjust_lock_priority)
  108. return;
  109. if (rt_task(current))
  110. return;
  111. if (!need_uplift_priority(sem, is_write))
  112. return;
  113. lc->orig_nice = task_nice(current);
  114. lc->new_nice = PRIO_TO_NICE(sem->sbi->lock_duration_priority);
  115. if (lc->orig_nice <= lc->new_nice)
  116. return;
  117. set_user_nice(current, lc->new_nice);
  118. lc->need_restore = true;
  119. trace_f2fs_priority_uplift(sem->sbi, sem->name, is_write, current,
  120. NICE_TO_PRIO(lc->orig_nice), NICE_TO_PRIO(lc->new_nice));
  121. }
  122. static void restore_priority(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc,
  123. bool is_write)
  124. {
  125. if (!lc->need_restore)
  126. return;
  127. /* someone has updated the priority */
  128. if (task_nice(current) != lc->new_nice)
  129. return;
  130. set_user_nice(current, lc->orig_nice);
  131. trace_f2fs_priority_restore(sem->sbi, sem->name, is_write, current,
  132. NICE_TO_PRIO(lc->orig_nice), NICE_TO_PRIO(lc->new_nice));
  133. }
  134. void f2fs_down_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
  135. {
  136. uplift_priority(sem, lc, false);
  137. f2fs_down_read(sem);
  138. trace_lock_elapsed_time_start(sem, lc);
  139. }
  140. int f2fs_down_read_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
  141. {
  142. uplift_priority(sem, lc, false);
  143. if (!f2fs_down_read_trylock(sem)) {
  144. restore_priority(sem, lc, false);
  145. return 0;
  146. }
  147. trace_lock_elapsed_time_start(sem, lc);
  148. return 1;
  149. }
  150. void f2fs_up_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
  151. {
  152. f2fs_up_read(sem);
  153. restore_priority(sem, lc, false);
  154. trace_lock_elapsed_time_end(sem, lc, false);
  155. }
  156. void f2fs_down_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
  157. {
  158. uplift_priority(sem, lc, true);
  159. f2fs_down_write(sem);
  160. trace_lock_elapsed_time_start(sem, lc);
  161. }
  162. int f2fs_down_write_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
  163. {
  164. uplift_priority(sem, lc, true);
  165. if (!f2fs_down_write_trylock(sem)) {
  166. restore_priority(sem, lc, true);
  167. return 0;
  168. }
  169. trace_lock_elapsed_time_start(sem, lc);
  170. return 1;
  171. }
  172. void f2fs_up_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
  173. {
  174. f2fs_up_write(sem);
  175. restore_priority(sem, lc, true);
  176. trace_lock_elapsed_time_end(sem, lc, true);
  177. }
  178. void f2fs_lock_op(struct f2fs_sb_info *sbi, struct f2fs_lock_context *lc)
  179. {
  180. f2fs_down_read_trace(&sbi->cp_rwsem, lc);
  181. }
  182. int f2fs_trylock_op(struct f2fs_sb_info *sbi, struct f2fs_lock_context *lc)
  183. {
  184. if (time_to_inject(sbi, FAULT_LOCK_OP))
  185. return 0;
  186. return f2fs_down_read_trylock_trace(&sbi->cp_rwsem, lc);
  187. }
  188. void f2fs_unlock_op(struct f2fs_sb_info *sbi, struct f2fs_lock_context *lc)
  189. {
  190. f2fs_up_read_trace(&sbi->cp_rwsem, lc);
  191. }
  192. static inline void f2fs_lock_all(struct f2fs_sb_info *sbi)
  193. {
  194. f2fs_down_write(&sbi->cp_rwsem);
  195. }
  196. static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
  197. {
  198. f2fs_up_write(&sbi->cp_rwsem);
  199. }
  200. #define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 3))
  201. static struct kmem_cache *ino_entry_slab;
  202. struct kmem_cache *f2fs_inode_entry_slab;
  203. void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io,
  204. unsigned char reason)
  205. {
  206. f2fs_build_fault_attr(sbi, 0, 0, FAULT_ALL);
  207. if (!end_io)
  208. f2fs_flush_merged_writes(sbi);
  209. f2fs_handle_critical_error(sbi, reason);
  210. }
  211. /*
  212. * We guarantee no failure on the returned page.
  213. */
  214. struct folio *f2fs_grab_meta_folio(struct f2fs_sb_info *sbi, pgoff_t index)
  215. {
  216. struct address_space *mapping = META_MAPPING(sbi);
  217. struct folio *folio;
  218. repeat:
  219. folio = f2fs_grab_cache_folio(mapping, index, false);
  220. if (IS_ERR(folio)) {
  221. cond_resched();
  222. goto repeat;
  223. }
  224. f2fs_folio_wait_writeback(folio, META, true, true);
  225. if (!folio_test_uptodate(folio))
  226. folio_mark_uptodate(folio);
  227. return folio;
  228. }
  229. static struct folio *__get_meta_folio(struct f2fs_sb_info *sbi, pgoff_t index,
  230. bool is_meta)
  231. {
  232. struct address_space *mapping = META_MAPPING(sbi);
  233. struct folio *folio;
  234. struct f2fs_io_info fio = {
  235. .sbi = sbi,
  236. .type = META,
  237. .op = REQ_OP_READ,
  238. .op_flags = REQ_META | REQ_PRIO,
  239. .old_blkaddr = index,
  240. .new_blkaddr = index,
  241. .encrypted_page = NULL,
  242. .is_por = !is_meta ? 1 : 0,
  243. };
  244. int err;
  245. if (unlikely(!is_meta))
  246. fio.op_flags &= ~REQ_META;
  247. repeat:
  248. folio = f2fs_grab_cache_folio(mapping, index, false);
  249. if (IS_ERR(folio)) {
  250. cond_resched();
  251. goto repeat;
  252. }
  253. if (folio_test_uptodate(folio))
  254. goto out;
  255. fio.folio = folio;
  256. err = f2fs_submit_page_bio(&fio);
  257. if (err) {
  258. f2fs_folio_put(folio, true);
  259. return ERR_PTR(err);
  260. }
  261. f2fs_update_iostat(sbi, NULL, FS_META_READ_IO, F2FS_BLKSIZE);
  262. folio_lock(folio);
  263. if (unlikely(!is_meta_folio(folio))) {
  264. f2fs_folio_put(folio, true);
  265. goto repeat;
  266. }
  267. if (unlikely(!folio_test_uptodate(folio))) {
  268. f2fs_handle_page_eio(sbi, folio, META);
  269. f2fs_folio_put(folio, true);
  270. return ERR_PTR(-EIO);
  271. }
  272. out:
  273. return folio;
  274. }
  275. struct folio *f2fs_get_meta_folio(struct f2fs_sb_info *sbi, pgoff_t index)
  276. {
  277. return __get_meta_folio(sbi, index, true);
  278. }
  279. struct folio *f2fs_get_meta_folio_retry(struct f2fs_sb_info *sbi, pgoff_t index)
  280. {
  281. struct folio *folio;
  282. int count = 0;
  283. retry:
  284. folio = __get_meta_folio(sbi, index, true);
  285. if (IS_ERR(folio)) {
  286. if (PTR_ERR(folio) == -EIO &&
  287. ++count <= DEFAULT_RETRY_IO_COUNT)
  288. goto retry;
  289. f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_META_PAGE);
  290. }
  291. return folio;
  292. }
  293. /* for POR only */
  294. struct folio *f2fs_get_tmp_folio(struct f2fs_sb_info *sbi, pgoff_t index)
  295. {
  296. return __get_meta_folio(sbi, index, false);
  297. }
  298. static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr,
  299. int type)
  300. {
  301. struct seg_entry *se;
  302. unsigned int segno, offset;
  303. bool exist;
  304. if (type == DATA_GENERIC)
  305. return true;
  306. segno = GET_SEGNO(sbi, blkaddr);
  307. offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
  308. se = get_seg_entry(sbi, segno);
  309. exist = f2fs_test_bit(offset, se->cur_valid_map);
  310. /* skip data, if we already have an error in checkpoint. */
  311. if (unlikely(f2fs_cp_error(sbi)))
  312. return exist;
  313. if ((exist && type == DATA_GENERIC_ENHANCE_UPDATE) ||
  314. (!exist && type == DATA_GENERIC_ENHANCE))
  315. goto out_err;
  316. if (!exist && type != DATA_GENERIC_ENHANCE_UPDATE)
  317. goto out_handle;
  318. return exist;
  319. out_err:
  320. f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d",
  321. blkaddr, exist);
  322. set_sbi_flag(sbi, SBI_NEED_FSCK);
  323. dump_stack();
  324. out_handle:
  325. f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
  326. return exist;
  327. }
  328. static bool __f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
  329. block_t blkaddr, int type)
  330. {
  331. switch (type) {
  332. case META_NAT:
  333. break;
  334. case META_SIT:
  335. if (unlikely(blkaddr >= SIT_BLK_CNT(sbi)))
  336. goto check_only;
  337. break;
  338. case META_SSA:
  339. if (unlikely(blkaddr >= MAIN_BLKADDR(sbi) ||
  340. blkaddr < SM_I(sbi)->ssa_blkaddr))
  341. goto check_only;
  342. break;
  343. case META_CP:
  344. if (unlikely(blkaddr >= SIT_I(sbi)->sit_base_addr ||
  345. blkaddr < __start_cp_addr(sbi)))
  346. goto check_only;
  347. break;
  348. case META_POR:
  349. if (unlikely(blkaddr >= MAX_BLKADDR(sbi) ||
  350. blkaddr < MAIN_BLKADDR(sbi)))
  351. goto check_only;
  352. break;
  353. case DATA_GENERIC:
  354. case DATA_GENERIC_ENHANCE:
  355. case DATA_GENERIC_ENHANCE_READ:
  356. case DATA_GENERIC_ENHANCE_UPDATE:
  357. if (unlikely(blkaddr >= MAX_BLKADDR(sbi) ||
  358. blkaddr < MAIN_BLKADDR(sbi))) {
  359. /* Skip to emit an error message. */
  360. if (unlikely(f2fs_cp_error(sbi)))
  361. return false;
  362. f2fs_warn(sbi, "access invalid blkaddr:%u",
  363. blkaddr);
  364. set_sbi_flag(sbi, SBI_NEED_FSCK);
  365. dump_stack();
  366. goto err;
  367. } else {
  368. return __is_bitmap_valid(sbi, blkaddr, type);
  369. }
  370. break;
  371. case META_GENERIC:
  372. if (unlikely(blkaddr < SEG0_BLKADDR(sbi) ||
  373. blkaddr >= MAIN_BLKADDR(sbi)))
  374. goto err;
  375. break;
  376. default:
  377. BUG();
  378. }
  379. return true;
  380. err:
  381. f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
  382. check_only:
  383. return false;
  384. }
  385. bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
  386. block_t blkaddr, int type)
  387. {
  388. if (time_to_inject(sbi, FAULT_BLKADDR_VALIDITY))
  389. return false;
  390. return __f2fs_is_valid_blkaddr(sbi, blkaddr, type);
  391. }
  392. bool f2fs_is_valid_blkaddr_raw(struct f2fs_sb_info *sbi,
  393. block_t blkaddr, int type)
  394. {
  395. return __f2fs_is_valid_blkaddr(sbi, blkaddr, type);
  396. }
  397. /*
  398. * Readahead CP/NAT/SIT/SSA/POR pages
  399. */
  400. int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
  401. int type, bool sync)
  402. {
  403. block_t blkno = start;
  404. struct f2fs_io_info fio = {
  405. .sbi = sbi,
  406. .type = META,
  407. .op = REQ_OP_READ,
  408. .op_flags = sync ? (REQ_META | REQ_PRIO) : REQ_RAHEAD,
  409. .encrypted_page = NULL,
  410. .in_list = 0,
  411. .is_por = (type == META_POR) ? 1 : 0,
  412. };
  413. struct blk_plug plug;
  414. int err;
  415. if (unlikely(type == META_POR))
  416. fio.op_flags &= ~REQ_META;
  417. blk_start_plug(&plug);
  418. for (; nrpages-- > 0; blkno++) {
  419. struct folio *folio;
  420. if (!f2fs_is_valid_blkaddr(sbi, blkno, type))
  421. goto out;
  422. switch (type) {
  423. case META_NAT:
  424. if (unlikely(blkno >=
  425. NAT_BLOCK_OFFSET(NM_I(sbi)->max_nid)))
  426. blkno = 0;
  427. /* get nat block addr */
  428. fio.new_blkaddr = current_nat_addr(sbi,
  429. blkno * NAT_ENTRY_PER_BLOCK);
  430. break;
  431. case META_SIT:
  432. if (unlikely(blkno >= TOTAL_SEGS(sbi)))
  433. goto out;
  434. /* get sit block addr */
  435. fio.new_blkaddr = current_sit_addr(sbi,
  436. blkno * SIT_ENTRY_PER_BLOCK);
  437. break;
  438. case META_SSA:
  439. case META_CP:
  440. case META_POR:
  441. fio.new_blkaddr = blkno;
  442. break;
  443. default:
  444. BUG();
  445. }
  446. folio = f2fs_grab_cache_folio(META_MAPPING(sbi),
  447. fio.new_blkaddr, false);
  448. if (IS_ERR(folio))
  449. continue;
  450. if (folio_test_uptodate(folio)) {
  451. f2fs_folio_put(folio, true);
  452. continue;
  453. }
  454. fio.folio = folio;
  455. err = f2fs_submit_page_bio(&fio);
  456. f2fs_folio_put(folio, err ? true : false);
  457. if (!err)
  458. f2fs_update_iostat(sbi, NULL, FS_META_READ_IO,
  459. F2FS_BLKSIZE);
  460. }
  461. out:
  462. blk_finish_plug(&plug);
  463. return blkno - start;
  464. }
  465. void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index,
  466. unsigned int ra_blocks)
  467. {
  468. struct folio *folio;
  469. bool readahead = false;
  470. if (ra_blocks == RECOVERY_MIN_RA_BLOCKS)
  471. return;
  472. folio = filemap_get_folio(META_MAPPING(sbi), index);
  473. if (IS_ERR(folio) || !folio_test_uptodate(folio))
  474. readahead = true;
  475. f2fs_folio_put(folio, false);
  476. if (readahead)
  477. f2fs_ra_meta_pages(sbi, index, ra_blocks, META_POR, true);
  478. }
  479. static bool __f2fs_write_meta_folio(struct folio *folio,
  480. struct writeback_control *wbc,
  481. enum iostat_type io_type)
  482. {
  483. struct f2fs_sb_info *sbi = F2FS_F_SB(folio);
  484. trace_f2fs_writepage(folio, META);
  485. if (unlikely(f2fs_cp_error(sbi))) {
  486. if (is_sbi_flag_set(sbi, SBI_IS_CLOSE)) {
  487. folio_clear_uptodate(folio);
  488. dec_page_count(sbi, F2FS_DIRTY_META);
  489. folio_unlock(folio);
  490. return true;
  491. }
  492. goto redirty_out;
  493. }
  494. if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
  495. goto redirty_out;
  496. f2fs_do_write_meta_page(sbi, folio, io_type);
  497. dec_page_count(sbi, F2FS_DIRTY_META);
  498. folio_unlock(folio);
  499. if (unlikely(f2fs_cp_error(sbi)))
  500. f2fs_submit_merged_write(sbi, META);
  501. return true;
  502. redirty_out:
  503. folio_redirty_for_writepage(wbc, folio);
  504. return false;
  505. }
  506. static int f2fs_write_meta_pages(struct address_space *mapping,
  507. struct writeback_control *wbc)
  508. {
  509. struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
  510. struct f2fs_lock_context lc;
  511. long diff, written;
  512. if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
  513. goto skip_write;
  514. /* collect a number of dirty meta pages and write together */
  515. if (wbc->sync_mode != WB_SYNC_ALL &&
  516. get_pages(sbi, F2FS_DIRTY_META) <
  517. nr_pages_to_skip(sbi, META))
  518. goto skip_write;
  519. /* if locked failed, cp will flush dirty pages instead */
  520. if (!f2fs_down_write_trylock_trace(&sbi->cp_global_sem, &lc))
  521. goto skip_write;
  522. trace_f2fs_writepages(mapping->host, wbc, META);
  523. diff = nr_pages_to_write(sbi, META, wbc);
  524. written = f2fs_sync_meta_pages(sbi, wbc->nr_to_write, FS_META_IO);
  525. f2fs_up_write_trace(&sbi->cp_global_sem, &lc);
  526. wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff);
  527. return 0;
  528. skip_write:
  529. wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_META);
  530. trace_f2fs_writepages(mapping->host, wbc, META);
  531. return 0;
  532. }
  533. long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, long nr_to_write,
  534. enum iostat_type io_type)
  535. {
  536. struct address_space *mapping = META_MAPPING(sbi);
  537. pgoff_t index = 0, prev = ULONG_MAX;
  538. struct folio_batch fbatch;
  539. long nwritten = 0;
  540. int nr_folios;
  541. struct writeback_control wbc = {};
  542. struct blk_plug plug;
  543. folio_batch_init(&fbatch);
  544. blk_start_plug(&plug);
  545. while ((nr_folios = filemap_get_folios_tag(mapping, &index,
  546. (pgoff_t)-1,
  547. PAGECACHE_TAG_DIRTY, &fbatch))) {
  548. int i;
  549. for (i = 0; i < nr_folios; i++) {
  550. struct folio *folio = fbatch.folios[i];
  551. if (nr_to_write != LONG_MAX && i != 0 &&
  552. folio->index != prev +
  553. folio_nr_pages(fbatch.folios[i-1])) {
  554. folio_batch_release(&fbatch);
  555. goto stop;
  556. }
  557. folio_lock(folio);
  558. if (unlikely(!is_meta_folio(folio))) {
  559. continue_unlock:
  560. folio_unlock(folio);
  561. continue;
  562. }
  563. if (!folio_test_dirty(folio)) {
  564. /* someone wrote it for us */
  565. goto continue_unlock;
  566. }
  567. f2fs_folio_wait_writeback(folio, META, true, true);
  568. if (!folio_clear_dirty_for_io(folio))
  569. goto continue_unlock;
  570. if (!__f2fs_write_meta_folio(folio, &wbc,
  571. io_type)) {
  572. folio_unlock(folio);
  573. break;
  574. }
  575. nwritten += folio_nr_pages(folio);
  576. prev = folio->index;
  577. if (unlikely(nwritten >= nr_to_write))
  578. break;
  579. }
  580. folio_batch_release(&fbatch);
  581. cond_resched();
  582. }
  583. stop:
  584. if (nwritten)
  585. f2fs_submit_merged_write(sbi, META);
  586. blk_finish_plug(&plug);
  587. return nwritten;
  588. }
  589. static bool f2fs_dirty_meta_folio(struct address_space *mapping,
  590. struct folio *folio)
  591. {
  592. trace_f2fs_set_page_dirty(folio, META);
  593. if (!folio_test_uptodate(folio))
  594. folio_mark_uptodate(folio);
  595. if (filemap_dirty_folio(mapping, folio)) {
  596. inc_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_META);
  597. folio_set_f2fs_reference(folio);
  598. return true;
  599. }
  600. return false;
  601. }
  602. const struct address_space_operations f2fs_meta_aops = {
  603. .writepages = f2fs_write_meta_pages,
  604. .dirty_folio = f2fs_dirty_meta_folio,
  605. .invalidate_folio = f2fs_invalidate_folio,
  606. .release_folio = f2fs_release_folio,
  607. .migrate_folio = filemap_migrate_folio,
  608. };
  609. static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino,
  610. unsigned int devidx, int type)
  611. {
  612. struct inode_management *im = &sbi->im[type];
  613. struct ino_entry *e = NULL, *new = NULL;
  614. int ret;
  615. if (type == FLUSH_INO) {
  616. rcu_read_lock();
  617. e = radix_tree_lookup(&im->ino_root, ino);
  618. rcu_read_unlock();
  619. }
  620. retry:
  621. if (!e)
  622. new = f2fs_kmem_cache_alloc(ino_entry_slab,
  623. GFP_NOFS, true, NULL);
  624. ret = radix_tree_preload(GFP_NOFS | __GFP_NOFAIL);
  625. f2fs_bug_on(sbi, ret);
  626. spin_lock(&im->ino_lock);
  627. e = radix_tree_lookup(&im->ino_root, ino);
  628. if (!e) {
  629. if (!new) {
  630. spin_unlock(&im->ino_lock);
  631. radix_tree_preload_end();
  632. goto retry;
  633. }
  634. e = new;
  635. if (unlikely(radix_tree_insert(&im->ino_root, ino, e)))
  636. f2fs_bug_on(sbi, 1);
  637. memset(e, 0, sizeof(struct ino_entry));
  638. e->ino = ino;
  639. list_add_tail(&e->list, &im->ino_list);
  640. if (type != ORPHAN_INO)
  641. im->ino_num++;
  642. }
  643. if (type == FLUSH_INO)
  644. f2fs_set_bit(devidx, (char *)&e->dirty_device);
  645. spin_unlock(&im->ino_lock);
  646. radix_tree_preload_end();
  647. if (new && e != new)
  648. kmem_cache_free(ino_entry_slab, new);
  649. }
  650. static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
  651. {
  652. struct inode_management *im = &sbi->im[type];
  653. struct ino_entry *e;
  654. spin_lock(&im->ino_lock);
  655. e = radix_tree_lookup(&im->ino_root, ino);
  656. if (e) {
  657. list_del(&e->list);
  658. radix_tree_delete(&im->ino_root, ino);
  659. im->ino_num--;
  660. spin_unlock(&im->ino_lock);
  661. kmem_cache_free(ino_entry_slab, e);
  662. return;
  663. }
  664. spin_unlock(&im->ino_lock);
  665. }
  666. void f2fs_add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
  667. {
  668. /* add new dirty ino entry into list */
  669. __add_ino_entry(sbi, ino, 0, type);
  670. }
  671. void f2fs_remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
  672. {
  673. /* remove dirty ino entry from list */
  674. __remove_ino_entry(sbi, ino, type);
  675. }
  676. /* mode should be APPEND_INO, UPDATE_INO or TRANS_DIR_INO */
  677. bool f2fs_exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode)
  678. {
  679. struct inode_management *im = &sbi->im[mode];
  680. struct ino_entry *e;
  681. spin_lock(&im->ino_lock);
  682. e = radix_tree_lookup(&im->ino_root, ino);
  683. spin_unlock(&im->ino_lock);
  684. return e ? true : false;
  685. }
  686. void f2fs_release_ino_entry(struct f2fs_sb_info *sbi, bool all)
  687. {
  688. struct ino_entry *e, *tmp;
  689. int i;
  690. for (i = all ? ORPHAN_INO : APPEND_INO; i < MAX_INO_ENTRY; i++) {
  691. struct inode_management *im = &sbi->im[i];
  692. spin_lock(&im->ino_lock);
  693. list_for_each_entry_safe(e, tmp, &im->ino_list, list) {
  694. list_del(&e->list);
  695. radix_tree_delete(&im->ino_root, e->ino);
  696. kmem_cache_free(ino_entry_slab, e);
  697. im->ino_num--;
  698. }
  699. spin_unlock(&im->ino_lock);
  700. }
  701. }
  702. void f2fs_set_dirty_device(struct f2fs_sb_info *sbi, nid_t ino,
  703. unsigned int devidx, int type)
  704. {
  705. __add_ino_entry(sbi, ino, devidx, type);
  706. }
  707. bool f2fs_is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino,
  708. unsigned int devidx, int type)
  709. {
  710. struct inode_management *im = &sbi->im[type];
  711. struct ino_entry *e;
  712. bool is_dirty = false;
  713. spin_lock(&im->ino_lock);
  714. e = radix_tree_lookup(&im->ino_root, ino);
  715. if (e && f2fs_test_bit(devidx, (char *)&e->dirty_device))
  716. is_dirty = true;
  717. spin_unlock(&im->ino_lock);
  718. return is_dirty;
  719. }
  720. int f2fs_acquire_orphan_inode(struct f2fs_sb_info *sbi)
  721. {
  722. struct inode_management *im = &sbi->im[ORPHAN_INO];
  723. int err = 0;
  724. spin_lock(&im->ino_lock);
  725. if (time_to_inject(sbi, FAULT_ORPHAN)) {
  726. spin_unlock(&im->ino_lock);
  727. return -ENOSPC;
  728. }
  729. if (unlikely(im->ino_num >= sbi->max_orphans))
  730. err = -ENOSPC;
  731. else
  732. im->ino_num++;
  733. spin_unlock(&im->ino_lock);
  734. return err;
  735. }
  736. void f2fs_release_orphan_inode(struct f2fs_sb_info *sbi)
  737. {
  738. struct inode_management *im = &sbi->im[ORPHAN_INO];
  739. spin_lock(&im->ino_lock);
  740. f2fs_bug_on(sbi, im->ino_num == 0);
  741. im->ino_num--;
  742. spin_unlock(&im->ino_lock);
  743. }
  744. void f2fs_add_orphan_inode(struct inode *inode)
  745. {
  746. /* add new orphan ino entry into list */
  747. __add_ino_entry(F2FS_I_SB(inode), inode->i_ino, 0, ORPHAN_INO);
  748. f2fs_update_inode_page(inode);
  749. }
  750. void f2fs_remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
  751. {
  752. /* remove orphan entry from orphan list */
  753. __remove_ino_entry(sbi, ino, ORPHAN_INO);
  754. }
  755. static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
  756. {
  757. struct inode *inode;
  758. struct node_info ni;
  759. int err;
  760. inode = f2fs_iget_retry(sbi->sb, ino);
  761. if (IS_ERR(inode)) {
  762. /*
  763. * there should be a bug that we can't find the entry
  764. * to orphan inode.
  765. */
  766. f2fs_bug_on(sbi, PTR_ERR(inode) == -ENOENT);
  767. return PTR_ERR(inode);
  768. }
  769. err = f2fs_dquot_initialize(inode);
  770. if (err) {
  771. iput(inode);
  772. goto err_out;
  773. }
  774. clear_nlink(inode);
  775. /* truncate all the data during iput */
  776. iput(inode);
  777. err = f2fs_get_node_info(sbi, ino, &ni, false);
  778. if (err)
  779. goto err_out;
  780. /* ENOMEM was fully retried in f2fs_evict_inode. */
  781. if (ni.blk_addr != NULL_ADDR) {
  782. err = -EIO;
  783. goto err_out;
  784. }
  785. return 0;
  786. err_out:
  787. set_sbi_flag(sbi, SBI_NEED_FSCK);
  788. f2fs_warn(sbi, "%s: orphan failed (ino=%x), run fsck to fix.",
  789. __func__, ino);
  790. return err;
  791. }
  792. int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi)
  793. {
  794. block_t start_blk, orphan_blocks, i, j;
  795. int err = 0;
  796. if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG))
  797. return 0;
  798. if (f2fs_hw_is_readonly(sbi)) {
  799. f2fs_info(sbi, "write access unavailable, skipping orphan cleanup");
  800. return 0;
  801. }
  802. if (is_sbi_flag_set(sbi, SBI_IS_WRITABLE))
  803. f2fs_info(sbi, "orphan cleanup on readonly fs");
  804. start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi);
  805. orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi);
  806. f2fs_ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP, true);
  807. for (i = 0; i < orphan_blocks; i++) {
  808. struct folio *folio;
  809. struct f2fs_orphan_block *orphan_blk;
  810. folio = f2fs_get_meta_folio(sbi, start_blk + i);
  811. if (IS_ERR(folio)) {
  812. err = PTR_ERR(folio);
  813. goto out;
  814. }
  815. orphan_blk = folio_address(folio);
  816. for (j = 0; j < le32_to_cpu(orphan_blk->entry_count); j++) {
  817. nid_t ino = le32_to_cpu(orphan_blk->ino[j]);
  818. err = recover_orphan_inode(sbi, ino);
  819. if (err) {
  820. f2fs_folio_put(folio, true);
  821. goto out;
  822. }
  823. }
  824. f2fs_folio_put(folio, true);
  825. }
  826. /* clear Orphan Flag */
  827. clear_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG);
  828. out:
  829. set_sbi_flag(sbi, SBI_IS_RECOVERED);
  830. return err;
  831. }
  832. static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
  833. {
  834. struct list_head *head;
  835. struct f2fs_orphan_block *orphan_blk = NULL;
  836. unsigned int nentries = 0;
  837. unsigned short index = 1;
  838. unsigned short orphan_blocks;
  839. struct folio *folio = NULL;
  840. struct ino_entry *orphan = NULL;
  841. struct inode_management *im = &sbi->im[ORPHAN_INO];
  842. orphan_blocks = GET_ORPHAN_BLOCKS(im->ino_num);
  843. /*
  844. * we don't need to do spin_lock(&im->ino_lock) here, since all the
  845. * orphan inode operations are covered under f2fs_lock_op().
  846. * And, spin_lock should be avoided due to page operations below.
  847. */
  848. head = &im->ino_list;
  849. /* loop for each orphan inode entry and write them in journal block */
  850. list_for_each_entry(orphan, head, list) {
  851. if (!folio) {
  852. folio = f2fs_grab_meta_folio(sbi, start_blk++);
  853. orphan_blk = folio_address(folio);
  854. memset(orphan_blk, 0, sizeof(*orphan_blk));
  855. }
  856. orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino);
  857. if (nentries == F2FS_ORPHANS_PER_BLOCK) {
  858. /*
  859. * an orphan block is full of 1020 entries,
  860. * then we need to flush current orphan blocks
  861. * and bring another one in memory
  862. */
  863. orphan_blk->blk_addr = cpu_to_le16(index);
  864. orphan_blk->blk_count = cpu_to_le16(orphan_blocks);
  865. orphan_blk->entry_count = cpu_to_le32(nentries);
  866. folio_mark_dirty(folio);
  867. f2fs_folio_put(folio, true);
  868. index++;
  869. nentries = 0;
  870. folio = NULL;
  871. }
  872. }
  873. if (folio) {
  874. orphan_blk->blk_addr = cpu_to_le16(index);
  875. orphan_blk->blk_count = cpu_to_le16(orphan_blocks);
  876. orphan_blk->entry_count = cpu_to_le32(nentries);
  877. folio_mark_dirty(folio);
  878. f2fs_folio_put(folio, true);
  879. }
  880. }
  881. static __u32 f2fs_checkpoint_chksum(struct f2fs_checkpoint *ckpt)
  882. {
  883. unsigned int chksum_ofs = le32_to_cpu(ckpt->checksum_offset);
  884. __u32 chksum;
  885. chksum = f2fs_crc32(ckpt, chksum_ofs);
  886. if (chksum_ofs < CP_CHKSUM_OFFSET) {
  887. chksum_ofs += sizeof(chksum);
  888. chksum = f2fs_chksum(chksum, (__u8 *)ckpt + chksum_ofs,
  889. F2FS_BLKSIZE - chksum_ofs);
  890. }
  891. return chksum;
  892. }
  893. static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr,
  894. struct f2fs_checkpoint **cp_block, struct folio **cp_folio,
  895. unsigned long long *version)
  896. {
  897. size_t crc_offset = 0;
  898. __u32 crc;
  899. *cp_folio = f2fs_get_meta_folio(sbi, cp_addr);
  900. if (IS_ERR(*cp_folio))
  901. return PTR_ERR(*cp_folio);
  902. *cp_block = folio_address(*cp_folio);
  903. crc_offset = le32_to_cpu((*cp_block)->checksum_offset);
  904. if (crc_offset < CP_MIN_CHKSUM_OFFSET ||
  905. crc_offset > CP_CHKSUM_OFFSET) {
  906. f2fs_folio_put(*cp_folio, true);
  907. f2fs_warn(sbi, "invalid crc_offset: %zu", crc_offset);
  908. return -EINVAL;
  909. }
  910. crc = f2fs_checkpoint_chksum(*cp_block);
  911. if (crc != cur_cp_crc(*cp_block)) {
  912. f2fs_folio_put(*cp_folio, true);
  913. f2fs_warn(sbi, "invalid crc value");
  914. return -EINVAL;
  915. }
  916. *version = cur_cp_version(*cp_block);
  917. return 0;
  918. }
  919. static struct folio *validate_checkpoint(struct f2fs_sb_info *sbi,
  920. block_t cp_addr, unsigned long long *version)
  921. {
  922. struct folio *cp_folio_1 = NULL, *cp_folio_2 = NULL;
  923. struct f2fs_checkpoint *cp_block = NULL;
  924. unsigned long long cur_version = 0, pre_version = 0;
  925. unsigned int cp_blocks;
  926. int err;
  927. err = get_checkpoint_version(sbi, cp_addr, &cp_block,
  928. &cp_folio_1, version);
  929. if (err)
  930. return NULL;
  931. cp_blocks = le32_to_cpu(cp_block->cp_pack_total_block_count);
  932. if (cp_blocks > BLKS_PER_SEG(sbi) || cp_blocks <= F2FS_CP_PACKS) {
  933. f2fs_warn(sbi, "invalid cp_pack_total_block_count:%u",
  934. le32_to_cpu(cp_block->cp_pack_total_block_count));
  935. goto invalid_cp;
  936. }
  937. pre_version = *version;
  938. cp_addr += cp_blocks - 1;
  939. err = get_checkpoint_version(sbi, cp_addr, &cp_block,
  940. &cp_folio_2, version);
  941. if (err)
  942. goto invalid_cp;
  943. cur_version = *version;
  944. if (cur_version == pre_version) {
  945. *version = cur_version;
  946. f2fs_folio_put(cp_folio_2, true);
  947. return cp_folio_1;
  948. }
  949. f2fs_folio_put(cp_folio_2, true);
  950. invalid_cp:
  951. f2fs_folio_put(cp_folio_1, true);
  952. return NULL;
  953. }
  954. int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi)
  955. {
  956. struct f2fs_checkpoint *cp_block;
  957. struct f2fs_super_block *fsb = sbi->raw_super;
  958. struct folio *cp1, *cp2, *cur_folio;
  959. unsigned long blk_size = sbi->blocksize;
  960. unsigned long long cp1_version = 0, cp2_version = 0;
  961. unsigned long long cp_start_blk_no;
  962. unsigned int cp_blks = 1 + __cp_payload(sbi);
  963. block_t cp_blk_no;
  964. int i;
  965. int err;
  966. sbi->ckpt = f2fs_kvzalloc(sbi, array_size(blk_size, cp_blks),
  967. GFP_KERNEL);
  968. if (!sbi->ckpt)
  969. return -ENOMEM;
  970. /*
  971. * Finding out valid cp block involves read both
  972. * sets( cp pack 1 and cp pack 2)
  973. */
  974. cp_start_blk_no = le32_to_cpu(fsb->cp_blkaddr);
  975. cp1 = validate_checkpoint(sbi, cp_start_blk_no, &cp1_version);
  976. /* The second checkpoint pack should start at the next segment */
  977. cp_start_blk_no += ((unsigned long long)1) <<
  978. le32_to_cpu(fsb->log_blocks_per_seg);
  979. cp2 = validate_checkpoint(sbi, cp_start_blk_no, &cp2_version);
  980. if (cp1 && cp2) {
  981. if (ver_after(cp2_version, cp1_version))
  982. cur_folio = cp2;
  983. else
  984. cur_folio = cp1;
  985. } else if (cp1) {
  986. cur_folio = cp1;
  987. } else if (cp2) {
  988. cur_folio = cp2;
  989. } else {
  990. err = -EFSCORRUPTED;
  991. goto fail_no_cp;
  992. }
  993. cp_block = folio_address(cur_folio);
  994. memcpy(sbi->ckpt, cp_block, blk_size);
  995. if (cur_folio == cp1)
  996. sbi->cur_cp_pack = 1;
  997. else
  998. sbi->cur_cp_pack = 2;
  999. /* Sanity checking of checkpoint */
  1000. if (f2fs_sanity_check_ckpt(sbi)) {
  1001. err = -EFSCORRUPTED;
  1002. goto free_fail_no_cp;
  1003. }
  1004. if (cp_blks <= 1)
  1005. goto done;
  1006. cp_blk_no = le32_to_cpu(fsb->cp_blkaddr);
  1007. if (cur_folio == cp2)
  1008. cp_blk_no += BIT(le32_to_cpu(fsb->log_blocks_per_seg));
  1009. for (i = 1; i < cp_blks; i++) {
  1010. void *sit_bitmap_ptr;
  1011. unsigned char *ckpt = (unsigned char *)sbi->ckpt;
  1012. cur_folio = f2fs_get_meta_folio(sbi, cp_blk_no + i);
  1013. if (IS_ERR(cur_folio)) {
  1014. err = PTR_ERR(cur_folio);
  1015. goto free_fail_no_cp;
  1016. }
  1017. sit_bitmap_ptr = folio_address(cur_folio);
  1018. memcpy(ckpt + i * blk_size, sit_bitmap_ptr, blk_size);
  1019. f2fs_folio_put(cur_folio, true);
  1020. }
  1021. done:
  1022. f2fs_folio_put(cp1, true);
  1023. f2fs_folio_put(cp2, true);
  1024. return 0;
  1025. free_fail_no_cp:
  1026. f2fs_folio_put(cp1, true);
  1027. f2fs_folio_put(cp2, true);
  1028. fail_no_cp:
  1029. kvfree(sbi->ckpt);
  1030. return err;
  1031. }
  1032. static void __add_dirty_inode(struct inode *inode, enum inode_type type)
  1033. {
  1034. struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
  1035. int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE;
  1036. if (is_inode_flag_set(inode, flag))
  1037. return;
  1038. set_inode_flag(inode, flag);
  1039. list_add_tail(&F2FS_I(inode)->dirty_list, &sbi->inode_list[type]);
  1040. stat_inc_dirty_inode(sbi, type);
  1041. }
  1042. static void __remove_dirty_inode(struct inode *inode, enum inode_type type)
  1043. {
  1044. int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE;
  1045. if (get_dirty_pages(inode) || !is_inode_flag_set(inode, flag))
  1046. return;
  1047. list_del_init(&F2FS_I(inode)->dirty_list);
  1048. clear_inode_flag(inode, flag);
  1049. stat_dec_dirty_inode(F2FS_I_SB(inode), type);
  1050. }
  1051. void f2fs_update_dirty_folio(struct inode *inode, struct folio *folio)
  1052. {
  1053. struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
  1054. enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE;
  1055. if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) &&
  1056. !S_ISLNK(inode->i_mode))
  1057. return;
  1058. spin_lock(&sbi->inode_lock[type]);
  1059. if (type != FILE_INODE || test_opt(sbi, DATA_FLUSH))
  1060. __add_dirty_inode(inode, type);
  1061. inode_inc_dirty_pages(inode);
  1062. spin_unlock(&sbi->inode_lock[type]);
  1063. folio_set_f2fs_reference(folio);
  1064. }
  1065. void f2fs_remove_dirty_inode(struct inode *inode)
  1066. {
  1067. struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
  1068. enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE;
  1069. if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) &&
  1070. !S_ISLNK(inode->i_mode))
  1071. return;
  1072. if (type == FILE_INODE && !test_opt(sbi, DATA_FLUSH))
  1073. return;
  1074. spin_lock(&sbi->inode_lock[type]);
  1075. __remove_dirty_inode(inode, type);
  1076. spin_unlock(&sbi->inode_lock[type]);
  1077. }
  1078. int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type,
  1079. bool from_cp)
  1080. {
  1081. struct list_head *head;
  1082. struct inode *inode;
  1083. struct f2fs_inode_info *fi;
  1084. bool is_dir = (type == DIR_INODE);
  1085. unsigned long ino = 0;
  1086. trace_f2fs_sync_dirty_inodes_enter(sbi->sb, is_dir,
  1087. get_pages(sbi, is_dir ?
  1088. F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA));
  1089. retry:
  1090. if (unlikely(f2fs_cp_error(sbi))) {
  1091. trace_f2fs_sync_dirty_inodes_exit(sbi->sb, is_dir,
  1092. get_pages(sbi, is_dir ?
  1093. F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA));
  1094. return -EIO;
  1095. }
  1096. spin_lock(&sbi->inode_lock[type]);
  1097. head = &sbi->inode_list[type];
  1098. if (list_empty(head)) {
  1099. spin_unlock(&sbi->inode_lock[type]);
  1100. trace_f2fs_sync_dirty_inodes_exit(sbi->sb, is_dir,
  1101. get_pages(sbi, is_dir ?
  1102. F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA));
  1103. return 0;
  1104. }
  1105. fi = list_first_entry(head, struct f2fs_inode_info, dirty_list);
  1106. inode = igrab(&fi->vfs_inode);
  1107. spin_unlock(&sbi->inode_lock[type]);
  1108. if (inode) {
  1109. unsigned long cur_ino = inode->i_ino;
  1110. if (from_cp)
  1111. F2FS_I(inode)->cp_task = current;
  1112. F2FS_I(inode)->wb_task = current;
  1113. filemap_fdatawrite(inode->i_mapping);
  1114. F2FS_I(inode)->wb_task = NULL;
  1115. if (from_cp)
  1116. F2FS_I(inode)->cp_task = NULL;
  1117. iput(inode);
  1118. /* We need to give cpu to another writers. */
  1119. if (ino == cur_ino)
  1120. cond_resched();
  1121. else
  1122. ino = cur_ino;
  1123. } else {
  1124. /*
  1125. * We should submit bio, since it exists several
  1126. * writebacking dentry pages in the freeing inode.
  1127. */
  1128. f2fs_submit_merged_write(sbi, DATA);
  1129. cond_resched();
  1130. }
  1131. goto retry;
  1132. }
  1133. static int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi)
  1134. {
  1135. struct list_head *head = &sbi->inode_list[DIRTY_META];
  1136. struct inode *inode;
  1137. struct f2fs_inode_info *fi;
  1138. s64 total = get_pages(sbi, F2FS_DIRTY_IMETA);
  1139. while (total--) {
  1140. if (unlikely(f2fs_cp_error(sbi)))
  1141. return -EIO;
  1142. spin_lock(&sbi->inode_lock[DIRTY_META]);
  1143. if (list_empty(head)) {
  1144. spin_unlock(&sbi->inode_lock[DIRTY_META]);
  1145. return 0;
  1146. }
  1147. fi = list_first_entry(head, struct f2fs_inode_info,
  1148. gdirty_list);
  1149. inode = igrab(&fi->vfs_inode);
  1150. spin_unlock(&sbi->inode_lock[DIRTY_META]);
  1151. if (inode) {
  1152. sync_inode_metadata(inode, 0);
  1153. /* it's on eviction */
  1154. if (is_inode_flag_set(inode, FI_DIRTY_INODE))
  1155. f2fs_update_inode_page(inode);
  1156. iput(inode);
  1157. }
  1158. }
  1159. return 0;
  1160. }
  1161. static void __prepare_cp_block(struct f2fs_sb_info *sbi)
  1162. {
  1163. struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
  1164. struct f2fs_nm_info *nm_i = NM_I(sbi);
  1165. nid_t last_nid = nm_i->next_scan_nid;
  1166. next_free_nid(sbi, &last_nid);
  1167. ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi));
  1168. ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi));
  1169. ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi));
  1170. ckpt->next_free_nid = cpu_to_le32(last_nid);
  1171. /* update user_block_counts */
  1172. sbi->last_valid_block_count = sbi->total_valid_block_count;
  1173. percpu_counter_set(&sbi->alloc_valid_block_count, 0);
  1174. percpu_counter_set(&sbi->rf_node_block_count, 0);
  1175. }
  1176. static bool __need_flush_quota(struct f2fs_sb_info *sbi)
  1177. {
  1178. bool ret = false;
  1179. if (!is_journalled_quota(sbi))
  1180. return false;
  1181. if (!f2fs_down_write_trylock(&sbi->quota_sem))
  1182. return true;
  1183. if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH)) {
  1184. ret = false;
  1185. } else if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_REPAIR)) {
  1186. ret = false;
  1187. } else if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_FLUSH)) {
  1188. clear_sbi_flag(sbi, SBI_QUOTA_NEED_FLUSH);
  1189. ret = true;
  1190. } else if (get_pages(sbi, F2FS_DIRTY_QDATA)) {
  1191. ret = true;
  1192. }
  1193. f2fs_up_write(&sbi->quota_sem);
  1194. return ret;
  1195. }
  1196. /*
  1197. * Freeze all the FS-operations for checkpoint.
  1198. */
  1199. static int block_operations(struct f2fs_sb_info *sbi)
  1200. {
  1201. struct writeback_control wbc = {
  1202. .sync_mode = WB_SYNC_ALL,
  1203. .nr_to_write = LONG_MAX,
  1204. };
  1205. int err = 0, cnt = 0;
  1206. /*
  1207. * Let's flush inline_data in dirty node pages.
  1208. */
  1209. f2fs_flush_inline_data(sbi);
  1210. retry_flush_quotas:
  1211. f2fs_lock_all(sbi);
  1212. if (__need_flush_quota(sbi)) {
  1213. bool need_lock = sbi->umount_lock_holder != current;
  1214. if (++cnt > DEFAULT_RETRY_QUOTA_FLUSH_COUNT) {
  1215. set_sbi_flag(sbi, SBI_QUOTA_SKIP_FLUSH);
  1216. set_sbi_flag(sbi, SBI_QUOTA_NEED_FLUSH);
  1217. goto retry_flush_dents;
  1218. }
  1219. f2fs_unlock_all(sbi);
  1220. /* don't grab s_umount lock during mount/umount/remount/freeze/quotactl */
  1221. if (!need_lock) {
  1222. f2fs_do_quota_sync(sbi->sb, -1);
  1223. } else if (down_read_trylock(&sbi->sb->s_umount)) {
  1224. f2fs_do_quota_sync(sbi->sb, -1);
  1225. up_read(&sbi->sb->s_umount);
  1226. }
  1227. cond_resched();
  1228. goto retry_flush_quotas;
  1229. }
  1230. retry_flush_dents:
  1231. /* write all the dirty dentry pages */
  1232. if (get_pages(sbi, F2FS_DIRTY_DENTS)) {
  1233. f2fs_unlock_all(sbi);
  1234. err = f2fs_sync_dirty_inodes(sbi, DIR_INODE, true);
  1235. if (err)
  1236. return err;
  1237. cond_resched();
  1238. goto retry_flush_quotas;
  1239. }
  1240. /*
  1241. * POR: we should ensure that there are no dirty node pages
  1242. * until finishing nat/sit flush. inode->i_blocks can be updated.
  1243. */
  1244. f2fs_down_write(&sbi->node_change);
  1245. if (get_pages(sbi, F2FS_DIRTY_IMETA)) {
  1246. f2fs_up_write(&sbi->node_change);
  1247. f2fs_unlock_all(sbi);
  1248. err = f2fs_sync_inode_meta(sbi);
  1249. if (err)
  1250. return err;
  1251. cond_resched();
  1252. goto retry_flush_quotas;
  1253. }
  1254. retry_flush_nodes:
  1255. f2fs_down_write(&sbi->node_write);
  1256. if (get_pages(sbi, F2FS_DIRTY_NODES)) {
  1257. f2fs_up_write(&sbi->node_write);
  1258. atomic_inc(&sbi->wb_sync_req[NODE]);
  1259. err = f2fs_sync_node_pages(sbi, &wbc, false, FS_CP_NODE_IO);
  1260. atomic_dec(&sbi->wb_sync_req[NODE]);
  1261. if (err) {
  1262. f2fs_up_write(&sbi->node_change);
  1263. f2fs_unlock_all(sbi);
  1264. return err;
  1265. }
  1266. cond_resched();
  1267. goto retry_flush_nodes;
  1268. }
  1269. /*
  1270. * sbi->node_change is used only for AIO write_begin path which produces
  1271. * dirty node blocks and some checkpoint values by block allocation.
  1272. */
  1273. __prepare_cp_block(sbi);
  1274. f2fs_up_write(&sbi->node_change);
  1275. return err;
  1276. }
  1277. static void unblock_operations(struct f2fs_sb_info *sbi)
  1278. {
  1279. f2fs_up_write(&sbi->node_write);
  1280. f2fs_unlock_all(sbi);
  1281. }
  1282. void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type)
  1283. {
  1284. DEFINE_WAIT(wait);
  1285. for (;;) {
  1286. if (!get_pages(sbi, type))
  1287. break;
  1288. if (unlikely(f2fs_cp_error(sbi) &&
  1289. !is_sbi_flag_set(sbi, SBI_IS_CLOSE)))
  1290. break;
  1291. if (type == F2FS_DIRTY_META)
  1292. f2fs_sync_meta_pages(sbi, LONG_MAX, FS_CP_META_IO);
  1293. else if (type == F2FS_WB_CP_DATA)
  1294. f2fs_submit_merged_write(sbi, DATA);
  1295. prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE);
  1296. io_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT);
  1297. }
  1298. finish_wait(&sbi->cp_wait, &wait);
  1299. }
  1300. static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
  1301. {
  1302. unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num;
  1303. struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
  1304. unsigned long flags;
  1305. spin_lock_irqsave(&sbi->cp_lock, flags);
  1306. if ((cpc->reason & CP_UMOUNT) &&
  1307. le32_to_cpu(ckpt->cp_pack_total_block_count) >
  1308. sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks)
  1309. disable_nat_bits(sbi, false);
  1310. if (cpc->reason & CP_TRIMMED)
  1311. __set_ckpt_flags(ckpt, CP_TRIMMED_FLAG);
  1312. else
  1313. __clear_ckpt_flags(ckpt, CP_TRIMMED_FLAG);
  1314. if (cpc->reason & CP_UMOUNT)
  1315. __set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
  1316. else
  1317. __clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
  1318. if (cpc->reason & CP_FASTBOOT)
  1319. __set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
  1320. else
  1321. __clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
  1322. if (orphan_num)
  1323. __set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
  1324. else
  1325. __clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
  1326. if (is_sbi_flag_set(sbi, SBI_NEED_FSCK))
  1327. __set_ckpt_flags(ckpt, CP_FSCK_FLAG);
  1328. if (is_sbi_flag_set(sbi, SBI_IS_RESIZEFS))
  1329. __set_ckpt_flags(ckpt, CP_RESIZEFS_FLAG);
  1330. else
  1331. __clear_ckpt_flags(ckpt, CP_RESIZEFS_FLAG);
  1332. if (is_sbi_flag_set(sbi, SBI_CP_DISABLED))
  1333. __set_ckpt_flags(ckpt, CP_DISABLED_FLAG);
  1334. else
  1335. __clear_ckpt_flags(ckpt, CP_DISABLED_FLAG);
  1336. if (is_sbi_flag_set(sbi, SBI_CP_DISABLED_QUICK))
  1337. __set_ckpt_flags(ckpt, CP_DISABLED_QUICK_FLAG);
  1338. else
  1339. __clear_ckpt_flags(ckpt, CP_DISABLED_QUICK_FLAG);
  1340. if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH))
  1341. __set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG);
  1342. else
  1343. __clear_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG);
  1344. if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_REPAIR))
  1345. __set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG);
  1346. /* set this flag to activate crc|cp_ver for recovery */
  1347. __set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG);
  1348. __clear_ckpt_flags(ckpt, CP_NOCRC_RECOVERY_FLAG);
  1349. spin_unlock_irqrestore(&sbi->cp_lock, flags);
  1350. }
  1351. static void commit_checkpoint(struct f2fs_sb_info *sbi,
  1352. void *src, block_t blk_addr)
  1353. {
  1354. struct writeback_control wbc = {};
  1355. /*
  1356. * filemap_get_folios_tag and folio_lock again will take
  1357. * some extra time. Therefore, f2fs_update_meta_pages and
  1358. * f2fs_sync_meta_pages are combined in this function.
  1359. */
  1360. struct folio *folio = f2fs_grab_meta_folio(sbi, blk_addr);
  1361. memcpy(folio_address(folio), src, PAGE_SIZE);
  1362. folio_mark_dirty(folio);
  1363. if (unlikely(!folio_clear_dirty_for_io(folio)))
  1364. f2fs_bug_on(sbi, 1);
  1365. /* writeout cp pack 2 page */
  1366. if (unlikely(!__f2fs_write_meta_folio(folio, &wbc, FS_CP_META_IO))) {
  1367. if (f2fs_cp_error(sbi)) {
  1368. f2fs_folio_put(folio, true);
  1369. return;
  1370. }
  1371. f2fs_bug_on(sbi, true);
  1372. }
  1373. f2fs_folio_put(folio, false);
  1374. /* submit checkpoint (with barrier if NOBARRIER is not set) */
  1375. f2fs_submit_merged_write(sbi, META_FLUSH);
  1376. }
  1377. static inline u64 get_sectors_written(struct block_device *bdev)
  1378. {
  1379. return (u64)part_stat_read(bdev, sectors[STAT_WRITE]);
  1380. }
  1381. u64 f2fs_get_sectors_written(struct f2fs_sb_info *sbi)
  1382. {
  1383. if (f2fs_is_multi_device(sbi)) {
  1384. u64 sectors = 0;
  1385. int i;
  1386. for (i = 0; i < sbi->s_ndevs; i++)
  1387. sectors += get_sectors_written(FDEV(i).bdev);
  1388. return sectors;
  1389. }
  1390. return get_sectors_written(sbi->sb->s_bdev);
  1391. }
  1392. static inline void stat_cp_time(struct cp_control *cpc, enum cp_time type)
  1393. {
  1394. cpc->stats.times[type] = ktime_get();
  1395. }
  1396. static inline void check_cp_time(struct f2fs_sb_info *sbi, struct cp_control *cpc)
  1397. {
  1398. unsigned long long sb_diff, cur_diff;
  1399. enum cp_time ct;
  1400. sb_diff = (u64)ktime_ms_delta(sbi->cp_stats.times[CP_TIME_END],
  1401. sbi->cp_stats.times[CP_TIME_START]);
  1402. cur_diff = (u64)ktime_ms_delta(cpc->stats.times[CP_TIME_END],
  1403. cpc->stats.times[CP_TIME_START]);
  1404. if (cur_diff > sb_diff) {
  1405. sbi->cp_stats = cpc->stats;
  1406. if (cur_diff < CP_LONG_LATENCY_THRESHOLD)
  1407. return;
  1408. f2fs_warn(sbi, "checkpoint was blocked for %llu ms", cur_diff);
  1409. for (ct = CP_TIME_START; ct < CP_TIME_MAX - 1; ct++)
  1410. f2fs_warn(sbi, "Step#%d: %llu ms", ct,
  1411. (u64)ktime_ms_delta(cpc->stats.times[ct + 1],
  1412. cpc->stats.times[ct]));
  1413. }
  1414. }
  1415. static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
  1416. {
  1417. struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
  1418. struct f2fs_nm_info *nm_i = NM_I(sbi);
  1419. unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num, flags;
  1420. block_t start_blk;
  1421. unsigned int data_sum_blocks, orphan_blocks;
  1422. __u32 crc32 = 0;
  1423. int i;
  1424. int cp_payload_blks = __cp_payload(sbi);
  1425. struct curseg_info *seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE);
  1426. u64 kbytes_written;
  1427. int err;
  1428. /* Flush all the NAT/SIT pages */
  1429. f2fs_sync_meta_pages(sbi, LONG_MAX, FS_CP_META_IO);
  1430. stat_cp_time(cpc, CP_TIME_SYNC_META);
  1431. /* start to update checkpoint, cp ver is already updated previously */
  1432. ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi, true));
  1433. ckpt->free_segment_count = cpu_to_le32(free_segments(sbi));
  1434. for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) {
  1435. struct curseg_info *curseg = CURSEG_I(sbi, i + CURSEG_HOT_NODE);
  1436. ckpt->cur_node_segno[i] = cpu_to_le32(curseg->segno);
  1437. ckpt->cur_node_blkoff[i] = cpu_to_le16(curseg->next_blkoff);
  1438. ckpt->alloc_type[i + CURSEG_HOT_NODE] = curseg->alloc_type;
  1439. }
  1440. for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) {
  1441. struct curseg_info *curseg = CURSEG_I(sbi, i + CURSEG_HOT_DATA);
  1442. ckpt->cur_data_segno[i] = cpu_to_le32(curseg->segno);
  1443. ckpt->cur_data_blkoff[i] = cpu_to_le16(curseg->next_blkoff);
  1444. ckpt->alloc_type[i + CURSEG_HOT_DATA] = curseg->alloc_type;
  1445. }
  1446. /* 2 cp + n data seg summary + orphan inode blocks */
  1447. data_sum_blocks = f2fs_npages_for_summary_flush(sbi, false);
  1448. spin_lock_irqsave(&sbi->cp_lock, flags);
  1449. if (data_sum_blocks < NR_CURSEG_DATA_TYPE)
  1450. __set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
  1451. else
  1452. __clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
  1453. spin_unlock_irqrestore(&sbi->cp_lock, flags);
  1454. orphan_blocks = GET_ORPHAN_BLOCKS(orphan_num);
  1455. ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks +
  1456. orphan_blocks);
  1457. if (__remain_node_summaries(cpc->reason))
  1458. ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS +
  1459. cp_payload_blks + data_sum_blocks +
  1460. orphan_blocks + NR_CURSEG_NODE_TYPE);
  1461. else
  1462. ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS +
  1463. cp_payload_blks + data_sum_blocks +
  1464. orphan_blocks);
  1465. /* update ckpt flag for checkpoint */
  1466. update_ckpt_flags(sbi, cpc);
  1467. /* update SIT/NAT bitmap */
  1468. get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP));
  1469. get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP));
  1470. crc32 = f2fs_checkpoint_chksum(ckpt);
  1471. *((__le32 *)((unsigned char *)ckpt +
  1472. le32_to_cpu(ckpt->checksum_offset)))
  1473. = cpu_to_le32(crc32);
  1474. start_blk = __start_cp_next_addr(sbi);
  1475. /* write nat bits */
  1476. if (enabled_nat_bits(sbi, cpc)) {
  1477. __u64 cp_ver = cur_cp_version(ckpt);
  1478. block_t blk;
  1479. cp_ver |= ((__u64)crc32 << 32);
  1480. *(__le64 *)nm_i->nat_bits = cpu_to_le64(cp_ver);
  1481. blk = start_blk + BLKS_PER_SEG(sbi) - nm_i->nat_bits_blocks;
  1482. for (i = 0; i < nm_i->nat_bits_blocks; i++)
  1483. f2fs_update_meta_page(sbi, nm_i->nat_bits +
  1484. F2FS_BLK_TO_BYTES(i), blk + i);
  1485. }
  1486. /* write out checkpoint buffer at block 0 */
  1487. f2fs_update_meta_page(sbi, ckpt, start_blk++);
  1488. for (i = 1; i < 1 + cp_payload_blks; i++)
  1489. f2fs_update_meta_page(sbi, (char *)ckpt + i * F2FS_BLKSIZE,
  1490. start_blk++);
  1491. if (orphan_num) {
  1492. write_orphan_inodes(sbi, start_blk);
  1493. start_blk += orphan_blocks;
  1494. }
  1495. f2fs_write_data_summaries(sbi, start_blk);
  1496. start_blk += data_sum_blocks;
  1497. /* Record write statistics in the hot node summary */
  1498. kbytes_written = sbi->kbytes_written;
  1499. kbytes_written += (f2fs_get_sectors_written(sbi) -
  1500. sbi->sectors_written_start) >> 1;
  1501. seg_i->journal->info.kbytes_written = cpu_to_le64(kbytes_written);
  1502. if (__remain_node_summaries(cpc->reason)) {
  1503. f2fs_write_node_summaries(sbi, start_blk);
  1504. start_blk += NR_CURSEG_NODE_TYPE;
  1505. }
  1506. /* Here, we have one bio having CP pack except cp pack 2 page */
  1507. f2fs_sync_meta_pages(sbi, LONG_MAX, FS_CP_META_IO);
  1508. stat_cp_time(cpc, CP_TIME_SYNC_CP_META);
  1509. /* Wait for all dirty meta pages to be submitted for IO */
  1510. f2fs_wait_on_all_pages(sbi, F2FS_DIRTY_META);
  1511. stat_cp_time(cpc, CP_TIME_WAIT_DIRTY_META);
  1512. /* wait for previous submitted meta pages writeback */
  1513. f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA);
  1514. stat_cp_time(cpc, CP_TIME_WAIT_CP_DATA);
  1515. /* flush all device cache */
  1516. err = f2fs_flush_device_cache(sbi);
  1517. if (err)
  1518. return err;
  1519. stat_cp_time(cpc, CP_TIME_FLUSH_DEVICE);
  1520. /* barrier and flush checkpoint cp pack 2 page if it can */
  1521. commit_checkpoint(sbi, ckpt, start_blk);
  1522. f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA);
  1523. stat_cp_time(cpc, CP_TIME_WAIT_LAST_CP);
  1524. /*
  1525. * invalidate intermediate page cache borrowed from meta inode which are
  1526. * used for migration of encrypted, verity or compressed inode's blocks.
  1527. */
  1528. if (f2fs_sb_has_encrypt(sbi) || f2fs_sb_has_verity(sbi) ||
  1529. f2fs_sb_has_compression(sbi))
  1530. f2fs_bug_on(sbi,
  1531. invalidate_inode_pages2_range(META_MAPPING(sbi),
  1532. MAIN_BLKADDR(sbi), MAX_BLKADDR(sbi) - 1));
  1533. f2fs_release_ino_entry(sbi, false);
  1534. f2fs_reset_fsync_node_info(sbi);
  1535. clear_sbi_flag(sbi, SBI_IS_DIRTY);
  1536. clear_sbi_flag(sbi, SBI_NEED_CP);
  1537. clear_sbi_flag(sbi, SBI_QUOTA_SKIP_FLUSH);
  1538. spin_lock(&sbi->stat_lock);
  1539. sbi->unusable_block_count = 0;
  1540. spin_unlock(&sbi->stat_lock);
  1541. __set_cp_next_pack(sbi);
  1542. /*
  1543. * redirty superblock if metadata like node page or inode cache is
  1544. * updated during writing checkpoint.
  1545. */
  1546. if (get_pages(sbi, F2FS_DIRTY_NODES) ||
  1547. get_pages(sbi, F2FS_DIRTY_IMETA))
  1548. set_sbi_flag(sbi, SBI_IS_DIRTY);
  1549. f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_DENTS));
  1550. return unlikely(f2fs_cp_error(sbi)) ? -EIO : 0;
  1551. }
  1552. int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
  1553. {
  1554. struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
  1555. struct f2fs_lock_context lc;
  1556. unsigned long long ckpt_ver;
  1557. int err = 0;
  1558. stat_cp_time(cpc, CP_TIME_START);
  1559. if (f2fs_readonly(sbi->sb) || f2fs_hw_is_readonly(sbi))
  1560. return -EROFS;
  1561. if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
  1562. if (cpc->reason != CP_PAUSE)
  1563. return 0;
  1564. f2fs_warn(sbi, "Start checkpoint disabled!");
  1565. }
  1566. if (cpc->reason != CP_RESIZE)
  1567. f2fs_down_write_trace(&sbi->cp_global_sem, &lc);
  1568. stat_cp_time(cpc, CP_TIME_LOCK);
  1569. if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) &&
  1570. ((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) ||
  1571. ((cpc->reason & CP_DISCARD) && !sbi->discard_blks)))
  1572. goto out;
  1573. if (unlikely(f2fs_cp_error(sbi))) {
  1574. err = -EIO;
  1575. goto out;
  1576. }
  1577. trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, CP_PHASE_START_BLOCK_OPS);
  1578. err = block_operations(sbi);
  1579. if (err)
  1580. goto out;
  1581. stat_cp_time(cpc, CP_TIME_OP_LOCK);
  1582. trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, CP_PHASE_FINISH_BLOCK_OPS);
  1583. f2fs_flush_merged_writes(sbi);
  1584. /* this is the case of multiple fstrims without any changes */
  1585. if (cpc->reason & CP_DISCARD) {
  1586. if (!f2fs_exist_trim_candidates(sbi, cpc)) {
  1587. unblock_operations(sbi);
  1588. goto out;
  1589. }
  1590. if (NM_I(sbi)->nat_cnt[DIRTY_NAT] == 0 &&
  1591. SIT_I(sbi)->dirty_sentries == 0 &&
  1592. prefree_segments(sbi) == 0) {
  1593. f2fs_flush_sit_entries(sbi, cpc);
  1594. f2fs_clear_prefree_segments(sbi, cpc);
  1595. unblock_operations(sbi);
  1596. goto out;
  1597. }
  1598. }
  1599. stat_cp_time(cpc, CP_TIME_MERGE_WRITE);
  1600. /*
  1601. * update checkpoint pack index
  1602. * Increase the version number so that
  1603. * SIT entries and seg summaries are written at correct place
  1604. */
  1605. ckpt_ver = cur_cp_version(ckpt);
  1606. ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver);
  1607. /* write cached NAT/SIT entries to NAT/SIT area */
  1608. err = f2fs_flush_nat_entries(sbi, cpc);
  1609. if (err) {
  1610. f2fs_err(sbi, "f2fs_flush_nat_entries failed err:%d, stop checkpoint", err);
  1611. f2fs_bug_on(sbi, !f2fs_cp_error(sbi));
  1612. goto stop;
  1613. }
  1614. stat_cp_time(cpc, CP_TIME_FLUSH_NAT);
  1615. f2fs_flush_sit_entries(sbi, cpc);
  1616. stat_cp_time(cpc, CP_TIME_FLUSH_SIT);
  1617. /* save inmem log status */
  1618. f2fs_save_inmem_curseg(sbi);
  1619. err = do_checkpoint(sbi, cpc);
  1620. if (err) {
  1621. f2fs_err(sbi, "do_checkpoint failed err:%d, stop checkpoint", err);
  1622. f2fs_bug_on(sbi, !f2fs_cp_error(sbi));
  1623. f2fs_release_discard_addrs(sbi);
  1624. } else {
  1625. f2fs_clear_prefree_segments(sbi, cpc);
  1626. }
  1627. f2fs_restore_inmem_curseg(sbi);
  1628. f2fs_reinit_atgc_curseg(sbi);
  1629. stat_inc_cp_count(sbi);
  1630. stop:
  1631. unblock_operations(sbi);
  1632. stat_cp_time(cpc, CP_TIME_END);
  1633. check_cp_time(sbi, cpc);
  1634. if (cpc->reason & CP_RECOVERY)
  1635. f2fs_notice(sbi, "checkpoint: version = %llx", ckpt_ver);
  1636. /* update CP_TIME to trigger checkpoint periodically */
  1637. f2fs_update_time(sbi, CP_TIME);
  1638. trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, CP_PHASE_FINISH_CHECKPOINT);
  1639. out:
  1640. if (cpc->reason != CP_RESIZE)
  1641. f2fs_up_write_trace(&sbi->cp_global_sem, &lc);
  1642. return err;
  1643. }
  1644. void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi)
  1645. {
  1646. int i;
  1647. for (i = 0; i < MAX_INO_ENTRY; i++) {
  1648. struct inode_management *im = &sbi->im[i];
  1649. INIT_RADIX_TREE(&im->ino_root, GFP_ATOMIC);
  1650. spin_lock_init(&im->ino_lock);
  1651. INIT_LIST_HEAD(&im->ino_list);
  1652. im->ino_num = 0;
  1653. }
  1654. sbi->max_orphans = (BLKS_PER_SEG(sbi) - F2FS_CP_PACKS -
  1655. NR_CURSEG_PERSIST_TYPE - __cp_payload(sbi)) *
  1656. F2FS_ORPHANS_PER_BLOCK;
  1657. }
  1658. int __init f2fs_create_checkpoint_caches(void)
  1659. {
  1660. ino_entry_slab = f2fs_kmem_cache_create("f2fs_ino_entry",
  1661. sizeof(struct ino_entry));
  1662. if (!ino_entry_slab)
  1663. return -ENOMEM;
  1664. f2fs_inode_entry_slab = f2fs_kmem_cache_create("f2fs_inode_entry",
  1665. sizeof(struct inode_entry));
  1666. if (!f2fs_inode_entry_slab) {
  1667. kmem_cache_destroy(ino_entry_slab);
  1668. return -ENOMEM;
  1669. }
  1670. return 0;
  1671. }
  1672. void f2fs_destroy_checkpoint_caches(void)
  1673. {
  1674. kmem_cache_destroy(ino_entry_slab);
  1675. kmem_cache_destroy(f2fs_inode_entry_slab);
  1676. }
  1677. static int __write_checkpoint_sync(struct f2fs_sb_info *sbi)
  1678. {
  1679. struct cp_control cpc = { .reason = CP_SYNC, };
  1680. struct f2fs_lock_context lc;
  1681. int err;
  1682. f2fs_down_write_trace(&sbi->gc_lock, &lc);
  1683. err = f2fs_write_checkpoint(sbi, &cpc);
  1684. f2fs_up_write_trace(&sbi->gc_lock, &lc);
  1685. return err;
  1686. }
  1687. static void __checkpoint_and_complete_reqs(struct f2fs_sb_info *sbi)
  1688. {
  1689. struct ckpt_req_control *cprc = &sbi->cprc_info;
  1690. struct ckpt_req *req, *next;
  1691. struct llist_node *dispatch_list;
  1692. u64 sum_diff = 0, diff, count = 0;
  1693. int ret;
  1694. dispatch_list = llist_del_all(&cprc->issue_list);
  1695. if (!dispatch_list)
  1696. return;
  1697. dispatch_list = llist_reverse_order(dispatch_list);
  1698. ret = __write_checkpoint_sync(sbi);
  1699. atomic_inc(&cprc->issued_ckpt);
  1700. llist_for_each_entry_safe(req, next, dispatch_list, llnode) {
  1701. diff = (u64)ktime_ms_delta(ktime_get(), req->queue_time);
  1702. req->ret = ret;
  1703. req->delta_time = diff;
  1704. complete(&req->wait);
  1705. sum_diff += diff;
  1706. count++;
  1707. }
  1708. atomic_sub(count, &cprc->queued_ckpt);
  1709. atomic_add(count, &cprc->total_ckpt);
  1710. spin_lock(&cprc->stat_lock);
  1711. cprc->cur_time = (unsigned int)div64_u64(sum_diff, count);
  1712. if (cprc->peak_time < cprc->cur_time)
  1713. cprc->peak_time = cprc->cur_time;
  1714. spin_unlock(&cprc->stat_lock);
  1715. }
  1716. static int issue_checkpoint_thread(void *data)
  1717. {
  1718. struct f2fs_sb_info *sbi = data;
  1719. struct ckpt_req_control *cprc = &sbi->cprc_info;
  1720. wait_queue_head_t *q = &cprc->ckpt_wait_queue;
  1721. repeat:
  1722. if (kthread_should_stop())
  1723. return 0;
  1724. if (!llist_empty(&cprc->issue_list))
  1725. __checkpoint_and_complete_reqs(sbi);
  1726. wait_event_interruptible(*q,
  1727. kthread_should_stop() || !llist_empty(&cprc->issue_list));
  1728. goto repeat;
  1729. }
  1730. static void flush_remained_ckpt_reqs(struct f2fs_sb_info *sbi,
  1731. struct ckpt_req *wait_req)
  1732. {
  1733. struct ckpt_req_control *cprc = &sbi->cprc_info;
  1734. if (!llist_empty(&cprc->issue_list)) {
  1735. __checkpoint_and_complete_reqs(sbi);
  1736. } else {
  1737. /* already dispatched by issue_checkpoint_thread */
  1738. if (wait_req)
  1739. wait_for_completion(&wait_req->wait);
  1740. }
  1741. }
  1742. static void init_ckpt_req(struct ckpt_req *req)
  1743. {
  1744. memset(req, 0, sizeof(struct ckpt_req));
  1745. init_completion(&req->wait);
  1746. req->queue_time = ktime_get();
  1747. }
  1748. int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi)
  1749. {
  1750. struct ckpt_req_control *cprc = &sbi->cprc_info;
  1751. struct ckpt_req req;
  1752. struct cp_control cpc;
  1753. cpc.reason = __get_cp_reason(sbi);
  1754. if (!test_opt(sbi, MERGE_CHECKPOINT) || cpc.reason != CP_SYNC ||
  1755. sbi->umount_lock_holder == current) {
  1756. struct f2fs_lock_context lc;
  1757. int ret;
  1758. f2fs_down_write_trace(&sbi->gc_lock, &lc);
  1759. ret = f2fs_write_checkpoint(sbi, &cpc);
  1760. f2fs_up_write_trace(&sbi->gc_lock, &lc);
  1761. return ret;
  1762. }
  1763. if (!cprc->f2fs_issue_ckpt)
  1764. return __write_checkpoint_sync(sbi);
  1765. init_ckpt_req(&req);
  1766. llist_add(&req.llnode, &cprc->issue_list);
  1767. atomic_inc(&cprc->queued_ckpt);
  1768. /*
  1769. * update issue_list before we wake up issue_checkpoint thread,
  1770. * this smp_mb() pairs with another barrier in ___wait_event(),
  1771. * see more details in comments of waitqueue_active().
  1772. */
  1773. smp_mb();
  1774. if (waitqueue_active(&cprc->ckpt_wait_queue))
  1775. wake_up(&cprc->ckpt_wait_queue);
  1776. if (cprc->f2fs_issue_ckpt)
  1777. wait_for_completion(&req.wait);
  1778. else
  1779. flush_remained_ckpt_reqs(sbi, &req);
  1780. if (unlikely(req.delta_time >= CP_LONG_LATENCY_THRESHOLD)) {
  1781. f2fs_warn_ratelimited(sbi,
  1782. "blocked on checkpoint for %u ms", cprc->peak_time);
  1783. dump_stack();
  1784. }
  1785. return req.ret;
  1786. }
  1787. int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi)
  1788. {
  1789. dev_t dev = sbi->sb->s_bdev->bd_dev;
  1790. struct ckpt_req_control *cprc = &sbi->cprc_info;
  1791. if (cprc->f2fs_issue_ckpt)
  1792. return 0;
  1793. cprc->f2fs_issue_ckpt = kthread_run(issue_checkpoint_thread, sbi,
  1794. "f2fs_ckpt-%u:%u", MAJOR(dev), MINOR(dev));
  1795. if (IS_ERR(cprc->f2fs_issue_ckpt)) {
  1796. int err = PTR_ERR(cprc->f2fs_issue_ckpt);
  1797. cprc->f2fs_issue_ckpt = NULL;
  1798. return err;
  1799. }
  1800. set_task_ioprio(cprc->f2fs_issue_ckpt, cprc->ckpt_thread_ioprio);
  1801. set_user_nice(cprc->f2fs_issue_ckpt,
  1802. PRIO_TO_NICE(sbi->critical_task_priority));
  1803. return 0;
  1804. }
  1805. void f2fs_stop_ckpt_thread(struct f2fs_sb_info *sbi)
  1806. {
  1807. struct ckpt_req_control *cprc = &sbi->cprc_info;
  1808. struct task_struct *ckpt_task;
  1809. if (!cprc->f2fs_issue_ckpt)
  1810. return;
  1811. ckpt_task = cprc->f2fs_issue_ckpt;
  1812. cprc->f2fs_issue_ckpt = NULL;
  1813. kthread_stop(ckpt_task);
  1814. f2fs_flush_ckpt_thread(sbi);
  1815. }
  1816. void f2fs_flush_ckpt_thread(struct f2fs_sb_info *sbi)
  1817. {
  1818. struct ckpt_req_control *cprc = &sbi->cprc_info;
  1819. flush_remained_ckpt_reqs(sbi, NULL);
  1820. /* Let's wait for the previous dispatched checkpoint. */
  1821. while (atomic_read(&cprc->queued_ckpt))
  1822. io_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT);
  1823. }
  1824. void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi)
  1825. {
  1826. struct ckpt_req_control *cprc = &sbi->cprc_info;
  1827. atomic_set(&cprc->issued_ckpt, 0);
  1828. atomic_set(&cprc->total_ckpt, 0);
  1829. atomic_set(&cprc->queued_ckpt, 0);
  1830. cprc->ckpt_thread_ioprio = DEFAULT_CHECKPOINT_IOPRIO;
  1831. init_waitqueue_head(&cprc->ckpt_wait_queue);
  1832. init_llist_head(&cprc->issue_list);
  1833. spin_lock_init(&cprc->stat_lock);
  1834. }