buffer.c 83 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * linux/fs/buffer.c
  4. *
  5. * Copyright (C) 1991, 1992, 2002 Linus Torvalds
  6. */
  7. /*
  8. * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
  9. *
  10. * Removed a lot of unnecessary code and simplified things now that
  11. * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  12. *
  13. * Speed up hash, lru, and free list operations. Use gfp() for allocating
  14. * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
  15. *
  16. * Added 32k buffer block sizes - these are required older ARM systems. - RMK
  17. *
  18. * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
  19. */
  20. #include <linux/kernel.h>
  21. #include <linux/sched/signal.h>
  22. #include <linux/syscalls.h>
  23. #include <linux/fs.h>
  24. #include <linux/iomap.h>
  25. #include <linux/mm.h>
  26. #include <linux/percpu.h>
  27. #include <linux/slab.h>
  28. #include <linux/capability.h>
  29. #include <linux/blkdev.h>
  30. #include <linux/blk-crypto.h>
  31. #include <linux/file.h>
  32. #include <linux/quotaops.h>
  33. #include <linux/highmem.h>
  34. #include <linux/export.h>
  35. #include <linux/backing-dev.h>
  36. #include <linux/writeback.h>
  37. #include <linux/hash.h>
  38. #include <linux/suspend.h>
  39. #include <linux/buffer_head.h>
  40. #include <linux/task_io_accounting_ops.h>
  41. #include <linux/bio.h>
  42. #include <linux/cpu.h>
  43. #include <linux/bitops.h>
  44. #include <linux/mpage.h>
  45. #include <linux/bit_spinlock.h>
  46. #include <linux/pagevec.h>
  47. #include <linux/sched/mm.h>
  48. #include <trace/events/block.h>
  49. #include <linux/fscrypt.h>
  50. #include <linux/fsverity.h>
  51. #include <linux/sched/isolation.h>
  52. #include "internal.h"
  53. static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
  54. static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
  55. enum rw_hint hint, struct writeback_control *wbc);
  56. #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
  57. inline void touch_buffer(struct buffer_head *bh)
  58. {
  59. trace_block_touch_buffer(bh);
  60. folio_mark_accessed(bh->b_folio);
  61. }
  62. EXPORT_SYMBOL(touch_buffer);
  63. void __lock_buffer(struct buffer_head *bh)
  64. {
  65. wait_on_bit_lock_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
  66. }
  67. EXPORT_SYMBOL(__lock_buffer);
  68. void unlock_buffer(struct buffer_head *bh)
  69. {
  70. clear_bit_unlock(BH_Lock, &bh->b_state);
  71. smp_mb__after_atomic();
  72. wake_up_bit(&bh->b_state, BH_Lock);
  73. }
  74. EXPORT_SYMBOL(unlock_buffer);
  75. /*
  76. * Returns if the folio has dirty or writeback buffers. If all the buffers
  77. * are unlocked and clean then the folio_test_dirty information is stale. If
  78. * any of the buffers are locked, it is assumed they are locked for IO.
  79. */
  80. void buffer_check_dirty_writeback(struct folio *folio,
  81. bool *dirty, bool *writeback)
  82. {
  83. struct buffer_head *head, *bh;
  84. *dirty = false;
  85. *writeback = false;
  86. BUG_ON(!folio_test_locked(folio));
  87. head = folio_buffers(folio);
  88. if (!head)
  89. return;
  90. if (folio_test_writeback(folio))
  91. *writeback = true;
  92. bh = head;
  93. do {
  94. if (buffer_locked(bh))
  95. *writeback = true;
  96. if (buffer_dirty(bh))
  97. *dirty = true;
  98. bh = bh->b_this_page;
  99. } while (bh != head);
  100. }
  101. /*
  102. * Block until a buffer comes unlocked. This doesn't stop it
  103. * from becoming locked again - you have to lock it yourself
  104. * if you want to preserve its state.
  105. */
  106. void __wait_on_buffer(struct buffer_head * bh)
  107. {
  108. wait_on_bit_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
  109. }
  110. EXPORT_SYMBOL(__wait_on_buffer);
  111. static void buffer_io_error(struct buffer_head *bh, char *msg)
  112. {
  113. if (!test_bit(BH_Quiet, &bh->b_state))
  114. printk_ratelimited(KERN_ERR
  115. "Buffer I/O error on dev %pg, logical block %llu%s\n",
  116. bh->b_bdev, (unsigned long long)bh->b_blocknr, msg);
  117. }
  118. /*
  119. * End-of-IO handler helper function which does not touch the bh after
  120. * unlocking it.
  121. * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
  122. * a race there is benign: unlock_buffer() only use the bh's address for
  123. * hashing after unlocking the buffer, so it doesn't actually touch the bh
  124. * itself.
  125. */
  126. static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
  127. {
  128. if (uptodate) {
  129. set_buffer_uptodate(bh);
  130. } else {
  131. /* This happens, due to failed read-ahead attempts. */
  132. clear_buffer_uptodate(bh);
  133. }
  134. unlock_buffer(bh);
  135. }
  136. /*
  137. * Default synchronous end-of-IO handler.. Just mark it up-to-date and
  138. * unlock the buffer.
  139. */
  140. void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
  141. {
  142. put_bh(bh);
  143. __end_buffer_read_notouch(bh, uptodate);
  144. }
  145. EXPORT_SYMBOL(end_buffer_read_sync);
  146. void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
  147. {
  148. if (uptodate) {
  149. set_buffer_uptodate(bh);
  150. } else {
  151. buffer_io_error(bh, ", lost sync page write");
  152. mark_buffer_write_io_error(bh);
  153. clear_buffer_uptodate(bh);
  154. }
  155. unlock_buffer(bh);
  156. put_bh(bh);
  157. }
  158. EXPORT_SYMBOL(end_buffer_write_sync);
  159. static struct buffer_head *
  160. __find_get_block_slow(struct block_device *bdev, sector_t block, bool atomic)
  161. {
  162. struct address_space *bd_mapping = bdev->bd_mapping;
  163. const int blkbits = bd_mapping->host->i_blkbits;
  164. struct buffer_head *ret = NULL;
  165. pgoff_t index;
  166. struct buffer_head *bh;
  167. struct buffer_head *head;
  168. struct folio *folio;
  169. int all_mapped = 1;
  170. static DEFINE_RATELIMIT_STATE(last_warned, HZ, 1);
  171. index = ((loff_t)block << blkbits) / PAGE_SIZE;
  172. folio = __filemap_get_folio(bd_mapping, index, FGP_ACCESSED, 0);
  173. if (IS_ERR(folio))
  174. goto out;
  175. /*
  176. * Folio lock protects the buffers. Callers that cannot block
  177. * will fallback to serializing vs try_to_free_buffers() via
  178. * the i_private_lock.
  179. */
  180. if (atomic)
  181. spin_lock(&bd_mapping->i_private_lock);
  182. else
  183. folio_lock(folio);
  184. head = folio_buffers(folio);
  185. if (!head)
  186. goto out_unlock;
  187. /*
  188. * Upon a noref migration, the folio lock serializes here;
  189. * otherwise bail.
  190. */
  191. if (test_bit_acquire(BH_Migrate, &head->b_state)) {
  192. WARN_ON(!atomic);
  193. goto out_unlock;
  194. }
  195. bh = head;
  196. do {
  197. if (!buffer_mapped(bh))
  198. all_mapped = 0;
  199. else if (bh->b_blocknr == block) {
  200. ret = bh;
  201. get_bh(bh);
  202. goto out_unlock;
  203. }
  204. bh = bh->b_this_page;
  205. } while (bh != head);
  206. /* we might be here because some of the buffers on this page are
  207. * not mapped. This is due to various races between
  208. * file io on the block device and getblk. It gets dealt with
  209. * elsewhere, don't buffer_error if we had some unmapped buffers
  210. */
  211. ratelimit_set_flags(&last_warned, RATELIMIT_MSG_ON_RELEASE);
  212. if (all_mapped && __ratelimit(&last_warned)) {
  213. printk("__find_get_block_slow() failed. block=%llu, "
  214. "b_blocknr=%llu, b_state=0x%08lx, b_size=%zu, "
  215. "device %pg blocksize: %d\n",
  216. (unsigned long long)block,
  217. (unsigned long long)bh->b_blocknr,
  218. bh->b_state, bh->b_size, bdev,
  219. 1 << blkbits);
  220. }
  221. out_unlock:
  222. if (atomic)
  223. spin_unlock(&bd_mapping->i_private_lock);
  224. else
  225. folio_unlock(folio);
  226. folio_put(folio);
  227. out:
  228. return ret;
  229. }
  230. static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
  231. {
  232. unsigned long flags;
  233. struct buffer_head *first;
  234. struct buffer_head *tmp;
  235. struct folio *folio;
  236. int folio_uptodate = 1;
  237. BUG_ON(!buffer_async_read(bh));
  238. folio = bh->b_folio;
  239. if (uptodate) {
  240. set_buffer_uptodate(bh);
  241. } else {
  242. clear_buffer_uptodate(bh);
  243. buffer_io_error(bh, ", async page read");
  244. }
  245. /*
  246. * Be _very_ careful from here on. Bad things can happen if
  247. * two buffer heads end IO at almost the same time and both
  248. * decide that the page is now completely done.
  249. */
  250. first = folio_buffers(folio);
  251. spin_lock_irqsave(&first->b_uptodate_lock, flags);
  252. clear_buffer_async_read(bh);
  253. unlock_buffer(bh);
  254. tmp = bh;
  255. do {
  256. if (!buffer_uptodate(tmp))
  257. folio_uptodate = 0;
  258. if (buffer_async_read(tmp)) {
  259. BUG_ON(!buffer_locked(tmp));
  260. goto still_busy;
  261. }
  262. tmp = tmp->b_this_page;
  263. } while (tmp != bh);
  264. spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
  265. folio_end_read(folio, folio_uptodate);
  266. return;
  267. still_busy:
  268. spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
  269. }
  270. struct postprocess_bh_ctx {
  271. struct work_struct work;
  272. struct buffer_head *bh;
  273. struct fsverity_info *vi;
  274. };
  275. static void verify_bh(struct work_struct *work)
  276. {
  277. struct postprocess_bh_ctx *ctx =
  278. container_of(work, struct postprocess_bh_ctx, work);
  279. struct buffer_head *bh = ctx->bh;
  280. bool valid;
  281. valid = fsverity_verify_blocks(ctx->vi, bh->b_folio, bh->b_size,
  282. bh_offset(bh));
  283. end_buffer_async_read(bh, valid);
  284. kfree(ctx);
  285. }
  286. static void decrypt_bh(struct work_struct *work)
  287. {
  288. struct postprocess_bh_ctx *ctx =
  289. container_of(work, struct postprocess_bh_ctx, work);
  290. struct buffer_head *bh = ctx->bh;
  291. int err;
  292. err = fscrypt_decrypt_pagecache_blocks(bh->b_folio, bh->b_size,
  293. bh_offset(bh));
  294. if (err == 0 && ctx->vi) {
  295. /*
  296. * We use different work queues for decryption and for verity
  297. * because verity may require reading metadata pages that need
  298. * decryption, and we shouldn't recurse to the same workqueue.
  299. */
  300. INIT_WORK(&ctx->work, verify_bh);
  301. fsverity_enqueue_verify_work(&ctx->work);
  302. return;
  303. }
  304. end_buffer_async_read(bh, err == 0);
  305. kfree(ctx);
  306. }
  307. /*
  308. * I/O completion handler for block_read_full_folio() - pages
  309. * which come unlocked at the end of I/O.
  310. */
  311. static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate)
  312. {
  313. struct inode *inode = bh->b_folio->mapping->host;
  314. bool decrypt = fscrypt_inode_uses_fs_layer_crypto(inode);
  315. struct fsverity_info *vi = NULL;
  316. /* needed by ext4 */
  317. if (bh->b_folio->index < DIV_ROUND_UP(inode->i_size, PAGE_SIZE))
  318. vi = fsverity_get_info(inode);
  319. /* Decrypt (with fscrypt) and/or verify (with fsverity) if needed. */
  320. if (uptodate && (decrypt || vi)) {
  321. struct postprocess_bh_ctx *ctx = kmalloc_obj(*ctx, GFP_ATOMIC);
  322. if (ctx) {
  323. ctx->bh = bh;
  324. ctx->vi = vi;
  325. if (decrypt) {
  326. INIT_WORK(&ctx->work, decrypt_bh);
  327. fscrypt_enqueue_decrypt_work(&ctx->work);
  328. } else {
  329. INIT_WORK(&ctx->work, verify_bh);
  330. fsverity_enqueue_verify_work(&ctx->work);
  331. }
  332. return;
  333. }
  334. uptodate = 0;
  335. }
  336. end_buffer_async_read(bh, uptodate);
  337. }
  338. /*
  339. * Completion handler for block_write_full_folio() - folios which are unlocked
  340. * during I/O, and which have the writeback flag cleared upon I/O completion.
  341. */
  342. static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
  343. {
  344. unsigned long flags;
  345. struct buffer_head *first;
  346. struct buffer_head *tmp;
  347. struct folio *folio;
  348. BUG_ON(!buffer_async_write(bh));
  349. folio = bh->b_folio;
  350. if (uptodate) {
  351. set_buffer_uptodate(bh);
  352. } else {
  353. buffer_io_error(bh, ", lost async page write");
  354. mark_buffer_write_io_error(bh);
  355. clear_buffer_uptodate(bh);
  356. }
  357. first = folio_buffers(folio);
  358. spin_lock_irqsave(&first->b_uptodate_lock, flags);
  359. clear_buffer_async_write(bh);
  360. unlock_buffer(bh);
  361. tmp = bh->b_this_page;
  362. while (tmp != bh) {
  363. if (buffer_async_write(tmp)) {
  364. BUG_ON(!buffer_locked(tmp));
  365. goto still_busy;
  366. }
  367. tmp = tmp->b_this_page;
  368. }
  369. spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
  370. folio_end_writeback(folio);
  371. return;
  372. still_busy:
  373. spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
  374. }
  375. /*
  376. * If a page's buffers are under async readin (end_buffer_async_read
  377. * completion) then there is a possibility that another thread of
  378. * control could lock one of the buffers after it has completed
  379. * but while some of the other buffers have not completed. This
  380. * locked buffer would confuse end_buffer_async_read() into not unlocking
  381. * the page. So the absence of BH_Async_Read tells end_buffer_async_read()
  382. * that this buffer is not under async I/O.
  383. *
  384. * The page comes unlocked when it has no locked buffer_async buffers
  385. * left.
  386. *
  387. * PageLocked prevents anyone starting new async I/O reads any of
  388. * the buffers.
  389. *
  390. * PageWriteback is used to prevent simultaneous writeout of the same
  391. * page.
  392. *
  393. * PageLocked prevents anyone from starting writeback of a page which is
  394. * under read I/O (PageWriteback is only ever set against a locked page).
  395. */
  396. static void mark_buffer_async_read(struct buffer_head *bh)
  397. {
  398. bh->b_end_io = end_buffer_async_read_io;
  399. set_buffer_async_read(bh);
  400. }
  401. static void mark_buffer_async_write_endio(struct buffer_head *bh,
  402. bh_end_io_t *handler)
  403. {
  404. bh->b_end_io = handler;
  405. set_buffer_async_write(bh);
  406. }
  407. void mark_buffer_async_write(struct buffer_head *bh)
  408. {
  409. mark_buffer_async_write_endio(bh, end_buffer_async_write);
  410. }
  411. EXPORT_SYMBOL(mark_buffer_async_write);
  412. /*
  413. * fs/buffer.c contains helper functions for buffer-backed address space's
  414. * fsync functions. A common requirement for buffer-based filesystems is
  415. * that certain data from the backing blockdev needs to be written out for
  416. * a successful fsync(). For example, ext2 indirect blocks need to be
  417. * written back and waited upon before fsync() returns.
  418. *
  419. * The functions mark_buffer_dirty_inode(), fsync_inode_buffers(),
  420. * inode_has_buffers() and invalidate_inode_buffers() are provided for the
  421. * management of a list of dependent buffers at ->i_mapping->i_private_list.
  422. *
  423. * Locking is a little subtle: try_to_free_buffers() will remove buffers
  424. * from their controlling inode's queue when they are being freed. But
  425. * try_to_free_buffers() will be operating against the *blockdev* mapping
  426. * at the time, not against the S_ISREG file which depends on those buffers.
  427. * So the locking for i_private_list is via the i_private_lock in the address_space
  428. * which backs the buffers. Which is different from the address_space
  429. * against which the buffers are listed. So for a particular address_space,
  430. * mapping->i_private_lock does *not* protect mapping->i_private_list! In fact,
  431. * mapping->i_private_list will always be protected by the backing blockdev's
  432. * ->i_private_lock.
  433. *
  434. * Which introduces a requirement: all buffers on an address_space's
  435. * ->i_private_list must be from the same address_space: the blockdev's.
  436. *
  437. * address_spaces which do not place buffers at ->i_private_list via these
  438. * utility functions are free to use i_private_lock and i_private_list for
  439. * whatever they want. The only requirement is that list_empty(i_private_list)
  440. * be true at clear_inode() time.
  441. *
  442. * FIXME: clear_inode should not call invalidate_inode_buffers(). The
  443. * filesystems should do that. invalidate_inode_buffers() should just go
  444. * BUG_ON(!list_empty).
  445. *
  446. * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
  447. * take an address_space, not an inode. And it should be called
  448. * mark_buffer_dirty_fsync() to clearly define why those buffers are being
  449. * queued up.
  450. *
  451. * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
  452. * list if it is already on a list. Because if the buffer is on a list,
  453. * it *must* already be on the right one. If not, the filesystem is being
  454. * silly. This will save a ton of locking. But first we have to ensure
  455. * that buffers are taken *off* the old inode's list when they are freed
  456. * (presumably in truncate). That requires careful auditing of all
  457. * filesystems (do it inside bforget()). It could also be done by bringing
  458. * b_inode back.
  459. */
  460. /*
  461. * The buffer's backing address_space's i_private_lock must be held
  462. */
  463. static void __remove_assoc_queue(struct buffer_head *bh)
  464. {
  465. list_del_init(&bh->b_assoc_buffers);
  466. WARN_ON(!bh->b_assoc_map);
  467. bh->b_assoc_map = NULL;
  468. }
  469. int inode_has_buffers(struct inode *inode)
  470. {
  471. return !list_empty(&inode->i_data.i_private_list);
  472. }
  473. /*
  474. * osync is designed to support O_SYNC io. It waits synchronously for
  475. * all already-submitted IO to complete, but does not queue any new
  476. * writes to the disk.
  477. *
  478. * To do O_SYNC writes, just queue the buffer writes with write_dirty_buffer
  479. * as you dirty the buffers, and then use osync_inode_buffers to wait for
  480. * completion. Any other dirty buffers which are not yet queued for
  481. * write will not be flushed to disk by the osync.
  482. */
  483. static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
  484. {
  485. struct buffer_head *bh;
  486. struct list_head *p;
  487. int err = 0;
  488. spin_lock(lock);
  489. repeat:
  490. list_for_each_prev(p, list) {
  491. bh = BH_ENTRY(p);
  492. if (buffer_locked(bh)) {
  493. get_bh(bh);
  494. spin_unlock(lock);
  495. wait_on_buffer(bh);
  496. if (!buffer_uptodate(bh))
  497. err = -EIO;
  498. brelse(bh);
  499. spin_lock(lock);
  500. goto repeat;
  501. }
  502. }
  503. spin_unlock(lock);
  504. return err;
  505. }
  506. /**
  507. * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
  508. * @mapping: the mapping which wants those buffers written
  509. *
  510. * Starts I/O against the buffers at mapping->i_private_list, and waits upon
  511. * that I/O.
  512. *
  513. * Basically, this is a convenience function for fsync().
  514. * @mapping is a file or directory which needs those buffers to be written for
  515. * a successful fsync().
  516. */
  517. int sync_mapping_buffers(struct address_space *mapping)
  518. {
  519. struct address_space *buffer_mapping = mapping->i_private_data;
  520. if (buffer_mapping == NULL || list_empty(&mapping->i_private_list))
  521. return 0;
  522. return fsync_buffers_list(&buffer_mapping->i_private_lock,
  523. &mapping->i_private_list);
  524. }
  525. EXPORT_SYMBOL(sync_mapping_buffers);
  526. /**
  527. * generic_buffers_fsync_noflush - generic buffer fsync implementation
  528. * for simple filesystems with no inode lock
  529. *
  530. * @file: file to synchronize
  531. * @start: start offset in bytes
  532. * @end: end offset in bytes (inclusive)
  533. * @datasync: only synchronize essential metadata if true
  534. *
  535. * This is a generic implementation of the fsync method for simple
  536. * filesystems which track all non-inode metadata in the buffers list
  537. * hanging off the address_space structure.
  538. */
  539. int generic_buffers_fsync_noflush(struct file *file, loff_t start, loff_t end,
  540. bool datasync)
  541. {
  542. struct inode *inode = file->f_mapping->host;
  543. int err;
  544. int ret;
  545. err = file_write_and_wait_range(file, start, end);
  546. if (err)
  547. return err;
  548. ret = sync_mapping_buffers(inode->i_mapping);
  549. if (!(inode_state_read_once(inode) & I_DIRTY_ALL))
  550. goto out;
  551. if (datasync && !(inode_state_read_once(inode) & I_DIRTY_DATASYNC))
  552. goto out;
  553. err = sync_inode_metadata(inode, 1);
  554. if (ret == 0)
  555. ret = err;
  556. out:
  557. /* check and advance again to catch errors after syncing out buffers */
  558. err = file_check_and_advance_wb_err(file);
  559. if (ret == 0)
  560. ret = err;
  561. return ret;
  562. }
  563. EXPORT_SYMBOL(generic_buffers_fsync_noflush);
  564. /**
  565. * generic_buffers_fsync - generic buffer fsync implementation
  566. * for simple filesystems with no inode lock
  567. *
  568. * @file: file to synchronize
  569. * @start: start offset in bytes
  570. * @end: end offset in bytes (inclusive)
  571. * @datasync: only synchronize essential metadata if true
  572. *
  573. * This is a generic implementation of the fsync method for simple
  574. * filesystems which track all non-inode metadata in the buffers list
  575. * hanging off the address_space structure. This also makes sure that
  576. * a device cache flush operation is called at the end.
  577. */
  578. int generic_buffers_fsync(struct file *file, loff_t start, loff_t end,
  579. bool datasync)
  580. {
  581. struct inode *inode = file->f_mapping->host;
  582. int ret;
  583. ret = generic_buffers_fsync_noflush(file, start, end, datasync);
  584. if (!ret)
  585. ret = blkdev_issue_flush(inode->i_sb->s_bdev);
  586. return ret;
  587. }
  588. EXPORT_SYMBOL(generic_buffers_fsync);
  589. /*
  590. * Called when we've recently written block `bblock', and it is known that
  591. * `bblock' was for a buffer_boundary() buffer. This means that the block at
  592. * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
  593. * dirty, schedule it for IO. So that indirects merge nicely with their data.
  594. */
  595. void write_boundary_block(struct block_device *bdev,
  596. sector_t bblock, unsigned blocksize)
  597. {
  598. struct buffer_head *bh;
  599. bh = __find_get_block_nonatomic(bdev, bblock + 1, blocksize);
  600. if (bh) {
  601. if (buffer_dirty(bh))
  602. write_dirty_buffer(bh, 0);
  603. put_bh(bh);
  604. }
  605. }
  606. void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
  607. {
  608. struct address_space *mapping = inode->i_mapping;
  609. struct address_space *buffer_mapping = bh->b_folio->mapping;
  610. mark_buffer_dirty(bh);
  611. if (!mapping->i_private_data) {
  612. mapping->i_private_data = buffer_mapping;
  613. } else {
  614. BUG_ON(mapping->i_private_data != buffer_mapping);
  615. }
  616. if (!bh->b_assoc_map) {
  617. spin_lock(&buffer_mapping->i_private_lock);
  618. list_move_tail(&bh->b_assoc_buffers,
  619. &mapping->i_private_list);
  620. bh->b_assoc_map = mapping;
  621. spin_unlock(&buffer_mapping->i_private_lock);
  622. }
  623. }
  624. EXPORT_SYMBOL(mark_buffer_dirty_inode);
  625. /**
  626. * block_dirty_folio - Mark a folio as dirty.
  627. * @mapping: The address space containing this folio.
  628. * @folio: The folio to mark dirty.
  629. *
  630. * Filesystems which use buffer_heads can use this function as their
  631. * ->dirty_folio implementation. Some filesystems need to do a little
  632. * work before calling this function. Filesystems which do not use
  633. * buffer_heads should call filemap_dirty_folio() instead.
  634. *
  635. * If the folio has buffers, the uptodate buffers are set dirty, to
  636. * preserve dirty-state coherency between the folio and the buffers.
  637. * Buffers added to a dirty folio are created dirty.
  638. *
  639. * The buffers are dirtied before the folio is dirtied. There's a small
  640. * race window in which writeback may see the folio cleanness but not the
  641. * buffer dirtiness. That's fine. If this code were to set the folio
  642. * dirty before the buffers, writeback could clear the folio dirty flag,
  643. * see a bunch of clean buffers and we'd end up with dirty buffers/clean
  644. * folio on the dirty folio list.
  645. *
  646. * We use i_private_lock to lock against try_to_free_buffers() while
  647. * using the folio's buffer list. This also prevents clean buffers
  648. * being added to the folio after it was set dirty.
  649. *
  650. * Context: May only be called from process context. Does not sleep.
  651. * Caller must ensure that @folio cannot be truncated during this call,
  652. * typically by holding the folio lock or having a page in the folio
  653. * mapped and holding the page table lock.
  654. *
  655. * Return: True if the folio was dirtied; false if it was already dirtied.
  656. */
  657. bool block_dirty_folio(struct address_space *mapping, struct folio *folio)
  658. {
  659. struct buffer_head *head;
  660. bool newly_dirty;
  661. spin_lock(&mapping->i_private_lock);
  662. head = folio_buffers(folio);
  663. if (head) {
  664. struct buffer_head *bh = head;
  665. do {
  666. set_buffer_dirty(bh);
  667. bh = bh->b_this_page;
  668. } while (bh != head);
  669. }
  670. /*
  671. * Lock out page's memcg migration to keep PageDirty
  672. * synchronized with per-memcg dirty page counters.
  673. */
  674. newly_dirty = !folio_test_set_dirty(folio);
  675. spin_unlock(&mapping->i_private_lock);
  676. if (newly_dirty)
  677. __folio_mark_dirty(folio, mapping, 1);
  678. if (newly_dirty)
  679. __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
  680. return newly_dirty;
  681. }
  682. EXPORT_SYMBOL(block_dirty_folio);
  683. /*
  684. * Write out and wait upon a list of buffers.
  685. *
  686. * We have conflicting pressures: we want to make sure that all
  687. * initially dirty buffers get waited on, but that any subsequently
  688. * dirtied buffers don't. After all, we don't want fsync to last
  689. * forever if somebody is actively writing to the file.
  690. *
  691. * Do this in two main stages: first we copy dirty buffers to a
  692. * temporary inode list, queueing the writes as we go. Then we clean
  693. * up, waiting for those writes to complete.
  694. *
  695. * During this second stage, any subsequent updates to the file may end
  696. * up refiling the buffer on the original inode's dirty list again, so
  697. * there is a chance we will end up with a buffer queued for write but
  698. * not yet completed on that list. So, as a final cleanup we go through
  699. * the osync code to catch these locked, dirty buffers without requeuing
  700. * any newly dirty buffers for write.
  701. */
  702. static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
  703. {
  704. struct buffer_head *bh;
  705. struct address_space *mapping;
  706. int err = 0, err2;
  707. struct blk_plug plug;
  708. LIST_HEAD(tmp);
  709. blk_start_plug(&plug);
  710. spin_lock(lock);
  711. while (!list_empty(list)) {
  712. bh = BH_ENTRY(list->next);
  713. mapping = bh->b_assoc_map;
  714. __remove_assoc_queue(bh);
  715. /* Avoid race with mark_buffer_dirty_inode() which does
  716. * a lockless check and we rely on seeing the dirty bit */
  717. smp_mb();
  718. if (buffer_dirty(bh) || buffer_locked(bh)) {
  719. list_add(&bh->b_assoc_buffers, &tmp);
  720. bh->b_assoc_map = mapping;
  721. if (buffer_dirty(bh)) {
  722. get_bh(bh);
  723. spin_unlock(lock);
  724. /*
  725. * Ensure any pending I/O completes so that
  726. * write_dirty_buffer() actually writes the
  727. * current contents - it is a noop if I/O is
  728. * still in flight on potentially older
  729. * contents.
  730. */
  731. write_dirty_buffer(bh, REQ_SYNC);
  732. /*
  733. * Kick off IO for the previous mapping. Note
  734. * that we will not run the very last mapping,
  735. * wait_on_buffer() will do that for us
  736. * through sync_buffer().
  737. */
  738. brelse(bh);
  739. spin_lock(lock);
  740. }
  741. }
  742. }
  743. spin_unlock(lock);
  744. blk_finish_plug(&plug);
  745. spin_lock(lock);
  746. while (!list_empty(&tmp)) {
  747. bh = BH_ENTRY(tmp.prev);
  748. get_bh(bh);
  749. mapping = bh->b_assoc_map;
  750. __remove_assoc_queue(bh);
  751. /* Avoid race with mark_buffer_dirty_inode() which does
  752. * a lockless check and we rely on seeing the dirty bit */
  753. smp_mb();
  754. if (buffer_dirty(bh)) {
  755. list_add(&bh->b_assoc_buffers,
  756. &mapping->i_private_list);
  757. bh->b_assoc_map = mapping;
  758. }
  759. spin_unlock(lock);
  760. wait_on_buffer(bh);
  761. if (!buffer_uptodate(bh))
  762. err = -EIO;
  763. brelse(bh);
  764. spin_lock(lock);
  765. }
  766. spin_unlock(lock);
  767. err2 = osync_buffers_list(lock, list);
  768. if (err)
  769. return err;
  770. else
  771. return err2;
  772. }
  773. /*
  774. * Invalidate any and all dirty buffers on a given inode. We are
  775. * probably unmounting the fs, but that doesn't mean we have already
  776. * done a sync(). Just drop the buffers from the inode list.
  777. *
  778. * NOTE: we take the inode's blockdev's mapping's i_private_lock. Which
  779. * assumes that all the buffers are against the blockdev.
  780. */
  781. void invalidate_inode_buffers(struct inode *inode)
  782. {
  783. if (inode_has_buffers(inode)) {
  784. struct address_space *mapping = &inode->i_data;
  785. struct list_head *list = &mapping->i_private_list;
  786. struct address_space *buffer_mapping = mapping->i_private_data;
  787. spin_lock(&buffer_mapping->i_private_lock);
  788. while (!list_empty(list))
  789. __remove_assoc_queue(BH_ENTRY(list->next));
  790. spin_unlock(&buffer_mapping->i_private_lock);
  791. }
  792. }
  793. EXPORT_SYMBOL(invalidate_inode_buffers);
  794. /*
  795. * Remove any clean buffers from the inode's buffer list. This is called
  796. * when we're trying to free the inode itself. Those buffers can pin it.
  797. *
  798. * Returns true if all buffers were removed.
  799. */
  800. int remove_inode_buffers(struct inode *inode)
  801. {
  802. int ret = 1;
  803. if (inode_has_buffers(inode)) {
  804. struct address_space *mapping = &inode->i_data;
  805. struct list_head *list = &mapping->i_private_list;
  806. struct address_space *buffer_mapping = mapping->i_private_data;
  807. spin_lock(&buffer_mapping->i_private_lock);
  808. while (!list_empty(list)) {
  809. struct buffer_head *bh = BH_ENTRY(list->next);
  810. if (buffer_dirty(bh)) {
  811. ret = 0;
  812. break;
  813. }
  814. __remove_assoc_queue(bh);
  815. }
  816. spin_unlock(&buffer_mapping->i_private_lock);
  817. }
  818. return ret;
  819. }
  820. /*
  821. * Create the appropriate buffers when given a folio for data area and
  822. * the size of each buffer.. Use the bh->b_this_page linked list to
  823. * follow the buffers created. Return NULL if unable to create more
  824. * buffers.
  825. *
  826. * The retry flag is used to differentiate async IO (paging, swapping)
  827. * which may not fail from ordinary buffer allocations.
  828. */
  829. struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size,
  830. gfp_t gfp)
  831. {
  832. struct buffer_head *bh, *head;
  833. long offset;
  834. struct mem_cgroup *memcg, *old_memcg;
  835. /* The folio lock pins the memcg */
  836. memcg = folio_memcg(folio);
  837. old_memcg = set_active_memcg(memcg);
  838. head = NULL;
  839. offset = folio_size(folio);
  840. while ((offset -= size) >= 0) {
  841. bh = alloc_buffer_head(gfp);
  842. if (!bh)
  843. goto no_grow;
  844. bh->b_this_page = head;
  845. bh->b_blocknr = -1;
  846. head = bh;
  847. bh->b_size = size;
  848. /* Link the buffer to its folio */
  849. folio_set_bh(bh, folio, offset);
  850. }
  851. out:
  852. set_active_memcg(old_memcg);
  853. return head;
  854. /*
  855. * In case anything failed, we just free everything we got.
  856. */
  857. no_grow:
  858. if (head) {
  859. do {
  860. bh = head;
  861. head = head->b_this_page;
  862. free_buffer_head(bh);
  863. } while (head);
  864. }
  865. goto out;
  866. }
  867. EXPORT_SYMBOL_GPL(folio_alloc_buffers);
  868. struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size)
  869. {
  870. gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
  871. return folio_alloc_buffers(page_folio(page), size, gfp);
  872. }
  873. EXPORT_SYMBOL_GPL(alloc_page_buffers);
  874. static inline void link_dev_buffers(struct folio *folio,
  875. struct buffer_head *head)
  876. {
  877. struct buffer_head *bh, *tail;
  878. bh = head;
  879. do {
  880. tail = bh;
  881. bh = bh->b_this_page;
  882. } while (bh);
  883. tail->b_this_page = head;
  884. folio_attach_private(folio, head);
  885. }
  886. static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
  887. {
  888. sector_t retval = ~((sector_t)0);
  889. loff_t sz = bdev_nr_bytes(bdev);
  890. if (sz) {
  891. unsigned int sizebits = blksize_bits(size);
  892. retval = (sz >> sizebits);
  893. }
  894. return retval;
  895. }
  896. /*
  897. * Initialise the state of a blockdev folio's buffers.
  898. */
  899. static sector_t folio_init_buffers(struct folio *folio,
  900. struct block_device *bdev, unsigned size)
  901. {
  902. struct buffer_head *head = folio_buffers(folio);
  903. struct buffer_head *bh = head;
  904. bool uptodate = folio_test_uptodate(folio);
  905. sector_t block = div_u64(folio_pos(folio), size);
  906. sector_t end_block = blkdev_max_block(bdev, size);
  907. do {
  908. if (!buffer_mapped(bh)) {
  909. bh->b_end_io = NULL;
  910. bh->b_private = NULL;
  911. bh->b_bdev = bdev;
  912. bh->b_blocknr = block;
  913. if (uptodate)
  914. set_buffer_uptodate(bh);
  915. if (block < end_block)
  916. set_buffer_mapped(bh);
  917. }
  918. block++;
  919. bh = bh->b_this_page;
  920. } while (bh != head);
  921. /*
  922. * Caller needs to validate requested block against end of device.
  923. */
  924. return end_block;
  925. }
  926. /*
  927. * Create the page-cache folio that contains the requested block.
  928. *
  929. * This is used purely for blockdev mappings.
  930. *
  931. * Returns false if we have a failure which cannot be cured by retrying
  932. * without sleeping. Returns true if we succeeded, or the caller should retry.
  933. */
  934. static bool grow_dev_folio(struct block_device *bdev, sector_t block,
  935. pgoff_t index, unsigned size, gfp_t gfp)
  936. {
  937. struct address_space *mapping = bdev->bd_mapping;
  938. struct folio *folio;
  939. struct buffer_head *bh;
  940. sector_t end_block = 0;
  941. folio = __filemap_get_folio(mapping, index,
  942. FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
  943. if (IS_ERR(folio))
  944. return false;
  945. bh = folio_buffers(folio);
  946. if (bh) {
  947. if (bh->b_size == size) {
  948. end_block = folio_init_buffers(folio, bdev, size);
  949. goto unlock;
  950. }
  951. /*
  952. * Retrying may succeed; for example the folio may finish
  953. * writeback, or buffers may be cleaned. This should not
  954. * happen very often; maybe we have old buffers attached to
  955. * this blockdev's page cache and we're trying to change
  956. * the block size?
  957. */
  958. if (!try_to_free_buffers(folio)) {
  959. end_block = ~0ULL;
  960. goto unlock;
  961. }
  962. }
  963. bh = folio_alloc_buffers(folio, size, gfp | __GFP_ACCOUNT);
  964. if (!bh)
  965. goto unlock;
  966. /*
  967. * Link the folio to the buffers and initialise them. Take the
  968. * lock to be atomic wrt __find_get_block(), which does not
  969. * run under the folio lock.
  970. */
  971. spin_lock(&mapping->i_private_lock);
  972. link_dev_buffers(folio, bh);
  973. end_block = folio_init_buffers(folio, bdev, size);
  974. spin_unlock(&mapping->i_private_lock);
  975. unlock:
  976. folio_unlock(folio);
  977. folio_put(folio);
  978. return block < end_block;
  979. }
  980. /*
  981. * Create buffers for the specified block device block's folio. If
  982. * that folio was dirty, the buffers are set dirty also. Returns false
  983. * if we've hit a permanent error.
  984. */
  985. static bool grow_buffers(struct block_device *bdev, sector_t block,
  986. unsigned size, gfp_t gfp)
  987. {
  988. loff_t pos;
  989. /*
  990. * Check for a block which lies outside our maximum possible
  991. * pagecache index.
  992. */
  993. if (check_mul_overflow(block, (sector_t)size, &pos) || pos > MAX_LFS_FILESIZE) {
  994. printk(KERN_ERR "%s: requested out-of-range block %llu for device %pg\n",
  995. __func__, (unsigned long long)block,
  996. bdev);
  997. return false;
  998. }
  999. /* Create a folio with the proper size buffers */
  1000. return grow_dev_folio(bdev, block, pos / PAGE_SIZE, size, gfp);
  1001. }
  1002. static struct buffer_head *
  1003. __getblk_slow(struct block_device *bdev, sector_t block,
  1004. unsigned size, gfp_t gfp)
  1005. {
  1006. bool blocking = gfpflags_allow_blocking(gfp);
  1007. if (WARN_ON_ONCE(!IS_ALIGNED(size, bdev_logical_block_size(bdev)))) {
  1008. printk(KERN_ERR "getblk(): block size %d not aligned to logical block size %d\n",
  1009. size, bdev_logical_block_size(bdev));
  1010. return NULL;
  1011. }
  1012. for (;;) {
  1013. struct buffer_head *bh;
  1014. if (!grow_buffers(bdev, block, size, gfp))
  1015. return NULL;
  1016. if (blocking)
  1017. bh = __find_get_block_nonatomic(bdev, block, size);
  1018. else
  1019. bh = __find_get_block(bdev, block, size);
  1020. if (bh)
  1021. return bh;
  1022. }
  1023. }
  1024. /*
  1025. * The relationship between dirty buffers and dirty pages:
  1026. *
  1027. * Whenever a page has any dirty buffers, the page's dirty bit is set, and
  1028. * the page is tagged dirty in the page cache.
  1029. *
  1030. * At all times, the dirtiness of the buffers represents the dirtiness of
  1031. * subsections of the page. If the page has buffers, the page dirty bit is
  1032. * merely a hint about the true dirty state.
  1033. *
  1034. * When a page is set dirty in its entirety, all its buffers are marked dirty
  1035. * (if the page has buffers).
  1036. *
  1037. * When a buffer is marked dirty, its page is dirtied, but the page's other
  1038. * buffers are not.
  1039. *
  1040. * Also. When blockdev buffers are explicitly read with bread(), they
  1041. * individually become uptodate. But their backing page remains not
  1042. * uptodate - even if all of its buffers are uptodate. A subsequent
  1043. * block_read_full_folio() against that folio will discover all the uptodate
  1044. * buffers, will set the folio uptodate and will perform no I/O.
  1045. */
  1046. /**
  1047. * mark_buffer_dirty - mark a buffer_head as needing writeout
  1048. * @bh: the buffer_head to mark dirty
  1049. *
  1050. * mark_buffer_dirty() will set the dirty bit against the buffer, then set
  1051. * its backing page dirty, then tag the page as dirty in the page cache
  1052. * and then attach the address_space's inode to its superblock's dirty
  1053. * inode list.
  1054. *
  1055. * mark_buffer_dirty() is atomic. It takes bh->b_folio->mapping->i_private_lock,
  1056. * i_pages lock and mapping->host->i_lock.
  1057. */
  1058. void mark_buffer_dirty(struct buffer_head *bh)
  1059. {
  1060. WARN_ON_ONCE(!buffer_uptodate(bh));
  1061. trace_block_dirty_buffer(bh);
  1062. /*
  1063. * Very *carefully* optimize the it-is-already-dirty case.
  1064. *
  1065. * Don't let the final "is it dirty" escape to before we
  1066. * perhaps modified the buffer.
  1067. */
  1068. if (buffer_dirty(bh)) {
  1069. smp_mb();
  1070. if (buffer_dirty(bh))
  1071. return;
  1072. }
  1073. if (!test_set_buffer_dirty(bh)) {
  1074. struct folio *folio = bh->b_folio;
  1075. struct address_space *mapping = NULL;
  1076. if (!folio_test_set_dirty(folio)) {
  1077. mapping = folio->mapping;
  1078. if (mapping)
  1079. __folio_mark_dirty(folio, mapping, 0);
  1080. }
  1081. if (mapping)
  1082. __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
  1083. }
  1084. }
  1085. EXPORT_SYMBOL(mark_buffer_dirty);
  1086. void mark_buffer_write_io_error(struct buffer_head *bh)
  1087. {
  1088. set_buffer_write_io_error(bh);
  1089. /* FIXME: do we need to set this in both places? */
  1090. if (bh->b_folio && bh->b_folio->mapping)
  1091. mapping_set_error(bh->b_folio->mapping, -EIO);
  1092. if (bh->b_assoc_map)
  1093. mapping_set_error(bh->b_assoc_map, -EIO);
  1094. }
  1095. EXPORT_SYMBOL(mark_buffer_write_io_error);
  1096. /**
  1097. * __brelse - Release a buffer.
  1098. * @bh: The buffer to release.
  1099. *
  1100. * This variant of brelse() can be called if @bh is guaranteed to not be NULL.
  1101. */
  1102. void __brelse(struct buffer_head *bh)
  1103. {
  1104. if (atomic_read(&bh->b_count)) {
  1105. put_bh(bh);
  1106. return;
  1107. }
  1108. WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
  1109. }
  1110. EXPORT_SYMBOL(__brelse);
  1111. /**
  1112. * __bforget - Discard any dirty data in a buffer.
  1113. * @bh: The buffer to forget.
  1114. *
  1115. * This variant of bforget() can be called if @bh is guaranteed to not
  1116. * be NULL.
  1117. */
  1118. void __bforget(struct buffer_head *bh)
  1119. {
  1120. clear_buffer_dirty(bh);
  1121. if (bh->b_assoc_map) {
  1122. struct address_space *buffer_mapping = bh->b_folio->mapping;
  1123. spin_lock(&buffer_mapping->i_private_lock);
  1124. list_del_init(&bh->b_assoc_buffers);
  1125. bh->b_assoc_map = NULL;
  1126. spin_unlock(&buffer_mapping->i_private_lock);
  1127. }
  1128. __brelse(bh);
  1129. }
  1130. EXPORT_SYMBOL(__bforget);
  1131. static struct buffer_head *__bread_slow(struct buffer_head *bh)
  1132. {
  1133. lock_buffer(bh);
  1134. if (buffer_uptodate(bh)) {
  1135. unlock_buffer(bh);
  1136. return bh;
  1137. } else {
  1138. get_bh(bh);
  1139. bh->b_end_io = end_buffer_read_sync;
  1140. submit_bh(REQ_OP_READ, bh);
  1141. wait_on_buffer(bh);
  1142. if (buffer_uptodate(bh))
  1143. return bh;
  1144. }
  1145. brelse(bh);
  1146. return NULL;
  1147. }
  1148. /*
  1149. * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
  1150. * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
  1151. * refcount elevated by one when they're in an LRU. A buffer can only appear
  1152. * once in a particular CPU's LRU. A single buffer can be present in multiple
  1153. * CPU's LRUs at the same time.
  1154. *
  1155. * This is a transparent caching front-end to sb_bread(), sb_getblk() and
  1156. * sb_find_get_block().
  1157. *
  1158. * The LRUs themselves only need locking against invalidate_bh_lrus. We use
  1159. * a local interrupt disable for that.
  1160. */
  1161. #define BH_LRU_SIZE 16
  1162. struct bh_lru {
  1163. struct buffer_head *bhs[BH_LRU_SIZE];
  1164. };
  1165. static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
  1166. #ifdef CONFIG_SMP
  1167. #define bh_lru_lock() local_irq_disable()
  1168. #define bh_lru_unlock() local_irq_enable()
  1169. #else
  1170. #define bh_lru_lock() preempt_disable()
  1171. #define bh_lru_unlock() preempt_enable()
  1172. #endif
  1173. static inline void check_irqs_on(void)
  1174. {
  1175. #ifdef irqs_disabled
  1176. BUG_ON(irqs_disabled());
  1177. #endif
  1178. }
  1179. /*
  1180. * Install a buffer_head into this cpu's LRU. If not already in the LRU, it is
  1181. * inserted at the front, and the buffer_head at the back if any is evicted.
  1182. * Or, if already in the LRU it is moved to the front.
  1183. */
  1184. static void bh_lru_install(struct buffer_head *bh)
  1185. {
  1186. struct buffer_head *evictee = bh;
  1187. struct bh_lru *b;
  1188. int i;
  1189. check_irqs_on();
  1190. bh_lru_lock();
  1191. /*
  1192. * the refcount of buffer_head in bh_lru prevents dropping the
  1193. * attached page(i.e., try_to_free_buffers) so it could cause
  1194. * failing page migration.
  1195. * Skip putting upcoming bh into bh_lru until migration is done.
  1196. */
  1197. if (lru_cache_disabled() || cpu_is_isolated(smp_processor_id())) {
  1198. bh_lru_unlock();
  1199. return;
  1200. }
  1201. b = this_cpu_ptr(&bh_lrus);
  1202. for (i = 0; i < BH_LRU_SIZE; i++) {
  1203. swap(evictee, b->bhs[i]);
  1204. if (evictee == bh) {
  1205. bh_lru_unlock();
  1206. return;
  1207. }
  1208. }
  1209. get_bh(bh);
  1210. bh_lru_unlock();
  1211. brelse(evictee);
  1212. }
  1213. /*
  1214. * Look up the bh in this cpu's LRU. If it's there, move it to the head.
  1215. */
  1216. static struct buffer_head *
  1217. lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
  1218. {
  1219. struct buffer_head *ret = NULL;
  1220. unsigned int i;
  1221. check_irqs_on();
  1222. bh_lru_lock();
  1223. if (cpu_is_isolated(smp_processor_id())) {
  1224. bh_lru_unlock();
  1225. return NULL;
  1226. }
  1227. for (i = 0; i < BH_LRU_SIZE; i++) {
  1228. struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
  1229. if (bh && bh->b_blocknr == block && bh->b_bdev == bdev &&
  1230. bh->b_size == size) {
  1231. if (i) {
  1232. while (i) {
  1233. __this_cpu_write(bh_lrus.bhs[i],
  1234. __this_cpu_read(bh_lrus.bhs[i - 1]));
  1235. i--;
  1236. }
  1237. __this_cpu_write(bh_lrus.bhs[0], bh);
  1238. }
  1239. get_bh(bh);
  1240. ret = bh;
  1241. break;
  1242. }
  1243. }
  1244. bh_lru_unlock();
  1245. return ret;
  1246. }
  1247. /*
  1248. * Perform a pagecache lookup for the matching buffer. If it's there, refresh
  1249. * it in the LRU and mark it as accessed. If it is not present then return
  1250. * NULL. Atomic context callers may also return NULL if the buffer is being
  1251. * migrated; similarly the page is not marked accessed either.
  1252. */
  1253. static struct buffer_head *
  1254. find_get_block_common(struct block_device *bdev, sector_t block,
  1255. unsigned size, bool atomic)
  1256. {
  1257. struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
  1258. if (bh == NULL) {
  1259. /* __find_get_block_slow will mark the page accessed */
  1260. bh = __find_get_block_slow(bdev, block, atomic);
  1261. if (bh)
  1262. bh_lru_install(bh);
  1263. } else
  1264. touch_buffer(bh);
  1265. return bh;
  1266. }
  1267. struct buffer_head *
  1268. __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
  1269. {
  1270. return find_get_block_common(bdev, block, size, true);
  1271. }
  1272. EXPORT_SYMBOL(__find_get_block);
  1273. /* same as __find_get_block() but allows sleeping contexts */
  1274. struct buffer_head *
  1275. __find_get_block_nonatomic(struct block_device *bdev, sector_t block,
  1276. unsigned size)
  1277. {
  1278. return find_get_block_common(bdev, block, size, false);
  1279. }
  1280. EXPORT_SYMBOL(__find_get_block_nonatomic);
  1281. /**
  1282. * bdev_getblk - Get a buffer_head in a block device's buffer cache.
  1283. * @bdev: The block device.
  1284. * @block: The block number.
  1285. * @size: The size of buffer_heads for this @bdev.
  1286. * @gfp: The memory allocation flags to use.
  1287. *
  1288. * The returned buffer head has its reference count incremented, but is
  1289. * not locked. The caller should call brelse() when it has finished
  1290. * with the buffer. The buffer may not be uptodate. If needed, the
  1291. * caller can bring it uptodate either by reading it or overwriting it.
  1292. *
  1293. * Return: The buffer head, or NULL if memory could not be allocated.
  1294. */
  1295. struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block,
  1296. unsigned size, gfp_t gfp)
  1297. {
  1298. struct buffer_head *bh;
  1299. if (gfpflags_allow_blocking(gfp))
  1300. bh = __find_get_block_nonatomic(bdev, block, size);
  1301. else
  1302. bh = __find_get_block(bdev, block, size);
  1303. might_alloc(gfp);
  1304. if (bh)
  1305. return bh;
  1306. return __getblk_slow(bdev, block, size, gfp);
  1307. }
  1308. EXPORT_SYMBOL(bdev_getblk);
  1309. /*
  1310. * Do async read-ahead on a buffer..
  1311. */
  1312. void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
  1313. {
  1314. struct buffer_head *bh = bdev_getblk(bdev, block, size,
  1315. GFP_NOWAIT | __GFP_MOVABLE);
  1316. if (likely(bh)) {
  1317. bh_readahead(bh, REQ_RAHEAD);
  1318. brelse(bh);
  1319. }
  1320. }
  1321. EXPORT_SYMBOL(__breadahead);
  1322. /**
  1323. * __bread_gfp() - Read a block.
  1324. * @bdev: The block device to read from.
  1325. * @block: Block number in units of block size.
  1326. * @size: The block size of this device in bytes.
  1327. * @gfp: Not page allocation flags; see below.
  1328. *
  1329. * You are not expected to call this function. You should use one of
  1330. * sb_bread(), sb_bread_unmovable() or __bread().
  1331. *
  1332. * Read a specified block, and return the buffer head that refers to it.
  1333. * If @gfp is 0, the memory will be allocated using the block device's
  1334. * default GFP flags. If @gfp is __GFP_MOVABLE, the memory may be
  1335. * allocated from a movable area. Do not pass in a complete set of
  1336. * GFP flags.
  1337. *
  1338. * The returned buffer head has its refcount increased. The caller should
  1339. * call brelse() when it has finished with the buffer.
  1340. *
  1341. * Context: May sleep waiting for I/O.
  1342. * Return: NULL if the block was unreadable.
  1343. */
  1344. struct buffer_head *__bread_gfp(struct block_device *bdev, sector_t block,
  1345. unsigned size, gfp_t gfp)
  1346. {
  1347. struct buffer_head *bh;
  1348. gfp |= mapping_gfp_constraint(bdev->bd_mapping, ~__GFP_FS);
  1349. /*
  1350. * Prefer looping in the allocator rather than here, at least that
  1351. * code knows what it's doing.
  1352. */
  1353. gfp |= __GFP_NOFAIL;
  1354. bh = bdev_getblk(bdev, block, size, gfp);
  1355. if (likely(bh) && !buffer_uptodate(bh))
  1356. bh = __bread_slow(bh);
  1357. return bh;
  1358. }
  1359. EXPORT_SYMBOL(__bread_gfp);
  1360. static void __invalidate_bh_lrus(struct bh_lru *b)
  1361. {
  1362. int i;
  1363. for (i = 0; i < BH_LRU_SIZE; i++) {
  1364. brelse(b->bhs[i]);
  1365. b->bhs[i] = NULL;
  1366. }
  1367. }
  1368. /*
  1369. * invalidate_bh_lrus() is called rarely - but not only at unmount.
  1370. * This doesn't race because it runs in each cpu either in irq
  1371. * or with preempt disabled.
  1372. */
  1373. static void invalidate_bh_lru(void *arg)
  1374. {
  1375. struct bh_lru *b = &get_cpu_var(bh_lrus);
  1376. __invalidate_bh_lrus(b);
  1377. put_cpu_var(bh_lrus);
  1378. }
  1379. bool has_bh_in_lru(int cpu, void *dummy)
  1380. {
  1381. struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
  1382. int i;
  1383. for (i = 0; i < BH_LRU_SIZE; i++) {
  1384. if (b->bhs[i])
  1385. return true;
  1386. }
  1387. return false;
  1388. }
  1389. void invalidate_bh_lrus(void)
  1390. {
  1391. on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1);
  1392. }
  1393. EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
  1394. /*
  1395. * It's called from workqueue context so we need a bh_lru_lock to close
  1396. * the race with preemption/irq.
  1397. */
  1398. void invalidate_bh_lrus_cpu(void)
  1399. {
  1400. struct bh_lru *b;
  1401. bh_lru_lock();
  1402. b = this_cpu_ptr(&bh_lrus);
  1403. __invalidate_bh_lrus(b);
  1404. bh_lru_unlock();
  1405. }
  1406. void folio_set_bh(struct buffer_head *bh, struct folio *folio,
  1407. unsigned long offset)
  1408. {
  1409. bh->b_folio = folio;
  1410. BUG_ON(offset >= folio_size(folio));
  1411. if (folio_test_highmem(folio))
  1412. /*
  1413. * This catches illegal uses and preserves the offset:
  1414. */
  1415. bh->b_data = (char *)(0 + offset);
  1416. else
  1417. bh->b_data = folio_address(folio) + offset;
  1418. }
  1419. EXPORT_SYMBOL(folio_set_bh);
  1420. /*
  1421. * Called when truncating a buffer on a page completely.
  1422. */
  1423. /* Bits that are cleared during an invalidate */
  1424. #define BUFFER_FLAGS_DISCARD \
  1425. (1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \
  1426. 1 << BH_Delay | 1 << BH_Unwritten)
  1427. static void discard_buffer(struct buffer_head * bh)
  1428. {
  1429. unsigned long b_state;
  1430. lock_buffer(bh);
  1431. clear_buffer_dirty(bh);
  1432. bh->b_bdev = NULL;
  1433. b_state = READ_ONCE(bh->b_state);
  1434. do {
  1435. } while (!try_cmpxchg_relaxed(&bh->b_state, &b_state,
  1436. b_state & ~BUFFER_FLAGS_DISCARD));
  1437. unlock_buffer(bh);
  1438. }
  1439. /**
  1440. * block_invalidate_folio - Invalidate part or all of a buffer-backed folio.
  1441. * @folio: The folio which is affected.
  1442. * @offset: start of the range to invalidate
  1443. * @length: length of the range to invalidate
  1444. *
  1445. * block_invalidate_folio() is called when all or part of the folio has been
  1446. * invalidated by a truncate operation.
  1447. *
  1448. * block_invalidate_folio() does not have to release all buffers, but it must
  1449. * ensure that no dirty buffer is left outside @offset and that no I/O
  1450. * is underway against any of the blocks which are outside the truncation
  1451. * point. Because the caller is about to free (and possibly reuse) those
  1452. * blocks on-disk.
  1453. */
  1454. void block_invalidate_folio(struct folio *folio, size_t offset, size_t length)
  1455. {
  1456. struct buffer_head *head, *bh, *next;
  1457. size_t curr_off = 0;
  1458. size_t stop = length + offset;
  1459. BUG_ON(!folio_test_locked(folio));
  1460. /*
  1461. * Check for overflow
  1462. */
  1463. BUG_ON(stop > folio_size(folio) || stop < length);
  1464. head = folio_buffers(folio);
  1465. if (!head)
  1466. return;
  1467. bh = head;
  1468. do {
  1469. size_t next_off = curr_off + bh->b_size;
  1470. next = bh->b_this_page;
  1471. /*
  1472. * Are we still fully in range ?
  1473. */
  1474. if (next_off > stop)
  1475. goto out;
  1476. /*
  1477. * is this block fully invalidated?
  1478. */
  1479. if (offset <= curr_off)
  1480. discard_buffer(bh);
  1481. curr_off = next_off;
  1482. bh = next;
  1483. } while (bh != head);
  1484. /*
  1485. * We release buffers only if the entire folio is being invalidated.
  1486. * The get_block cached value has been unconditionally invalidated,
  1487. * so real IO is not possible anymore.
  1488. */
  1489. if (length == folio_size(folio))
  1490. filemap_release_folio(folio, 0);
  1491. out:
  1492. folio_clear_mappedtodisk(folio);
  1493. }
  1494. EXPORT_SYMBOL(block_invalidate_folio);
  1495. /*
  1496. * We attach and possibly dirty the buffers atomically wrt
  1497. * block_dirty_folio() via i_private_lock. try_to_free_buffers
  1498. * is already excluded via the folio lock.
  1499. */
  1500. struct buffer_head *create_empty_buffers(struct folio *folio,
  1501. unsigned long blocksize, unsigned long b_state)
  1502. {
  1503. struct buffer_head *bh, *head, *tail;
  1504. gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT | __GFP_NOFAIL;
  1505. head = folio_alloc_buffers(folio, blocksize, gfp);
  1506. bh = head;
  1507. do {
  1508. bh->b_state |= b_state;
  1509. tail = bh;
  1510. bh = bh->b_this_page;
  1511. } while (bh);
  1512. tail->b_this_page = head;
  1513. spin_lock(&folio->mapping->i_private_lock);
  1514. if (folio_test_uptodate(folio) || folio_test_dirty(folio)) {
  1515. bh = head;
  1516. do {
  1517. if (folio_test_dirty(folio))
  1518. set_buffer_dirty(bh);
  1519. if (folio_test_uptodate(folio))
  1520. set_buffer_uptodate(bh);
  1521. bh = bh->b_this_page;
  1522. } while (bh != head);
  1523. }
  1524. folio_attach_private(folio, head);
  1525. spin_unlock(&folio->mapping->i_private_lock);
  1526. return head;
  1527. }
  1528. EXPORT_SYMBOL(create_empty_buffers);
  1529. /**
  1530. * clean_bdev_aliases: clean a range of buffers in block device
  1531. * @bdev: Block device to clean buffers in
  1532. * @block: Start of a range of blocks to clean
  1533. * @len: Number of blocks to clean
  1534. *
  1535. * We are taking a range of blocks for data and we don't want writeback of any
  1536. * buffer-cache aliases starting from return from this function and until the
  1537. * moment when something will explicitly mark the buffer dirty (hopefully that
  1538. * will not happen until we will free that block ;-) We don't even need to mark
  1539. * it not-uptodate - nobody can expect anything from a newly allocated buffer
  1540. * anyway. We used to use unmap_buffer() for such invalidation, but that was
  1541. * wrong. We definitely don't want to mark the alias unmapped, for example - it
  1542. * would confuse anyone who might pick it with bread() afterwards...
  1543. *
  1544. * Also.. Note that bforget() doesn't lock the buffer. So there can be
  1545. * writeout I/O going on against recently-freed buffers. We don't wait on that
  1546. * I/O in bforget() - it's more efficient to wait on the I/O only if we really
  1547. * need to. That happens here.
  1548. */
  1549. void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
  1550. {
  1551. struct address_space *bd_mapping = bdev->bd_mapping;
  1552. const int blkbits = bd_mapping->host->i_blkbits;
  1553. struct folio_batch fbatch;
  1554. pgoff_t index = ((loff_t)block << blkbits) / PAGE_SIZE;
  1555. pgoff_t end;
  1556. int i, count;
  1557. struct buffer_head *bh;
  1558. struct buffer_head *head;
  1559. end = ((loff_t)(block + len - 1) << blkbits) / PAGE_SIZE;
  1560. folio_batch_init(&fbatch);
  1561. while (filemap_get_folios(bd_mapping, &index, end, &fbatch)) {
  1562. count = folio_batch_count(&fbatch);
  1563. for (i = 0; i < count; i++) {
  1564. struct folio *folio = fbatch.folios[i];
  1565. if (!folio_buffers(folio))
  1566. continue;
  1567. /*
  1568. * We use folio lock instead of bd_mapping->i_private_lock
  1569. * to pin buffers here since we can afford to sleep and
  1570. * it scales better than a global spinlock lock.
  1571. */
  1572. folio_lock(folio);
  1573. /* Recheck when the folio is locked which pins bhs */
  1574. head = folio_buffers(folio);
  1575. if (!head)
  1576. goto unlock_page;
  1577. bh = head;
  1578. do {
  1579. if (!buffer_mapped(bh) || (bh->b_blocknr < block))
  1580. goto next;
  1581. if (bh->b_blocknr >= block + len)
  1582. break;
  1583. clear_buffer_dirty(bh);
  1584. wait_on_buffer(bh);
  1585. clear_buffer_req(bh);
  1586. next:
  1587. bh = bh->b_this_page;
  1588. } while (bh != head);
  1589. unlock_page:
  1590. folio_unlock(folio);
  1591. }
  1592. folio_batch_release(&fbatch);
  1593. cond_resched();
  1594. /* End of range already reached? */
  1595. if (index > end || !index)
  1596. break;
  1597. }
  1598. }
  1599. EXPORT_SYMBOL(clean_bdev_aliases);
  1600. static struct buffer_head *folio_create_buffers(struct folio *folio,
  1601. struct inode *inode,
  1602. unsigned int b_state)
  1603. {
  1604. struct buffer_head *bh;
  1605. BUG_ON(!folio_test_locked(folio));
  1606. bh = folio_buffers(folio);
  1607. if (!bh)
  1608. bh = create_empty_buffers(folio,
  1609. 1 << READ_ONCE(inode->i_blkbits), b_state);
  1610. return bh;
  1611. }
  1612. /*
  1613. * NOTE! All mapped/uptodate combinations are valid:
  1614. *
  1615. * Mapped Uptodate Meaning
  1616. *
  1617. * No No "unknown" - must do get_block()
  1618. * No Yes "hole" - zero-filled
  1619. * Yes No "allocated" - allocated on disk, not read in
  1620. * Yes Yes "valid" - allocated and up-to-date in memory.
  1621. *
  1622. * "Dirty" is valid only with the last case (mapped+uptodate).
  1623. */
  1624. /*
  1625. * While block_write_full_folio is writing back the dirty buffers under
  1626. * the page lock, whoever dirtied the buffers may decide to clean them
  1627. * again at any time. We handle that by only looking at the buffer
  1628. * state inside lock_buffer().
  1629. *
  1630. * If block_write_full_folio() is called for regular writeback
  1631. * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
  1632. * locked buffer. This only can happen if someone has written the buffer
  1633. * directly, with submit_bh(). At the address_space level PageWriteback
  1634. * prevents this contention from occurring.
  1635. *
  1636. * If block_write_full_folio() is called with wbc->sync_mode ==
  1637. * WB_SYNC_ALL, the writes are posted using REQ_SYNC; this
  1638. * causes the writes to be flagged as synchronous writes.
  1639. */
  1640. int __block_write_full_folio(struct inode *inode, struct folio *folio,
  1641. get_block_t *get_block, struct writeback_control *wbc)
  1642. {
  1643. int err;
  1644. sector_t block;
  1645. sector_t last_block;
  1646. struct buffer_head *bh, *head;
  1647. size_t blocksize;
  1648. int nr_underway = 0;
  1649. blk_opf_t write_flags = wbc_to_write_flags(wbc);
  1650. head = folio_create_buffers(folio, inode,
  1651. (1 << BH_Dirty) | (1 << BH_Uptodate));
  1652. /*
  1653. * Be very careful. We have no exclusion from block_dirty_folio
  1654. * here, and the (potentially unmapped) buffers may become dirty at
  1655. * any time. If a buffer becomes dirty here after we've inspected it
  1656. * then we just miss that fact, and the folio stays dirty.
  1657. *
  1658. * Buffers outside i_size may be dirtied by block_dirty_folio;
  1659. * handle that here by just cleaning them.
  1660. */
  1661. bh = head;
  1662. blocksize = bh->b_size;
  1663. block = div_u64(folio_pos(folio), blocksize);
  1664. last_block = div_u64(i_size_read(inode) - 1, blocksize);
  1665. /*
  1666. * Get all the dirty buffers mapped to disk addresses and
  1667. * handle any aliases from the underlying blockdev's mapping.
  1668. */
  1669. do {
  1670. if (block > last_block) {
  1671. /*
  1672. * mapped buffers outside i_size will occur, because
  1673. * this folio can be outside i_size when there is a
  1674. * truncate in progress.
  1675. */
  1676. /*
  1677. * The buffer was zeroed by block_write_full_folio()
  1678. */
  1679. clear_buffer_dirty(bh);
  1680. set_buffer_uptodate(bh);
  1681. } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
  1682. buffer_dirty(bh)) {
  1683. WARN_ON(bh->b_size != blocksize);
  1684. err = get_block(inode, block, bh, 1);
  1685. if (err)
  1686. goto recover;
  1687. clear_buffer_delay(bh);
  1688. if (buffer_new(bh)) {
  1689. /* blockdev mappings never come here */
  1690. clear_buffer_new(bh);
  1691. clean_bdev_bh_alias(bh);
  1692. }
  1693. }
  1694. bh = bh->b_this_page;
  1695. block++;
  1696. } while (bh != head);
  1697. do {
  1698. if (!buffer_mapped(bh))
  1699. continue;
  1700. /*
  1701. * If it's a fully non-blocking write attempt and we cannot
  1702. * lock the buffer then redirty the folio. Note that this can
  1703. * potentially cause a busy-wait loop from writeback threads
  1704. * and kswapd activity, but those code paths have their own
  1705. * higher-level throttling.
  1706. */
  1707. if (wbc->sync_mode != WB_SYNC_NONE) {
  1708. lock_buffer(bh);
  1709. } else if (!trylock_buffer(bh)) {
  1710. folio_redirty_for_writepage(wbc, folio);
  1711. continue;
  1712. }
  1713. if (test_clear_buffer_dirty(bh)) {
  1714. mark_buffer_async_write_endio(bh,
  1715. end_buffer_async_write);
  1716. } else {
  1717. unlock_buffer(bh);
  1718. }
  1719. } while ((bh = bh->b_this_page) != head);
  1720. /*
  1721. * The folio and its buffers are protected by the writeback flag,
  1722. * so we can drop the bh refcounts early.
  1723. */
  1724. BUG_ON(folio_test_writeback(folio));
  1725. folio_start_writeback(folio);
  1726. do {
  1727. struct buffer_head *next = bh->b_this_page;
  1728. if (buffer_async_write(bh)) {
  1729. submit_bh_wbc(REQ_OP_WRITE | write_flags, bh,
  1730. inode->i_write_hint, wbc);
  1731. nr_underway++;
  1732. }
  1733. bh = next;
  1734. } while (bh != head);
  1735. folio_unlock(folio);
  1736. err = 0;
  1737. done:
  1738. if (nr_underway == 0) {
  1739. /*
  1740. * The folio was marked dirty, but the buffers were
  1741. * clean. Someone wrote them back by hand with
  1742. * write_dirty_buffer/submit_bh. A rare case.
  1743. */
  1744. folio_end_writeback(folio);
  1745. /*
  1746. * The folio and buffer_heads can be released at any time from
  1747. * here on.
  1748. */
  1749. }
  1750. return err;
  1751. recover:
  1752. /*
  1753. * ENOSPC, or some other error. We may already have added some
  1754. * blocks to the file, so we need to write these out to avoid
  1755. * exposing stale data.
  1756. * The folio is currently locked and not marked for writeback
  1757. */
  1758. bh = head;
  1759. /* Recovery: lock and submit the mapped buffers */
  1760. do {
  1761. if (buffer_mapped(bh) && buffer_dirty(bh) &&
  1762. !buffer_delay(bh)) {
  1763. lock_buffer(bh);
  1764. mark_buffer_async_write_endio(bh,
  1765. end_buffer_async_write);
  1766. } else {
  1767. /*
  1768. * The buffer may have been set dirty during
  1769. * attachment to a dirty folio.
  1770. */
  1771. clear_buffer_dirty(bh);
  1772. }
  1773. } while ((bh = bh->b_this_page) != head);
  1774. BUG_ON(folio_test_writeback(folio));
  1775. mapping_set_error(folio->mapping, err);
  1776. folio_start_writeback(folio);
  1777. do {
  1778. struct buffer_head *next = bh->b_this_page;
  1779. if (buffer_async_write(bh)) {
  1780. clear_buffer_dirty(bh);
  1781. submit_bh_wbc(REQ_OP_WRITE | write_flags, bh,
  1782. inode->i_write_hint, wbc);
  1783. nr_underway++;
  1784. }
  1785. bh = next;
  1786. } while (bh != head);
  1787. folio_unlock(folio);
  1788. goto done;
  1789. }
  1790. EXPORT_SYMBOL(__block_write_full_folio);
  1791. /*
  1792. * If a folio has any new buffers, zero them out here, and mark them uptodate
  1793. * and dirty so they'll be written out (in order to prevent uninitialised
  1794. * block data from leaking). And clear the new bit.
  1795. */
  1796. void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to)
  1797. {
  1798. size_t block_start, block_end;
  1799. struct buffer_head *head, *bh;
  1800. BUG_ON(!folio_test_locked(folio));
  1801. head = folio_buffers(folio);
  1802. if (!head)
  1803. return;
  1804. bh = head;
  1805. block_start = 0;
  1806. do {
  1807. block_end = block_start + bh->b_size;
  1808. if (buffer_new(bh)) {
  1809. if (block_end > from && block_start < to) {
  1810. if (!folio_test_uptodate(folio)) {
  1811. size_t start, xend;
  1812. start = max(from, block_start);
  1813. xend = min(to, block_end);
  1814. folio_zero_segment(folio, start, xend);
  1815. set_buffer_uptodate(bh);
  1816. }
  1817. clear_buffer_new(bh);
  1818. mark_buffer_dirty(bh);
  1819. }
  1820. }
  1821. block_start = block_end;
  1822. bh = bh->b_this_page;
  1823. } while (bh != head);
  1824. }
  1825. EXPORT_SYMBOL(folio_zero_new_buffers);
  1826. static int
  1827. iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
  1828. const struct iomap *iomap)
  1829. {
  1830. loff_t offset = (loff_t)block << inode->i_blkbits;
  1831. bh->b_bdev = iomap->bdev;
  1832. /*
  1833. * Block points to offset in file we need to map, iomap contains
  1834. * the offset at which the map starts. If the map ends before the
  1835. * current block, then do not map the buffer and let the caller
  1836. * handle it.
  1837. */
  1838. if (offset >= iomap->offset + iomap->length)
  1839. return -EIO;
  1840. switch (iomap->type) {
  1841. case IOMAP_HOLE:
  1842. /*
  1843. * If the buffer is not up to date or beyond the current EOF,
  1844. * we need to mark it as new to ensure sub-block zeroing is
  1845. * executed if necessary.
  1846. */
  1847. if (!buffer_uptodate(bh) ||
  1848. (offset >= i_size_read(inode)))
  1849. set_buffer_new(bh);
  1850. return 0;
  1851. case IOMAP_DELALLOC:
  1852. if (!buffer_uptodate(bh) ||
  1853. (offset >= i_size_read(inode)))
  1854. set_buffer_new(bh);
  1855. set_buffer_uptodate(bh);
  1856. set_buffer_mapped(bh);
  1857. set_buffer_delay(bh);
  1858. return 0;
  1859. case IOMAP_UNWRITTEN:
  1860. /*
  1861. * For unwritten regions, we always need to ensure that regions
  1862. * in the block we are not writing to are zeroed. Mark the
  1863. * buffer as new to ensure this.
  1864. */
  1865. set_buffer_new(bh);
  1866. set_buffer_unwritten(bh);
  1867. fallthrough;
  1868. case IOMAP_MAPPED:
  1869. if ((iomap->flags & IOMAP_F_NEW) ||
  1870. offset >= i_size_read(inode)) {
  1871. /*
  1872. * This can happen if truncating the block device races
  1873. * with the check in the caller as i_size updates on
  1874. * block devices aren't synchronized by i_rwsem for
  1875. * block devices.
  1876. */
  1877. if (S_ISBLK(inode->i_mode))
  1878. return -EIO;
  1879. set_buffer_new(bh);
  1880. }
  1881. bh->b_blocknr = (iomap->addr + offset - iomap->offset) >>
  1882. inode->i_blkbits;
  1883. set_buffer_mapped(bh);
  1884. return 0;
  1885. default:
  1886. WARN_ON_ONCE(1);
  1887. return -EIO;
  1888. }
  1889. }
  1890. int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len,
  1891. get_block_t *get_block, const struct iomap *iomap)
  1892. {
  1893. size_t from = offset_in_folio(folio, pos);
  1894. size_t to = from + len;
  1895. struct inode *inode = folio->mapping->host;
  1896. size_t block_start, block_end;
  1897. sector_t block;
  1898. int err = 0;
  1899. size_t blocksize;
  1900. struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
  1901. BUG_ON(!folio_test_locked(folio));
  1902. BUG_ON(to > folio_size(folio));
  1903. BUG_ON(from > to);
  1904. head = folio_create_buffers(folio, inode, 0);
  1905. blocksize = head->b_size;
  1906. block = div_u64(folio_pos(folio), blocksize);
  1907. for (bh = head, block_start = 0; bh != head || !block_start;
  1908. block++, block_start=block_end, bh = bh->b_this_page) {
  1909. block_end = block_start + blocksize;
  1910. if (block_end <= from || block_start >= to) {
  1911. if (folio_test_uptodate(folio)) {
  1912. if (!buffer_uptodate(bh))
  1913. set_buffer_uptodate(bh);
  1914. }
  1915. continue;
  1916. }
  1917. if (buffer_new(bh))
  1918. clear_buffer_new(bh);
  1919. if (!buffer_mapped(bh)) {
  1920. WARN_ON(bh->b_size != blocksize);
  1921. if (get_block)
  1922. err = get_block(inode, block, bh, 1);
  1923. else
  1924. err = iomap_to_bh(inode, block, bh, iomap);
  1925. if (err)
  1926. break;
  1927. if (buffer_new(bh)) {
  1928. clean_bdev_bh_alias(bh);
  1929. if (folio_test_uptodate(folio)) {
  1930. clear_buffer_new(bh);
  1931. set_buffer_uptodate(bh);
  1932. mark_buffer_dirty(bh);
  1933. continue;
  1934. }
  1935. if (block_end > to || block_start < from)
  1936. folio_zero_segments(folio,
  1937. to, block_end,
  1938. block_start, from);
  1939. continue;
  1940. }
  1941. }
  1942. if (folio_test_uptodate(folio)) {
  1943. if (!buffer_uptodate(bh))
  1944. set_buffer_uptodate(bh);
  1945. continue;
  1946. }
  1947. if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
  1948. !buffer_unwritten(bh) &&
  1949. (block_start < from || block_end > to)) {
  1950. bh_read_nowait(bh, 0);
  1951. *wait_bh++=bh;
  1952. }
  1953. }
  1954. /*
  1955. * If we issued read requests - let them complete.
  1956. */
  1957. while(wait_bh > wait) {
  1958. wait_on_buffer(*--wait_bh);
  1959. if (!buffer_uptodate(*wait_bh))
  1960. err = -EIO;
  1961. }
  1962. if (unlikely(err))
  1963. folio_zero_new_buffers(folio, from, to);
  1964. return err;
  1965. }
  1966. int __block_write_begin(struct folio *folio, loff_t pos, unsigned len,
  1967. get_block_t *get_block)
  1968. {
  1969. return __block_write_begin_int(folio, pos, len, get_block, NULL);
  1970. }
  1971. EXPORT_SYMBOL(__block_write_begin);
  1972. void block_commit_write(struct folio *folio, size_t from, size_t to)
  1973. {
  1974. size_t block_start, block_end;
  1975. bool partial = false;
  1976. unsigned blocksize;
  1977. struct buffer_head *bh, *head;
  1978. bh = head = folio_buffers(folio);
  1979. if (!bh)
  1980. return;
  1981. blocksize = bh->b_size;
  1982. block_start = 0;
  1983. do {
  1984. block_end = block_start + blocksize;
  1985. if (block_end <= from || block_start >= to) {
  1986. if (!buffer_uptodate(bh))
  1987. partial = true;
  1988. } else {
  1989. set_buffer_uptodate(bh);
  1990. mark_buffer_dirty(bh);
  1991. }
  1992. if (buffer_new(bh))
  1993. clear_buffer_new(bh);
  1994. block_start = block_end;
  1995. bh = bh->b_this_page;
  1996. } while (bh != head);
  1997. /*
  1998. * If this is a partial write which happened to make all buffers
  1999. * uptodate then we can optimize away a bogus read_folio() for
  2000. * the next read(). Here we 'discover' whether the folio went
  2001. * uptodate as a result of this (potentially partial) write.
  2002. */
  2003. if (!partial)
  2004. folio_mark_uptodate(folio);
  2005. }
  2006. EXPORT_SYMBOL(block_commit_write);
  2007. /*
  2008. * block_write_begin takes care of the basic task of block allocation and
  2009. * bringing partial write blocks uptodate first.
  2010. *
  2011. * The filesystem needs to handle block truncation upon failure.
  2012. */
  2013. int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
  2014. struct folio **foliop, get_block_t *get_block)
  2015. {
  2016. pgoff_t index = pos >> PAGE_SHIFT;
  2017. struct folio *folio;
  2018. int status;
  2019. folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
  2020. mapping_gfp_mask(mapping));
  2021. if (IS_ERR(folio))
  2022. return PTR_ERR(folio);
  2023. status = __block_write_begin_int(folio, pos, len, get_block, NULL);
  2024. if (unlikely(status)) {
  2025. folio_unlock(folio);
  2026. folio_put(folio);
  2027. folio = NULL;
  2028. }
  2029. *foliop = folio;
  2030. return status;
  2031. }
  2032. EXPORT_SYMBOL(block_write_begin);
  2033. int block_write_end(loff_t pos, unsigned len, unsigned copied,
  2034. struct folio *folio)
  2035. {
  2036. size_t start = pos - folio_pos(folio);
  2037. if (unlikely(copied < len)) {
  2038. /*
  2039. * The buffers that were written will now be uptodate, so
  2040. * we don't have to worry about a read_folio reading them
  2041. * and overwriting a partial write. However if we have
  2042. * encountered a short write and only partially written
  2043. * into a buffer, it will not be marked uptodate, so a
  2044. * read_folio might come in and destroy our partial write.
  2045. *
  2046. * Do the simplest thing, and just treat any short write to a
  2047. * non uptodate folio as a zero-length write, and force the
  2048. * caller to redo the whole thing.
  2049. */
  2050. if (!folio_test_uptodate(folio))
  2051. copied = 0;
  2052. folio_zero_new_buffers(folio, start+copied, start+len);
  2053. }
  2054. flush_dcache_folio(folio);
  2055. /* This could be a short (even 0-length) commit */
  2056. block_commit_write(folio, start, start + copied);
  2057. return copied;
  2058. }
  2059. EXPORT_SYMBOL(block_write_end);
  2060. int generic_write_end(const struct kiocb *iocb, struct address_space *mapping,
  2061. loff_t pos, unsigned len, unsigned copied,
  2062. struct folio *folio, void *fsdata)
  2063. {
  2064. struct inode *inode = mapping->host;
  2065. loff_t old_size = inode->i_size;
  2066. bool i_size_changed = false;
  2067. copied = block_write_end(pos, len, copied, folio);
  2068. /*
  2069. * No need to use i_size_read() here, the i_size cannot change under us
  2070. * because we hold i_rwsem.
  2071. *
  2072. * But it's important to update i_size while still holding folio lock:
  2073. * page writeout could otherwise come in and zero beyond i_size.
  2074. */
  2075. if (pos + copied > inode->i_size) {
  2076. i_size_write(inode, pos + copied);
  2077. i_size_changed = true;
  2078. }
  2079. folio_unlock(folio);
  2080. folio_put(folio);
  2081. if (old_size < pos)
  2082. pagecache_isize_extended(inode, old_size, pos);
  2083. /*
  2084. * Don't mark the inode dirty under page lock. First, it unnecessarily
  2085. * makes the holding time of page lock longer. Second, it forces lock
  2086. * ordering of page lock and transaction start for journaling
  2087. * filesystems.
  2088. */
  2089. if (i_size_changed)
  2090. mark_inode_dirty(inode);
  2091. return copied;
  2092. }
  2093. EXPORT_SYMBOL(generic_write_end);
  2094. /*
  2095. * block_is_partially_uptodate checks whether buffers within a folio are
  2096. * uptodate or not.
  2097. *
  2098. * Returns true if all buffers which correspond to the specified part
  2099. * of the folio are uptodate.
  2100. */
  2101. bool block_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
  2102. {
  2103. unsigned block_start, block_end, blocksize;
  2104. unsigned to;
  2105. struct buffer_head *bh, *head;
  2106. bool ret = true;
  2107. head = folio_buffers(folio);
  2108. if (!head)
  2109. return false;
  2110. blocksize = head->b_size;
  2111. to = min(folio_size(folio) - from, count);
  2112. to = from + to;
  2113. if (from < blocksize && to > folio_size(folio) - blocksize)
  2114. return false;
  2115. bh = head;
  2116. block_start = 0;
  2117. do {
  2118. block_end = block_start + blocksize;
  2119. if (block_end > from && block_start < to) {
  2120. if (!buffer_uptodate(bh)) {
  2121. ret = false;
  2122. break;
  2123. }
  2124. if (block_end >= to)
  2125. break;
  2126. }
  2127. block_start = block_end;
  2128. bh = bh->b_this_page;
  2129. } while (bh != head);
  2130. return ret;
  2131. }
  2132. EXPORT_SYMBOL(block_is_partially_uptodate);
  2133. /*
  2134. * Generic "read_folio" function for block devices that have the normal
  2135. * get_block functionality. This is most of the block device filesystems.
  2136. * Reads the folio asynchronously --- the unlock_buffer() and
  2137. * set/clear_buffer_uptodate() functions propagate buffer state into the
  2138. * folio once IO has completed.
  2139. */
  2140. int block_read_full_folio(struct folio *folio, get_block_t *get_block)
  2141. {
  2142. struct inode *inode = folio->mapping->host;
  2143. sector_t iblock, lblock;
  2144. struct buffer_head *bh, *head, *prev = NULL;
  2145. size_t blocksize;
  2146. int fully_mapped = 1;
  2147. bool page_error = false;
  2148. loff_t limit = i_size_read(inode);
  2149. /* This is needed for ext4. */
  2150. if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode))
  2151. limit = inode->i_sb->s_maxbytes;
  2152. head = folio_create_buffers(folio, inode, 0);
  2153. blocksize = head->b_size;
  2154. iblock = div_u64(folio_pos(folio), blocksize);
  2155. lblock = div_u64(limit + blocksize - 1, blocksize);
  2156. bh = head;
  2157. do {
  2158. if (buffer_uptodate(bh))
  2159. continue;
  2160. if (!buffer_mapped(bh)) {
  2161. int err = 0;
  2162. fully_mapped = 0;
  2163. if (iblock < lblock) {
  2164. WARN_ON(bh->b_size != blocksize);
  2165. err = get_block(inode, iblock, bh, 0);
  2166. if (err)
  2167. page_error = true;
  2168. }
  2169. if (!buffer_mapped(bh)) {
  2170. folio_zero_range(folio, bh_offset(bh),
  2171. blocksize);
  2172. if (!err)
  2173. set_buffer_uptodate(bh);
  2174. continue;
  2175. }
  2176. /*
  2177. * get_block() might have updated the buffer
  2178. * synchronously
  2179. */
  2180. if (buffer_uptodate(bh))
  2181. continue;
  2182. }
  2183. lock_buffer(bh);
  2184. if (buffer_uptodate(bh)) {
  2185. unlock_buffer(bh);
  2186. continue;
  2187. }
  2188. mark_buffer_async_read(bh);
  2189. if (prev)
  2190. submit_bh(REQ_OP_READ, prev);
  2191. prev = bh;
  2192. } while (iblock++, (bh = bh->b_this_page) != head);
  2193. if (fully_mapped)
  2194. folio_set_mappedtodisk(folio);
  2195. /*
  2196. * All buffers are uptodate or get_block() returned an error
  2197. * when trying to map them - we must finish the read because
  2198. * end_buffer_async_read() will never be called on any buffer
  2199. * in this folio.
  2200. */
  2201. if (prev)
  2202. submit_bh(REQ_OP_READ, prev);
  2203. else
  2204. folio_end_read(folio, !page_error);
  2205. return 0;
  2206. }
  2207. EXPORT_SYMBOL(block_read_full_folio);
  2208. /* utility function for filesystems that need to do work on expanding
  2209. * truncates. Uses filesystem pagecache writes to allow the filesystem to
  2210. * deal with the hole.
  2211. */
  2212. int generic_cont_expand_simple(struct inode *inode, loff_t size)
  2213. {
  2214. struct address_space *mapping = inode->i_mapping;
  2215. const struct address_space_operations *aops = mapping->a_ops;
  2216. struct folio *folio;
  2217. void *fsdata = NULL;
  2218. int err;
  2219. err = inode_newsize_ok(inode, size);
  2220. if (err)
  2221. goto out;
  2222. err = aops->write_begin(NULL, mapping, size, 0, &folio, &fsdata);
  2223. if (err)
  2224. goto out;
  2225. err = aops->write_end(NULL, mapping, size, 0, 0, folio, fsdata);
  2226. BUG_ON(err > 0);
  2227. out:
  2228. return err;
  2229. }
  2230. EXPORT_SYMBOL(generic_cont_expand_simple);
  2231. static int cont_expand_zero(const struct kiocb *iocb,
  2232. struct address_space *mapping,
  2233. loff_t pos, loff_t *bytes)
  2234. {
  2235. struct inode *inode = mapping->host;
  2236. const struct address_space_operations *aops = mapping->a_ops;
  2237. unsigned int blocksize = i_blocksize(inode);
  2238. struct folio *folio;
  2239. void *fsdata = NULL;
  2240. pgoff_t index, curidx;
  2241. loff_t curpos;
  2242. unsigned zerofrom, offset, len;
  2243. int err = 0;
  2244. index = pos >> PAGE_SHIFT;
  2245. offset = pos & ~PAGE_MASK;
  2246. while (index > (curidx = (curpos = *bytes)>>PAGE_SHIFT)) {
  2247. zerofrom = curpos & ~PAGE_MASK;
  2248. if (zerofrom & (blocksize-1)) {
  2249. *bytes |= (blocksize-1);
  2250. (*bytes)++;
  2251. }
  2252. len = PAGE_SIZE - zerofrom;
  2253. err = aops->write_begin(iocb, mapping, curpos, len,
  2254. &folio, &fsdata);
  2255. if (err)
  2256. goto out;
  2257. folio_zero_range(folio, offset_in_folio(folio, curpos), len);
  2258. err = aops->write_end(iocb, mapping, curpos, len, len,
  2259. folio, fsdata);
  2260. if (err < 0)
  2261. goto out;
  2262. BUG_ON(err != len);
  2263. err = 0;
  2264. balance_dirty_pages_ratelimited(mapping);
  2265. if (fatal_signal_pending(current)) {
  2266. err = -EINTR;
  2267. goto out;
  2268. }
  2269. }
  2270. /* page covers the boundary, find the boundary offset */
  2271. if (index == curidx) {
  2272. zerofrom = curpos & ~PAGE_MASK;
  2273. /* if we will expand the thing last block will be filled */
  2274. if (offset <= zerofrom) {
  2275. goto out;
  2276. }
  2277. if (zerofrom & (blocksize-1)) {
  2278. *bytes |= (blocksize-1);
  2279. (*bytes)++;
  2280. }
  2281. len = offset - zerofrom;
  2282. err = aops->write_begin(iocb, mapping, curpos, len,
  2283. &folio, &fsdata);
  2284. if (err)
  2285. goto out;
  2286. folio_zero_range(folio, offset_in_folio(folio, curpos), len);
  2287. err = aops->write_end(iocb, mapping, curpos, len, len,
  2288. folio, fsdata);
  2289. if (err < 0)
  2290. goto out;
  2291. BUG_ON(err != len);
  2292. err = 0;
  2293. }
  2294. out:
  2295. return err;
  2296. }
  2297. /*
  2298. * For moronic filesystems that do not allow holes in file.
  2299. * We may have to extend the file.
  2300. */
  2301. int cont_write_begin(const struct kiocb *iocb, struct address_space *mapping,
  2302. loff_t pos, unsigned len, struct folio **foliop,
  2303. void **fsdata, get_block_t *get_block, loff_t *bytes)
  2304. {
  2305. struct inode *inode = mapping->host;
  2306. unsigned int blocksize = i_blocksize(inode);
  2307. unsigned int zerofrom;
  2308. int err;
  2309. err = cont_expand_zero(iocb, mapping, pos, bytes);
  2310. if (err)
  2311. return err;
  2312. zerofrom = *bytes & ~PAGE_MASK;
  2313. if (pos+len > *bytes && zerofrom & (blocksize-1)) {
  2314. *bytes |= (blocksize-1);
  2315. (*bytes)++;
  2316. }
  2317. return block_write_begin(mapping, pos, len, foliop, get_block);
  2318. }
  2319. EXPORT_SYMBOL(cont_write_begin);
  2320. /*
  2321. * block_page_mkwrite() is not allowed to change the file size as it gets
  2322. * called from a page fault handler when a page is first dirtied. Hence we must
  2323. * be careful to check for EOF conditions here. We set the page up correctly
  2324. * for a written page which means we get ENOSPC checking when writing into
  2325. * holes and correct delalloc and unwritten extent mapping on filesystems that
  2326. * support these features.
  2327. *
  2328. * We are not allowed to take the i_rwsem here so we have to play games to
  2329. * protect against truncate races as the page could now be beyond EOF. Because
  2330. * truncate writes the inode size before removing pages, once we have the
  2331. * page lock we can determine safely if the page is beyond EOF. If it is not
  2332. * beyond EOF, then the page is guaranteed safe against truncation until we
  2333. * unlock the page.
  2334. *
  2335. * Direct callers of this function should protect against filesystem freezing
  2336. * using sb_start_pagefault() - sb_end_pagefault() functions.
  2337. */
  2338. int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
  2339. get_block_t get_block)
  2340. {
  2341. struct folio *folio = page_folio(vmf->page);
  2342. struct inode *inode = file_inode(vma->vm_file);
  2343. unsigned long end;
  2344. loff_t size;
  2345. int ret;
  2346. folio_lock(folio);
  2347. size = i_size_read(inode);
  2348. if ((folio->mapping != inode->i_mapping) ||
  2349. (folio_pos(folio) >= size)) {
  2350. /* We overload EFAULT to mean page got truncated */
  2351. ret = -EFAULT;
  2352. goto out_unlock;
  2353. }
  2354. end = folio_size(folio);
  2355. /* folio is wholly or partially inside EOF */
  2356. if (folio_pos(folio) + end > size)
  2357. end = size - folio_pos(folio);
  2358. ret = __block_write_begin_int(folio, 0, end, get_block, NULL);
  2359. if (unlikely(ret))
  2360. goto out_unlock;
  2361. block_commit_write(folio, 0, end);
  2362. folio_mark_dirty(folio);
  2363. folio_wait_stable(folio);
  2364. return 0;
  2365. out_unlock:
  2366. folio_unlock(folio);
  2367. return ret;
  2368. }
  2369. EXPORT_SYMBOL(block_page_mkwrite);
  2370. int block_truncate_page(struct address_space *mapping,
  2371. loff_t from, get_block_t *get_block)
  2372. {
  2373. pgoff_t index = from >> PAGE_SHIFT;
  2374. unsigned blocksize;
  2375. sector_t iblock;
  2376. size_t offset, length, pos;
  2377. struct inode *inode = mapping->host;
  2378. struct folio *folio;
  2379. struct buffer_head *bh;
  2380. int err = 0;
  2381. blocksize = i_blocksize(inode);
  2382. length = from & (blocksize - 1);
  2383. /* Block boundary? Nothing to do */
  2384. if (!length)
  2385. return 0;
  2386. length = blocksize - length;
  2387. iblock = ((loff_t)index * PAGE_SIZE) >> inode->i_blkbits;
  2388. folio = filemap_grab_folio(mapping, index);
  2389. if (IS_ERR(folio))
  2390. return PTR_ERR(folio);
  2391. bh = folio_buffers(folio);
  2392. if (!bh)
  2393. bh = create_empty_buffers(folio, blocksize, 0);
  2394. /* Find the buffer that contains "offset" */
  2395. offset = offset_in_folio(folio, from);
  2396. pos = blocksize;
  2397. while (offset >= pos) {
  2398. bh = bh->b_this_page;
  2399. iblock++;
  2400. pos += blocksize;
  2401. }
  2402. if (!buffer_mapped(bh)) {
  2403. WARN_ON(bh->b_size != blocksize);
  2404. err = get_block(inode, iblock, bh, 0);
  2405. if (err)
  2406. goto unlock;
  2407. /* unmapped? It's a hole - nothing to do */
  2408. if (!buffer_mapped(bh))
  2409. goto unlock;
  2410. }
  2411. /* Ok, it's mapped. Make sure it's up-to-date */
  2412. if (folio_test_uptodate(folio))
  2413. set_buffer_uptodate(bh);
  2414. if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
  2415. err = bh_read(bh, 0);
  2416. /* Uhhuh. Read error. Complain and punt. */
  2417. if (err < 0)
  2418. goto unlock;
  2419. }
  2420. folio_zero_range(folio, offset, length);
  2421. mark_buffer_dirty(bh);
  2422. unlock:
  2423. folio_unlock(folio);
  2424. folio_put(folio);
  2425. return err;
  2426. }
  2427. EXPORT_SYMBOL(block_truncate_page);
  2428. /*
  2429. * The generic write folio function for buffer-backed address_spaces
  2430. */
  2431. int block_write_full_folio(struct folio *folio, struct writeback_control *wbc,
  2432. void *get_block)
  2433. {
  2434. struct inode * const inode = folio->mapping->host;
  2435. loff_t i_size = i_size_read(inode);
  2436. /* Is the folio fully inside i_size? */
  2437. if (folio_next_pos(folio) <= i_size)
  2438. return __block_write_full_folio(inode, folio, get_block, wbc);
  2439. /* Is the folio fully outside i_size? (truncate in progress) */
  2440. if (folio_pos(folio) >= i_size) {
  2441. folio_unlock(folio);
  2442. return 0; /* don't care */
  2443. }
  2444. /*
  2445. * The folio straddles i_size. It must be zeroed out on each and every
  2446. * writeback invocation because it may be mmapped. "A file is mapped
  2447. * in multiples of the page size. For a file that is not a multiple of
  2448. * the page size, the remaining memory is zeroed when mapped, and
  2449. * writes to that region are not written out to the file."
  2450. */
  2451. folio_zero_segment(folio, offset_in_folio(folio, i_size),
  2452. folio_size(folio));
  2453. return __block_write_full_folio(inode, folio, get_block, wbc);
  2454. }
  2455. sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
  2456. get_block_t *get_block)
  2457. {
  2458. struct inode *inode = mapping->host;
  2459. struct buffer_head tmp = {
  2460. .b_size = i_blocksize(inode),
  2461. };
  2462. get_block(inode, block, &tmp, 0);
  2463. return tmp.b_blocknr;
  2464. }
  2465. EXPORT_SYMBOL(generic_block_bmap);
  2466. static void end_bio_bh_io_sync(struct bio *bio)
  2467. {
  2468. struct buffer_head *bh = bio->bi_private;
  2469. if (unlikely(bio_flagged(bio, BIO_QUIET)))
  2470. set_bit(BH_Quiet, &bh->b_state);
  2471. bh->b_end_io(bh, !bio->bi_status);
  2472. bio_put(bio);
  2473. }
  2474. static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
  2475. enum rw_hint write_hint,
  2476. struct writeback_control *wbc)
  2477. {
  2478. const enum req_op op = opf & REQ_OP_MASK;
  2479. struct bio *bio;
  2480. BUG_ON(!buffer_locked(bh));
  2481. BUG_ON(!buffer_mapped(bh));
  2482. BUG_ON(!bh->b_end_io);
  2483. BUG_ON(buffer_delay(bh));
  2484. BUG_ON(buffer_unwritten(bh));
  2485. /*
  2486. * Only clear out a write error when rewriting
  2487. */
  2488. if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE))
  2489. clear_buffer_write_io_error(bh);
  2490. if (buffer_meta(bh))
  2491. opf |= REQ_META;
  2492. if (buffer_prio(bh))
  2493. opf |= REQ_PRIO;
  2494. bio = bio_alloc(bh->b_bdev, 1, opf, GFP_NOIO);
  2495. fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
  2496. bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
  2497. bio->bi_write_hint = write_hint;
  2498. bio_add_folio_nofail(bio, bh->b_folio, bh->b_size, bh_offset(bh));
  2499. bio->bi_end_io = end_bio_bh_io_sync;
  2500. bio->bi_private = bh;
  2501. /* Take care of bh's that straddle the end of the device */
  2502. guard_bio_eod(bio);
  2503. if (wbc) {
  2504. wbc_init_bio(wbc, bio);
  2505. wbc_account_cgroup_owner(wbc, bh->b_folio, bh->b_size);
  2506. }
  2507. blk_crypto_submit_bio(bio);
  2508. }
  2509. void submit_bh(blk_opf_t opf, struct buffer_head *bh)
  2510. {
  2511. submit_bh_wbc(opf, bh, WRITE_LIFE_NOT_SET, NULL);
  2512. }
  2513. EXPORT_SYMBOL(submit_bh);
  2514. void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
  2515. {
  2516. lock_buffer(bh);
  2517. if (!test_clear_buffer_dirty(bh)) {
  2518. unlock_buffer(bh);
  2519. return;
  2520. }
  2521. bh->b_end_io = end_buffer_write_sync;
  2522. get_bh(bh);
  2523. submit_bh(REQ_OP_WRITE | op_flags, bh);
  2524. }
  2525. EXPORT_SYMBOL(write_dirty_buffer);
  2526. /*
  2527. * For a data-integrity writeout, we need to wait upon any in-progress I/O
  2528. * and then start new I/O and then wait upon it. The caller must have a ref on
  2529. * the buffer_head.
  2530. */
  2531. int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
  2532. {
  2533. WARN_ON(atomic_read(&bh->b_count) < 1);
  2534. lock_buffer(bh);
  2535. if (test_clear_buffer_dirty(bh)) {
  2536. /*
  2537. * The bh should be mapped, but it might not be if the
  2538. * device was hot-removed. Not much we can do but fail the I/O.
  2539. */
  2540. if (!buffer_mapped(bh)) {
  2541. unlock_buffer(bh);
  2542. return -EIO;
  2543. }
  2544. get_bh(bh);
  2545. bh->b_end_io = end_buffer_write_sync;
  2546. submit_bh(REQ_OP_WRITE | op_flags, bh);
  2547. wait_on_buffer(bh);
  2548. if (!buffer_uptodate(bh))
  2549. return -EIO;
  2550. } else {
  2551. unlock_buffer(bh);
  2552. }
  2553. return 0;
  2554. }
  2555. EXPORT_SYMBOL(__sync_dirty_buffer);
  2556. int sync_dirty_buffer(struct buffer_head *bh)
  2557. {
  2558. return __sync_dirty_buffer(bh, REQ_SYNC);
  2559. }
  2560. EXPORT_SYMBOL(sync_dirty_buffer);
  2561. static inline int buffer_busy(struct buffer_head *bh)
  2562. {
  2563. return atomic_read(&bh->b_count) |
  2564. (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
  2565. }
  2566. static bool
  2567. drop_buffers(struct folio *folio, struct buffer_head **buffers_to_free)
  2568. {
  2569. struct buffer_head *head = folio_buffers(folio);
  2570. struct buffer_head *bh;
  2571. bh = head;
  2572. do {
  2573. if (buffer_busy(bh))
  2574. goto failed;
  2575. bh = bh->b_this_page;
  2576. } while (bh != head);
  2577. do {
  2578. struct buffer_head *next = bh->b_this_page;
  2579. if (bh->b_assoc_map)
  2580. __remove_assoc_queue(bh);
  2581. bh = next;
  2582. } while (bh != head);
  2583. *buffers_to_free = head;
  2584. folio_detach_private(folio);
  2585. return true;
  2586. failed:
  2587. return false;
  2588. }
  2589. /**
  2590. * try_to_free_buffers - Release buffers attached to this folio.
  2591. * @folio: The folio.
  2592. *
  2593. * If any buffers are in use (dirty, under writeback, elevated refcount),
  2594. * no buffers will be freed.
  2595. *
  2596. * If the folio is dirty but all the buffers are clean then we need to
  2597. * be sure to mark the folio clean as well. This is because the folio
  2598. * may be against a block device, and a later reattachment of buffers
  2599. * to a dirty folio will set *all* buffers dirty. Which would corrupt
  2600. * filesystem data on the same device.
  2601. *
  2602. * The same applies to regular filesystem folios: if all the buffers are
  2603. * clean then we set the folio clean and proceed. To do that, we require
  2604. * total exclusion from block_dirty_folio(). That is obtained with
  2605. * i_private_lock.
  2606. *
  2607. * Exclusion against try_to_free_buffers may be obtained by either
  2608. * locking the folio or by holding its mapping's i_private_lock.
  2609. *
  2610. * Context: Process context. @folio must be locked. Will not sleep.
  2611. * Return: true if all buffers attached to this folio were freed.
  2612. */
  2613. bool try_to_free_buffers(struct folio *folio)
  2614. {
  2615. struct address_space * const mapping = folio->mapping;
  2616. struct buffer_head *buffers_to_free = NULL;
  2617. bool ret = 0;
  2618. BUG_ON(!folio_test_locked(folio));
  2619. if (folio_test_writeback(folio))
  2620. return false;
  2621. /* Misconfigured folio check */
  2622. if (WARN_ON_ONCE(!folio_buffers(folio)))
  2623. return true;
  2624. if (mapping == NULL) { /* can this still happen? */
  2625. ret = drop_buffers(folio, &buffers_to_free);
  2626. goto out;
  2627. }
  2628. spin_lock(&mapping->i_private_lock);
  2629. ret = drop_buffers(folio, &buffers_to_free);
  2630. /*
  2631. * If the filesystem writes its buffers by hand (eg ext3)
  2632. * then we can have clean buffers against a dirty folio. We
  2633. * clean the folio here; otherwise the VM will never notice
  2634. * that the filesystem did any IO at all.
  2635. *
  2636. * Also, during truncate, discard_buffer will have marked all
  2637. * the folio's buffers clean. We discover that here and clean
  2638. * the folio also.
  2639. *
  2640. * i_private_lock must be held over this entire operation in order
  2641. * to synchronise against block_dirty_folio and prevent the
  2642. * dirty bit from being lost.
  2643. */
  2644. if (ret)
  2645. folio_cancel_dirty(folio);
  2646. spin_unlock(&mapping->i_private_lock);
  2647. out:
  2648. if (buffers_to_free) {
  2649. struct buffer_head *bh = buffers_to_free;
  2650. do {
  2651. struct buffer_head *next = bh->b_this_page;
  2652. free_buffer_head(bh);
  2653. bh = next;
  2654. } while (bh != buffers_to_free);
  2655. }
  2656. return ret;
  2657. }
  2658. EXPORT_SYMBOL(try_to_free_buffers);
  2659. /*
  2660. * Buffer-head allocation
  2661. */
  2662. static struct kmem_cache *bh_cachep __ro_after_init;
  2663. /*
  2664. * Once the number of bh's in the machine exceeds this level, we start
  2665. * stripping them in writeback.
  2666. */
  2667. static unsigned long max_buffer_heads __ro_after_init;
  2668. int buffer_heads_over_limit;
  2669. struct bh_accounting {
  2670. int nr; /* Number of live bh's */
  2671. int ratelimit; /* Limit cacheline bouncing */
  2672. };
  2673. static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
  2674. static void recalc_bh_state(void)
  2675. {
  2676. int i;
  2677. int tot = 0;
  2678. if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
  2679. return;
  2680. __this_cpu_write(bh_accounting.ratelimit, 0);
  2681. for_each_online_cpu(i)
  2682. tot += per_cpu(bh_accounting, i).nr;
  2683. buffer_heads_over_limit = (tot > max_buffer_heads);
  2684. }
  2685. struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
  2686. {
  2687. struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
  2688. if (ret) {
  2689. INIT_LIST_HEAD(&ret->b_assoc_buffers);
  2690. spin_lock_init(&ret->b_uptodate_lock);
  2691. preempt_disable();
  2692. __this_cpu_inc(bh_accounting.nr);
  2693. recalc_bh_state();
  2694. preempt_enable();
  2695. }
  2696. return ret;
  2697. }
  2698. EXPORT_SYMBOL(alloc_buffer_head);
  2699. void free_buffer_head(struct buffer_head *bh)
  2700. {
  2701. BUG_ON(!list_empty(&bh->b_assoc_buffers));
  2702. kmem_cache_free(bh_cachep, bh);
  2703. preempt_disable();
  2704. __this_cpu_dec(bh_accounting.nr);
  2705. recalc_bh_state();
  2706. preempt_enable();
  2707. }
  2708. EXPORT_SYMBOL(free_buffer_head);
  2709. static int buffer_exit_cpu_dead(unsigned int cpu)
  2710. {
  2711. int i;
  2712. struct bh_lru *b = &per_cpu(bh_lrus, cpu);
  2713. for (i = 0; i < BH_LRU_SIZE; i++) {
  2714. brelse(b->bhs[i]);
  2715. b->bhs[i] = NULL;
  2716. }
  2717. this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
  2718. per_cpu(bh_accounting, cpu).nr = 0;
  2719. return 0;
  2720. }
  2721. /**
  2722. * bh_uptodate_or_lock - Test whether the buffer is uptodate
  2723. * @bh: struct buffer_head
  2724. *
  2725. * Return true if the buffer is up-to-date and false,
  2726. * with the buffer locked, if not.
  2727. */
  2728. int bh_uptodate_or_lock(struct buffer_head *bh)
  2729. {
  2730. if (!buffer_uptodate(bh)) {
  2731. lock_buffer(bh);
  2732. if (!buffer_uptodate(bh))
  2733. return 0;
  2734. unlock_buffer(bh);
  2735. }
  2736. return 1;
  2737. }
  2738. EXPORT_SYMBOL(bh_uptodate_or_lock);
  2739. /**
  2740. * __bh_read - Submit read for a locked buffer
  2741. * @bh: struct buffer_head
  2742. * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ
  2743. * @wait: wait until reading finish
  2744. *
  2745. * Returns zero on success or don't wait, and -EIO on error.
  2746. */
  2747. int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait)
  2748. {
  2749. int ret = 0;
  2750. BUG_ON(!buffer_locked(bh));
  2751. get_bh(bh);
  2752. bh->b_end_io = end_buffer_read_sync;
  2753. submit_bh(REQ_OP_READ | op_flags, bh);
  2754. if (wait) {
  2755. wait_on_buffer(bh);
  2756. if (!buffer_uptodate(bh))
  2757. ret = -EIO;
  2758. }
  2759. return ret;
  2760. }
  2761. EXPORT_SYMBOL(__bh_read);
  2762. /**
  2763. * __bh_read_batch - Submit read for a batch of unlocked buffers
  2764. * @nr: entry number of the buffer batch
  2765. * @bhs: a batch of struct buffer_head
  2766. * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ
  2767. * @force_lock: force to get a lock on the buffer if set, otherwise drops any
  2768. * buffer that cannot lock.
  2769. *
  2770. * Returns zero on success or don't wait, and -EIO on error.
  2771. */
  2772. void __bh_read_batch(int nr, struct buffer_head *bhs[],
  2773. blk_opf_t op_flags, bool force_lock)
  2774. {
  2775. int i;
  2776. for (i = 0; i < nr; i++) {
  2777. struct buffer_head *bh = bhs[i];
  2778. if (buffer_uptodate(bh))
  2779. continue;
  2780. if (force_lock)
  2781. lock_buffer(bh);
  2782. else
  2783. if (!trylock_buffer(bh))
  2784. continue;
  2785. if (buffer_uptodate(bh)) {
  2786. unlock_buffer(bh);
  2787. continue;
  2788. }
  2789. bh->b_end_io = end_buffer_read_sync;
  2790. get_bh(bh);
  2791. submit_bh(REQ_OP_READ | op_flags, bh);
  2792. }
  2793. }
  2794. EXPORT_SYMBOL(__bh_read_batch);
  2795. void __init buffer_init(void)
  2796. {
  2797. unsigned long nrpages;
  2798. int ret;
  2799. bh_cachep = KMEM_CACHE(buffer_head,
  2800. SLAB_RECLAIM_ACCOUNT|SLAB_PANIC);
  2801. /*
  2802. * Limit the bh occupancy to 10% of ZONE_NORMAL
  2803. */
  2804. nrpages = (nr_free_buffer_pages() * 10) / 100;
  2805. max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
  2806. ret = cpuhp_setup_state_nocalls(CPUHP_FS_BUFF_DEAD, "fs/buffer:dead",
  2807. NULL, buffer_exit_cpu_dead);
  2808. WARN_ON(ret < 0);
  2809. }