bio.c 53 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright (C) 2001 Jens Axboe <axboe@kernel.dk>
  4. */
  5. #include <linux/mm.h>
  6. #include <linux/swap.h>
  7. #include <linux/bio-integrity.h>
  8. #include <linux/blkdev.h>
  9. #include <linux/uio.h>
  10. #include <linux/iocontext.h>
  11. #include <linux/slab.h>
  12. #include <linux/init.h>
  13. #include <linux/kernel.h>
  14. #include <linux/export.h>
  15. #include <linux/mempool.h>
  16. #include <linux/workqueue.h>
  17. #include <linux/cgroup.h>
  18. #include <linux/highmem.h>
  19. #include <linux/blk-crypto.h>
  20. #include <linux/xarray.h>
  21. #include <trace/events/block.h>
  22. #include "blk.h"
  23. #include "blk-rq-qos.h"
  24. #include "blk-cgroup.h"
  25. #define ALLOC_CACHE_THRESHOLD 16
  26. #define ALLOC_CACHE_MAX 256
  27. struct bio_alloc_cache {
  28. struct bio *free_list;
  29. struct bio *free_list_irq;
  30. unsigned int nr;
  31. unsigned int nr_irq;
  32. };
  33. static struct biovec_slab {
  34. int nr_vecs;
  35. char *name;
  36. struct kmem_cache *slab;
  37. } bvec_slabs[] __read_mostly = {
  38. { .nr_vecs = 16, .name = "biovec-16" },
  39. { .nr_vecs = 64, .name = "biovec-64" },
  40. { .nr_vecs = 128, .name = "biovec-128" },
  41. { .nr_vecs = BIO_MAX_VECS, .name = "biovec-max" },
  42. };
  43. static struct biovec_slab *biovec_slab(unsigned short nr_vecs)
  44. {
  45. switch (nr_vecs) {
  46. /* smaller bios use inline vecs */
  47. case 5 ... 16:
  48. return &bvec_slabs[0];
  49. case 17 ... 64:
  50. return &bvec_slabs[1];
  51. case 65 ... 128:
  52. return &bvec_slabs[2];
  53. case 129 ... BIO_MAX_VECS:
  54. return &bvec_slabs[3];
  55. default:
  56. BUG();
  57. return NULL;
  58. }
  59. }
  60. /*
  61. * fs_bio_set is the bio_set containing bio and iovec memory pools used by
  62. * IO code that does not need private memory pools.
  63. */
  64. struct bio_set fs_bio_set;
  65. EXPORT_SYMBOL(fs_bio_set);
  66. /*
  67. * Our slab pool management
  68. */
  69. struct bio_slab {
  70. struct kmem_cache *slab;
  71. unsigned int slab_ref;
  72. unsigned int slab_size;
  73. char name[12];
  74. };
  75. static DEFINE_MUTEX(bio_slab_lock);
  76. static DEFINE_XARRAY(bio_slabs);
  77. static struct bio_slab *create_bio_slab(unsigned int size)
  78. {
  79. struct bio_slab *bslab = kzalloc_obj(*bslab);
  80. if (!bslab)
  81. return NULL;
  82. snprintf(bslab->name, sizeof(bslab->name), "bio-%d", size);
  83. bslab->slab = kmem_cache_create(bslab->name, size,
  84. ARCH_KMALLOC_MINALIGN,
  85. SLAB_HWCACHE_ALIGN | SLAB_TYPESAFE_BY_RCU, NULL);
  86. if (!bslab->slab)
  87. goto fail_alloc_slab;
  88. bslab->slab_ref = 1;
  89. bslab->slab_size = size;
  90. if (!xa_err(xa_store(&bio_slabs, size, bslab, GFP_KERNEL)))
  91. return bslab;
  92. kmem_cache_destroy(bslab->slab);
  93. fail_alloc_slab:
  94. kfree(bslab);
  95. return NULL;
  96. }
  97. static inline unsigned int bs_bio_slab_size(struct bio_set *bs)
  98. {
  99. return bs->front_pad + sizeof(struct bio) + bs->back_pad;
  100. }
  101. static struct kmem_cache *bio_find_or_create_slab(struct bio_set *bs)
  102. {
  103. unsigned int size = bs_bio_slab_size(bs);
  104. struct bio_slab *bslab;
  105. mutex_lock(&bio_slab_lock);
  106. bslab = xa_load(&bio_slabs, size);
  107. if (bslab)
  108. bslab->slab_ref++;
  109. else
  110. bslab = create_bio_slab(size);
  111. mutex_unlock(&bio_slab_lock);
  112. if (bslab)
  113. return bslab->slab;
  114. return NULL;
  115. }
  116. static void bio_put_slab(struct bio_set *bs)
  117. {
  118. struct bio_slab *bslab = NULL;
  119. unsigned int slab_size = bs_bio_slab_size(bs);
  120. mutex_lock(&bio_slab_lock);
  121. bslab = xa_load(&bio_slabs, slab_size);
  122. if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n"))
  123. goto out;
  124. WARN_ON_ONCE(bslab->slab != bs->bio_slab);
  125. WARN_ON(!bslab->slab_ref);
  126. if (--bslab->slab_ref)
  127. goto out;
  128. xa_erase(&bio_slabs, slab_size);
  129. kmem_cache_destroy(bslab->slab);
  130. kfree(bslab);
  131. out:
  132. mutex_unlock(&bio_slab_lock);
  133. }
  134. void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs)
  135. {
  136. BUG_ON(nr_vecs > BIO_MAX_VECS);
  137. if (nr_vecs == BIO_MAX_VECS)
  138. mempool_free(bv, pool);
  139. else if (nr_vecs > BIO_INLINE_VECS)
  140. kmem_cache_free(biovec_slab(nr_vecs)->slab, bv);
  141. }
  142. /*
  143. * Make the first allocation restricted and don't dump info on allocation
  144. * failures, since we'll fall back to the mempool in case of failure.
  145. */
  146. static inline gfp_t bvec_alloc_gfp(gfp_t gfp)
  147. {
  148. return (gfp & ~(__GFP_DIRECT_RECLAIM | __GFP_IO)) |
  149. __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
  150. }
  151. struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs,
  152. gfp_t gfp_mask)
  153. {
  154. struct biovec_slab *bvs = biovec_slab(*nr_vecs);
  155. if (WARN_ON_ONCE(!bvs))
  156. return NULL;
  157. /*
  158. * Upgrade the nr_vecs request to take full advantage of the allocation.
  159. * We also rely on this in the bvec_free path.
  160. */
  161. *nr_vecs = bvs->nr_vecs;
  162. /*
  163. * Try a slab allocation first for all smaller allocations. If that
  164. * fails and __GFP_DIRECT_RECLAIM is set retry with the mempool.
  165. * The mempool is sized to handle up to BIO_MAX_VECS entries.
  166. */
  167. if (*nr_vecs < BIO_MAX_VECS) {
  168. struct bio_vec *bvl;
  169. bvl = kmem_cache_alloc(bvs->slab, bvec_alloc_gfp(gfp_mask));
  170. if (likely(bvl) || !(gfp_mask & __GFP_DIRECT_RECLAIM))
  171. return bvl;
  172. *nr_vecs = BIO_MAX_VECS;
  173. }
  174. return mempool_alloc(pool, gfp_mask);
  175. }
  176. void bio_uninit(struct bio *bio)
  177. {
  178. #ifdef CONFIG_BLK_CGROUP
  179. if (bio->bi_blkg) {
  180. blkg_put(bio->bi_blkg);
  181. bio->bi_blkg = NULL;
  182. }
  183. #endif
  184. if (bio_integrity(bio))
  185. bio_integrity_free(bio);
  186. bio_crypt_free_ctx(bio);
  187. }
  188. EXPORT_SYMBOL(bio_uninit);
  189. static void bio_free(struct bio *bio)
  190. {
  191. struct bio_set *bs = bio->bi_pool;
  192. void *p = bio;
  193. WARN_ON_ONCE(!bs);
  194. bio_uninit(bio);
  195. bvec_free(&bs->bvec_pool, bio->bi_io_vec, bio->bi_max_vecs);
  196. mempool_free(p - bs->front_pad, &bs->bio_pool);
  197. }
  198. /*
  199. * Users of this function have their own bio allocation. Subsequently,
  200. * they must remember to pair any call to bio_init() with bio_uninit()
  201. * when IO has completed, or when the bio is released.
  202. */
  203. void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
  204. unsigned short max_vecs, blk_opf_t opf)
  205. {
  206. bio->bi_next = NULL;
  207. bio->bi_bdev = bdev;
  208. bio->bi_opf = opf;
  209. bio->bi_flags = 0;
  210. bio->bi_ioprio = 0;
  211. bio->bi_write_hint = 0;
  212. bio->bi_write_stream = 0;
  213. bio->bi_status = 0;
  214. bio->bi_bvec_gap_bit = 0;
  215. bio->bi_iter.bi_sector = 0;
  216. bio->bi_iter.bi_size = 0;
  217. bio->bi_iter.bi_idx = 0;
  218. bio->bi_iter.bi_bvec_done = 0;
  219. bio->bi_end_io = NULL;
  220. bio->bi_private = NULL;
  221. #ifdef CONFIG_BLK_CGROUP
  222. bio->bi_blkg = NULL;
  223. bio->issue_time_ns = 0;
  224. if (bdev)
  225. bio_associate_blkg(bio);
  226. #ifdef CONFIG_BLK_CGROUP_IOCOST
  227. bio->bi_iocost_cost = 0;
  228. #endif
  229. #endif
  230. #ifdef CONFIG_BLK_INLINE_ENCRYPTION
  231. bio->bi_crypt_context = NULL;
  232. #endif
  233. #ifdef CONFIG_BLK_DEV_INTEGRITY
  234. bio->bi_integrity = NULL;
  235. #endif
  236. bio->bi_vcnt = 0;
  237. atomic_set(&bio->__bi_remaining, 1);
  238. atomic_set(&bio->__bi_cnt, 1);
  239. bio->bi_cookie = BLK_QC_T_NONE;
  240. bio->bi_max_vecs = max_vecs;
  241. bio->bi_io_vec = table;
  242. bio->bi_pool = NULL;
  243. }
  244. EXPORT_SYMBOL(bio_init);
  245. /**
  246. * bio_reset - reinitialize a bio
  247. * @bio: bio to reset
  248. * @bdev: block device to use the bio for
  249. * @opf: operation and flags for bio
  250. *
  251. * Description:
  252. * After calling bio_reset(), @bio will be in the same state as a freshly
  253. * allocated bio returned bio bio_alloc_bioset() - the only fields that are
  254. * preserved are the ones that are initialized by bio_alloc_bioset(). See
  255. * comment in struct bio.
  256. */
  257. void bio_reset(struct bio *bio, struct block_device *bdev, blk_opf_t opf)
  258. {
  259. struct bio_vec *bv = bio->bi_io_vec;
  260. bio_uninit(bio);
  261. memset(bio, 0, BIO_RESET_BYTES);
  262. atomic_set(&bio->__bi_remaining, 1);
  263. bio->bi_io_vec = bv;
  264. bio->bi_bdev = bdev;
  265. if (bio->bi_bdev)
  266. bio_associate_blkg(bio);
  267. bio->bi_opf = opf;
  268. }
  269. EXPORT_SYMBOL(bio_reset);
  270. /**
  271. * bio_reuse - reuse a bio with the payload left intact
  272. * @bio: bio to reuse
  273. * @opf: operation and flags for the next I/O
  274. *
  275. * Allow reusing an existing bio for another operation with all set up
  276. * fields including the payload, device and end_io handler left intact.
  277. *
  278. * Typically used when @bio is first used to read data which is then written
  279. * to another location without modification. @bio must not be in-flight and
  280. * owned by the caller. Can't be used for cloned bios.
  281. *
  282. * Note: Can't be used when @bio has integrity or blk-crypto contexts for now.
  283. * Feel free to add that support when you need it, though.
  284. */
  285. void bio_reuse(struct bio *bio, blk_opf_t opf)
  286. {
  287. unsigned short vcnt = bio->bi_vcnt, i;
  288. bio_end_io_t *end_io = bio->bi_end_io;
  289. void *private = bio->bi_private;
  290. WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
  291. WARN_ON_ONCE(bio_integrity(bio));
  292. WARN_ON_ONCE(bio_has_crypt_ctx(bio));
  293. bio_reset(bio, bio->bi_bdev, opf);
  294. for (i = 0; i < vcnt; i++)
  295. bio->bi_iter.bi_size += bio->bi_io_vec[i].bv_len;
  296. bio->bi_vcnt = vcnt;
  297. bio->bi_private = private;
  298. bio->bi_end_io = end_io;
  299. }
  300. EXPORT_SYMBOL_GPL(bio_reuse);
  301. static struct bio *__bio_chain_endio(struct bio *bio)
  302. {
  303. struct bio *parent = bio->bi_private;
  304. if (bio->bi_status && !parent->bi_status)
  305. parent->bi_status = bio->bi_status;
  306. bio_put(bio);
  307. return parent;
  308. }
  309. /*
  310. * This function should only be used as a flag and must never be called.
  311. * If execution reaches here, it indicates a serious programming error.
  312. */
  313. static void bio_chain_endio(struct bio *bio)
  314. {
  315. BUG();
  316. }
  317. /**
  318. * bio_chain - chain bio completions
  319. * @bio: the target bio
  320. * @parent: the parent bio of @bio
  321. *
  322. * The caller won't have a bi_end_io called when @bio completes - instead,
  323. * @parent's bi_end_io won't be called until both @parent and @bio have
  324. * completed; the chained bio will also be freed when it completes.
  325. *
  326. * The caller must not set bi_private or bi_end_io in @bio.
  327. */
  328. void bio_chain(struct bio *bio, struct bio *parent)
  329. {
  330. BUG_ON(bio->bi_private || bio->bi_end_io);
  331. bio->bi_private = parent;
  332. bio->bi_end_io = bio_chain_endio;
  333. bio_inc_remaining(parent);
  334. }
  335. EXPORT_SYMBOL(bio_chain);
  336. /**
  337. * bio_chain_and_submit - submit a bio after chaining it to another one
  338. * @prev: bio to chain and submit
  339. * @new: bio to chain to
  340. *
  341. * If @prev is non-NULL, chain it to @new and submit it.
  342. *
  343. * Return: @new.
  344. */
  345. struct bio *bio_chain_and_submit(struct bio *prev, struct bio *new)
  346. {
  347. if (prev) {
  348. bio_chain(prev, new);
  349. submit_bio(prev);
  350. }
  351. return new;
  352. }
  353. struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev,
  354. unsigned int nr_pages, blk_opf_t opf, gfp_t gfp)
  355. {
  356. return bio_chain_and_submit(bio, bio_alloc(bdev, nr_pages, opf, gfp));
  357. }
  358. EXPORT_SYMBOL_GPL(blk_next_bio);
  359. static void bio_alloc_rescue(struct work_struct *work)
  360. {
  361. struct bio_set *bs = container_of(work, struct bio_set, rescue_work);
  362. struct bio *bio;
  363. while (1) {
  364. spin_lock(&bs->rescue_lock);
  365. bio = bio_list_pop(&bs->rescue_list);
  366. spin_unlock(&bs->rescue_lock);
  367. if (!bio)
  368. break;
  369. submit_bio_noacct(bio);
  370. }
  371. }
  372. static void punt_bios_to_rescuer(struct bio_set *bs)
  373. {
  374. struct bio_list punt, nopunt;
  375. struct bio *bio;
  376. if (WARN_ON_ONCE(!bs->rescue_workqueue))
  377. return;
  378. /*
  379. * In order to guarantee forward progress we must punt only bios that
  380. * were allocated from this bio_set; otherwise, if there was a bio on
  381. * there for a stacking driver higher up in the stack, processing it
  382. * could require allocating bios from this bio_set, and doing that from
  383. * our own rescuer would be bad.
  384. *
  385. * Since bio lists are singly linked, pop them all instead of trying to
  386. * remove from the middle of the list:
  387. */
  388. bio_list_init(&punt);
  389. bio_list_init(&nopunt);
  390. while ((bio = bio_list_pop(&current->bio_list[0])))
  391. bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
  392. current->bio_list[0] = nopunt;
  393. bio_list_init(&nopunt);
  394. while ((bio = bio_list_pop(&current->bio_list[1])))
  395. bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
  396. current->bio_list[1] = nopunt;
  397. spin_lock(&bs->rescue_lock);
  398. bio_list_merge(&bs->rescue_list, &punt);
  399. spin_unlock(&bs->rescue_lock);
  400. queue_work(bs->rescue_workqueue, &bs->rescue_work);
  401. }
  402. static void bio_alloc_irq_cache_splice(struct bio_alloc_cache *cache)
  403. {
  404. unsigned long flags;
  405. /* cache->free_list must be empty */
  406. if (WARN_ON_ONCE(cache->free_list))
  407. return;
  408. local_irq_save(flags);
  409. cache->free_list = cache->free_list_irq;
  410. cache->free_list_irq = NULL;
  411. cache->nr += cache->nr_irq;
  412. cache->nr_irq = 0;
  413. local_irq_restore(flags);
  414. }
  415. static struct bio *bio_alloc_percpu_cache(struct block_device *bdev,
  416. unsigned short nr_vecs, blk_opf_t opf, gfp_t gfp,
  417. struct bio_set *bs)
  418. {
  419. struct bio_alloc_cache *cache;
  420. struct bio *bio;
  421. cache = per_cpu_ptr(bs->cache, get_cpu());
  422. if (!cache->free_list) {
  423. if (READ_ONCE(cache->nr_irq) >= ALLOC_CACHE_THRESHOLD)
  424. bio_alloc_irq_cache_splice(cache);
  425. if (!cache->free_list) {
  426. put_cpu();
  427. return NULL;
  428. }
  429. }
  430. bio = cache->free_list;
  431. cache->free_list = bio->bi_next;
  432. cache->nr--;
  433. put_cpu();
  434. if (nr_vecs)
  435. bio_init_inline(bio, bdev, nr_vecs, opf);
  436. else
  437. bio_init(bio, bdev, NULL, nr_vecs, opf);
  438. bio->bi_pool = bs;
  439. return bio;
  440. }
  441. /**
  442. * bio_alloc_bioset - allocate a bio for I/O
  443. * @bdev: block device to allocate the bio for (can be %NULL)
  444. * @nr_vecs: number of bvecs to pre-allocate
  445. * @opf: operation and flags for bio
  446. * @gfp_mask: the GFP_* mask given to the slab allocator
  447. * @bs: the bio_set to allocate from.
  448. *
  449. * Allocate a bio from the mempools in @bs.
  450. *
  451. * If %__GFP_DIRECT_RECLAIM is set then bio_alloc will always be able to
  452. * allocate a bio. This is due to the mempool guarantees. To make this work,
  453. * callers must never allocate more than 1 bio at a time from the general pool.
  454. * Callers that need to allocate more than 1 bio must always submit the
  455. * previously allocated bio for IO before attempting to allocate a new one.
  456. * Failure to do so can cause deadlocks under memory pressure.
  457. *
  458. * Note that when running under submit_bio_noacct() (i.e. any block driver),
  459. * bios are not submitted until after you return - see the code in
  460. * submit_bio_noacct() that converts recursion into iteration, to prevent
  461. * stack overflows.
  462. *
  463. * This would normally mean allocating multiple bios under submit_bio_noacct()
  464. * would be susceptible to deadlocks, but we have
  465. * deadlock avoidance code that resubmits any blocked bios from a rescuer
  466. * thread.
  467. *
  468. * However, we do not guarantee forward progress for allocations from other
  469. * mempools. Doing multiple allocations from the same mempool under
  470. * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad
  471. * for per bio allocations.
  472. *
  473. * Returns: Pointer to new bio on success, NULL on failure.
  474. */
  475. struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs,
  476. blk_opf_t opf, gfp_t gfp_mask,
  477. struct bio_set *bs)
  478. {
  479. gfp_t saved_gfp = gfp_mask;
  480. struct bio *bio;
  481. void *p;
  482. /* should not use nobvec bioset for nr_vecs > 0 */
  483. if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_vecs > 0))
  484. return NULL;
  485. if (bs->cache && nr_vecs <= BIO_INLINE_VECS) {
  486. opf |= REQ_ALLOC_CACHE;
  487. bio = bio_alloc_percpu_cache(bdev, nr_vecs, opf,
  488. gfp_mask, bs);
  489. if (bio)
  490. return bio;
  491. /*
  492. * No cached bio available, bio returned below marked with
  493. * REQ_ALLOC_CACHE to participate in per-cpu alloc cache.
  494. */
  495. } else
  496. opf &= ~REQ_ALLOC_CACHE;
  497. /*
  498. * submit_bio_noacct() converts recursion to iteration; this means if
  499. * we're running beneath it, any bios we allocate and submit will not be
  500. * submitted (and thus freed) until after we return.
  501. *
  502. * This exposes us to a potential deadlock if we allocate multiple bios
  503. * from the same bio_set() while running underneath submit_bio_noacct().
  504. * If we were to allocate multiple bios (say a stacking block driver
  505. * that was splitting bios), we would deadlock if we exhausted the
  506. * mempool's reserve.
  507. *
  508. * We solve this, and guarantee forward progress, with a rescuer
  509. * workqueue per bio_set. If we go to allocate and there are bios on
  510. * current->bio_list, we first try the allocation without
  511. * __GFP_DIRECT_RECLAIM; if that fails, we punt those bios we would be
  512. * blocking to the rescuer workqueue before we retry with the original
  513. * gfp_flags.
  514. */
  515. if (current->bio_list &&
  516. (!bio_list_empty(&current->bio_list[0]) ||
  517. !bio_list_empty(&current->bio_list[1])) &&
  518. bs->rescue_workqueue)
  519. gfp_mask &= ~__GFP_DIRECT_RECLAIM;
  520. p = mempool_alloc(&bs->bio_pool, gfp_mask);
  521. if (!p && gfp_mask != saved_gfp) {
  522. punt_bios_to_rescuer(bs);
  523. gfp_mask = saved_gfp;
  524. p = mempool_alloc(&bs->bio_pool, gfp_mask);
  525. }
  526. if (unlikely(!p))
  527. return NULL;
  528. if (!mempool_is_saturated(&bs->bio_pool))
  529. opf &= ~REQ_ALLOC_CACHE;
  530. bio = p + bs->front_pad;
  531. if (nr_vecs > BIO_INLINE_VECS) {
  532. struct bio_vec *bvl = NULL;
  533. bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask);
  534. if (!bvl && gfp_mask != saved_gfp) {
  535. punt_bios_to_rescuer(bs);
  536. gfp_mask = saved_gfp;
  537. bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask);
  538. }
  539. if (unlikely(!bvl))
  540. goto err_free;
  541. bio_init(bio, bdev, bvl, nr_vecs, opf);
  542. } else if (nr_vecs) {
  543. bio_init_inline(bio, bdev, BIO_INLINE_VECS, opf);
  544. } else {
  545. bio_init(bio, bdev, NULL, 0, opf);
  546. }
  547. bio->bi_pool = bs;
  548. return bio;
  549. err_free:
  550. mempool_free(p, &bs->bio_pool);
  551. return NULL;
  552. }
  553. EXPORT_SYMBOL(bio_alloc_bioset);
  554. /**
  555. * bio_kmalloc - kmalloc a bio
  556. * @nr_vecs: number of bio_vecs to allocate
  557. * @gfp_mask: the GFP_* mask given to the slab allocator
  558. *
  559. * Use kmalloc to allocate a bio (including bvecs). The bio must be initialized
  560. * using bio_init() before use. To free a bio returned from this function use
  561. * kfree() after calling bio_uninit(). A bio returned from this function can
  562. * be reused by calling bio_uninit() before calling bio_init() again.
  563. *
  564. * Note that unlike bio_alloc() or bio_alloc_bioset() allocations from this
  565. * function are not backed by a mempool can fail. Do not use this function
  566. * for allocations in the file system I/O path.
  567. *
  568. * Returns: Pointer to new bio on success, NULL on failure.
  569. */
  570. struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask)
  571. {
  572. struct bio *bio;
  573. if (nr_vecs > BIO_MAX_INLINE_VECS)
  574. return NULL;
  575. return kmalloc(sizeof(*bio) + nr_vecs * sizeof(struct bio_vec),
  576. gfp_mask);
  577. }
  578. EXPORT_SYMBOL(bio_kmalloc);
  579. void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start)
  580. {
  581. struct bio_vec bv;
  582. struct bvec_iter iter;
  583. __bio_for_each_segment(bv, bio, iter, start)
  584. memzero_bvec(&bv);
  585. }
  586. EXPORT_SYMBOL(zero_fill_bio_iter);
  587. /**
  588. * bio_truncate - truncate the bio to small size of @new_size
  589. * @bio: the bio to be truncated
  590. * @new_size: new size for truncating the bio
  591. *
  592. * Description:
  593. * Truncate the bio to new size of @new_size. If bio_op(bio) is
  594. * REQ_OP_READ, zero the truncated part. This function should only
  595. * be used for handling corner cases, such as bio eod.
  596. */
  597. static void bio_truncate(struct bio *bio, unsigned new_size)
  598. {
  599. struct bio_vec bv;
  600. struct bvec_iter iter;
  601. unsigned int done = 0;
  602. bool truncated = false;
  603. if (new_size >= bio->bi_iter.bi_size)
  604. return;
  605. if (bio_op(bio) != REQ_OP_READ)
  606. goto exit;
  607. bio_for_each_segment(bv, bio, iter) {
  608. if (done + bv.bv_len > new_size) {
  609. size_t offset;
  610. if (!truncated)
  611. offset = new_size - done;
  612. else
  613. offset = 0;
  614. memzero_page(bv.bv_page, bv.bv_offset + offset,
  615. bv.bv_len - offset);
  616. truncated = true;
  617. }
  618. done += bv.bv_len;
  619. }
  620. exit:
  621. /*
  622. * Don't touch bvec table here and make it really immutable, since
  623. * fs bio user has to retrieve all pages via bio_for_each_segment_all
  624. * in its .end_bio() callback.
  625. *
  626. * It is enough to truncate bio by updating .bi_size since we can make
  627. * correct bvec with the updated .bi_size for drivers.
  628. */
  629. bio->bi_iter.bi_size = new_size;
  630. }
  631. /**
  632. * guard_bio_eod - truncate a BIO to fit the block device
  633. * @bio: bio to truncate
  634. *
  635. * This allows us to do IO even on the odd last sectors of a device, even if the
  636. * block size is some multiple of the physical sector size.
  637. *
  638. * We'll just truncate the bio to the size of the device, and clear the end of
  639. * the buffer head manually. Truly out-of-range accesses will turn into actual
  640. * I/O errors, this only handles the "we need to be able to do I/O at the final
  641. * sector" case.
  642. */
  643. void guard_bio_eod(struct bio *bio)
  644. {
  645. sector_t maxsector = bdev_nr_sectors(bio->bi_bdev);
  646. if (!maxsector)
  647. return;
  648. /*
  649. * If the *whole* IO is past the end of the device,
  650. * let it through, and the IO layer will turn it into
  651. * an EIO.
  652. */
  653. if (unlikely(bio->bi_iter.bi_sector >= maxsector))
  654. return;
  655. maxsector -= bio->bi_iter.bi_sector;
  656. if (likely((bio->bi_iter.bi_size >> 9) <= maxsector))
  657. return;
  658. bio_truncate(bio, maxsector << 9);
  659. }
  660. static int __bio_alloc_cache_prune(struct bio_alloc_cache *cache,
  661. unsigned int nr)
  662. {
  663. unsigned int i = 0;
  664. struct bio *bio;
  665. while ((bio = cache->free_list) != NULL) {
  666. cache->free_list = bio->bi_next;
  667. cache->nr--;
  668. bio_free(bio);
  669. if (++i == nr)
  670. break;
  671. }
  672. return i;
  673. }
  674. static void bio_alloc_cache_prune(struct bio_alloc_cache *cache,
  675. unsigned int nr)
  676. {
  677. nr -= __bio_alloc_cache_prune(cache, nr);
  678. if (!READ_ONCE(cache->free_list)) {
  679. bio_alloc_irq_cache_splice(cache);
  680. __bio_alloc_cache_prune(cache, nr);
  681. }
  682. }
  683. static int bio_cpu_dead(unsigned int cpu, struct hlist_node *node)
  684. {
  685. struct bio_set *bs;
  686. bs = hlist_entry_safe(node, struct bio_set, cpuhp_dead);
  687. if (bs->cache) {
  688. struct bio_alloc_cache *cache = per_cpu_ptr(bs->cache, cpu);
  689. bio_alloc_cache_prune(cache, -1U);
  690. }
  691. return 0;
  692. }
  693. static void bio_alloc_cache_destroy(struct bio_set *bs)
  694. {
  695. int cpu;
  696. if (!bs->cache)
  697. return;
  698. cpuhp_state_remove_instance_nocalls(CPUHP_BIO_DEAD, &bs->cpuhp_dead);
  699. for_each_possible_cpu(cpu) {
  700. struct bio_alloc_cache *cache;
  701. cache = per_cpu_ptr(bs->cache, cpu);
  702. bio_alloc_cache_prune(cache, -1U);
  703. }
  704. free_percpu(bs->cache);
  705. bs->cache = NULL;
  706. }
  707. static inline void bio_put_percpu_cache(struct bio *bio)
  708. {
  709. struct bio_alloc_cache *cache;
  710. cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu());
  711. if (READ_ONCE(cache->nr_irq) + cache->nr > ALLOC_CACHE_MAX)
  712. goto out_free;
  713. if (in_task()) {
  714. bio_uninit(bio);
  715. bio->bi_next = cache->free_list;
  716. /* Not necessary but helps not to iopoll already freed bios */
  717. bio->bi_bdev = NULL;
  718. cache->free_list = bio;
  719. cache->nr++;
  720. } else if (in_hardirq()) {
  721. lockdep_assert_irqs_disabled();
  722. bio_uninit(bio);
  723. bio->bi_next = cache->free_list_irq;
  724. cache->free_list_irq = bio;
  725. cache->nr_irq++;
  726. } else {
  727. goto out_free;
  728. }
  729. put_cpu();
  730. return;
  731. out_free:
  732. put_cpu();
  733. bio_free(bio);
  734. }
  735. /**
  736. * bio_put - release a reference to a bio
  737. * @bio: bio to release reference to
  738. *
  739. * Description:
  740. * Put a reference to a &struct bio, either one you have gotten with
  741. * bio_alloc, bio_get or bio_clone_*. The last put of a bio will free it.
  742. **/
  743. void bio_put(struct bio *bio)
  744. {
  745. if (unlikely(bio_flagged(bio, BIO_REFFED))) {
  746. BUG_ON(!atomic_read(&bio->__bi_cnt));
  747. if (!atomic_dec_and_test(&bio->__bi_cnt))
  748. return;
  749. }
  750. if (bio->bi_opf & REQ_ALLOC_CACHE)
  751. bio_put_percpu_cache(bio);
  752. else
  753. bio_free(bio);
  754. }
  755. EXPORT_SYMBOL(bio_put);
  756. static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp)
  757. {
  758. bio_set_flag(bio, BIO_CLONED);
  759. bio->bi_ioprio = bio_src->bi_ioprio;
  760. bio->bi_write_hint = bio_src->bi_write_hint;
  761. bio->bi_write_stream = bio_src->bi_write_stream;
  762. bio->bi_iter = bio_src->bi_iter;
  763. if (bio->bi_bdev) {
  764. if (bio->bi_bdev == bio_src->bi_bdev &&
  765. bio_flagged(bio_src, BIO_REMAPPED))
  766. bio_set_flag(bio, BIO_REMAPPED);
  767. bio_clone_blkg_association(bio, bio_src);
  768. }
  769. if (bio_crypt_clone(bio, bio_src, gfp) < 0)
  770. return -ENOMEM;
  771. if (bio_integrity(bio_src) &&
  772. bio_integrity_clone(bio, bio_src, gfp) < 0)
  773. return -ENOMEM;
  774. return 0;
  775. }
  776. /**
  777. * bio_alloc_clone - clone a bio that shares the original bio's biovec
  778. * @bdev: block_device to clone onto
  779. * @bio_src: bio to clone from
  780. * @gfp: allocation priority
  781. * @bs: bio_set to allocate from
  782. *
  783. * Allocate a new bio that is a clone of @bio_src. The caller owns the returned
  784. * bio, but not the actual data it points to.
  785. *
  786. * The caller must ensure that the return bio is not freed before @bio_src.
  787. */
  788. struct bio *bio_alloc_clone(struct block_device *bdev, struct bio *bio_src,
  789. gfp_t gfp, struct bio_set *bs)
  790. {
  791. struct bio *bio;
  792. bio = bio_alloc_bioset(bdev, 0, bio_src->bi_opf, gfp, bs);
  793. if (!bio)
  794. return NULL;
  795. if (__bio_clone(bio, bio_src, gfp) < 0) {
  796. bio_put(bio);
  797. return NULL;
  798. }
  799. bio->bi_io_vec = bio_src->bi_io_vec;
  800. return bio;
  801. }
  802. EXPORT_SYMBOL(bio_alloc_clone);
  803. /**
  804. * bio_init_clone - clone a bio that shares the original bio's biovec
  805. * @bdev: block_device to clone onto
  806. * @bio: bio to clone into
  807. * @bio_src: bio to clone from
  808. * @gfp: allocation priority
  809. *
  810. * Initialize a new bio in caller provided memory that is a clone of @bio_src.
  811. * The caller owns the returned bio, but not the actual data it points to.
  812. *
  813. * The caller must ensure that @bio_src is not freed before @bio.
  814. */
  815. int bio_init_clone(struct block_device *bdev, struct bio *bio,
  816. struct bio *bio_src, gfp_t gfp)
  817. {
  818. int ret;
  819. bio_init(bio, bdev, bio_src->bi_io_vec, 0, bio_src->bi_opf);
  820. ret = __bio_clone(bio, bio_src, gfp);
  821. if (ret)
  822. bio_uninit(bio);
  823. return ret;
  824. }
  825. EXPORT_SYMBOL(bio_init_clone);
  826. /**
  827. * bio_full - check if the bio is full
  828. * @bio: bio to check
  829. * @len: length of one segment to be added
  830. *
  831. * Return true if @bio is full and one segment with @len bytes can't be
  832. * added to the bio, otherwise return false
  833. */
  834. static inline bool bio_full(struct bio *bio, unsigned len)
  835. {
  836. if (bio->bi_vcnt >= bio->bi_max_vecs)
  837. return true;
  838. if (bio->bi_iter.bi_size > BIO_MAX_SIZE - len)
  839. return true;
  840. return false;
  841. }
  842. static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page,
  843. unsigned int len, unsigned int off)
  844. {
  845. size_t bv_end = bv->bv_offset + bv->bv_len;
  846. phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + bv_end - 1;
  847. phys_addr_t page_addr = page_to_phys(page);
  848. if (vec_end_addr + 1 != page_addr + off)
  849. return false;
  850. if (xen_domain() && !xen_biovec_phys_mergeable(bv, page))
  851. return false;
  852. if ((vec_end_addr & PAGE_MASK) != ((page_addr + off) & PAGE_MASK)) {
  853. if (IS_ENABLED(CONFIG_KMSAN))
  854. return false;
  855. if (bv->bv_page + bv_end / PAGE_SIZE != page + off / PAGE_SIZE)
  856. return false;
  857. }
  858. bv->bv_len += len;
  859. return true;
  860. }
  861. /*
  862. * Try to merge a page into a segment, while obeying the hardware segment
  863. * size limit.
  864. *
  865. * This is kept around for the integrity metadata, which is still tries
  866. * to build the initial bio to the hardware limit and doesn't have proper
  867. * helpers to split. Hopefully this will go away soon.
  868. */
  869. bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv,
  870. struct page *page, unsigned len, unsigned offset)
  871. {
  872. unsigned long mask = queue_segment_boundary(q);
  873. phys_addr_t addr1 = bvec_phys(bv);
  874. phys_addr_t addr2 = page_to_phys(page) + offset + len - 1;
  875. if ((addr1 | mask) != (addr2 | mask))
  876. return false;
  877. if (len > queue_max_segment_size(q) - bv->bv_len)
  878. return false;
  879. return bvec_try_merge_page(bv, page, len, offset);
  880. }
  881. /**
  882. * __bio_add_page - add page(s) to a bio in a new segment
  883. * @bio: destination bio
  884. * @page: start page to add
  885. * @len: length of the data to add, may cross pages
  886. * @off: offset of the data relative to @page, may cross pages
  887. *
  888. * Add the data at @page + @off to @bio as a new bvec. The caller must ensure
  889. * that @bio has space for another bvec.
  890. */
  891. void __bio_add_page(struct bio *bio, struct page *page,
  892. unsigned int len, unsigned int off)
  893. {
  894. WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
  895. WARN_ON_ONCE(bio_full(bio, len));
  896. if (is_pci_p2pdma_page(page))
  897. bio->bi_opf |= REQ_NOMERGE;
  898. bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, off);
  899. bio->bi_iter.bi_size += len;
  900. bio->bi_vcnt++;
  901. }
  902. EXPORT_SYMBOL_GPL(__bio_add_page);
  903. /**
  904. * bio_add_virt_nofail - add data in the direct kernel mapping to a bio
  905. * @bio: destination bio
  906. * @vaddr: data to add
  907. * @len: length of the data to add, may cross pages
  908. *
  909. * Add the data at @vaddr to @bio. The caller must have ensure a segment
  910. * is available for the added data. No merging into an existing segment
  911. * will be performed.
  912. */
  913. void bio_add_virt_nofail(struct bio *bio, void *vaddr, unsigned len)
  914. {
  915. __bio_add_page(bio, virt_to_page(vaddr), len, offset_in_page(vaddr));
  916. }
  917. EXPORT_SYMBOL_GPL(bio_add_virt_nofail);
  918. /**
  919. * bio_add_page - attempt to add page(s) to bio
  920. * @bio: destination bio
  921. * @page: start page to add
  922. * @len: vec entry length, may cross pages
  923. * @offset: vec entry offset relative to @page, may cross pages
  924. *
  925. * Attempt to add page(s) to the bio_vec maplist. This will only fail
  926. * if either bio->bi_vcnt == bio->bi_max_vecs or it's a cloned bio.
  927. */
  928. int bio_add_page(struct bio *bio, struct page *page,
  929. unsigned int len, unsigned int offset)
  930. {
  931. if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
  932. return 0;
  933. if (bio->bi_iter.bi_size > BIO_MAX_SIZE - len)
  934. return 0;
  935. if (bio->bi_vcnt > 0) {
  936. struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
  937. if (!zone_device_pages_have_same_pgmap(bv->bv_page, page))
  938. return 0;
  939. if (bvec_try_merge_page(bv, page, len, offset)) {
  940. bio->bi_iter.bi_size += len;
  941. return len;
  942. }
  943. }
  944. if (bio->bi_vcnt >= bio->bi_max_vecs)
  945. return 0;
  946. __bio_add_page(bio, page, len, offset);
  947. return len;
  948. }
  949. EXPORT_SYMBOL(bio_add_page);
  950. void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len,
  951. size_t off)
  952. {
  953. unsigned long nr = off / PAGE_SIZE;
  954. WARN_ON_ONCE(len > BIO_MAX_SIZE);
  955. __bio_add_page(bio, folio_page(folio, nr), len, off % PAGE_SIZE);
  956. }
  957. EXPORT_SYMBOL_GPL(bio_add_folio_nofail);
  958. /**
  959. * bio_add_folio - Attempt to add part of a folio to a bio.
  960. * @bio: BIO to add to.
  961. * @folio: Folio to add.
  962. * @len: How many bytes from the folio to add.
  963. * @off: First byte in this folio to add.
  964. *
  965. * Filesystems that use folios can call this function instead of calling
  966. * bio_add_page() for each page in the folio. If @off is bigger than
  967. * PAGE_SIZE, this function can create a bio_vec that starts in a page
  968. * after the bv_page. BIOs do not support folios that are 4GiB or larger.
  969. *
  970. * Return: Whether the addition was successful.
  971. */
  972. bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len,
  973. size_t off)
  974. {
  975. unsigned long nr = off / PAGE_SIZE;
  976. if (len > BIO_MAX_SIZE)
  977. return false;
  978. return bio_add_page(bio, folio_page(folio, nr), len, off % PAGE_SIZE) > 0;
  979. }
  980. EXPORT_SYMBOL(bio_add_folio);
  981. /**
  982. * bio_add_vmalloc_chunk - add a vmalloc chunk to a bio
  983. * @bio: destination bio
  984. * @vaddr: vmalloc address to add
  985. * @len: total length in bytes of the data to add
  986. *
  987. * Add data starting at @vaddr to @bio and return how many bytes were added.
  988. * This may be less than the amount originally asked. Returns 0 if no data
  989. * could be added to @bio.
  990. *
  991. * This helper calls flush_kernel_vmap_range() for the range added. For reads
  992. * the caller still needs to manually call invalidate_kernel_vmap_range() in
  993. * the completion handler.
  994. */
  995. unsigned int bio_add_vmalloc_chunk(struct bio *bio, void *vaddr, unsigned len)
  996. {
  997. unsigned int offset = offset_in_page(vaddr);
  998. len = min(len, PAGE_SIZE - offset);
  999. if (bio_add_page(bio, vmalloc_to_page(vaddr), len, offset) < len)
  1000. return 0;
  1001. if (op_is_write(bio_op(bio)))
  1002. flush_kernel_vmap_range(vaddr, len);
  1003. return len;
  1004. }
  1005. EXPORT_SYMBOL_GPL(bio_add_vmalloc_chunk);
  1006. /**
  1007. * bio_add_vmalloc - add a vmalloc region to a bio
  1008. * @bio: destination bio
  1009. * @vaddr: vmalloc address to add
  1010. * @len: total length in bytes of the data to add
  1011. *
  1012. * Add data starting at @vaddr to @bio. Return %true on success or %false if
  1013. * @bio does not have enough space for the payload.
  1014. *
  1015. * This helper calls flush_kernel_vmap_range() for the range added. For reads
  1016. * the caller still needs to manually call invalidate_kernel_vmap_range() in
  1017. * the completion handler.
  1018. */
  1019. bool bio_add_vmalloc(struct bio *bio, void *vaddr, unsigned int len)
  1020. {
  1021. do {
  1022. unsigned int added = bio_add_vmalloc_chunk(bio, vaddr, len);
  1023. if (!added)
  1024. return false;
  1025. vaddr += added;
  1026. len -= added;
  1027. } while (len);
  1028. return true;
  1029. }
  1030. EXPORT_SYMBOL_GPL(bio_add_vmalloc);
  1031. void __bio_release_pages(struct bio *bio, bool mark_dirty)
  1032. {
  1033. struct folio_iter fi;
  1034. bio_for_each_folio_all(fi, bio) {
  1035. size_t nr_pages;
  1036. if (mark_dirty) {
  1037. folio_lock(fi.folio);
  1038. folio_mark_dirty(fi.folio);
  1039. folio_unlock(fi.folio);
  1040. }
  1041. nr_pages = (fi.offset + fi.length - 1) / PAGE_SIZE -
  1042. fi.offset / PAGE_SIZE + 1;
  1043. unpin_user_folio(fi.folio, nr_pages);
  1044. }
  1045. }
  1046. EXPORT_SYMBOL_GPL(__bio_release_pages);
  1047. void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter)
  1048. {
  1049. WARN_ON_ONCE(bio->bi_max_vecs);
  1050. bio->bi_io_vec = (struct bio_vec *)iter->bvec;
  1051. bio->bi_iter.bi_idx = 0;
  1052. bio->bi_iter.bi_bvec_done = iter->iov_offset;
  1053. bio->bi_iter.bi_size = iov_iter_count(iter);
  1054. bio_set_flag(bio, BIO_CLONED);
  1055. }
  1056. /*
  1057. * Aligns the bio size to the len_align_mask, releasing excessive bio vecs that
  1058. * __bio_iov_iter_get_pages may have inserted, and reverts the trimmed length
  1059. * for the next iteration.
  1060. */
  1061. static int bio_iov_iter_align_down(struct bio *bio, struct iov_iter *iter,
  1062. unsigned len_align_mask)
  1063. {
  1064. size_t nbytes = bio->bi_iter.bi_size & len_align_mask;
  1065. if (!nbytes)
  1066. return 0;
  1067. iov_iter_revert(iter, nbytes);
  1068. bio->bi_iter.bi_size -= nbytes;
  1069. do {
  1070. struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
  1071. if (nbytes < bv->bv_len) {
  1072. bv->bv_len -= nbytes;
  1073. break;
  1074. }
  1075. if (bio_flagged(bio, BIO_PAGE_PINNED))
  1076. unpin_user_page(bv->bv_page);
  1077. bio->bi_vcnt--;
  1078. nbytes -= bv->bv_len;
  1079. } while (nbytes);
  1080. if (!bio->bi_vcnt)
  1081. return -EFAULT;
  1082. return 0;
  1083. }
  1084. /**
  1085. * bio_iov_iter_get_pages - add user or kernel pages to a bio
  1086. * @bio: bio to add pages to
  1087. * @iter: iov iterator describing the region to be added
  1088. * @len_align_mask: the mask to align the total size to, 0 for any length
  1089. *
  1090. * This takes either an iterator pointing to user memory, or one pointing to
  1091. * kernel pages (BVEC iterator). If we're adding user pages, we pin them and
  1092. * map them into the kernel. On IO completion, the caller should put those
  1093. * pages. For bvec based iterators bio_iov_iter_get_pages() uses the provided
  1094. * bvecs rather than copying them. Hence anyone issuing kiocb based IO needs
  1095. * to ensure the bvecs and pages stay referenced until the submitted I/O is
  1096. * completed by a call to ->ki_complete() or returns with an error other than
  1097. * -EIOCBQUEUED. The caller needs to check if the bio is flagged BIO_NO_PAGE_REF
  1098. * on IO completion. If it isn't, then pages should be released.
  1099. *
  1100. * The function tries, but does not guarantee, to pin as many pages as
  1101. * fit into the bio, or are requested in @iter, whatever is smaller. If
  1102. * MM encounters an error pinning the requested pages, it stops. Error
  1103. * is returned only if 0 pages could be pinned.
  1104. */
  1105. int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter,
  1106. unsigned len_align_mask)
  1107. {
  1108. iov_iter_extraction_t flags = 0;
  1109. if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
  1110. return -EIO;
  1111. if (iov_iter_is_bvec(iter)) {
  1112. bio_iov_bvec_set(bio, iter);
  1113. iov_iter_advance(iter, bio->bi_iter.bi_size);
  1114. return 0;
  1115. }
  1116. if (iov_iter_extract_will_pin(iter))
  1117. bio_set_flag(bio, BIO_PAGE_PINNED);
  1118. if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue))
  1119. flags |= ITER_ALLOW_P2PDMA;
  1120. do {
  1121. ssize_t ret;
  1122. ret = iov_iter_extract_bvecs(iter, bio->bi_io_vec,
  1123. BIO_MAX_SIZE - bio->bi_iter.bi_size,
  1124. &bio->bi_vcnt, bio->bi_max_vecs, flags);
  1125. if (ret <= 0) {
  1126. if (!bio->bi_vcnt)
  1127. return ret;
  1128. break;
  1129. }
  1130. bio->bi_iter.bi_size += ret;
  1131. } while (iov_iter_count(iter) && !bio_full(bio, 0));
  1132. if (is_pci_p2pdma_page(bio->bi_io_vec->bv_page))
  1133. bio->bi_opf |= REQ_NOMERGE;
  1134. return bio_iov_iter_align_down(bio, iter, len_align_mask);
  1135. }
  1136. static struct folio *folio_alloc_greedy(gfp_t gfp, size_t *size)
  1137. {
  1138. struct folio *folio;
  1139. while (*size > PAGE_SIZE) {
  1140. folio = folio_alloc(gfp | __GFP_NORETRY, get_order(*size));
  1141. if (folio)
  1142. return folio;
  1143. *size = rounddown_pow_of_two(*size - 1);
  1144. }
  1145. return folio_alloc(gfp, get_order(*size));
  1146. }
  1147. static void bio_free_folios(struct bio *bio)
  1148. {
  1149. struct bio_vec *bv;
  1150. int i;
  1151. bio_for_each_bvec_all(bv, bio, i) {
  1152. struct folio *folio = page_folio(bv->bv_page);
  1153. if (!is_zero_folio(folio))
  1154. folio_put(folio);
  1155. }
  1156. }
  1157. static int bio_iov_iter_bounce_write(struct bio *bio, struct iov_iter *iter)
  1158. {
  1159. size_t total_len = iov_iter_count(iter);
  1160. if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
  1161. return -EINVAL;
  1162. if (WARN_ON_ONCE(bio->bi_iter.bi_size))
  1163. return -EINVAL;
  1164. if (WARN_ON_ONCE(bio->bi_vcnt >= bio->bi_max_vecs))
  1165. return -EINVAL;
  1166. do {
  1167. size_t this_len = min(total_len, SZ_1M);
  1168. struct folio *folio;
  1169. if (this_len > PAGE_SIZE * 2)
  1170. this_len = rounddown_pow_of_two(this_len);
  1171. if (bio->bi_iter.bi_size > BIO_MAX_SIZE - this_len)
  1172. break;
  1173. folio = folio_alloc_greedy(GFP_KERNEL, &this_len);
  1174. if (!folio)
  1175. break;
  1176. bio_add_folio_nofail(bio, folio, this_len, 0);
  1177. if (copy_from_iter(folio_address(folio), this_len, iter) !=
  1178. this_len) {
  1179. bio_free_folios(bio);
  1180. return -EFAULT;
  1181. }
  1182. total_len -= this_len;
  1183. } while (total_len && bio->bi_vcnt < bio->bi_max_vecs);
  1184. if (!bio->bi_iter.bi_size)
  1185. return -ENOMEM;
  1186. return 0;
  1187. }
  1188. static int bio_iov_iter_bounce_read(struct bio *bio, struct iov_iter *iter)
  1189. {
  1190. size_t len = min(iov_iter_count(iter), SZ_1M);
  1191. struct folio *folio;
  1192. folio = folio_alloc_greedy(GFP_KERNEL, &len);
  1193. if (!folio)
  1194. return -ENOMEM;
  1195. do {
  1196. ssize_t ret;
  1197. ret = iov_iter_extract_bvecs(iter, bio->bi_io_vec + 1, len,
  1198. &bio->bi_vcnt, bio->bi_max_vecs - 1, 0);
  1199. if (ret <= 0) {
  1200. if (!bio->bi_vcnt) {
  1201. folio_put(folio);
  1202. return ret;
  1203. }
  1204. break;
  1205. }
  1206. len -= ret;
  1207. bio->bi_iter.bi_size += ret;
  1208. } while (len && bio->bi_vcnt < bio->bi_max_vecs - 1);
  1209. /*
  1210. * Set the folio directly here. The above loop has already calculated
  1211. * the correct bi_size, and we use bi_vcnt for the user buffers. That
  1212. * is safe as bi_vcnt is only used by the submitter and not the actual
  1213. * I/O path.
  1214. */
  1215. bvec_set_folio(&bio->bi_io_vec[0], folio, bio->bi_iter.bi_size, 0);
  1216. if (iov_iter_extract_will_pin(iter))
  1217. bio_set_flag(bio, BIO_PAGE_PINNED);
  1218. return 0;
  1219. }
  1220. /**
  1221. * bio_iov_iter_bounce - bounce buffer data from an iter into a bio
  1222. * @bio: bio to send
  1223. * @iter: iter to read from / write into
  1224. *
  1225. * Helper for direct I/O implementations that need to bounce buffer because
  1226. * we need to checksum the data or perform other operations that require
  1227. * consistency. Allocates folios to back the bounce buffer, and for writes
  1228. * copies the data into it. Needs to be paired with bio_iov_iter_unbounce()
  1229. * called on completion.
  1230. */
  1231. int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter)
  1232. {
  1233. if (op_is_write(bio_op(bio)))
  1234. return bio_iov_iter_bounce_write(bio, iter);
  1235. return bio_iov_iter_bounce_read(bio, iter);
  1236. }
  1237. static void bvec_unpin(struct bio_vec *bv, bool mark_dirty)
  1238. {
  1239. struct folio *folio = page_folio(bv->bv_page);
  1240. size_t nr_pages = (bv->bv_offset + bv->bv_len - 1) / PAGE_SIZE -
  1241. bv->bv_offset / PAGE_SIZE + 1;
  1242. if (mark_dirty)
  1243. folio_mark_dirty_lock(folio);
  1244. unpin_user_folio(folio, nr_pages);
  1245. }
  1246. static void bio_iov_iter_unbounce_read(struct bio *bio, bool is_error,
  1247. bool mark_dirty)
  1248. {
  1249. unsigned int len = bio->bi_io_vec[0].bv_len;
  1250. if (likely(!is_error)) {
  1251. void *buf = bvec_virt(&bio->bi_io_vec[0]);
  1252. struct iov_iter to;
  1253. iov_iter_bvec(&to, ITER_DEST, bio->bi_io_vec + 1, bio->bi_vcnt,
  1254. len);
  1255. /* copying to pinned pages should always work */
  1256. WARN_ON_ONCE(copy_to_iter(buf, len, &to) != len);
  1257. } else {
  1258. /* No need to mark folios dirty if never copied to them */
  1259. mark_dirty = false;
  1260. }
  1261. if (bio_flagged(bio, BIO_PAGE_PINNED)) {
  1262. int i;
  1263. for (i = 0; i < bio->bi_vcnt; i++)
  1264. bvec_unpin(&bio->bi_io_vec[1 + i], mark_dirty);
  1265. }
  1266. folio_put(page_folio(bio->bi_io_vec[0].bv_page));
  1267. }
  1268. /**
  1269. * bio_iov_iter_unbounce - finish a bounce buffer operation
  1270. * @bio: completed bio
  1271. * @is_error: %true if an I/O error occurred and data should not be copied
  1272. * @mark_dirty: If %true, folios will be marked dirty.
  1273. *
  1274. * Helper for direct I/O implementations that need to bounce buffer because
  1275. * we need to checksum the data or perform other operations that require
  1276. * consistency. Called to complete a bio set up by bio_iov_iter_bounce().
  1277. * Copies data back for reads, and marks the original folios dirty if
  1278. * requested and then frees the bounce buffer.
  1279. */
  1280. void bio_iov_iter_unbounce(struct bio *bio, bool is_error, bool mark_dirty)
  1281. {
  1282. if (op_is_write(bio_op(bio)))
  1283. bio_free_folios(bio);
  1284. else
  1285. bio_iov_iter_unbounce_read(bio, is_error, mark_dirty);
  1286. }
  1287. static void submit_bio_wait_endio(struct bio *bio)
  1288. {
  1289. complete(bio->bi_private);
  1290. }
  1291. /**
  1292. * submit_bio_wait - submit a bio, and wait until it completes
  1293. * @bio: The &struct bio which describes the I/O
  1294. *
  1295. * Simple wrapper around submit_bio(). Returns 0 on success, or the error from
  1296. * bio_endio() on failure.
  1297. *
  1298. * WARNING: Unlike to how submit_bio() is usually used, this function does not
  1299. * result in bio reference to be consumed. The caller must drop the reference
  1300. * on his own.
  1301. */
  1302. int submit_bio_wait(struct bio *bio)
  1303. {
  1304. DECLARE_COMPLETION_ONSTACK_MAP(done,
  1305. bio->bi_bdev->bd_disk->lockdep_map);
  1306. bio->bi_private = &done;
  1307. bio->bi_end_io = submit_bio_wait_endio;
  1308. bio->bi_opf |= REQ_SYNC;
  1309. submit_bio(bio);
  1310. blk_wait_io(&done);
  1311. return blk_status_to_errno(bio->bi_status);
  1312. }
  1313. EXPORT_SYMBOL(submit_bio_wait);
  1314. /**
  1315. * bdev_rw_virt - synchronously read into / write from kernel mapping
  1316. * @bdev: block device to access
  1317. * @sector: sector to access
  1318. * @data: data to read/write
  1319. * @len: length in byte to read/write
  1320. * @op: operation (e.g. REQ_OP_READ/REQ_OP_WRITE)
  1321. *
  1322. * Performs synchronous I/O to @bdev for @data/@len. @data must be in
  1323. * the kernel direct mapping and not a vmalloc address.
  1324. */
  1325. int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data,
  1326. size_t len, enum req_op op)
  1327. {
  1328. struct bio_vec bv;
  1329. struct bio bio;
  1330. int error;
  1331. if (WARN_ON_ONCE(is_vmalloc_addr(data)))
  1332. return -EIO;
  1333. bio_init(&bio, bdev, &bv, 1, op);
  1334. bio.bi_iter.bi_sector = sector;
  1335. bio_add_virt_nofail(&bio, data, len);
  1336. error = submit_bio_wait(&bio);
  1337. bio_uninit(&bio);
  1338. return error;
  1339. }
  1340. EXPORT_SYMBOL_GPL(bdev_rw_virt);
  1341. static void bio_wait_end_io(struct bio *bio)
  1342. {
  1343. complete(bio->bi_private);
  1344. bio_put(bio);
  1345. }
  1346. /*
  1347. * bio_await_chain - ends @bio and waits for every chained bio to complete
  1348. */
  1349. void bio_await_chain(struct bio *bio)
  1350. {
  1351. DECLARE_COMPLETION_ONSTACK_MAP(done,
  1352. bio->bi_bdev->bd_disk->lockdep_map);
  1353. bio->bi_private = &done;
  1354. bio->bi_end_io = bio_wait_end_io;
  1355. bio_endio(bio);
  1356. blk_wait_io(&done);
  1357. }
  1358. void __bio_advance(struct bio *bio, unsigned bytes)
  1359. {
  1360. if (bio_integrity(bio))
  1361. bio_integrity_advance(bio, bytes);
  1362. bio_crypt_advance(bio, bytes);
  1363. bio_advance_iter(bio, &bio->bi_iter, bytes);
  1364. }
  1365. EXPORT_SYMBOL(__bio_advance);
  1366. void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
  1367. struct bio *src, struct bvec_iter *src_iter)
  1368. {
  1369. while (src_iter->bi_size && dst_iter->bi_size) {
  1370. struct bio_vec src_bv = bio_iter_iovec(src, *src_iter);
  1371. struct bio_vec dst_bv = bio_iter_iovec(dst, *dst_iter);
  1372. unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len);
  1373. void *src_buf = bvec_kmap_local(&src_bv);
  1374. void *dst_buf = bvec_kmap_local(&dst_bv);
  1375. memcpy(dst_buf, src_buf, bytes);
  1376. kunmap_local(dst_buf);
  1377. kunmap_local(src_buf);
  1378. bio_advance_iter_single(src, src_iter, bytes);
  1379. bio_advance_iter_single(dst, dst_iter, bytes);
  1380. }
  1381. }
  1382. EXPORT_SYMBOL(bio_copy_data_iter);
  1383. /**
  1384. * bio_copy_data - copy contents of data buffers from one bio to another
  1385. * @src: source bio
  1386. * @dst: destination bio
  1387. *
  1388. * Stops when it reaches the end of either @src or @dst - that is, copies
  1389. * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
  1390. */
  1391. void bio_copy_data(struct bio *dst, struct bio *src)
  1392. {
  1393. struct bvec_iter src_iter = src->bi_iter;
  1394. struct bvec_iter dst_iter = dst->bi_iter;
  1395. bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
  1396. }
  1397. EXPORT_SYMBOL(bio_copy_data);
  1398. void bio_free_pages(struct bio *bio)
  1399. {
  1400. struct bio_vec *bvec;
  1401. struct bvec_iter_all iter_all;
  1402. bio_for_each_segment_all(bvec, bio, iter_all)
  1403. __free_page(bvec->bv_page);
  1404. }
  1405. EXPORT_SYMBOL(bio_free_pages);
  1406. /*
  1407. * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions
  1408. * for performing direct-IO in BIOs.
  1409. *
  1410. * The problem is that we cannot run folio_mark_dirty() from interrupt context
  1411. * because the required locks are not interrupt-safe. So what we can do is to
  1412. * mark the pages dirty _before_ performing IO. And in interrupt context,
  1413. * check that the pages are still dirty. If so, fine. If not, redirty them
  1414. * in process context.
  1415. *
  1416. * Note that this code is very hard to test under normal circumstances because
  1417. * direct-io pins the pages with get_user_pages(). This makes
  1418. * is_page_cache_freeable return false, and the VM will not clean the pages.
  1419. * But other code (eg, flusher threads) could clean the pages if they are mapped
  1420. * pagecache.
  1421. *
  1422. * Simply disabling the call to bio_set_pages_dirty() is a good way to test the
  1423. * deferred bio dirtying paths.
  1424. */
  1425. /*
  1426. * bio_set_pages_dirty() will mark all the bio's pages as dirty.
  1427. */
  1428. void bio_set_pages_dirty(struct bio *bio)
  1429. {
  1430. struct folio_iter fi;
  1431. bio_for_each_folio_all(fi, bio) {
  1432. folio_lock(fi.folio);
  1433. folio_mark_dirty(fi.folio);
  1434. folio_unlock(fi.folio);
  1435. }
  1436. }
  1437. EXPORT_SYMBOL_GPL(bio_set_pages_dirty);
  1438. /*
  1439. * bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
  1440. * If they are, then fine. If, however, some pages are clean then they must
  1441. * have been written out during the direct-IO read. So we take another ref on
  1442. * the BIO and re-dirty the pages in process context.
  1443. *
  1444. * It is expected that bio_check_pages_dirty() will wholly own the BIO from
  1445. * here on. It will unpin each page and will run one bio_put() against the
  1446. * BIO.
  1447. */
  1448. static void bio_dirty_fn(struct work_struct *work);
  1449. static DECLARE_WORK(bio_dirty_work, bio_dirty_fn);
  1450. static DEFINE_SPINLOCK(bio_dirty_lock);
  1451. static struct bio *bio_dirty_list;
  1452. /*
  1453. * This runs in process context
  1454. */
  1455. static void bio_dirty_fn(struct work_struct *work)
  1456. {
  1457. struct bio *bio, *next;
  1458. spin_lock_irq(&bio_dirty_lock);
  1459. next = bio_dirty_list;
  1460. bio_dirty_list = NULL;
  1461. spin_unlock_irq(&bio_dirty_lock);
  1462. while ((bio = next) != NULL) {
  1463. next = bio->bi_private;
  1464. bio_release_pages(bio, true);
  1465. bio_put(bio);
  1466. }
  1467. }
  1468. void bio_check_pages_dirty(struct bio *bio)
  1469. {
  1470. struct folio_iter fi;
  1471. unsigned long flags;
  1472. bio_for_each_folio_all(fi, bio) {
  1473. if (!folio_test_dirty(fi.folio))
  1474. goto defer;
  1475. }
  1476. bio_release_pages(bio, false);
  1477. bio_put(bio);
  1478. return;
  1479. defer:
  1480. spin_lock_irqsave(&bio_dirty_lock, flags);
  1481. bio->bi_private = bio_dirty_list;
  1482. bio_dirty_list = bio;
  1483. spin_unlock_irqrestore(&bio_dirty_lock, flags);
  1484. schedule_work(&bio_dirty_work);
  1485. }
  1486. EXPORT_SYMBOL_GPL(bio_check_pages_dirty);
  1487. static inline bool bio_remaining_done(struct bio *bio)
  1488. {
  1489. /*
  1490. * If we're not chaining, then ->__bi_remaining is always 1 and
  1491. * we always end io on the first invocation.
  1492. */
  1493. if (!bio_flagged(bio, BIO_CHAIN))
  1494. return true;
  1495. BUG_ON(atomic_read(&bio->__bi_remaining) <= 0);
  1496. if (atomic_dec_and_test(&bio->__bi_remaining)) {
  1497. bio_clear_flag(bio, BIO_CHAIN);
  1498. return true;
  1499. }
  1500. return false;
  1501. }
  1502. /**
  1503. * bio_endio - end I/O on a bio
  1504. * @bio: bio
  1505. *
  1506. * Description:
  1507. * bio_endio() will end I/O on the whole bio. bio_endio() is the preferred
  1508. * way to end I/O on a bio. No one should call bi_end_io() directly on a
  1509. * bio unless they own it and thus know that it has an end_io function.
  1510. *
  1511. * bio_endio() can be called several times on a bio that has been chained
  1512. * using bio_chain(). The ->bi_end_io() function will only be called the
  1513. * last time.
  1514. **/
  1515. void bio_endio(struct bio *bio)
  1516. {
  1517. again:
  1518. if (!bio_remaining_done(bio))
  1519. return;
  1520. if (!bio_integrity_endio(bio))
  1521. return;
  1522. blk_zone_bio_endio(bio);
  1523. rq_qos_done_bio(bio);
  1524. if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
  1525. trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio);
  1526. bio_clear_flag(bio, BIO_TRACE_COMPLETION);
  1527. }
  1528. /*
  1529. * Need to have a real endio function for chained bios, otherwise
  1530. * various corner cases will break (like stacking block devices that
  1531. * save/restore bi_end_io) - however, we want to avoid unbounded
  1532. * recursion and blowing the stack. Tail call optimization would
  1533. * handle this, but compiling with frame pointers also disables
  1534. * gcc's sibling call optimization.
  1535. */
  1536. if (bio->bi_end_io == bio_chain_endio) {
  1537. bio = __bio_chain_endio(bio);
  1538. goto again;
  1539. }
  1540. #ifdef CONFIG_BLK_CGROUP
  1541. /*
  1542. * Release cgroup info. We shouldn't have to do this here, but quite
  1543. * a few callers of bio_init fail to call bio_uninit, so we cover up
  1544. * for that here at least for now.
  1545. */
  1546. if (bio->bi_blkg) {
  1547. blkg_put(bio->bi_blkg);
  1548. bio->bi_blkg = NULL;
  1549. }
  1550. #endif
  1551. if (bio->bi_end_io)
  1552. bio->bi_end_io(bio);
  1553. }
  1554. EXPORT_SYMBOL(bio_endio);
  1555. /**
  1556. * bio_split - split a bio
  1557. * @bio: bio to split
  1558. * @sectors: number of sectors to split from the front of @bio
  1559. * @gfp: gfp mask
  1560. * @bs: bio set to allocate from
  1561. *
  1562. * Allocates and returns a new bio which represents @sectors from the start of
  1563. * @bio, and updates @bio to represent the remaining sectors.
  1564. *
  1565. * Unless this is a discard request the newly allocated bio will point
  1566. * to @bio's bi_io_vec. It is the caller's responsibility to ensure that
  1567. * neither @bio nor @bs are freed before the split bio.
  1568. */
  1569. struct bio *bio_split(struct bio *bio, int sectors,
  1570. gfp_t gfp, struct bio_set *bs)
  1571. {
  1572. struct bio *split;
  1573. if (WARN_ON_ONCE(sectors <= 0))
  1574. return ERR_PTR(-EINVAL);
  1575. if (WARN_ON_ONCE(sectors >= bio_sectors(bio)))
  1576. return ERR_PTR(-EINVAL);
  1577. /* Zone append commands cannot be split */
  1578. if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND))
  1579. return ERR_PTR(-EINVAL);
  1580. /* atomic writes cannot be split */
  1581. if (bio->bi_opf & REQ_ATOMIC)
  1582. return ERR_PTR(-EINVAL);
  1583. split = bio_alloc_clone(bio->bi_bdev, bio, gfp, bs);
  1584. if (!split)
  1585. return ERR_PTR(-ENOMEM);
  1586. split->bi_iter.bi_size = sectors << 9;
  1587. if (bio_integrity(split))
  1588. bio_integrity_trim(split);
  1589. bio_advance(bio, split->bi_iter.bi_size);
  1590. if (bio_flagged(bio, BIO_TRACE_COMPLETION))
  1591. bio_set_flag(split, BIO_TRACE_COMPLETION);
  1592. return split;
  1593. }
  1594. EXPORT_SYMBOL(bio_split);
  1595. /**
  1596. * bio_trim - trim a bio
  1597. * @bio: bio to trim
  1598. * @offset: number of sectors to trim from the front of @bio
  1599. * @size: size we want to trim @bio to, in sectors
  1600. *
  1601. * This function is typically used for bios that are cloned and submitted
  1602. * to the underlying device in parts.
  1603. */
  1604. void bio_trim(struct bio *bio, sector_t offset, sector_t size)
  1605. {
  1606. /* We should never trim an atomic write */
  1607. if (WARN_ON_ONCE(bio->bi_opf & REQ_ATOMIC && size))
  1608. return;
  1609. if (WARN_ON_ONCE(offset > BIO_MAX_SECTORS || size > BIO_MAX_SECTORS ||
  1610. offset + size > bio_sectors(bio)))
  1611. return;
  1612. size <<= 9;
  1613. if (offset == 0 && size == bio->bi_iter.bi_size)
  1614. return;
  1615. bio_advance(bio, offset << 9);
  1616. bio->bi_iter.bi_size = size;
  1617. if (bio_integrity(bio))
  1618. bio_integrity_trim(bio);
  1619. }
  1620. EXPORT_SYMBOL_GPL(bio_trim);
  1621. /*
  1622. * create memory pools for biovec's in a bio_set.
  1623. * use the global biovec slabs created for general use.
  1624. */
  1625. int biovec_init_pool(mempool_t *pool, int pool_entries)
  1626. {
  1627. struct biovec_slab *bp = bvec_slabs + ARRAY_SIZE(bvec_slabs) - 1;
  1628. return mempool_init_slab_pool(pool, pool_entries, bp->slab);
  1629. }
  1630. /*
  1631. * bioset_exit - exit a bioset initialized with bioset_init()
  1632. *
  1633. * May be called on a zeroed but uninitialized bioset (i.e. allocated with
  1634. * kzalloc()).
  1635. */
  1636. void bioset_exit(struct bio_set *bs)
  1637. {
  1638. bio_alloc_cache_destroy(bs);
  1639. if (bs->rescue_workqueue)
  1640. destroy_workqueue(bs->rescue_workqueue);
  1641. bs->rescue_workqueue = NULL;
  1642. mempool_exit(&bs->bio_pool);
  1643. mempool_exit(&bs->bvec_pool);
  1644. if (bs->bio_slab)
  1645. bio_put_slab(bs);
  1646. bs->bio_slab = NULL;
  1647. }
  1648. EXPORT_SYMBOL(bioset_exit);
  1649. /**
  1650. * bioset_init - Initialize a bio_set
  1651. * @bs: pool to initialize
  1652. * @pool_size: Number of bio and bio_vecs to cache in the mempool
  1653. * @front_pad: Number of bytes to allocate in front of the returned bio
  1654. * @flags: Flags to modify behavior, currently %BIOSET_NEED_BVECS
  1655. * and %BIOSET_NEED_RESCUER
  1656. *
  1657. * Description:
  1658. * Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller
  1659. * to ask for a number of bytes to be allocated in front of the bio.
  1660. * Front pad allocation is useful for embedding the bio inside
  1661. * another structure, to avoid allocating extra data to go with the bio.
  1662. * Note that the bio must be embedded at the END of that structure always,
  1663. * or things will break badly.
  1664. * If %BIOSET_NEED_BVECS is set in @flags, a separate pool will be allocated
  1665. * for allocating iovecs. This pool is not needed e.g. for bio_init_clone().
  1666. * If %BIOSET_NEED_RESCUER is set, a workqueue is created which can be used
  1667. * to dispatch queued requests when the mempool runs out of space.
  1668. *
  1669. */
  1670. int bioset_init(struct bio_set *bs,
  1671. unsigned int pool_size,
  1672. unsigned int front_pad,
  1673. int flags)
  1674. {
  1675. bs->front_pad = front_pad;
  1676. if (flags & BIOSET_NEED_BVECS)
  1677. bs->back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
  1678. else
  1679. bs->back_pad = 0;
  1680. spin_lock_init(&bs->rescue_lock);
  1681. bio_list_init(&bs->rescue_list);
  1682. INIT_WORK(&bs->rescue_work, bio_alloc_rescue);
  1683. bs->bio_slab = bio_find_or_create_slab(bs);
  1684. if (!bs->bio_slab)
  1685. return -ENOMEM;
  1686. if (mempool_init_slab_pool(&bs->bio_pool, pool_size, bs->bio_slab))
  1687. goto bad;
  1688. if ((flags & BIOSET_NEED_BVECS) &&
  1689. biovec_init_pool(&bs->bvec_pool, pool_size))
  1690. goto bad;
  1691. if (flags & BIOSET_NEED_RESCUER) {
  1692. bs->rescue_workqueue = alloc_workqueue("bioset",
  1693. WQ_MEM_RECLAIM, 0);
  1694. if (!bs->rescue_workqueue)
  1695. goto bad;
  1696. }
  1697. if (flags & BIOSET_PERCPU_CACHE) {
  1698. bs->cache = alloc_percpu(struct bio_alloc_cache);
  1699. if (!bs->cache)
  1700. goto bad;
  1701. cpuhp_state_add_instance_nocalls(CPUHP_BIO_DEAD, &bs->cpuhp_dead);
  1702. }
  1703. return 0;
  1704. bad:
  1705. bioset_exit(bs);
  1706. return -ENOMEM;
  1707. }
  1708. EXPORT_SYMBOL(bioset_init);
  1709. static int __init init_bio(void)
  1710. {
  1711. int i;
  1712. BUILD_BUG_ON(BIO_FLAG_LAST > 8 * sizeof_field(struct bio, bi_flags));
  1713. for (i = 0; i < ARRAY_SIZE(bvec_slabs); i++) {
  1714. struct biovec_slab *bvs = bvec_slabs + i;
  1715. bvs->slab = kmem_cache_create(bvs->name,
  1716. bvs->nr_vecs * sizeof(struct bio_vec), 0,
  1717. SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
  1718. }
  1719. cpuhp_setup_state_multi(CPUHP_BIO_DEAD, "block/bio:dead", NULL,
  1720. bio_cpu_dead);
  1721. if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0,
  1722. BIOSET_NEED_BVECS | BIOSET_PERCPU_CACHE))
  1723. panic("bio: can't allocate bios\n");
  1724. return 0;
  1725. }
  1726. subsys_initcall(init_bio);