bio.c 31 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright (C) 2007 Oracle. All rights reserved.
  4. * Copyright (C) 2022 Christoph Hellwig.
  5. */
  6. #include <linux/bio.h>
  7. #include "bio.h"
  8. #include "ctree.h"
  9. #include "volumes.h"
  10. #include "raid56.h"
  11. #include "async-thread.h"
  12. #include "dev-replace.h"
  13. #include "zoned.h"
  14. #include "file-item.h"
  15. #include "raid-stripe-tree.h"
  16. static struct bio_set btrfs_bioset;
  17. static struct bio_set btrfs_clone_bioset;
  18. static struct bio_set btrfs_repair_bioset;
  19. static mempool_t btrfs_failed_bio_pool;
  20. struct btrfs_failed_bio {
  21. struct btrfs_bio *bbio;
  22. int num_copies;
  23. atomic_t repair_count;
  24. };
  25. /* Is this a data path I/O that needs storage layer checksum and repair? */
  26. static inline bool is_data_bbio(const struct btrfs_bio *bbio)
  27. {
  28. return bbio->inode && is_data_inode(bbio->inode);
  29. }
  30. static bool bbio_has_ordered_extent(const struct btrfs_bio *bbio)
  31. {
  32. return is_data_bbio(bbio) && btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE;
  33. }
  34. /*
  35. * Initialize a btrfs_bio structure. This skips the embedded bio itself as it
  36. * is already initialized by the block layer.
  37. */
  38. void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, u64 file_offset,
  39. btrfs_bio_end_io_t end_io, void *private)
  40. {
  41. /* @inode parameter is mandatory. */
  42. ASSERT(inode);
  43. memset(bbio, 0, offsetof(struct btrfs_bio, bio));
  44. bbio->inode = inode;
  45. bbio->end_io = end_io;
  46. bbio->private = private;
  47. bbio->file_offset = file_offset;
  48. atomic_set(&bbio->pending_ios, 1);
  49. WRITE_ONCE(bbio->status, BLK_STS_OK);
  50. }
  51. /*
  52. * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for
  53. * btrfs, and is used for all I/O submitted through btrfs_submit_bbio().
  54. *
  55. * Just like the underlying bio_alloc_bioset it will not fail as it is backed by
  56. * a mempool.
  57. */
  58. struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
  59. struct btrfs_inode *inode, u64 file_offset,
  60. btrfs_bio_end_io_t end_io, void *private)
  61. {
  62. struct btrfs_bio *bbio;
  63. struct bio *bio;
  64. bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset);
  65. bbio = btrfs_bio(bio);
  66. btrfs_bio_init(bbio, inode, file_offset, end_io, private);
  67. return bbio;
  68. }
  69. static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
  70. struct btrfs_bio *orig_bbio,
  71. u64 map_length)
  72. {
  73. struct btrfs_bio *bbio;
  74. struct bio *bio;
  75. bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT, GFP_NOFS,
  76. &btrfs_clone_bioset);
  77. if (IS_ERR(bio))
  78. return ERR_CAST(bio);
  79. bbio = btrfs_bio(bio);
  80. btrfs_bio_init(bbio, orig_bbio->inode, orig_bbio->file_offset, NULL, orig_bbio);
  81. orig_bbio->file_offset += map_length;
  82. if (bbio_has_ordered_extent(bbio)) {
  83. refcount_inc(&orig_bbio->ordered->refs);
  84. bbio->ordered = orig_bbio->ordered;
  85. bbio->orig_logical = orig_bbio->orig_logical;
  86. orig_bbio->orig_logical += map_length;
  87. }
  88. bbio->csum_search_commit_root = orig_bbio->csum_search_commit_root;
  89. bbio->can_use_append = orig_bbio->can_use_append;
  90. bbio->is_scrub = orig_bbio->is_scrub;
  91. bbio->is_remap = orig_bbio->is_remap;
  92. bbio->async_csum = orig_bbio->async_csum;
  93. atomic_inc(&orig_bbio->pending_ios);
  94. return bbio;
  95. }
  96. void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
  97. {
  98. /* Make sure we're already in task context. */
  99. ASSERT(in_task());
  100. if (bbio->async_csum)
  101. wait_for_completion(&bbio->csum_done);
  102. bbio->bio.bi_status = status;
  103. if (bbio->bio.bi_pool == &btrfs_clone_bioset) {
  104. struct btrfs_bio *orig_bbio = bbio->private;
  105. /* Free bio that was never submitted to the underlying device. */
  106. if (bbio_has_ordered_extent(bbio))
  107. btrfs_put_ordered_extent(bbio->ordered);
  108. bio_put(&bbio->bio);
  109. bbio = orig_bbio;
  110. }
  111. /*
  112. * At this point, bbio always points to the original btrfs_bio. Save
  113. * the first error in it.
  114. */
  115. if (status != BLK_STS_OK)
  116. cmpxchg(&bbio->status, BLK_STS_OK, status);
  117. if (atomic_dec_and_test(&bbio->pending_ios)) {
  118. /* Load split bio's error which might be set above. */
  119. if (status == BLK_STS_OK)
  120. bbio->bio.bi_status = READ_ONCE(bbio->status);
  121. if (bbio_has_ordered_extent(bbio)) {
  122. struct btrfs_ordered_extent *ordered = bbio->ordered;
  123. bbio->end_io(bbio);
  124. btrfs_put_ordered_extent(ordered);
  125. } else {
  126. bbio->end_io(bbio);
  127. }
  128. }
  129. }
  130. static int next_repair_mirror(const struct btrfs_failed_bio *fbio, int cur_mirror)
  131. {
  132. if (cur_mirror == fbio->num_copies)
  133. return cur_mirror + 1 - fbio->num_copies;
  134. return cur_mirror + 1;
  135. }
  136. static int prev_repair_mirror(const struct btrfs_failed_bio *fbio, int cur_mirror)
  137. {
  138. if (cur_mirror == 1)
  139. return fbio->num_copies;
  140. return cur_mirror - 1;
  141. }
  142. static void btrfs_repair_done(struct btrfs_failed_bio *fbio)
  143. {
  144. if (atomic_dec_and_test(&fbio->repair_count)) {
  145. btrfs_bio_end_io(fbio->bbio, fbio->bbio->bio.bi_status);
  146. mempool_free(fbio, &btrfs_failed_bio_pool);
  147. }
  148. }
  149. static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
  150. struct btrfs_device *dev)
  151. {
  152. struct btrfs_failed_bio *fbio = repair_bbio->private;
  153. struct btrfs_inode *inode = repair_bbio->inode;
  154. struct btrfs_fs_info *fs_info = inode->root->fs_info;
  155. /*
  156. * We can not move forward the saved_iter, as it will be later
  157. * utilized by repair_bbio again.
  158. */
  159. struct bvec_iter saved_iter = repair_bbio->saved_iter;
  160. const u32 step = min(fs_info->sectorsize, PAGE_SIZE);
  161. const u64 logical = repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT;
  162. const u32 nr_steps = repair_bbio->saved_iter.bi_size / step;
  163. int mirror = repair_bbio->mirror_num;
  164. phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE];
  165. phys_addr_t paddr;
  166. unsigned int slot = 0;
  167. /* Repair bbio should be eaxctly one block sized. */
  168. ASSERT(repair_bbio->saved_iter.bi_size == fs_info->sectorsize);
  169. btrfs_bio_for_each_block(paddr, &repair_bbio->bio, &saved_iter, step) {
  170. ASSERT(slot < nr_steps);
  171. paddrs[slot] = paddr;
  172. slot++;
  173. }
  174. if (repair_bbio->bio.bi_status ||
  175. !btrfs_data_csum_ok(repair_bbio, dev, 0, paddrs)) {
  176. bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ);
  177. repair_bbio->bio.bi_iter = repair_bbio->saved_iter;
  178. mirror = next_repair_mirror(fbio, mirror);
  179. if (mirror == fbio->bbio->mirror_num) {
  180. btrfs_debug(fs_info, "no mirror left");
  181. fbio->bbio->bio.bi_status = BLK_STS_IOERR;
  182. goto done;
  183. }
  184. btrfs_submit_bbio(repair_bbio, mirror);
  185. return;
  186. }
  187. do {
  188. mirror = prev_repair_mirror(fbio, mirror);
  189. btrfs_repair_io_failure(fs_info, btrfs_ino(inode),
  190. repair_bbio->file_offset, fs_info->sectorsize,
  191. logical, paddrs, step, mirror);
  192. } while (mirror != fbio->bbio->mirror_num);
  193. done:
  194. btrfs_repair_done(fbio);
  195. bio_put(&repair_bbio->bio);
  196. }
  197. /*
  198. * Try to kick off a repair read to the next available mirror for a bad sector.
  199. *
  200. * This primarily tries to recover good data to serve the actual read request,
  201. * but also tries to write the good data back to the bad mirror(s) when a
  202. * read succeeded to restore the redundancy.
  203. */
  204. static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio,
  205. u32 bio_offset,
  206. phys_addr_t paddrs[],
  207. struct btrfs_failed_bio *fbio)
  208. {
  209. struct btrfs_inode *inode = failed_bbio->inode;
  210. struct btrfs_fs_info *fs_info = inode->root->fs_info;
  211. const u32 sectorsize = fs_info->sectorsize;
  212. const u32 step = min(fs_info->sectorsize, PAGE_SIZE);
  213. const u32 nr_steps = sectorsize / step;
  214. /*
  215. * For bs > ps cases, the saved_iter can be partially moved forward.
  216. * In that case we should round it down to the block boundary.
  217. */
  218. const u64 logical = round_down(failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT,
  219. sectorsize);
  220. struct btrfs_bio *repair_bbio;
  221. struct bio *repair_bio;
  222. int num_copies;
  223. int mirror;
  224. btrfs_debug(fs_info, "repair read error: read error at %llu",
  225. failed_bbio->file_offset + bio_offset);
  226. num_copies = btrfs_num_copies(fs_info, logical, sectorsize);
  227. if (num_copies == 1) {
  228. btrfs_debug(fs_info, "no copy to repair from");
  229. failed_bbio->bio.bi_status = BLK_STS_IOERR;
  230. return fbio;
  231. }
  232. if (!fbio) {
  233. fbio = mempool_alloc(&btrfs_failed_bio_pool, GFP_NOFS);
  234. fbio->bbio = failed_bbio;
  235. fbio->num_copies = num_copies;
  236. atomic_set(&fbio->repair_count, 1);
  237. }
  238. atomic_inc(&fbio->repair_count);
  239. repair_bio = bio_alloc_bioset(NULL, nr_steps, REQ_OP_READ, GFP_NOFS,
  240. &btrfs_repair_bioset);
  241. repair_bio->bi_iter.bi_sector = logical >> SECTOR_SHIFT;
  242. for (int i = 0; i < nr_steps; i++) {
  243. int ret;
  244. ASSERT(offset_in_page(paddrs[i]) + step <= PAGE_SIZE);
  245. ret = bio_add_page(repair_bio, phys_to_page(paddrs[i]), step,
  246. offset_in_page(paddrs[i]));
  247. ASSERT(ret == step);
  248. }
  249. repair_bbio = btrfs_bio(repair_bio);
  250. btrfs_bio_init(repair_bbio, failed_bbio->inode, failed_bbio->file_offset + bio_offset,
  251. NULL, fbio);
  252. mirror = next_repair_mirror(fbio, failed_bbio->mirror_num);
  253. btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror);
  254. btrfs_submit_bbio(repair_bbio, mirror);
  255. return fbio;
  256. }
  257. static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *dev)
  258. {
  259. struct btrfs_inode *inode = bbio->inode;
  260. struct btrfs_fs_info *fs_info = inode->root->fs_info;
  261. const u32 sectorsize = fs_info->sectorsize;
  262. const u32 step = min(sectorsize, PAGE_SIZE);
  263. const u32 nr_steps = sectorsize / step;
  264. struct bvec_iter *iter = &bbio->saved_iter;
  265. blk_status_t status = bbio->bio.bi_status;
  266. struct btrfs_failed_bio *fbio = NULL;
  267. phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE];
  268. phys_addr_t paddr;
  269. u32 offset = 0;
  270. /* Read-repair requires the inode field to be set by the submitter. */
  271. ASSERT(inode);
  272. /*
  273. * Hand off repair bios to the repair code as there is no upper level
  274. * submitter for them.
  275. */
  276. if (bbio->bio.bi_pool == &btrfs_repair_bioset) {
  277. btrfs_end_repair_bio(bbio, dev);
  278. return;
  279. }
  280. /* Clear the I/O error. A failed repair will reset it. */
  281. bbio->bio.bi_status = BLK_STS_OK;
  282. btrfs_bio_for_each_block(paddr, &bbio->bio, iter, step) {
  283. paddrs[(offset / step) % nr_steps] = paddr;
  284. offset += step;
  285. if (IS_ALIGNED(offset, sectorsize)) {
  286. if (status ||
  287. !btrfs_data_csum_ok(bbio, dev, offset - sectorsize, paddrs))
  288. fbio = repair_one_sector(bbio, offset - sectorsize,
  289. paddrs, fbio);
  290. }
  291. }
  292. if (bbio->csum != bbio->csum_inline)
  293. kvfree(bbio->csum);
  294. if (fbio)
  295. btrfs_repair_done(fbio);
  296. else
  297. btrfs_bio_end_io(bbio, bbio->bio.bi_status);
  298. }
  299. static void btrfs_log_dev_io_error(const struct bio *bio, struct btrfs_device *dev)
  300. {
  301. if (!dev || !dev->bdev)
  302. return;
  303. if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET)
  304. return;
  305. if (btrfs_op(bio) == BTRFS_MAP_WRITE)
  306. btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
  307. else if (!(bio->bi_opf & REQ_RAHEAD))
  308. btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
  309. if (bio->bi_opf & REQ_PREFLUSH)
  310. btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS);
  311. }
  312. static struct workqueue_struct *btrfs_end_io_wq(const struct btrfs_fs_info *fs_info,
  313. const struct bio *bio)
  314. {
  315. if (bio->bi_opf & REQ_META)
  316. return fs_info->endio_meta_workers;
  317. return fs_info->endio_workers;
  318. }
  319. static void simple_end_io_work(struct work_struct *work)
  320. {
  321. struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work);
  322. struct bio *bio = &bbio->bio;
  323. if (bio_op(bio) == REQ_OP_READ) {
  324. /* Metadata reads are checked and repaired by the submitter. */
  325. if (is_data_bbio(bbio))
  326. return btrfs_check_read_bio(bbio, bbio->bio.bi_private);
  327. return btrfs_bio_end_io(bbio, bbio->bio.bi_status);
  328. }
  329. if (bio_is_zone_append(bio) && !bio->bi_status)
  330. btrfs_record_physical_zoned(bbio);
  331. btrfs_bio_end_io(bbio, bbio->bio.bi_status);
  332. }
  333. static void btrfs_simple_end_io(struct bio *bio)
  334. {
  335. struct btrfs_bio *bbio = btrfs_bio(bio);
  336. struct btrfs_device *dev = bio->bi_private;
  337. struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
  338. btrfs_bio_counter_dec(fs_info);
  339. if (bio->bi_status)
  340. btrfs_log_dev_io_error(bio, dev);
  341. INIT_WORK(&bbio->end_io_work, simple_end_io_work);
  342. queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work);
  343. }
  344. static void btrfs_raid56_end_io(struct bio *bio)
  345. {
  346. struct btrfs_io_context *bioc = bio->bi_private;
  347. struct btrfs_bio *bbio = btrfs_bio(bio);
  348. /* RAID56 endio is always handled in workqueue. */
  349. ASSERT(in_task());
  350. btrfs_bio_counter_dec(bioc->fs_info);
  351. bbio->mirror_num = bioc->mirror_num;
  352. if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio))
  353. btrfs_check_read_bio(bbio, NULL);
  354. else
  355. btrfs_bio_end_io(bbio, bbio->bio.bi_status);
  356. btrfs_put_bioc(bioc);
  357. }
  358. static void orig_write_end_io_work(struct work_struct *work)
  359. {
  360. struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work);
  361. struct bio *bio = &bbio->bio;
  362. struct btrfs_io_stripe *stripe = bio->bi_private;
  363. struct btrfs_io_context *bioc = stripe->bioc;
  364. btrfs_bio_counter_dec(bioc->fs_info);
  365. if (bio->bi_status) {
  366. atomic_inc(&bioc->error);
  367. btrfs_log_dev_io_error(bio, stripe->dev);
  368. }
  369. /*
  370. * Only send an error to the higher layers if it is beyond the tolerance
  371. * threshold.
  372. */
  373. if (atomic_read(&bioc->error) > bioc->max_errors)
  374. bio->bi_status = BLK_STS_IOERR;
  375. else
  376. bio->bi_status = BLK_STS_OK;
  377. if (bio_is_zone_append(bio) && !bio->bi_status)
  378. stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
  379. btrfs_bio_end_io(bbio, bbio->bio.bi_status);
  380. btrfs_put_bioc(bioc);
  381. }
  382. static void btrfs_orig_write_end_io(struct bio *bio)
  383. {
  384. struct btrfs_bio *bbio = btrfs_bio(bio);
  385. INIT_WORK(&bbio->end_io_work, orig_write_end_io_work);
  386. queue_work(btrfs_end_io_wq(bbio->inode->root->fs_info, bio), &bbio->end_io_work);
  387. }
  388. static void clone_write_end_io_work(struct work_struct *work)
  389. {
  390. struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work);
  391. struct bio *bio = &bbio->bio;
  392. struct btrfs_io_stripe *stripe = bio->bi_private;
  393. if (bio->bi_status) {
  394. atomic_inc(&stripe->bioc->error);
  395. btrfs_log_dev_io_error(bio, stripe->dev);
  396. } else if (bio_is_zone_append(bio)) {
  397. stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
  398. }
  399. /* Pass on control to the original bio this one was cloned from */
  400. bio_endio(stripe->bioc->orig_bio);
  401. bio_put(bio);
  402. }
  403. static void btrfs_clone_write_end_io(struct bio *bio)
  404. {
  405. struct btrfs_bio *bbio = btrfs_bio(bio);
  406. INIT_WORK(&bbio->end_io_work, clone_write_end_io_work);
  407. queue_work(btrfs_end_io_wq(bbio->inode->root->fs_info, bio), &bbio->end_io_work);
  408. }
  409. static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
  410. {
  411. u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
  412. if (!dev || !dev->bdev ||
  413. test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
  414. (btrfs_op(bio) == BTRFS_MAP_WRITE &&
  415. !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
  416. bio_io_error(bio);
  417. return;
  418. }
  419. bio_set_dev(bio, dev->bdev);
  420. /*
  421. * For zone append writing, bi_sector must point the beginning of the
  422. * zone
  423. */
  424. if (btrfs_bio(bio)->can_use_append && btrfs_dev_is_sequential(dev, physical)) {
  425. u64 zone_start = round_down(physical, dev->fs_info->zone_size);
  426. ASSERT(btrfs_dev_is_sequential(dev, physical));
  427. bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
  428. bio->bi_opf &= ~REQ_OP_WRITE;
  429. bio->bi_opf |= REQ_OP_ZONE_APPEND;
  430. }
  431. btrfs_debug(dev->fs_info,
  432. "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
  433. __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
  434. (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev),
  435. dev->devid, bio->bi_iter.bi_size);
  436. /*
  437. * Track reads if tracking is enabled; ignore I/O operations before the
  438. * filesystem is fully initialized.
  439. */
  440. if (dev->fs_devices->collect_fs_stats && bio_op(bio) == REQ_OP_READ && dev->fs_info)
  441. percpu_counter_add(&dev->fs_info->stats_read_blocks,
  442. bio->bi_iter.bi_size >> dev->fs_info->sectorsize_bits);
  443. if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT)
  444. blkcg_punt_bio_submit(bio);
  445. else
  446. submit_bio(bio);
  447. }
  448. static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
  449. {
  450. struct bio *orig_bio = bioc->orig_bio, *bio;
  451. struct btrfs_bio *orig_bbio = btrfs_bio(orig_bio);
  452. ASSERT(bio_op(orig_bio) != REQ_OP_READ);
  453. /* Reuse the bio embedded into the btrfs_bio for the last mirror */
  454. if (dev_nr == bioc->num_stripes - 1) {
  455. bio = orig_bio;
  456. bio->bi_end_io = btrfs_orig_write_end_io;
  457. } else {
  458. /* We need to use endio_work to run end_io in task context. */
  459. bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &btrfs_bioset);
  460. bio_inc_remaining(orig_bio);
  461. btrfs_bio_init(btrfs_bio(bio), orig_bbio->inode,
  462. orig_bbio->file_offset, NULL, NULL);
  463. bio->bi_end_io = btrfs_clone_write_end_io;
  464. }
  465. bio->bi_private = &bioc->stripes[dev_nr];
  466. bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT;
  467. bioc->stripes[dev_nr].bioc = bioc;
  468. bioc->size = bio->bi_iter.bi_size;
  469. btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio);
  470. }
  471. static void btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc,
  472. struct btrfs_io_stripe *smap, int mirror_num)
  473. {
  474. if (!bioc) {
  475. /* Single mirror read/write fast path. */
  476. btrfs_bio(bio)->mirror_num = mirror_num;
  477. bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT;
  478. if (bio_op(bio) != REQ_OP_READ)
  479. btrfs_bio(bio)->orig_physical = smap->physical;
  480. bio->bi_private = smap->dev;
  481. bio->bi_end_io = btrfs_simple_end_io;
  482. btrfs_submit_dev_bio(smap->dev, bio);
  483. } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
  484. /* Parity RAID write or read recovery. */
  485. bio->bi_private = bioc;
  486. bio->bi_end_io = btrfs_raid56_end_io;
  487. if (bio_op(bio) == REQ_OP_READ)
  488. raid56_parity_recover(bio, bioc, mirror_num);
  489. else
  490. raid56_parity_write(bio, bioc);
  491. } else {
  492. /* Write to multiple mirrors. */
  493. int total_devs = bioc->num_stripes;
  494. bioc->orig_bio = bio;
  495. for (int dev_nr = 0; dev_nr < total_devs; dev_nr++)
  496. btrfs_submit_mirrored_bio(bioc, dev_nr);
  497. }
  498. }
  499. static int btrfs_bio_csum(struct btrfs_bio *bbio)
  500. {
  501. if (bbio->bio.bi_opf & REQ_META)
  502. return btree_csum_one_bio(bbio);
  503. #ifdef CONFIG_BTRFS_EXPERIMENTAL
  504. return btrfs_csum_one_bio(bbio, true);
  505. #else
  506. return btrfs_csum_one_bio(bbio, false);
  507. #endif
  508. }
  509. /*
  510. * Async submit bios are used to offload expensive checksumming onto the worker
  511. * threads.
  512. */
  513. struct async_submit_bio {
  514. struct btrfs_bio *bbio;
  515. struct btrfs_io_context *bioc;
  516. struct btrfs_io_stripe smap;
  517. int mirror_num;
  518. struct btrfs_work work;
  519. };
  520. /*
  521. * In order to insert checksums into the metadata in large chunks, we wait
  522. * until bio submission time. All the pages in the bio are checksummed and
  523. * sums are attached onto the ordered extent record.
  524. *
  525. * At IO completion time the csums attached on the ordered extent record are
  526. * inserted into the btree.
  527. */
  528. static void run_one_async_start(struct btrfs_work *work)
  529. {
  530. struct async_submit_bio *async =
  531. container_of(work, struct async_submit_bio, work);
  532. int ret;
  533. ret = btrfs_bio_csum(async->bbio);
  534. if (ret)
  535. async->bbio->bio.bi_status = errno_to_blk_status(ret);
  536. }
  537. /*
  538. * In order to insert checksums into the metadata in large chunks, we wait
  539. * until bio submission time. All the pages in the bio are checksummed and
  540. * sums are attached onto the ordered extent record.
  541. *
  542. * At IO completion time the csums attached on the ordered extent record are
  543. * inserted into the tree.
  544. *
  545. * If called with @do_free == true, then it will free the work struct.
  546. */
  547. static void run_one_async_done(struct btrfs_work *work, bool do_free)
  548. {
  549. struct async_submit_bio *async =
  550. container_of(work, struct async_submit_bio, work);
  551. struct bio *bio = &async->bbio->bio;
  552. if (do_free) {
  553. kfree(container_of(work, struct async_submit_bio, work));
  554. return;
  555. }
  556. /* If an error occurred we just want to clean up the bio and move on. */
  557. if (bio->bi_status) {
  558. btrfs_bio_end_io(async->bbio, bio->bi_status);
  559. return;
  560. }
  561. /*
  562. * All of the bios that pass through here are from async helpers.
  563. * Use REQ_BTRFS_CGROUP_PUNT to issue them from the owning cgroup's
  564. * context. This changes nothing when cgroups aren't in use.
  565. */
  566. bio->bi_opf |= REQ_BTRFS_CGROUP_PUNT;
  567. btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num);
  568. }
  569. static bool should_async_write(struct btrfs_bio *bbio)
  570. {
  571. struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
  572. bool auto_csum_mode = true;
  573. #ifdef CONFIG_BTRFS_EXPERIMENTAL
  574. /*
  575. * Write bios will calculate checksum and submit bio at the same time.
  576. * Unless explicitly required don't offload serial csum calculate and bio
  577. * submit into a workqueue.
  578. */
  579. return false;
  580. #endif
  581. /* Submit synchronously if the checksum implementation is fast. */
  582. if (auto_csum_mode && test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags))
  583. return false;
  584. /*
  585. * Try to defer the submission to a workqueue to parallelize the
  586. * checksum calculation unless the I/O is issued synchronously.
  587. */
  588. if (op_is_sync(bbio->bio.bi_opf))
  589. return false;
  590. /* Zoned devices require I/O to be submitted in order. */
  591. if ((bbio->bio.bi_opf & REQ_META) && btrfs_is_zoned(fs_info))
  592. return false;
  593. return true;
  594. }
  595. /*
  596. * Submit bio to an async queue.
  597. *
  598. * Return true if the work has been successfully submitted, else false.
  599. */
  600. static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
  601. struct btrfs_io_context *bioc,
  602. struct btrfs_io_stripe *smap, int mirror_num)
  603. {
  604. struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
  605. struct async_submit_bio *async;
  606. async = kmalloc_obj(*async, GFP_NOFS);
  607. if (!async)
  608. return false;
  609. async->bbio = bbio;
  610. async->bioc = bioc;
  611. async->smap = *smap;
  612. async->mirror_num = mirror_num;
  613. btrfs_init_work(&async->work, run_one_async_start, run_one_async_done);
  614. btrfs_queue_work(fs_info->workers, &async->work);
  615. return true;
  616. }
  617. static u64 btrfs_append_map_length(struct btrfs_bio *bbio, u64 map_length)
  618. {
  619. struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
  620. unsigned int nr_segs;
  621. int sector_offset;
  622. map_length = min(map_length, fs_info->max_zone_append_size);
  623. sector_offset = bio_split_rw_at(&bbio->bio, &fs_info->limits,
  624. &nr_segs, map_length);
  625. if (sector_offset) {
  626. /*
  627. * bio_split_rw_at() could split at a size smaller than our
  628. * sectorsize and thus cause unaligned I/Os. Fix that by
  629. * always rounding down to the nearest boundary.
  630. */
  631. return ALIGN_DOWN(sector_offset << SECTOR_SHIFT, fs_info->sectorsize);
  632. }
  633. return map_length;
  634. }
  635. static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
  636. {
  637. struct btrfs_inode *inode = bbio->inode;
  638. struct btrfs_fs_info *fs_info = inode->root->fs_info;
  639. struct bio *bio = &bbio->bio;
  640. u64 logical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
  641. u64 length = bio->bi_iter.bi_size;
  642. u64 map_length = length;
  643. struct btrfs_io_context *bioc = NULL;
  644. struct btrfs_io_stripe smap;
  645. blk_status_t status;
  646. int ret;
  647. if (bbio->is_scrub || btrfs_is_data_reloc_root(inode->root))
  648. smap.rst_search_commit_root = true;
  649. else
  650. smap.rst_search_commit_root = false;
  651. btrfs_bio_counter_inc_blocked(fs_info);
  652. ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
  653. &bioc, &smap, &mirror_num);
  654. if (ret) {
  655. status = errno_to_blk_status(ret);
  656. btrfs_bio_counter_dec(fs_info);
  657. goto end_bbio;
  658. }
  659. /*
  660. * For fscrypt writes we will get the encrypted bio after we've remapped
  661. * our bio to the physical disk location, so we need to save the
  662. * original bytenr so we know what we're checksumming.
  663. */
  664. if (bio_op(bio) == REQ_OP_WRITE && is_data_bbio(bbio))
  665. bbio->orig_logical = logical;
  666. bbio->can_use_append = btrfs_use_zone_append(bbio);
  667. map_length = min(map_length, length);
  668. if (bbio->can_use_append)
  669. map_length = btrfs_append_map_length(bbio, map_length);
  670. if (map_length < length) {
  671. struct btrfs_bio *split;
  672. split = btrfs_split_bio(fs_info, bbio, map_length);
  673. if (IS_ERR(split)) {
  674. status = errno_to_blk_status(PTR_ERR(split));
  675. btrfs_bio_counter_dec(fs_info);
  676. goto end_bbio;
  677. }
  678. bbio = split;
  679. bio = &bbio->bio;
  680. }
  681. /*
  682. * Save the iter for the end_io handler and preload the checksums for
  683. * data reads.
  684. */
  685. if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) {
  686. bbio->saved_iter = bio->bi_iter;
  687. ret = btrfs_lookup_bio_sums(bbio);
  688. status = errno_to_blk_status(ret);
  689. if (status)
  690. goto fail;
  691. }
  692. if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
  693. if (is_data_bbio(bbio) && bioc && bioc->use_rst) {
  694. /*
  695. * No locking for the list update, as we only add to
  696. * the list in the I/O submission path, and list
  697. * iteration only happens in the completion path, which
  698. * can't happen until after the last submission.
  699. */
  700. btrfs_get_bioc(bioc);
  701. list_add_tail(&bioc->rst_ordered_entry, &bbio->ordered->bioc_list);
  702. }
  703. /*
  704. * Csum items for reloc roots have already been cloned at this
  705. * point, so they are handled as part of the no-checksum case.
  706. */
  707. if (!(inode->flags & BTRFS_INODE_NODATASUM) &&
  708. !test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state) &&
  709. !btrfs_is_data_reloc_root(inode->root) && !bbio->is_remap) {
  710. if (should_async_write(bbio) &&
  711. btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num))
  712. goto done;
  713. ret = btrfs_bio_csum(bbio);
  714. status = errno_to_blk_status(ret);
  715. if (status)
  716. goto fail;
  717. } else if (bbio->can_use_append ||
  718. (btrfs_is_zoned(fs_info) && inode->flags & BTRFS_INODE_NODATASUM)) {
  719. ret = btrfs_alloc_dummy_sum(bbio);
  720. status = errno_to_blk_status(ret);
  721. if (status)
  722. goto fail;
  723. }
  724. }
  725. btrfs_submit_bio(bio, bioc, &smap, mirror_num);
  726. done:
  727. return map_length == length;
  728. fail:
  729. btrfs_bio_counter_dec(fs_info);
  730. /*
  731. * We have split the original bbio, now we have to end both the current
  732. * @bbio and remaining one, as the remaining one will never be submitted.
  733. */
  734. if (map_length < length) {
  735. struct btrfs_bio *remaining = bbio->private;
  736. ASSERT(bbio->bio.bi_pool == &btrfs_clone_bioset);
  737. ASSERT(remaining);
  738. btrfs_bio_end_io(remaining, status);
  739. }
  740. end_bbio:
  741. btrfs_bio_end_io(bbio, status);
  742. /* Do not submit another chunk */
  743. return true;
  744. }
  745. static void assert_bbio_alignment(struct btrfs_bio *bbio)
  746. {
  747. #ifdef CONFIG_BTRFS_ASSERT
  748. struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
  749. struct bio_vec bvec;
  750. struct bvec_iter iter;
  751. const u32 blocksize = fs_info->sectorsize;
  752. const u32 alignment = min(blocksize, PAGE_SIZE);
  753. const u64 logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
  754. const u32 length = bbio->bio.bi_iter.bi_size;
  755. /* The logical and length should still be aligned to blocksize. */
  756. ASSERT(IS_ALIGNED(logical, blocksize) && IS_ALIGNED(length, blocksize) &&
  757. length != 0, "root=%llu inode=%llu logical=%llu length=%u",
  758. btrfs_root_id(bbio->inode->root),
  759. btrfs_ino(bbio->inode), logical, length);
  760. bio_for_each_bvec(bvec, &bbio->bio, iter)
  761. ASSERT(IS_ALIGNED(bvec.bv_offset, alignment) &&
  762. IS_ALIGNED(bvec.bv_len, alignment),
  763. "root=%llu inode=%llu logical=%llu length=%u index=%u bv_offset=%u bv_len=%u",
  764. btrfs_root_id(bbio->inode->root),
  765. btrfs_ino(bbio->inode), logical, length, iter.bi_idx,
  766. bvec.bv_offset, bvec.bv_len);
  767. #endif
  768. }
  769. void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num)
  770. {
  771. /* If bbio->inode is not populated, its file_offset must be 0. */
  772. ASSERT(bbio->inode || bbio->file_offset == 0);
  773. assert_bbio_alignment(bbio);
  774. while (!btrfs_submit_chunk(bbio, mirror_num))
  775. ;
  776. }
  777. /*
  778. * Submit a repair write.
  779. *
  780. * This bypasses btrfs_submit_bbio() deliberately, as that writes all copies in a
  781. * RAID setup. Here we only want to write the one bad copy, so we do the
  782. * mapping ourselves and submit the bio directly.
  783. *
  784. * The I/O is issued synchronously to block the repair read completion from
  785. * freeing the bio.
  786. *
  787. * @ino: Offending inode number
  788. * @fileoff: File offset inside the inode
  789. * @length: Length of the repair write
  790. * @logical: Logical address of the range
  791. * @paddrs: Physical address array of the content
  792. * @step: Length of for each paddrs
  793. * @mirror_num: Mirror number to write to. Must not be zero
  794. */
  795. int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 fileoff,
  796. u32 length, u64 logical, const phys_addr_t paddrs[],
  797. unsigned int step, int mirror_num)
  798. {
  799. const u32 nr_steps = DIV_ROUND_UP_POW2(length, step);
  800. struct btrfs_io_stripe smap = { 0 };
  801. struct bio *bio = NULL;
  802. int ret = 0;
  803. BUG_ON(!mirror_num);
  804. /* Basic alignment checks. */
  805. ASSERT(IS_ALIGNED(logical, fs_info->sectorsize));
  806. ASSERT(IS_ALIGNED(length, fs_info->sectorsize));
  807. ASSERT(IS_ALIGNED(fileoff, fs_info->sectorsize));
  808. /* Either it's a single data or metadata block. */
  809. ASSERT(length <= BTRFS_MAX_BLOCKSIZE);
  810. ASSERT(step <= length);
  811. ASSERT(is_power_of_2(step));
  812. /*
  813. * The fs either mounted RO or hit critical errors, no need
  814. * to continue repairing.
  815. */
  816. if (unlikely(sb_rdonly(fs_info->sb)))
  817. return 0;
  818. if (btrfs_repair_one_zone(fs_info, logical))
  819. return 0;
  820. /*
  821. * Avoid races with device replace and make sure our bioc has devices
  822. * associated to its stripes that don't go away while we are doing the
  823. * read repair operation.
  824. */
  825. btrfs_bio_counter_inc_blocked(fs_info);
  826. ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num);
  827. if (ret < 0)
  828. goto out_counter_dec;
  829. if (unlikely(!smap.dev->bdev ||
  830. !test_bit(BTRFS_DEV_STATE_WRITEABLE, &smap.dev->dev_state))) {
  831. ret = -EIO;
  832. goto out_counter_dec;
  833. }
  834. bio = bio_alloc(smap.dev->bdev, nr_steps, REQ_OP_WRITE | REQ_SYNC, GFP_NOFS);
  835. bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
  836. for (int i = 0; i < nr_steps; i++) {
  837. ret = bio_add_page(bio, phys_to_page(paddrs[i]), step, offset_in_page(paddrs[i]));
  838. /* We should have allocated enough slots to contain all the different pages. */
  839. ASSERT(ret == step);
  840. }
  841. ret = submit_bio_wait(bio);
  842. bio_put(bio);
  843. if (ret) {
  844. /* try to remap that extent elsewhere? */
  845. btrfs_dev_stat_inc_and_print(smap.dev, BTRFS_DEV_STAT_WRITE_ERRS);
  846. goto out_counter_dec;
  847. }
  848. btrfs_info_rl(fs_info,
  849. "read error corrected: ino %llu off %llu (dev %s sector %llu)",
  850. ino, fileoff, btrfs_dev_name(smap.dev),
  851. smap.physical >> SECTOR_SHIFT);
  852. ret = 0;
  853. out_counter_dec:
  854. btrfs_bio_counter_dec(fs_info);
  855. return ret;
  856. }
  857. /*
  858. * Submit a btrfs_bio based repair write.
  859. *
  860. * If @dev_replace is true, the write would be submitted to dev-replace target.
  861. */
  862. void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace)
  863. {
  864. struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
  865. u64 logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
  866. u64 length = bbio->bio.bi_iter.bi_size;
  867. struct btrfs_io_stripe smap = { 0 };
  868. int ret;
  869. ASSERT(mirror_num > 0);
  870. ASSERT(btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE);
  871. ASSERT(!is_data_inode(bbio->inode));
  872. ASSERT(bbio->is_scrub);
  873. btrfs_bio_counter_inc_blocked(fs_info);
  874. ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num);
  875. if (ret < 0)
  876. goto fail;
  877. if (dev_replace) {
  878. ASSERT(smap.dev == fs_info->dev_replace.srcdev);
  879. smap.dev = fs_info->dev_replace.tgtdev;
  880. }
  881. btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num);
  882. return;
  883. fail:
  884. btrfs_bio_counter_dec(fs_info);
  885. btrfs_bio_end_io(bbio, errno_to_blk_status(ret));
  886. }
  887. int __init btrfs_bioset_init(void)
  888. {
  889. if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
  890. offsetof(struct btrfs_bio, bio),
  891. BIOSET_NEED_BVECS))
  892. return -ENOMEM;
  893. if (bioset_init(&btrfs_clone_bioset, BIO_POOL_SIZE,
  894. offsetof(struct btrfs_bio, bio), 0))
  895. goto out;
  896. if (bioset_init(&btrfs_repair_bioset, BIO_POOL_SIZE,
  897. offsetof(struct btrfs_bio, bio),
  898. BIOSET_NEED_BVECS))
  899. goto out;
  900. if (mempool_init_kmalloc_pool(&btrfs_failed_bio_pool, BIO_POOL_SIZE,
  901. sizeof(struct btrfs_failed_bio)))
  902. goto out;
  903. return 0;
  904. out:
  905. btrfs_bioset_exit();
  906. return -ENOMEM;
  907. }
  908. void __cold btrfs_bioset_exit(void)
  909. {
  910. mempool_exit(&btrfs_failed_bio_pool);
  911. bioset_exit(&btrfs_repair_bioset);
  912. bioset_exit(&btrfs_clone_bioset);
  913. bioset_exit(&btrfs_bioset);
  914. }