ioend.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright (c) 2016-2025 Christoph Hellwig.
  4. */
  5. #include <linux/iomap.h>
  6. #include <linux/list_sort.h>
  7. #include <linux/pagemap.h>
  8. #include <linux/writeback.h>
  9. #include <linux/fserror.h>
  10. #include "internal.h"
  11. #include "trace.h"
  12. struct bio_set iomap_ioend_bioset;
  13. EXPORT_SYMBOL_GPL(iomap_ioend_bioset);
  14. struct iomap_ioend *iomap_init_ioend(struct inode *inode,
  15. struct bio *bio, loff_t file_offset, u16 ioend_flags)
  16. {
  17. struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
  18. atomic_set(&ioend->io_remaining, 1);
  19. ioend->io_error = 0;
  20. ioend->io_parent = NULL;
  21. INIT_LIST_HEAD(&ioend->io_list);
  22. ioend->io_flags = ioend_flags;
  23. ioend->io_inode = inode;
  24. ioend->io_offset = file_offset;
  25. ioend->io_size = bio->bi_iter.bi_size;
  26. ioend->io_sector = bio->bi_iter.bi_sector;
  27. ioend->io_private = NULL;
  28. return ioend;
  29. }
  30. EXPORT_SYMBOL_GPL(iomap_init_ioend);
  31. /*
  32. * We're now finished for good with this ioend structure. Update the folio
  33. * state, release holds on bios, and finally free up memory. Do not use the
  34. * ioend after this.
  35. */
  36. static u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend)
  37. {
  38. struct inode *inode = ioend->io_inode;
  39. struct bio *bio = &ioend->io_bio;
  40. struct folio_iter fi;
  41. u32 folio_count = 0;
  42. if (ioend->io_error) {
  43. mapping_set_error(inode->i_mapping, ioend->io_error);
  44. if (!bio_flagged(bio, BIO_QUIET)) {
  45. pr_err_ratelimited(
  46. "%s: writeback error on inode %lu, offset %lld, sector %llu",
  47. inode->i_sb->s_id, inode->i_ino,
  48. ioend->io_offset, ioend->io_sector);
  49. }
  50. }
  51. /* walk all folios in bio, ending page IO on them */
  52. bio_for_each_folio_all(fi, bio) {
  53. if (ioend->io_error)
  54. fserror_report_io(inode, FSERR_BUFFERED_WRITE,
  55. folio_pos(fi.folio) + fi.offset,
  56. fi.length, ioend->io_error,
  57. GFP_ATOMIC);
  58. iomap_finish_folio_write(inode, fi.folio, fi.length);
  59. folio_count++;
  60. }
  61. bio_put(bio); /* frees the ioend */
  62. return folio_count;
  63. }
  64. static DEFINE_SPINLOCK(failed_ioend_lock);
  65. static LIST_HEAD(failed_ioend_list);
  66. static void
  67. iomap_fail_ioends(
  68. struct work_struct *work)
  69. {
  70. struct iomap_ioend *ioend;
  71. struct list_head tmp;
  72. unsigned long flags;
  73. spin_lock_irqsave(&failed_ioend_lock, flags);
  74. list_replace_init(&failed_ioend_list, &tmp);
  75. spin_unlock_irqrestore(&failed_ioend_lock, flags);
  76. while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend,
  77. io_list))) {
  78. list_del_init(&ioend->io_list);
  79. iomap_finish_ioend_buffered(ioend);
  80. cond_resched();
  81. }
  82. }
  83. static DECLARE_WORK(failed_ioend_work, iomap_fail_ioends);
  84. static void iomap_fail_ioend_buffered(struct iomap_ioend *ioend)
  85. {
  86. unsigned long flags;
  87. /*
  88. * Bounce I/O errors to a workqueue to avoid nested i_lock acquisitions
  89. * in the fserror code. The caller no longer owns the ioend reference
  90. * after the spinlock drops.
  91. */
  92. spin_lock_irqsave(&failed_ioend_lock, flags);
  93. if (list_empty(&failed_ioend_list))
  94. WARN_ON_ONCE(!schedule_work(&failed_ioend_work));
  95. list_add_tail(&ioend->io_list, &failed_ioend_list);
  96. spin_unlock_irqrestore(&failed_ioend_lock, flags);
  97. }
  98. static void ioend_writeback_end_bio(struct bio *bio)
  99. {
  100. struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
  101. ioend->io_error = blk_status_to_errno(bio->bi_status);
  102. if (ioend->io_error) {
  103. iomap_fail_ioend_buffered(ioend);
  104. return;
  105. }
  106. iomap_finish_ioend_buffered(ioend);
  107. }
  108. /*
  109. * We cannot cancel the ioend directly in case of an error, so call the bio end
  110. * I/O handler with the error status here to run the normal I/O completion
  111. * handler.
  112. */
  113. int iomap_ioend_writeback_submit(struct iomap_writepage_ctx *wpc, int error)
  114. {
  115. struct iomap_ioend *ioend = wpc->wb_ctx;
  116. if (!ioend->io_bio.bi_end_io)
  117. ioend->io_bio.bi_end_io = ioend_writeback_end_bio;
  118. if (WARN_ON_ONCE(wpc->iomap.flags & IOMAP_F_ANON_WRITE))
  119. error = -EIO;
  120. if (error) {
  121. ioend->io_bio.bi_status = errno_to_blk_status(error);
  122. bio_endio(&ioend->io_bio);
  123. return error;
  124. }
  125. submit_bio(&ioend->io_bio);
  126. return 0;
  127. }
  128. EXPORT_SYMBOL_GPL(iomap_ioend_writeback_submit);
  129. static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
  130. loff_t pos, u16 ioend_flags)
  131. {
  132. struct bio *bio;
  133. bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS,
  134. REQ_OP_WRITE | wbc_to_write_flags(wpc->wbc),
  135. GFP_NOFS, &iomap_ioend_bioset);
  136. bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos);
  137. bio->bi_write_hint = wpc->inode->i_write_hint;
  138. wbc_init_bio(wpc->wbc, bio);
  139. wpc->nr_folios = 0;
  140. return iomap_init_ioend(wpc->inode, bio, pos, ioend_flags);
  141. }
  142. static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos,
  143. u16 ioend_flags)
  144. {
  145. struct iomap_ioend *ioend = wpc->wb_ctx;
  146. if (ioend_flags & IOMAP_IOEND_BOUNDARY)
  147. return false;
  148. if ((ioend_flags & IOMAP_IOEND_NOMERGE_FLAGS) !=
  149. (ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS))
  150. return false;
  151. if (pos != ioend->io_offset + ioend->io_size)
  152. return false;
  153. if (!(wpc->iomap.flags & IOMAP_F_ANON_WRITE) &&
  154. iomap_sector(&wpc->iomap, pos) != bio_end_sector(&ioend->io_bio))
  155. return false;
  156. /*
  157. * Limit ioend bio chain lengths to minimise IO completion latency. This
  158. * also prevents long tight loops ending page writeback on all the
  159. * folios in the ioend.
  160. */
  161. if (wpc->nr_folios >= IOEND_BATCH_SIZE)
  162. return false;
  163. return true;
  164. }
  165. /*
  166. * Test to see if we have an existing ioend structure that we could append to
  167. * first; otherwise finish off the current ioend and start another.
  168. *
  169. * If a new ioend is created and cached, the old ioend is submitted to the block
  170. * layer instantly. Batching optimisations are provided by higher level block
  171. * plugging.
  172. *
  173. * At the end of a writeback pass, there will be a cached ioend remaining on the
  174. * writepage context that the caller will need to submit.
  175. */
  176. ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio,
  177. loff_t pos, loff_t end_pos, unsigned int dirty_len)
  178. {
  179. struct iomap_ioend *ioend = wpc->wb_ctx;
  180. size_t poff = offset_in_folio(folio, pos);
  181. unsigned int ioend_flags = 0;
  182. unsigned int map_len = min_t(u64, dirty_len,
  183. wpc->iomap.offset + wpc->iomap.length - pos);
  184. int error;
  185. trace_iomap_add_to_ioend(wpc->inode, pos, dirty_len, &wpc->iomap);
  186. WARN_ON_ONCE(!folio->private && map_len < dirty_len);
  187. switch (wpc->iomap.type) {
  188. case IOMAP_UNWRITTEN:
  189. ioend_flags |= IOMAP_IOEND_UNWRITTEN;
  190. break;
  191. case IOMAP_MAPPED:
  192. break;
  193. case IOMAP_HOLE:
  194. return map_len;
  195. default:
  196. WARN_ON_ONCE(1);
  197. return -EIO;
  198. }
  199. if (wpc->iomap.flags & IOMAP_F_SHARED)
  200. ioend_flags |= IOMAP_IOEND_SHARED;
  201. if (folio_test_dropbehind(folio))
  202. ioend_flags |= IOMAP_IOEND_DONTCACHE;
  203. if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY))
  204. ioend_flags |= IOMAP_IOEND_BOUNDARY;
  205. if (!ioend || !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) {
  206. new_ioend:
  207. if (ioend) {
  208. error = wpc->ops->writeback_submit(wpc, 0);
  209. if (error)
  210. return error;
  211. }
  212. wpc->wb_ctx = ioend = iomap_alloc_ioend(wpc, pos, ioend_flags);
  213. }
  214. if (!bio_add_folio(&ioend->io_bio, folio, map_len, poff))
  215. goto new_ioend;
  216. /*
  217. * Clamp io_offset and io_size to the incore EOF so that ondisk
  218. * file size updates in the ioend completion are byte-accurate.
  219. * This avoids recovering files with zeroed tail regions when
  220. * writeback races with appending writes:
  221. *
  222. * Thread 1: Thread 2:
  223. * ------------ -----------
  224. * write [A, A+B]
  225. * update inode size to A+B
  226. * submit I/O [A, A+BS]
  227. * write [A+B, A+B+C]
  228. * update inode size to A+B+C
  229. * <I/O completes, updates disk size to min(A+B+C, A+BS)>
  230. * <power failure>
  231. *
  232. * After reboot:
  233. * 1) with A+B+C < A+BS, the file has zero padding in range
  234. * [A+B, A+B+C]
  235. *
  236. * |< Block Size (BS) >|
  237. * |DDDDDDDDDDDD0000000000000|
  238. * ^ ^ ^
  239. * A A+B A+B+C
  240. * (EOF)
  241. *
  242. * 2) with A+B+C > A+BS, the file has zero padding in range
  243. * [A+B, A+BS]
  244. *
  245. * |< Block Size (BS) >|< Block Size (BS) >|
  246. * |DDDDDDDDDDDD0000000000000|00000000000000000000000000|
  247. * ^ ^ ^ ^
  248. * A A+B A+BS A+B+C
  249. * (EOF)
  250. *
  251. * D = Valid Data
  252. * 0 = Zero Padding
  253. *
  254. * Note that this defeats the ability to chain the ioends of
  255. * appending writes.
  256. */
  257. ioend->io_size += map_len;
  258. if (ioend->io_offset + ioend->io_size > end_pos)
  259. ioend->io_size = end_pos - ioend->io_offset;
  260. wbc_account_cgroup_owner(wpc->wbc, folio, map_len);
  261. return map_len;
  262. }
  263. EXPORT_SYMBOL_GPL(iomap_add_to_ioend);
  264. static u32 iomap_finish_ioend(struct iomap_ioend *ioend, int error)
  265. {
  266. if (ioend->io_parent) {
  267. struct bio *bio = &ioend->io_bio;
  268. ioend = ioend->io_parent;
  269. bio_put(bio);
  270. }
  271. if (error)
  272. cmpxchg(&ioend->io_error, 0, error);
  273. if (!atomic_dec_and_test(&ioend->io_remaining))
  274. return 0;
  275. if (ioend->io_flags & IOMAP_IOEND_DIRECT)
  276. return iomap_finish_ioend_direct(ioend);
  277. return iomap_finish_ioend_buffered(ioend);
  278. }
  279. /*
  280. * Ioend completion routine for merged bios. This can only be called from task
  281. * contexts as merged ioends can be of unbound length. Hence we have to break up
  282. * the writeback completions into manageable chunks to avoid long scheduler
  283. * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get
  284. * good batch processing throughput without creating adverse scheduler latency
  285. * conditions.
  286. */
  287. void iomap_finish_ioends(struct iomap_ioend *ioend, int error)
  288. {
  289. struct list_head tmp;
  290. u32 completions;
  291. might_sleep();
  292. list_replace_init(&ioend->io_list, &tmp);
  293. completions = iomap_finish_ioend(ioend, error);
  294. while (!list_empty(&tmp)) {
  295. if (completions > IOEND_BATCH_SIZE * 8) {
  296. cond_resched();
  297. completions = 0;
  298. }
  299. ioend = list_first_entry(&tmp, struct iomap_ioend, io_list);
  300. list_del_init(&ioend->io_list);
  301. completions += iomap_finish_ioend(ioend, error);
  302. }
  303. }
  304. EXPORT_SYMBOL_GPL(iomap_finish_ioends);
  305. /*
  306. * We can merge two adjacent ioends if they have the same set of work to do.
  307. */
  308. static bool iomap_ioend_can_merge(struct iomap_ioend *ioend,
  309. struct iomap_ioend *next)
  310. {
  311. /*
  312. * There is no point in merging reads as there is no completion
  313. * processing that can be easily batched up for them.
  314. */
  315. if (bio_op(&ioend->io_bio) == REQ_OP_READ ||
  316. bio_op(&next->io_bio) == REQ_OP_READ)
  317. return false;
  318. if (ioend->io_bio.bi_status != next->io_bio.bi_status)
  319. return false;
  320. if (next->io_flags & IOMAP_IOEND_BOUNDARY)
  321. return false;
  322. if ((ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS) !=
  323. (next->io_flags & IOMAP_IOEND_NOMERGE_FLAGS))
  324. return false;
  325. if (ioend->io_offset + ioend->io_size != next->io_offset)
  326. return false;
  327. /*
  328. * Do not merge physically discontiguous ioends. The filesystem
  329. * completion functions will have to iterate the physical
  330. * discontiguities even if we merge the ioends at a logical level, so
  331. * we don't gain anything by merging physical discontiguities here.
  332. *
  333. * We cannot use bio->bi_iter.bi_sector here as it is modified during
  334. * submission so does not point to the start sector of the bio at
  335. * completion.
  336. */
  337. if (ioend->io_sector + (ioend->io_size >> SECTOR_SHIFT) !=
  338. next->io_sector)
  339. return false;
  340. return true;
  341. }
  342. void iomap_ioend_try_merge(struct iomap_ioend *ioend,
  343. struct list_head *more_ioends)
  344. {
  345. struct iomap_ioend *next;
  346. INIT_LIST_HEAD(&ioend->io_list);
  347. while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend,
  348. io_list))) {
  349. if (!iomap_ioend_can_merge(ioend, next))
  350. break;
  351. list_move_tail(&next->io_list, &ioend->io_list);
  352. ioend->io_size += next->io_size;
  353. }
  354. }
  355. EXPORT_SYMBOL_GPL(iomap_ioend_try_merge);
  356. static int iomap_ioend_compare(void *priv, const struct list_head *a,
  357. const struct list_head *b)
  358. {
  359. struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list);
  360. struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list);
  361. if (ia->io_offset < ib->io_offset)
  362. return -1;
  363. if (ia->io_offset > ib->io_offset)
  364. return 1;
  365. return 0;
  366. }
  367. void iomap_sort_ioends(struct list_head *ioend_list)
  368. {
  369. list_sort(NULL, ioend_list, iomap_ioend_compare);
  370. }
  371. EXPORT_SYMBOL_GPL(iomap_sort_ioends);
  372. /*
  373. * Split up to the first @max_len bytes from @ioend if the ioend covers more
  374. * than @max_len bytes.
  375. *
  376. * If @is_append is set, the split will be based on the hardware limits for
  377. * REQ_OP_ZONE_APPEND commands and can be less than @max_len if the hardware
  378. * limits don't allow the entire @max_len length.
  379. *
  380. * The bio embedded into @ioend must be a REQ_OP_WRITE because the block layer
  381. * does not allow splitting REQ_OP_ZONE_APPEND bios. The file systems has to
  382. * switch the operation after this call, but before submitting the bio.
  383. */
  384. struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend,
  385. unsigned int max_len, bool is_append)
  386. {
  387. struct bio *bio = &ioend->io_bio;
  388. struct iomap_ioend *split_ioend;
  389. unsigned int nr_segs;
  390. int sector_offset;
  391. struct bio *split;
  392. if (is_append) {
  393. struct queue_limits *lim = bdev_limits(bio->bi_bdev);
  394. max_len = min(max_len,
  395. lim->max_zone_append_sectors << SECTOR_SHIFT);
  396. sector_offset = bio_split_rw_at(bio, lim, &nr_segs, max_len);
  397. if (unlikely(sector_offset < 0))
  398. return ERR_PTR(sector_offset);
  399. if (!sector_offset)
  400. return NULL;
  401. } else {
  402. if (bio->bi_iter.bi_size <= max_len)
  403. return NULL;
  404. sector_offset = max_len >> SECTOR_SHIFT;
  405. }
  406. /* ensure the split ioend is still block size aligned */
  407. sector_offset = ALIGN_DOWN(sector_offset << SECTOR_SHIFT,
  408. i_blocksize(ioend->io_inode)) >> SECTOR_SHIFT;
  409. split = bio_split(bio, sector_offset, GFP_NOFS, &iomap_ioend_bioset);
  410. if (IS_ERR(split))
  411. return ERR_CAST(split);
  412. split->bi_private = bio->bi_private;
  413. split->bi_end_io = bio->bi_end_io;
  414. split_ioend = iomap_init_ioend(ioend->io_inode, split, ioend->io_offset,
  415. ioend->io_flags);
  416. split_ioend->io_parent = ioend;
  417. atomic_inc(&ioend->io_remaining);
  418. ioend->io_offset += split_ioend->io_size;
  419. ioend->io_size -= split_ioend->io_size;
  420. split_ioend->io_sector = ioend->io_sector;
  421. if (!is_append)
  422. ioend->io_sector += (split_ioend->io_size >> SECTOR_SHIFT);
  423. return split_ioend;
  424. }
  425. EXPORT_SYMBOL_GPL(iomap_split_ioend);
  426. static int __init iomap_ioend_init(void)
  427. {
  428. return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
  429. offsetof(struct iomap_ioend, io_bio),
  430. BIOSET_NEED_BVECS);
  431. }
  432. fs_initcall(iomap_ioend_init);