fops.c 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * Copyright (C) 1991, 1992 Linus Torvalds
  4. * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
  5. * Copyright (C) 2016 - 2020 Christoph Hellwig
  6. */
  7. #include <linux/init.h>
  8. #include <linux/mm.h>
  9. #include <linux/blkdev.h>
  10. #include <linux/blk-integrity.h>
  11. #include <linux/buffer_head.h>
  12. #include <linux/mpage.h>
  13. #include <linux/uio.h>
  14. #include <linux/namei.h>
  15. #include <linux/task_io_accounting_ops.h>
  16. #include <linux/falloc.h>
  17. #include <linux/suspend.h>
  18. #include <linux/fs.h>
  19. #include <linux/iomap.h>
  20. #include <linux/module.h>
  21. #include <linux/io_uring/cmd.h>
  22. #include "blk.h"
  23. static inline struct inode *bdev_file_inode(struct file *file)
  24. {
  25. return file->f_mapping->host;
  26. }
  27. static blk_opf_t dio_bio_write_op(struct kiocb *iocb)
  28. {
  29. blk_opf_t opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
  30. /* avoid the need for a I/O completion work item */
  31. if (iocb_is_dsync(iocb))
  32. opf |= REQ_FUA;
  33. return opf;
  34. }
  35. static bool blkdev_dio_invalid(struct block_device *bdev, struct kiocb *iocb,
  36. struct iov_iter *iter)
  37. {
  38. return (iocb->ki_pos | iov_iter_count(iter)) &
  39. (bdev_logical_block_size(bdev) - 1);
  40. }
  41. static inline int blkdev_iov_iter_get_pages(struct bio *bio,
  42. struct iov_iter *iter, struct block_device *bdev)
  43. {
  44. return bio_iov_iter_get_pages(bio, iter,
  45. bdev_logical_block_size(bdev) - 1);
  46. }
  47. #define DIO_INLINE_BIO_VECS 4
  48. static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
  49. struct iov_iter *iter, struct block_device *bdev,
  50. unsigned int nr_pages)
  51. {
  52. struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs;
  53. loff_t pos = iocb->ki_pos;
  54. bool should_dirty = false;
  55. struct bio bio;
  56. ssize_t ret;
  57. if (nr_pages <= DIO_INLINE_BIO_VECS)
  58. vecs = inline_vecs;
  59. else {
  60. vecs = kmalloc_objs(struct bio_vec, nr_pages);
  61. if (!vecs)
  62. return -ENOMEM;
  63. }
  64. if (iov_iter_rw(iter) == READ) {
  65. bio_init(&bio, bdev, vecs, nr_pages, REQ_OP_READ);
  66. if (user_backed_iter(iter))
  67. should_dirty = true;
  68. } else {
  69. bio_init(&bio, bdev, vecs, nr_pages, dio_bio_write_op(iocb));
  70. }
  71. bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT;
  72. bio.bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
  73. bio.bi_write_stream = iocb->ki_write_stream;
  74. bio.bi_ioprio = iocb->ki_ioprio;
  75. if (iocb->ki_flags & IOCB_ATOMIC)
  76. bio.bi_opf |= REQ_ATOMIC;
  77. ret = blkdev_iov_iter_get_pages(&bio, iter, bdev);
  78. if (unlikely(ret))
  79. goto out;
  80. ret = bio.bi_iter.bi_size;
  81. if (iov_iter_rw(iter) == WRITE)
  82. task_io_account_write(ret);
  83. if (iocb->ki_flags & IOCB_NOWAIT)
  84. bio.bi_opf |= REQ_NOWAIT;
  85. submit_bio_wait(&bio);
  86. bio_release_pages(&bio, should_dirty);
  87. if (unlikely(bio.bi_status))
  88. ret = blk_status_to_errno(bio.bi_status);
  89. out:
  90. if (vecs != inline_vecs)
  91. kfree(vecs);
  92. bio_uninit(&bio);
  93. return ret;
  94. }
  95. enum {
  96. DIO_SHOULD_DIRTY = 1,
  97. DIO_IS_SYNC = 2,
  98. };
  99. struct blkdev_dio {
  100. union {
  101. struct kiocb *iocb;
  102. struct task_struct *waiter;
  103. };
  104. size_t size;
  105. atomic_t ref;
  106. unsigned int flags;
  107. struct bio bio ____cacheline_aligned_in_smp;
  108. };
  109. static struct bio_set blkdev_dio_pool;
  110. static void blkdev_bio_end_io(struct bio *bio)
  111. {
  112. struct blkdev_dio *dio = bio->bi_private;
  113. bool should_dirty = dio->flags & DIO_SHOULD_DIRTY;
  114. bool is_sync = dio->flags & DIO_IS_SYNC;
  115. if (bio->bi_status && !dio->bio.bi_status)
  116. dio->bio.bi_status = bio->bi_status;
  117. if (bio_integrity(bio))
  118. bio_integrity_unmap_user(bio);
  119. if (atomic_dec_and_test(&dio->ref)) {
  120. if (!is_sync) {
  121. struct kiocb *iocb = dio->iocb;
  122. ssize_t ret;
  123. WRITE_ONCE(iocb->private, NULL);
  124. if (likely(!dio->bio.bi_status)) {
  125. ret = dio->size;
  126. iocb->ki_pos += ret;
  127. } else {
  128. ret = blk_status_to_errno(dio->bio.bi_status);
  129. }
  130. dio->iocb->ki_complete(iocb, ret);
  131. bio_put(&dio->bio);
  132. } else {
  133. struct task_struct *waiter = dio->waiter;
  134. WRITE_ONCE(dio->waiter, NULL);
  135. blk_wake_io_task(waiter);
  136. }
  137. }
  138. if (should_dirty) {
  139. bio_check_pages_dirty(bio);
  140. } else {
  141. bio_release_pages(bio, false);
  142. bio_put(bio);
  143. }
  144. }
  145. static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
  146. struct block_device *bdev, unsigned int nr_pages)
  147. {
  148. struct blk_plug plug;
  149. struct blkdev_dio *dio;
  150. struct bio *bio;
  151. bool is_read = (iov_iter_rw(iter) == READ), is_sync;
  152. blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
  153. loff_t pos = iocb->ki_pos;
  154. int ret = 0;
  155. bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL,
  156. &blkdev_dio_pool);
  157. dio = container_of(bio, struct blkdev_dio, bio);
  158. atomic_set(&dio->ref, 1);
  159. /*
  160. * Grab an extra reference to ensure the dio structure which is embedded
  161. * into the first bio stays around.
  162. */
  163. bio_get(bio);
  164. is_sync = is_sync_kiocb(iocb);
  165. if (is_sync) {
  166. dio->flags = DIO_IS_SYNC;
  167. dio->waiter = current;
  168. } else {
  169. dio->flags = 0;
  170. dio->iocb = iocb;
  171. }
  172. dio->size = 0;
  173. if (is_read && user_backed_iter(iter))
  174. dio->flags |= DIO_SHOULD_DIRTY;
  175. blk_start_plug(&plug);
  176. for (;;) {
  177. bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
  178. bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
  179. bio->bi_write_stream = iocb->ki_write_stream;
  180. bio->bi_private = dio;
  181. bio->bi_end_io = blkdev_bio_end_io;
  182. bio->bi_ioprio = iocb->ki_ioprio;
  183. ret = blkdev_iov_iter_get_pages(bio, iter, bdev);
  184. if (unlikely(ret)) {
  185. bio->bi_status = BLK_STS_IOERR;
  186. bio_endio(bio);
  187. break;
  188. }
  189. if (iocb->ki_flags & IOCB_NOWAIT) {
  190. /*
  191. * This is nonblocking IO, and we need to allocate
  192. * another bio if we have data left to map. As we
  193. * cannot guarantee that one of the sub bios will not
  194. * fail getting issued FOR NOWAIT and as error results
  195. * are coalesced across all of them, be safe and ask for
  196. * a retry of this from blocking context.
  197. */
  198. if (unlikely(iov_iter_count(iter))) {
  199. ret = -EAGAIN;
  200. goto fail;
  201. }
  202. bio->bi_opf |= REQ_NOWAIT;
  203. }
  204. if (iocb->ki_flags & IOCB_HAS_METADATA) {
  205. ret = bio_integrity_map_iter(bio, iocb->private);
  206. if (unlikely(ret))
  207. goto fail;
  208. }
  209. if (is_read) {
  210. if (dio->flags & DIO_SHOULD_DIRTY)
  211. bio_set_pages_dirty(bio);
  212. } else {
  213. task_io_account_write(bio->bi_iter.bi_size);
  214. }
  215. dio->size += bio->bi_iter.bi_size;
  216. pos += bio->bi_iter.bi_size;
  217. nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS);
  218. if (!nr_pages) {
  219. submit_bio(bio);
  220. break;
  221. }
  222. atomic_inc(&dio->ref);
  223. submit_bio(bio);
  224. bio = bio_alloc(bdev, nr_pages, opf, GFP_KERNEL);
  225. }
  226. blk_finish_plug(&plug);
  227. if (!is_sync)
  228. return -EIOCBQUEUED;
  229. for (;;) {
  230. set_current_state(TASK_UNINTERRUPTIBLE);
  231. if (!READ_ONCE(dio->waiter))
  232. break;
  233. blk_io_schedule();
  234. }
  235. __set_current_state(TASK_RUNNING);
  236. if (!ret)
  237. ret = blk_status_to_errno(dio->bio.bi_status);
  238. if (likely(!ret))
  239. ret = dio->size;
  240. bio_put(&dio->bio);
  241. return ret;
  242. fail:
  243. bio_release_pages(bio, false);
  244. bio_clear_flag(bio, BIO_REFFED);
  245. bio_put(bio);
  246. blk_finish_plug(&plug);
  247. return ret;
  248. }
  249. static void blkdev_bio_end_io_async(struct bio *bio)
  250. {
  251. struct blkdev_dio *dio = container_of(bio, struct blkdev_dio, bio);
  252. struct kiocb *iocb = dio->iocb;
  253. ssize_t ret;
  254. WRITE_ONCE(iocb->private, NULL);
  255. if (likely(!bio->bi_status)) {
  256. ret = dio->size;
  257. iocb->ki_pos += ret;
  258. } else {
  259. ret = blk_status_to_errno(bio->bi_status);
  260. }
  261. if (bio_integrity(bio))
  262. bio_integrity_unmap_user(bio);
  263. iocb->ki_complete(iocb, ret);
  264. if (dio->flags & DIO_SHOULD_DIRTY) {
  265. bio_check_pages_dirty(bio);
  266. } else {
  267. bio_release_pages(bio, false);
  268. bio_put(bio);
  269. }
  270. }
  271. static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
  272. struct iov_iter *iter,
  273. struct block_device *bdev,
  274. unsigned int nr_pages)
  275. {
  276. bool is_read = iov_iter_rw(iter) == READ;
  277. blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
  278. struct blkdev_dio *dio;
  279. struct bio *bio;
  280. loff_t pos = iocb->ki_pos;
  281. int ret = 0;
  282. bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL,
  283. &blkdev_dio_pool);
  284. dio = container_of(bio, struct blkdev_dio, bio);
  285. dio->flags = 0;
  286. dio->iocb = iocb;
  287. bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
  288. bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
  289. bio->bi_write_stream = iocb->ki_write_stream;
  290. bio->bi_end_io = blkdev_bio_end_io_async;
  291. bio->bi_ioprio = iocb->ki_ioprio;
  292. if (iov_iter_is_bvec(iter)) {
  293. /*
  294. * Users don't rely on the iterator being in any particular
  295. * state for async I/O returning -EIOCBQUEUED, hence we can
  296. * avoid expensive iov_iter_advance(). Bypass
  297. * bio_iov_iter_get_pages() and set the bvec directly.
  298. */
  299. bio_iov_bvec_set(bio, iter);
  300. } else {
  301. ret = blkdev_iov_iter_get_pages(bio, iter, bdev);
  302. if (unlikely(ret))
  303. goto out_bio_put;
  304. }
  305. dio->size = bio->bi_iter.bi_size;
  306. if (is_read) {
  307. if (user_backed_iter(iter)) {
  308. dio->flags |= DIO_SHOULD_DIRTY;
  309. bio_set_pages_dirty(bio);
  310. }
  311. } else {
  312. task_io_account_write(bio->bi_iter.bi_size);
  313. }
  314. if (iocb->ki_flags & IOCB_HAS_METADATA) {
  315. ret = bio_integrity_map_iter(bio, iocb->private);
  316. WRITE_ONCE(iocb->private, NULL);
  317. if (unlikely(ret))
  318. goto out_bio_put;
  319. }
  320. if (iocb->ki_flags & IOCB_ATOMIC)
  321. bio->bi_opf |= REQ_ATOMIC;
  322. if (iocb->ki_flags & IOCB_NOWAIT)
  323. bio->bi_opf |= REQ_NOWAIT;
  324. if (iocb->ki_flags & IOCB_HIPRI) {
  325. bio->bi_opf |= REQ_POLLED;
  326. submit_bio(bio);
  327. WRITE_ONCE(iocb->private, bio);
  328. } else {
  329. submit_bio(bio);
  330. }
  331. return -EIOCBQUEUED;
  332. out_bio_put:
  333. bio_put(bio);
  334. return ret;
  335. }
  336. static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
  337. {
  338. struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
  339. unsigned int nr_pages;
  340. if (!iov_iter_count(iter))
  341. return 0;
  342. if (blkdev_dio_invalid(bdev, iocb, iter))
  343. return -EINVAL;
  344. if (iov_iter_rw(iter) == WRITE) {
  345. u16 max_write_streams = bdev_max_write_streams(bdev);
  346. if (iocb->ki_write_stream) {
  347. if (iocb->ki_write_stream > max_write_streams)
  348. return -EINVAL;
  349. } else if (max_write_streams) {
  350. enum rw_hint write_hint =
  351. file_inode(iocb->ki_filp)->i_write_hint;
  352. /*
  353. * Just use the write hint as write stream for block
  354. * device writes. This assumes no file system is
  355. * mounted that would use the streams differently.
  356. */
  357. if (write_hint <= max_write_streams)
  358. iocb->ki_write_stream = write_hint;
  359. }
  360. }
  361. nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
  362. if (likely(nr_pages <= BIO_MAX_VECS &&
  363. !(iocb->ki_flags & IOCB_HAS_METADATA))) {
  364. if (is_sync_kiocb(iocb))
  365. return __blkdev_direct_IO_simple(iocb, iter, bdev,
  366. nr_pages);
  367. return __blkdev_direct_IO_async(iocb, iter, bdev, nr_pages);
  368. } else if (iocb->ki_flags & IOCB_ATOMIC) {
  369. return -EINVAL;
  370. }
  371. return __blkdev_direct_IO(iocb, iter, bdev, bio_max_segs(nr_pages));
  372. }
  373. static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
  374. unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
  375. {
  376. struct block_device *bdev = I_BDEV(inode);
  377. loff_t isize = i_size_read(inode);
  378. if (offset >= isize)
  379. return -EIO;
  380. iomap->bdev = bdev;
  381. iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev));
  382. iomap->type = IOMAP_MAPPED;
  383. iomap->addr = iomap->offset;
  384. iomap->length = isize - iomap->offset;
  385. iomap->flags |= IOMAP_F_BUFFER_HEAD; /* noop for !CONFIG_BUFFER_HEAD */
  386. return 0;
  387. }
  388. static const struct iomap_ops blkdev_iomap_ops = {
  389. .iomap_begin = blkdev_iomap_begin,
  390. };
  391. #ifdef CONFIG_BUFFER_HEAD
  392. static int blkdev_get_block(struct inode *inode, sector_t iblock,
  393. struct buffer_head *bh, int create)
  394. {
  395. bh->b_bdev = I_BDEV(inode);
  396. bh->b_blocknr = iblock;
  397. set_buffer_mapped(bh);
  398. return 0;
  399. }
  400. /*
  401. * We cannot call mpage_writepages() as it does not take the buffer lock.
  402. * We must use block_write_full_folio() directly which holds the buffer
  403. * lock. The buffer lock provides the synchronisation with writeback
  404. * that filesystems rely on when they use the blockdev's mapping.
  405. */
  406. static int blkdev_writepages(struct address_space *mapping,
  407. struct writeback_control *wbc)
  408. {
  409. struct folio *folio = NULL;
  410. struct blk_plug plug;
  411. int err;
  412. blk_start_plug(&plug);
  413. while ((folio = writeback_iter(mapping, wbc, folio, &err)))
  414. err = block_write_full_folio(folio, wbc, blkdev_get_block);
  415. blk_finish_plug(&plug);
  416. return err;
  417. }
  418. static int blkdev_read_folio(struct file *file, struct folio *folio)
  419. {
  420. return block_read_full_folio(folio, blkdev_get_block);
  421. }
  422. static void blkdev_readahead(struct readahead_control *rac)
  423. {
  424. mpage_readahead(rac, blkdev_get_block);
  425. }
  426. static int blkdev_write_begin(const struct kiocb *iocb,
  427. struct address_space *mapping, loff_t pos,
  428. unsigned len, struct folio **foliop,
  429. void **fsdata)
  430. {
  431. return block_write_begin(mapping, pos, len, foliop, blkdev_get_block);
  432. }
  433. static int blkdev_write_end(const struct kiocb *iocb,
  434. struct address_space *mapping,
  435. loff_t pos, unsigned len, unsigned copied,
  436. struct folio *folio, void *fsdata)
  437. {
  438. int ret;
  439. ret = block_write_end(pos, len, copied, folio);
  440. folio_unlock(folio);
  441. folio_put(folio);
  442. return ret;
  443. }
  444. const struct address_space_operations def_blk_aops = {
  445. .dirty_folio = block_dirty_folio,
  446. .invalidate_folio = block_invalidate_folio,
  447. .read_folio = blkdev_read_folio,
  448. .readahead = blkdev_readahead,
  449. .writepages = blkdev_writepages,
  450. .write_begin = blkdev_write_begin,
  451. .write_end = blkdev_write_end,
  452. .migrate_folio = buffer_migrate_folio_norefs,
  453. .is_dirty_writeback = buffer_check_dirty_writeback,
  454. };
  455. #else /* CONFIG_BUFFER_HEAD */
  456. static int blkdev_read_folio(struct file *file, struct folio *folio)
  457. {
  458. iomap_bio_read_folio(folio, &blkdev_iomap_ops);
  459. return 0;
  460. }
  461. static void blkdev_readahead(struct readahead_control *rac)
  462. {
  463. iomap_bio_readahead(rac, &blkdev_iomap_ops);
  464. }
  465. static ssize_t blkdev_writeback_range(struct iomap_writepage_ctx *wpc,
  466. struct folio *folio, u64 offset, unsigned int len, u64 end_pos)
  467. {
  468. loff_t isize = i_size_read(wpc->inode);
  469. if (WARN_ON_ONCE(offset >= isize))
  470. return -EIO;
  471. if (offset < wpc->iomap.offset ||
  472. offset >= wpc->iomap.offset + wpc->iomap.length) {
  473. int error;
  474. error = blkdev_iomap_begin(wpc->inode, offset, isize - offset,
  475. IOMAP_WRITE, &wpc->iomap, NULL);
  476. if (error)
  477. return error;
  478. }
  479. return iomap_add_to_ioend(wpc, folio, offset, end_pos, len);
  480. }
  481. static const struct iomap_writeback_ops blkdev_writeback_ops = {
  482. .writeback_range = blkdev_writeback_range,
  483. .writeback_submit = iomap_ioend_writeback_submit,
  484. };
  485. static int blkdev_writepages(struct address_space *mapping,
  486. struct writeback_control *wbc)
  487. {
  488. struct iomap_writepage_ctx wpc = {
  489. .inode = mapping->host,
  490. .wbc = wbc,
  491. .ops = &blkdev_writeback_ops
  492. };
  493. return iomap_writepages(&wpc);
  494. }
  495. const struct address_space_operations def_blk_aops = {
  496. .dirty_folio = filemap_dirty_folio,
  497. .release_folio = iomap_release_folio,
  498. .invalidate_folio = iomap_invalidate_folio,
  499. .read_folio = blkdev_read_folio,
  500. .readahead = blkdev_readahead,
  501. .writepages = blkdev_writepages,
  502. .is_partially_uptodate = iomap_is_partially_uptodate,
  503. .error_remove_folio = generic_error_remove_folio,
  504. .migrate_folio = filemap_migrate_folio,
  505. };
  506. #endif /* CONFIG_BUFFER_HEAD */
  507. /*
  508. * for a block special file file_inode(file)->i_size is zero
  509. * so we compute the size by hand (just as in block_read/write above)
  510. */
  511. static loff_t blkdev_llseek(struct file *file, loff_t offset, int whence)
  512. {
  513. struct inode *bd_inode = bdev_file_inode(file);
  514. loff_t retval;
  515. inode_lock(bd_inode);
  516. retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode));
  517. inode_unlock(bd_inode);
  518. return retval;
  519. }
  520. static int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
  521. int datasync)
  522. {
  523. struct block_device *bdev = I_BDEV(filp->f_mapping->host);
  524. int error;
  525. error = file_write_and_wait_range(filp, start, end);
  526. if (error)
  527. return error;
  528. /*
  529. * There is no need to serialise calls to blkdev_issue_flush with
  530. * i_mutex and doing so causes performance issues with concurrent
  531. * O_SYNC writers to a block device.
  532. */
  533. error = blkdev_issue_flush(bdev);
  534. if (error == -EOPNOTSUPP)
  535. error = 0;
  536. return error;
  537. }
  538. /**
  539. * file_to_blk_mode - get block open flags from file flags
  540. * @file: file whose open flags should be converted
  541. *
  542. * Look at file open flags and generate corresponding block open flags from
  543. * them. The function works both for file just being open (e.g. during ->open
  544. * callback) and for file that is already open. This is actually non-trivial
  545. * (see comment in the function).
  546. */
  547. blk_mode_t file_to_blk_mode(struct file *file)
  548. {
  549. blk_mode_t mode = 0;
  550. if (file->f_mode & FMODE_READ)
  551. mode |= BLK_OPEN_READ;
  552. if (file->f_mode & FMODE_WRITE)
  553. mode |= BLK_OPEN_WRITE;
  554. /*
  555. * do_dentry_open() clears O_EXCL from f_flags, use file->private_data
  556. * to determine whether the open was exclusive for already open files.
  557. */
  558. if (file->private_data)
  559. mode |= BLK_OPEN_EXCL;
  560. else if (file->f_flags & O_EXCL)
  561. mode |= BLK_OPEN_EXCL;
  562. if (file->f_flags & O_NDELAY)
  563. mode |= BLK_OPEN_NDELAY;
  564. /*
  565. * If all bits in O_ACCMODE set (aka O_RDWR | O_WRONLY), the floppy
  566. * driver has historically allowed ioctls as if the file was opened for
  567. * writing, but does not allow and actual reads or writes.
  568. */
  569. if ((file->f_flags & O_ACCMODE) == (O_RDWR | O_WRONLY))
  570. mode |= BLK_OPEN_WRITE_IOCTL;
  571. return mode;
  572. }
  573. static int blkdev_open(struct inode *inode, struct file *filp)
  574. {
  575. struct block_device *bdev;
  576. blk_mode_t mode;
  577. int ret;
  578. mode = file_to_blk_mode(filp);
  579. /* Use the file as the holder. */
  580. if (mode & BLK_OPEN_EXCL)
  581. filp->private_data = filp;
  582. ret = bdev_permission(inode->i_rdev, mode, filp->private_data);
  583. if (ret)
  584. return ret;
  585. bdev = blkdev_get_no_open(inode->i_rdev, true);
  586. if (!bdev)
  587. return -ENXIO;
  588. if (bdev_can_atomic_write(bdev))
  589. filp->f_mode |= FMODE_CAN_ATOMIC_WRITE;
  590. if (blk_get_integrity(bdev->bd_disk))
  591. filp->f_mode |= FMODE_HAS_METADATA;
  592. ret = bdev_open(bdev, mode, filp->private_data, NULL, filp);
  593. if (ret)
  594. blkdev_put_no_open(bdev);
  595. return ret;
  596. }
  597. static int blkdev_release(struct inode *inode, struct file *filp)
  598. {
  599. bdev_release(filp);
  600. return 0;
  601. }
  602. static ssize_t
  603. blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from)
  604. {
  605. size_t count = iov_iter_count(from);
  606. ssize_t written;
  607. written = kiocb_invalidate_pages(iocb, count);
  608. if (written) {
  609. if (written == -EBUSY)
  610. return 0;
  611. return written;
  612. }
  613. written = blkdev_direct_IO(iocb, from);
  614. if (written > 0) {
  615. kiocb_invalidate_post_direct_write(iocb, count);
  616. iocb->ki_pos += written;
  617. count -= written;
  618. }
  619. if (written != -EIOCBQUEUED)
  620. iov_iter_revert(from, count - iov_iter_count(from));
  621. return written;
  622. }
  623. static ssize_t blkdev_buffered_write(struct kiocb *iocb, struct iov_iter *from)
  624. {
  625. return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops, NULL,
  626. NULL);
  627. }
  628. /*
  629. * Write data to the block device. Only intended for the block device itself
  630. * and the raw driver which basically is a fake block device.
  631. *
  632. * Does not take i_mutex for the write and thus is not for general purpose
  633. * use.
  634. */
  635. static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
  636. {
  637. struct file *file = iocb->ki_filp;
  638. struct inode *bd_inode = bdev_file_inode(file);
  639. struct block_device *bdev = I_BDEV(bd_inode);
  640. bool atomic = iocb->ki_flags & IOCB_ATOMIC;
  641. loff_t size = bdev_nr_bytes(bdev);
  642. size_t shorted = 0;
  643. ssize_t ret;
  644. if (bdev_read_only(bdev))
  645. return -EPERM;
  646. if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev))
  647. return -ETXTBSY;
  648. if (!iov_iter_count(from))
  649. return 0;
  650. if (iocb->ki_pos >= size)
  651. return -ENOSPC;
  652. if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT)
  653. return -EOPNOTSUPP;
  654. if (atomic) {
  655. ret = generic_atomic_write_valid(iocb, from);
  656. if (ret)
  657. return ret;
  658. }
  659. size -= iocb->ki_pos;
  660. if (iov_iter_count(from) > size) {
  661. if (atomic)
  662. return -EINVAL;
  663. shorted = iov_iter_count(from) - size;
  664. iov_iter_truncate(from, size);
  665. }
  666. ret = file_update_time(file);
  667. if (ret)
  668. return ret;
  669. if (iocb->ki_flags & IOCB_DIRECT) {
  670. ret = blkdev_direct_write(iocb, from);
  671. if (ret >= 0 && iov_iter_count(from))
  672. ret = direct_write_fallback(iocb, from, ret,
  673. blkdev_buffered_write(iocb, from));
  674. } else {
  675. /*
  676. * Take i_rwsem and invalidate_lock to avoid racing with
  677. * set_blocksize changing i_blkbits/folio order and punching
  678. * out the pagecache.
  679. */
  680. inode_lock_shared(bd_inode);
  681. ret = blkdev_buffered_write(iocb, from);
  682. inode_unlock_shared(bd_inode);
  683. }
  684. if (ret > 0)
  685. ret = generic_write_sync(iocb, ret);
  686. iov_iter_reexpand(from, iov_iter_count(from) + shorted);
  687. return ret;
  688. }
  689. static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
  690. {
  691. struct inode *bd_inode = bdev_file_inode(iocb->ki_filp);
  692. struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
  693. loff_t size = bdev_nr_bytes(bdev);
  694. loff_t pos = iocb->ki_pos;
  695. size_t shorted = 0;
  696. ssize_t ret = 0;
  697. size_t count;
  698. if (unlikely(pos + iov_iter_count(to) > size)) {
  699. if (pos >= size)
  700. return 0;
  701. size -= pos;
  702. shorted = iov_iter_count(to) - size;
  703. iov_iter_truncate(to, size);
  704. }
  705. count = iov_iter_count(to);
  706. if (!count)
  707. goto reexpand; /* skip atime */
  708. if (iocb->ki_flags & IOCB_DIRECT) {
  709. ret = kiocb_write_and_wait(iocb, count);
  710. if (ret < 0)
  711. goto reexpand;
  712. file_accessed(iocb->ki_filp);
  713. ret = blkdev_direct_IO(iocb, to);
  714. if (ret > 0) {
  715. iocb->ki_pos += ret;
  716. count -= ret;
  717. }
  718. if (ret != -EIOCBQUEUED)
  719. iov_iter_revert(to, count - iov_iter_count(to));
  720. if (ret < 0 || !count)
  721. goto reexpand;
  722. }
  723. /*
  724. * Take i_rwsem and invalidate_lock to avoid racing with set_blocksize
  725. * changing i_blkbits/folio order and punching out the pagecache.
  726. */
  727. inode_lock_shared(bd_inode);
  728. ret = filemap_read(iocb, to, ret);
  729. inode_unlock_shared(bd_inode);
  730. reexpand:
  731. if (unlikely(shorted))
  732. iov_iter_reexpand(to, iov_iter_count(to) + shorted);
  733. return ret;
  734. }
  735. #define BLKDEV_FALLOC_FL_SUPPORTED \
  736. (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \
  737. FALLOC_FL_ZERO_RANGE | FALLOC_FL_WRITE_ZEROES)
  738. static long blkdev_fallocate(struct file *file, int mode, loff_t start,
  739. loff_t len)
  740. {
  741. struct inode *inode = bdev_file_inode(file);
  742. struct block_device *bdev = I_BDEV(inode);
  743. loff_t end = start + len - 1;
  744. loff_t isize;
  745. unsigned int flags;
  746. int error;
  747. /* Fail if we don't recognize the flags. */
  748. if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED)
  749. return -EOPNOTSUPP;
  750. /*
  751. * Don't allow writing zeroes if the device does not enable the
  752. * unmap write zeroes operation.
  753. */
  754. if ((mode & FALLOC_FL_WRITE_ZEROES) &&
  755. !bdev_write_zeroes_unmap_sectors(bdev))
  756. return -EOPNOTSUPP;
  757. /* Don't go off the end of the device. */
  758. isize = bdev_nr_bytes(bdev);
  759. if (start >= isize)
  760. return -EINVAL;
  761. if (end >= isize) {
  762. if (mode & FALLOC_FL_KEEP_SIZE) {
  763. len = isize - start;
  764. end = start + len - 1;
  765. } else
  766. return -EINVAL;
  767. }
  768. /*
  769. * Don't allow IO that isn't aligned to logical block size.
  770. */
  771. if ((start | len) & (bdev_logical_block_size(bdev) - 1))
  772. return -EINVAL;
  773. inode_lock(inode);
  774. filemap_invalidate_lock(inode->i_mapping);
  775. switch (mode) {
  776. case FALLOC_FL_ZERO_RANGE:
  777. case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE:
  778. flags = BLKDEV_ZERO_NOUNMAP;
  779. break;
  780. case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE:
  781. flags = BLKDEV_ZERO_NOFALLBACK;
  782. break;
  783. case FALLOC_FL_WRITE_ZEROES:
  784. flags = 0;
  785. break;
  786. default:
  787. error = -EOPNOTSUPP;
  788. goto fail;
  789. }
  790. /*
  791. * Invalidate the page cache, including dirty pages, for valid
  792. * de-allocate mode calls to fallocate().
  793. */
  794. error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end);
  795. if (error)
  796. goto fail;
  797. error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT,
  798. len >> SECTOR_SHIFT, GFP_KERNEL, flags);
  799. fail:
  800. filemap_invalidate_unlock(inode->i_mapping);
  801. inode_unlock(inode);
  802. return error;
  803. }
  804. static int blkdev_mmap_prepare(struct vm_area_desc *desc)
  805. {
  806. struct file *file = desc->file;
  807. if (bdev_read_only(I_BDEV(bdev_file_inode(file))))
  808. return generic_file_readonly_mmap_prepare(desc);
  809. return generic_file_mmap_prepare(desc);
  810. }
  811. const struct file_operations def_blk_fops = {
  812. .open = blkdev_open,
  813. .release = blkdev_release,
  814. .llseek = blkdev_llseek,
  815. .read_iter = blkdev_read_iter,
  816. .write_iter = blkdev_write_iter,
  817. .iopoll = iocb_bio_iopoll,
  818. .mmap_prepare = blkdev_mmap_prepare,
  819. .fsync = blkdev_fsync,
  820. .unlocked_ioctl = blkdev_ioctl,
  821. #ifdef CONFIG_COMPAT
  822. .compat_ioctl = compat_blkdev_ioctl,
  823. #endif
  824. .splice_read = filemap_splice_read,
  825. .splice_write = iter_file_splice_write,
  826. .fallocate = blkdev_fallocate,
  827. .uring_cmd = blkdev_uring_cmd,
  828. .fop_flags = FOP_BUFFER_RASYNC,
  829. };
  830. static __init int blkdev_init(void)
  831. {
  832. return bioset_init(&blkdev_dio_pool, 4,
  833. offsetof(struct blkdev_dio, bio),
  834. BIOSET_NEED_BVECS|BIOSET_PERCPU_CACHE);
  835. }
  836. module_init(blkdev_init);