file.c 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Simple file system for zoned block devices exposing zones as files.
  4. *
  5. * Copyright (C) 2022 Western Digital Corporation or its affiliates.
  6. */
  7. #include <linux/module.h>
  8. #include <linux/pagemap.h>
  9. #include <linux/iomap.h>
  10. #include <linux/init.h>
  11. #include <linux/slab.h>
  12. #include <linux/blkdev.h>
  13. #include <linux/statfs.h>
  14. #include <linux/writeback.h>
  15. #include <linux/quotaops.h>
  16. #include <linux/seq_file.h>
  17. #include <linux/parser.h>
  18. #include <linux/uio.h>
  19. #include <linux/mman.h>
  20. #include <linux/sched/mm.h>
  21. #include <linux/task_io_accounting_ops.h>
  22. #include "zonefs.h"
  23. #include "trace.h"
  24. static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset,
  25. loff_t length, unsigned int flags,
  26. struct iomap *iomap, struct iomap *srcmap)
  27. {
  28. struct zonefs_inode_info *zi = ZONEFS_I(inode);
  29. struct zonefs_zone *z = zonefs_inode_zone(inode);
  30. struct super_block *sb = inode->i_sb;
  31. loff_t isize;
  32. /*
  33. * All blocks are always mapped below EOF. If reading past EOF,
  34. * act as if there is a hole up to the file maximum size.
  35. */
  36. mutex_lock(&zi->i_truncate_mutex);
  37. iomap->bdev = inode->i_sb->s_bdev;
  38. iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
  39. isize = i_size_read(inode);
  40. if (iomap->offset >= isize) {
  41. iomap->type = IOMAP_HOLE;
  42. iomap->addr = IOMAP_NULL_ADDR;
  43. iomap->length = length;
  44. } else {
  45. iomap->type = IOMAP_MAPPED;
  46. iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset;
  47. iomap->length = isize - iomap->offset;
  48. }
  49. mutex_unlock(&zi->i_truncate_mutex);
  50. trace_zonefs_iomap_begin(inode, iomap);
  51. return 0;
  52. }
  53. static const struct iomap_ops zonefs_read_iomap_ops = {
  54. .iomap_begin = zonefs_read_iomap_begin,
  55. };
  56. static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset,
  57. loff_t length, unsigned int flags,
  58. struct iomap *iomap, struct iomap *srcmap)
  59. {
  60. struct zonefs_inode_info *zi = ZONEFS_I(inode);
  61. struct zonefs_zone *z = zonefs_inode_zone(inode);
  62. struct super_block *sb = inode->i_sb;
  63. loff_t isize;
  64. /* All write I/Os should always be within the file maximum size */
  65. if (WARN_ON_ONCE(offset + length > z->z_capacity))
  66. return -EIO;
  67. /*
  68. * Sequential zones can only accept direct writes. This is already
  69. * checked when writes are issued, so warn if we see a page writeback
  70. * operation.
  71. */
  72. if (WARN_ON_ONCE(zonefs_zone_is_seq(z) && !(flags & IOMAP_DIRECT)))
  73. return -EIO;
  74. /*
  75. * For conventional zones, all blocks are always mapped. For sequential
  76. * zones, all blocks after always mapped below the inode size (zone
  77. * write pointer) and unwritten beyond.
  78. */
  79. mutex_lock(&zi->i_truncate_mutex);
  80. iomap->bdev = inode->i_sb->s_bdev;
  81. iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
  82. iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset;
  83. isize = i_size_read(inode);
  84. if (iomap->offset >= isize) {
  85. iomap->type = IOMAP_UNWRITTEN;
  86. iomap->length = z->z_capacity - iomap->offset;
  87. } else {
  88. iomap->type = IOMAP_MAPPED;
  89. iomap->length = isize - iomap->offset;
  90. }
  91. mutex_unlock(&zi->i_truncate_mutex);
  92. trace_zonefs_iomap_begin(inode, iomap);
  93. return 0;
  94. }
  95. static const struct iomap_ops zonefs_write_iomap_ops = {
  96. .iomap_begin = zonefs_write_iomap_begin,
  97. };
  98. static int zonefs_read_folio(struct file *unused, struct folio *folio)
  99. {
  100. iomap_bio_read_folio(folio, &zonefs_read_iomap_ops);
  101. return 0;
  102. }
  103. static void zonefs_readahead(struct readahead_control *rac)
  104. {
  105. iomap_bio_readahead(rac, &zonefs_read_iomap_ops);
  106. }
  107. /*
  108. * Map blocks for page writeback. This is used only on conventional zone files,
  109. * which implies that the page range can only be within the fixed inode size.
  110. */
  111. static ssize_t zonefs_writeback_range(struct iomap_writepage_ctx *wpc,
  112. struct folio *folio, u64 offset, unsigned len, u64 end_pos)
  113. {
  114. struct zonefs_zone *z = zonefs_inode_zone(wpc->inode);
  115. if (WARN_ON_ONCE(zonefs_zone_is_seq(z)))
  116. return -EIO;
  117. if (WARN_ON_ONCE(offset >= i_size_read(wpc->inode)))
  118. return -EIO;
  119. /* If the mapping is already OK, nothing needs to be done */
  120. if (offset < wpc->iomap.offset ||
  121. offset >= wpc->iomap.offset + wpc->iomap.length) {
  122. int error;
  123. error = zonefs_write_iomap_begin(wpc->inode, offset,
  124. z->z_capacity - offset, IOMAP_WRITE,
  125. &wpc->iomap, NULL);
  126. if (error)
  127. return error;
  128. }
  129. return iomap_add_to_ioend(wpc, folio, offset, end_pos, len);
  130. }
  131. static const struct iomap_writeback_ops zonefs_writeback_ops = {
  132. .writeback_range = zonefs_writeback_range,
  133. .writeback_submit = iomap_ioend_writeback_submit,
  134. };
  135. static int zonefs_writepages(struct address_space *mapping,
  136. struct writeback_control *wbc)
  137. {
  138. struct iomap_writepage_ctx wpc = {
  139. .inode = mapping->host,
  140. .wbc = wbc,
  141. .ops = &zonefs_writeback_ops,
  142. };
  143. return iomap_writepages(&wpc);
  144. }
  145. static int zonefs_swap_activate(struct swap_info_struct *sis,
  146. struct file *swap_file, sector_t *span)
  147. {
  148. struct inode *inode = file_inode(swap_file);
  149. if (zonefs_inode_is_seq(inode)) {
  150. zonefs_err(inode->i_sb,
  151. "swap file: not a conventional zone file\n");
  152. return -EINVAL;
  153. }
  154. return iomap_swapfile_activate(sis, swap_file, span,
  155. &zonefs_read_iomap_ops);
  156. }
  157. const struct address_space_operations zonefs_file_aops = {
  158. .read_folio = zonefs_read_folio,
  159. .readahead = zonefs_readahead,
  160. .writepages = zonefs_writepages,
  161. .dirty_folio = iomap_dirty_folio,
  162. .release_folio = iomap_release_folio,
  163. .invalidate_folio = iomap_invalidate_folio,
  164. .migrate_folio = filemap_migrate_folio,
  165. .is_partially_uptodate = iomap_is_partially_uptodate,
  166. .error_remove_folio = generic_error_remove_folio,
  167. .swap_activate = zonefs_swap_activate,
  168. };
  169. int zonefs_file_truncate(struct inode *inode, loff_t isize)
  170. {
  171. struct zonefs_inode_info *zi = ZONEFS_I(inode);
  172. struct zonefs_zone *z = zonefs_inode_zone(inode);
  173. loff_t old_isize;
  174. enum req_op op;
  175. int ret = 0;
  176. /*
  177. * Only sequential zone files can be truncated and truncation is allowed
  178. * only down to a 0 size, which is equivalent to a zone reset, and to
  179. * the maximum file size, which is equivalent to a zone finish.
  180. */
  181. if (!zonefs_zone_is_seq(z))
  182. return -EPERM;
  183. if (!isize)
  184. op = REQ_OP_ZONE_RESET;
  185. else if (isize == z->z_capacity)
  186. op = REQ_OP_ZONE_FINISH;
  187. else
  188. return -EPERM;
  189. inode_dio_wait(inode);
  190. /* Serialize against page faults */
  191. filemap_invalidate_lock(inode->i_mapping);
  192. /* Serialize against zonefs_iomap_begin() */
  193. mutex_lock(&zi->i_truncate_mutex);
  194. old_isize = i_size_read(inode);
  195. if (isize == old_isize)
  196. goto unlock;
  197. ret = zonefs_inode_zone_mgmt(inode, op);
  198. if (ret)
  199. goto unlock;
  200. /*
  201. * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set,
  202. * take care of open zones.
  203. */
  204. if (z->z_flags & ZONEFS_ZONE_OPEN) {
  205. /*
  206. * Truncating a zone to EMPTY or FULL is the equivalent of
  207. * closing the zone. For a truncation to 0, we need to
  208. * re-open the zone to ensure new writes can be processed.
  209. * For a truncation to the maximum file size, the zone is
  210. * closed and writes cannot be accepted anymore, so clear
  211. * the open flag.
  212. */
  213. if (!isize)
  214. ret = zonefs_inode_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
  215. else
  216. z->z_flags &= ~ZONEFS_ZONE_OPEN;
  217. }
  218. zonefs_update_stats(inode, isize);
  219. truncate_setsize(inode, isize);
  220. z->z_wpoffset = isize;
  221. zonefs_inode_account_active(inode);
  222. unlock:
  223. mutex_unlock(&zi->i_truncate_mutex);
  224. filemap_invalidate_unlock(inode->i_mapping);
  225. return ret;
  226. }
  227. static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end,
  228. int datasync)
  229. {
  230. struct inode *inode = file_inode(file);
  231. int ret = 0;
  232. if (unlikely(IS_IMMUTABLE(inode)))
  233. return -EPERM;
  234. /*
  235. * Since only direct writes are allowed in sequential files, page cache
  236. * flush is needed only for conventional zone files.
  237. */
  238. if (zonefs_inode_is_cnv(inode))
  239. ret = file_write_and_wait_range(file, start, end);
  240. if (!ret)
  241. ret = blkdev_issue_flush(inode->i_sb->s_bdev);
  242. if (ret)
  243. zonefs_io_error(inode, true);
  244. return ret;
  245. }
  246. static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf)
  247. {
  248. struct inode *inode = file_inode(vmf->vma->vm_file);
  249. vm_fault_t ret;
  250. if (unlikely(IS_IMMUTABLE(inode)))
  251. return VM_FAULT_SIGBUS;
  252. /*
  253. * Sanity check: only conventional zone files can have shared
  254. * writeable mappings.
  255. */
  256. if (zonefs_inode_is_seq(inode))
  257. return VM_FAULT_NOPAGE;
  258. sb_start_pagefault(inode->i_sb);
  259. file_update_time(vmf->vma->vm_file);
  260. /* Serialize against truncates */
  261. filemap_invalidate_lock_shared(inode->i_mapping);
  262. ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops, NULL);
  263. filemap_invalidate_unlock_shared(inode->i_mapping);
  264. sb_end_pagefault(inode->i_sb);
  265. return ret;
  266. }
  267. static const struct vm_operations_struct zonefs_file_vm_ops = {
  268. .fault = filemap_fault,
  269. .map_pages = filemap_map_pages,
  270. .page_mkwrite = zonefs_filemap_page_mkwrite,
  271. };
  272. static int zonefs_file_mmap_prepare(struct vm_area_desc *desc)
  273. {
  274. struct file *file = desc->file;
  275. /*
  276. * Conventional zones accept random writes, so their files can support
  277. * shared writable mappings. For sequential zone files, only read
  278. * mappings are possible since there are no guarantees for write
  279. * ordering between msync() and page cache writeback.
  280. */
  281. if (zonefs_inode_is_seq(file_inode(file)) &&
  282. vma_desc_test_flags(desc, VMA_SHARED_BIT) &&
  283. vma_desc_test_flags(desc, VMA_MAYWRITE_BIT))
  284. return -EINVAL;
  285. file_accessed(file);
  286. desc->vm_ops = &zonefs_file_vm_ops;
  287. return 0;
  288. }
  289. static loff_t zonefs_file_llseek(struct file *file, loff_t offset, int whence)
  290. {
  291. loff_t isize = i_size_read(file_inode(file));
  292. /*
  293. * Seeks are limited to below the zone size for conventional zones
  294. * and below the zone write pointer for sequential zones. In both
  295. * cases, this limit is the inode size.
  296. */
  297. return generic_file_llseek_size(file, offset, whence, isize, isize);
  298. }
  299. static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size,
  300. int error, unsigned int flags)
  301. {
  302. struct inode *inode = file_inode(iocb->ki_filp);
  303. struct zonefs_inode_info *zi = ZONEFS_I(inode);
  304. if (error) {
  305. /*
  306. * For Sync IOs, error recovery is called from
  307. * zonefs_file_dio_write().
  308. */
  309. if (!is_sync_kiocb(iocb))
  310. zonefs_io_error(inode, true);
  311. return error;
  312. }
  313. if (size && zonefs_inode_is_seq(inode)) {
  314. /*
  315. * Note that we may be seeing completions out of order,
  316. * but that is not a problem since a write completed
  317. * successfully necessarily means that all preceding writes
  318. * were also successful. So we can safely increase the inode
  319. * size to the write end location.
  320. */
  321. mutex_lock(&zi->i_truncate_mutex);
  322. if (i_size_read(inode) < iocb->ki_pos + size) {
  323. zonefs_update_stats(inode, iocb->ki_pos + size);
  324. zonefs_i_size_write(inode, iocb->ki_pos + size);
  325. }
  326. mutex_unlock(&zi->i_truncate_mutex);
  327. }
  328. return 0;
  329. }
  330. static const struct iomap_dio_ops zonefs_write_dio_ops = {
  331. .end_io = zonefs_file_write_dio_end_io,
  332. };
  333. /*
  334. * Do not exceed the LFS limits nor the file zone size. If pos is under the
  335. * limit it becomes a short access. If it exceeds the limit, return -EFBIG.
  336. */
  337. static loff_t zonefs_write_check_limits(struct file *file, loff_t pos,
  338. loff_t count)
  339. {
  340. struct inode *inode = file_inode(file);
  341. struct zonefs_zone *z = zonefs_inode_zone(inode);
  342. loff_t limit = rlimit(RLIMIT_FSIZE);
  343. loff_t max_size = z->z_capacity;
  344. if (limit != RLIM_INFINITY) {
  345. if (pos >= limit) {
  346. send_sig(SIGXFSZ, current, 0);
  347. return -EFBIG;
  348. }
  349. count = min(count, limit - pos);
  350. }
  351. if (!(file->f_flags & O_LARGEFILE))
  352. max_size = min_t(loff_t, MAX_NON_LFS, max_size);
  353. if (unlikely(pos >= max_size))
  354. return -EFBIG;
  355. return min(count, max_size - pos);
  356. }
  357. static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from)
  358. {
  359. struct file *file = iocb->ki_filp;
  360. struct inode *inode = file_inode(file);
  361. struct zonefs_inode_info *zi = ZONEFS_I(inode);
  362. struct zonefs_zone *z = zonefs_inode_zone(inode);
  363. loff_t count;
  364. if (IS_SWAPFILE(inode))
  365. return -ETXTBSY;
  366. if (!iov_iter_count(from))
  367. return 0;
  368. if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
  369. return -EINVAL;
  370. if (iocb->ki_flags & IOCB_APPEND) {
  371. if (zonefs_zone_is_cnv(z))
  372. return -EINVAL;
  373. mutex_lock(&zi->i_truncate_mutex);
  374. iocb->ki_pos = z->z_wpoffset;
  375. mutex_unlock(&zi->i_truncate_mutex);
  376. }
  377. count = zonefs_write_check_limits(file, iocb->ki_pos,
  378. iov_iter_count(from));
  379. if (count < 0)
  380. return count;
  381. iov_iter_truncate(from, count);
  382. return iov_iter_count(from);
  383. }
  384. /*
  385. * Handle direct writes. For sequential zone files, this is the only possible
  386. * write path. For these files, check that the user is issuing writes
  387. * sequentially from the end of the file. This code assumes that the block layer
  388. * delivers write requests to the device in sequential order. This is always the
  389. * case if a block IO scheduler implementing the ELEVATOR_F_ZBD_SEQ_WRITE
  390. * elevator feature is being used (e.g. mq-deadline). The block layer always
  391. * automatically select such an elevator for zoned block devices during the
  392. * device initialization.
  393. */
  394. static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
  395. {
  396. struct inode *inode = file_inode(iocb->ki_filp);
  397. struct zonefs_inode_info *zi = ZONEFS_I(inode);
  398. struct zonefs_zone *z = zonefs_inode_zone(inode);
  399. struct super_block *sb = inode->i_sb;
  400. ssize_t ret, count;
  401. /*
  402. * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT
  403. * as this can cause write reordering (e.g. the first aio gets EAGAIN
  404. * on the inode lock but the second goes through but is now unaligned).
  405. */
  406. if (zonefs_zone_is_seq(z) && !is_sync_kiocb(iocb) &&
  407. (iocb->ki_flags & IOCB_NOWAIT))
  408. return -EOPNOTSUPP;
  409. if (iocb->ki_flags & IOCB_NOWAIT) {
  410. if (!inode_trylock(inode))
  411. return -EAGAIN;
  412. } else {
  413. inode_lock(inode);
  414. }
  415. count = zonefs_write_checks(iocb, from);
  416. if (count <= 0) {
  417. ret = count;
  418. goto inode_unlock;
  419. }
  420. if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) {
  421. ret = -EINVAL;
  422. goto inode_unlock;
  423. }
  424. /* Enforce sequential writes (append only) in sequential zones */
  425. if (zonefs_zone_is_seq(z)) {
  426. mutex_lock(&zi->i_truncate_mutex);
  427. if (iocb->ki_pos != z->z_wpoffset) {
  428. mutex_unlock(&zi->i_truncate_mutex);
  429. ret = -EINVAL;
  430. goto inode_unlock;
  431. }
  432. /*
  433. * Advance the zone write pointer offset. This assumes that the
  434. * IO will succeed, which is OK to do because we do not allow
  435. * partial writes (IOMAP_DIO_PARTIAL is not set) and if the IO
  436. * fails, the error path will correct the write pointer offset.
  437. */
  438. z->z_wpoffset += count;
  439. zonefs_inode_account_active(inode);
  440. mutex_unlock(&zi->i_truncate_mutex);
  441. }
  442. /*
  443. * iomap_dio_rw() may return ENOTBLK if there was an issue with
  444. * page invalidation. Overwrite that error code with EBUSY so that
  445. * the user can make sense of the error.
  446. */
  447. ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops,
  448. &zonefs_write_dio_ops, 0, NULL, 0);
  449. if (ret == -ENOTBLK)
  450. ret = -EBUSY;
  451. /*
  452. * For a failed IO or partial completion, trigger error recovery
  453. * to update the zone write pointer offset to a correct value.
  454. * For asynchronous IOs, zonefs_file_write_dio_end_io() may already
  455. * have executed error recovery if the IO already completed when we
  456. * reach here. However, we cannot know that and execute error recovery
  457. * again (that will not change anything).
  458. */
  459. if (zonefs_zone_is_seq(z)) {
  460. if (ret > 0 && ret != count)
  461. ret = -EIO;
  462. if (ret < 0 && ret != -EIOCBQUEUED)
  463. zonefs_io_error(inode, true);
  464. }
  465. inode_unlock:
  466. inode_unlock(inode);
  467. return ret;
  468. }
  469. static ssize_t zonefs_file_buffered_write(struct kiocb *iocb,
  470. struct iov_iter *from)
  471. {
  472. struct inode *inode = file_inode(iocb->ki_filp);
  473. ssize_t ret;
  474. /*
  475. * Direct IO writes are mandatory for sequential zone files so that the
  476. * write IO issuing order is preserved.
  477. */
  478. if (zonefs_inode_is_seq(inode))
  479. return -EIO;
  480. if (iocb->ki_flags & IOCB_NOWAIT) {
  481. if (!inode_trylock(inode))
  482. return -EAGAIN;
  483. } else {
  484. inode_lock(inode);
  485. }
  486. ret = zonefs_write_checks(iocb, from);
  487. if (ret <= 0)
  488. goto inode_unlock;
  489. ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops,
  490. NULL, NULL);
  491. if (ret == -EIO)
  492. zonefs_io_error(inode, true);
  493. inode_unlock:
  494. inode_unlock(inode);
  495. if (ret > 0)
  496. ret = generic_write_sync(iocb, ret);
  497. return ret;
  498. }
  499. static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
  500. {
  501. struct inode *inode = file_inode(iocb->ki_filp);
  502. struct zonefs_zone *z = zonefs_inode_zone(inode);
  503. if (unlikely(IS_IMMUTABLE(inode)))
  504. return -EPERM;
  505. if (sb_rdonly(inode->i_sb))
  506. return -EROFS;
  507. /* Write operations beyond the zone capacity are not allowed */
  508. if (iocb->ki_pos >= z->z_capacity)
  509. return -EFBIG;
  510. if (iocb->ki_flags & IOCB_DIRECT) {
  511. ssize_t ret = zonefs_file_dio_write(iocb, from);
  512. if (ret != -ENOTBLK)
  513. return ret;
  514. }
  515. return zonefs_file_buffered_write(iocb, from);
  516. }
  517. static int zonefs_file_read_dio_end_io(struct kiocb *iocb, ssize_t size,
  518. int error, unsigned int flags)
  519. {
  520. if (error) {
  521. zonefs_io_error(file_inode(iocb->ki_filp), false);
  522. return error;
  523. }
  524. return 0;
  525. }
  526. static const struct iomap_dio_ops zonefs_read_dio_ops = {
  527. .end_io = zonefs_file_read_dio_end_io,
  528. };
  529. static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
  530. {
  531. struct inode *inode = file_inode(iocb->ki_filp);
  532. struct zonefs_inode_info *zi = ZONEFS_I(inode);
  533. struct zonefs_zone *z = zonefs_inode_zone(inode);
  534. struct super_block *sb = inode->i_sb;
  535. loff_t isize;
  536. ssize_t ret;
  537. /* Offline zones cannot be read */
  538. if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777)))
  539. return -EPERM;
  540. if (iocb->ki_pos >= z->z_capacity)
  541. return 0;
  542. if (iocb->ki_flags & IOCB_NOWAIT) {
  543. if (!inode_trylock_shared(inode))
  544. return -EAGAIN;
  545. } else {
  546. inode_lock_shared(inode);
  547. }
  548. /* Limit read operations to written data */
  549. mutex_lock(&zi->i_truncate_mutex);
  550. isize = i_size_read(inode);
  551. if (iocb->ki_pos >= isize) {
  552. mutex_unlock(&zi->i_truncate_mutex);
  553. ret = 0;
  554. goto inode_unlock;
  555. }
  556. iov_iter_truncate(to, isize - iocb->ki_pos);
  557. mutex_unlock(&zi->i_truncate_mutex);
  558. if (iocb->ki_flags & IOCB_DIRECT) {
  559. size_t count = iov_iter_count(to);
  560. if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) {
  561. ret = -EINVAL;
  562. goto inode_unlock;
  563. }
  564. file_accessed(iocb->ki_filp);
  565. ret = iomap_dio_rw(iocb, to, &zonefs_read_iomap_ops,
  566. &zonefs_read_dio_ops, 0, NULL, 0);
  567. } else {
  568. ret = generic_file_read_iter(iocb, to);
  569. if (ret == -EIO)
  570. zonefs_io_error(inode, false);
  571. }
  572. inode_unlock:
  573. inode_unlock_shared(inode);
  574. return ret;
  575. }
  576. static ssize_t zonefs_file_splice_read(struct file *in, loff_t *ppos,
  577. struct pipe_inode_info *pipe,
  578. size_t len, unsigned int flags)
  579. {
  580. struct inode *inode = file_inode(in);
  581. struct zonefs_inode_info *zi = ZONEFS_I(inode);
  582. struct zonefs_zone *z = zonefs_inode_zone(inode);
  583. loff_t isize;
  584. ssize_t ret = 0;
  585. /* Offline zones cannot be read */
  586. if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777)))
  587. return -EPERM;
  588. if (*ppos >= z->z_capacity)
  589. return 0;
  590. inode_lock_shared(inode);
  591. /* Limit read operations to written data */
  592. mutex_lock(&zi->i_truncate_mutex);
  593. isize = i_size_read(inode);
  594. if (*ppos >= isize)
  595. len = 0;
  596. else
  597. len = min_t(loff_t, len, isize - *ppos);
  598. mutex_unlock(&zi->i_truncate_mutex);
  599. if (len > 0) {
  600. ret = filemap_splice_read(in, ppos, pipe, len, flags);
  601. if (ret == -EIO)
  602. zonefs_io_error(inode, false);
  603. }
  604. inode_unlock_shared(inode);
  605. return ret;
  606. }
  607. /*
  608. * Write open accounting is done only for sequential files.
  609. */
  610. static inline bool zonefs_seq_file_need_wro(struct inode *inode,
  611. struct file *file)
  612. {
  613. if (zonefs_inode_is_cnv(inode))
  614. return false;
  615. if (!(file->f_mode & FMODE_WRITE))
  616. return false;
  617. return true;
  618. }
  619. static int zonefs_seq_file_write_open(struct inode *inode)
  620. {
  621. struct zonefs_inode_info *zi = ZONEFS_I(inode);
  622. struct zonefs_zone *z = zonefs_inode_zone(inode);
  623. int ret = 0;
  624. mutex_lock(&zi->i_truncate_mutex);
  625. if (!zi->i_wr_refcnt) {
  626. struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
  627. unsigned int wro = atomic_inc_return(&sbi->s_wro_seq_files);
  628. if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) {
  629. if (sbi->s_max_wro_seq_files
  630. && wro > sbi->s_max_wro_seq_files) {
  631. atomic_dec(&sbi->s_wro_seq_files);
  632. ret = -EBUSY;
  633. goto unlock;
  634. }
  635. if (i_size_read(inode) < z->z_capacity) {
  636. ret = zonefs_inode_zone_mgmt(inode,
  637. REQ_OP_ZONE_OPEN);
  638. if (ret) {
  639. atomic_dec(&sbi->s_wro_seq_files);
  640. goto unlock;
  641. }
  642. z->z_flags |= ZONEFS_ZONE_OPEN;
  643. zonefs_inode_account_active(inode);
  644. }
  645. }
  646. }
  647. zi->i_wr_refcnt++;
  648. unlock:
  649. mutex_unlock(&zi->i_truncate_mutex);
  650. return ret;
  651. }
  652. static int zonefs_file_open(struct inode *inode, struct file *file)
  653. {
  654. int ret;
  655. file->f_mode |= FMODE_CAN_ODIRECT;
  656. ret = generic_file_open(inode, file);
  657. if (ret)
  658. return ret;
  659. if (zonefs_seq_file_need_wro(inode, file))
  660. return zonefs_seq_file_write_open(inode);
  661. return 0;
  662. }
  663. static void zonefs_seq_file_write_close(struct inode *inode)
  664. {
  665. struct zonefs_inode_info *zi = ZONEFS_I(inode);
  666. struct zonefs_zone *z = zonefs_inode_zone(inode);
  667. struct super_block *sb = inode->i_sb;
  668. struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
  669. int ret = 0;
  670. mutex_lock(&zi->i_truncate_mutex);
  671. zi->i_wr_refcnt--;
  672. if (zi->i_wr_refcnt)
  673. goto unlock;
  674. /*
  675. * The file zone may not be open anymore (e.g. the file was truncated to
  676. * its maximum size or it was fully written). For this case, we only
  677. * need to decrement the write open count.
  678. */
  679. if (z->z_flags & ZONEFS_ZONE_OPEN) {
  680. ret = zonefs_inode_zone_mgmt(inode, REQ_OP_ZONE_CLOSE);
  681. if (ret) {
  682. __zonefs_io_error(inode, false);
  683. /*
  684. * Leaving zones explicitly open may lead to a state
  685. * where most zones cannot be written (zone resources
  686. * exhausted). So take preventive action by remounting
  687. * read-only.
  688. */
  689. if (z->z_flags & ZONEFS_ZONE_OPEN &&
  690. !(sb->s_flags & SB_RDONLY)) {
  691. zonefs_warn(sb,
  692. "closing zone at %llu failed %d\n",
  693. z->z_sector, ret);
  694. zonefs_warn(sb,
  695. "remounting filesystem read-only\n");
  696. sb->s_flags |= SB_RDONLY;
  697. }
  698. goto unlock;
  699. }
  700. z->z_flags &= ~ZONEFS_ZONE_OPEN;
  701. zonefs_inode_account_active(inode);
  702. }
  703. atomic_dec(&sbi->s_wro_seq_files);
  704. unlock:
  705. mutex_unlock(&zi->i_truncate_mutex);
  706. }
  707. static int zonefs_file_release(struct inode *inode, struct file *file)
  708. {
  709. /*
  710. * If we explicitly open a zone we must close it again as well, but the
  711. * zone management operation can fail (either due to an IO error or as
  712. * the zone has gone offline or read-only). Make sure we don't fail the
  713. * close(2) for user-space.
  714. */
  715. if (zonefs_seq_file_need_wro(inode, file))
  716. zonefs_seq_file_write_close(inode);
  717. return 0;
  718. }
  719. const struct file_operations zonefs_file_operations = {
  720. .open = zonefs_file_open,
  721. .release = zonefs_file_release,
  722. .fsync = zonefs_file_fsync,
  723. .mmap_prepare = zonefs_file_mmap_prepare,
  724. .llseek = zonefs_file_llseek,
  725. .read_iter = zonefs_file_read_iter,
  726. .write_iter = zonefs_file_write_iter,
  727. .splice_read = zonefs_file_splice_read,
  728. .splice_write = iter_file_splice_write,
  729. .iopoll = iocb_bio_iopoll,
  730. };