data.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * Copyright (C) 2017-2018 HUAWEI, Inc.
  4. * https://www.huawei.com/
  5. * Copyright (C) 2021, Alibaba Cloud
  6. */
  7. #include "internal.h"
  8. #include <linux/filelock.h>
  9. #include <linux/sched/mm.h>
  10. #include <trace/events/erofs.h>
  11. void erofs_unmap_metabuf(struct erofs_buf *buf)
  12. {
  13. if (!buf->base)
  14. return;
  15. kunmap_local(buf->base);
  16. buf->base = NULL;
  17. }
  18. void erofs_put_metabuf(struct erofs_buf *buf)
  19. {
  20. if (!buf->page)
  21. return;
  22. erofs_unmap_metabuf(buf);
  23. folio_put(page_folio(buf->page));
  24. buf->page = NULL;
  25. }
  26. void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, bool need_kmap)
  27. {
  28. pgoff_t index = (buf->off + offset) >> PAGE_SHIFT;
  29. struct folio *folio = NULL;
  30. if (buf->page) {
  31. folio = page_folio(buf->page);
  32. if (folio_file_page(folio, index) != buf->page)
  33. erofs_unmap_metabuf(buf);
  34. }
  35. if (!folio || !folio_contains(folio, index)) {
  36. erofs_put_metabuf(buf);
  37. folio = read_mapping_folio(buf->mapping, index, buf->file);
  38. if (IS_ERR(folio))
  39. return folio;
  40. }
  41. buf->page = folio_file_page(folio, index);
  42. if (!need_kmap)
  43. return NULL;
  44. if (!buf->base)
  45. buf->base = kmap_local_page(buf->page);
  46. return buf->base + (offset & ~PAGE_MASK);
  47. }
  48. int erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb,
  49. bool in_metabox)
  50. {
  51. struct erofs_sb_info *sbi = EROFS_SB(sb);
  52. buf->file = NULL;
  53. if (in_metabox) {
  54. if (unlikely(!sbi->metabox_inode))
  55. return -EFSCORRUPTED;
  56. buf->mapping = sbi->metabox_inode->i_mapping;
  57. return 0;
  58. }
  59. buf->off = sbi->dif0.fsoff;
  60. if (erofs_is_fileio_mode(sbi)) {
  61. buf->file = sbi->dif0.file; /* some fs like FUSE needs it */
  62. buf->mapping = buf->file->f_mapping;
  63. } else if (erofs_is_fscache_mode(sb))
  64. buf->mapping = sbi->dif0.fscache->inode->i_mapping;
  65. else
  66. buf->mapping = sb->s_bdev->bd_mapping;
  67. return 0;
  68. }
  69. void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
  70. erofs_off_t offset, bool in_metabox)
  71. {
  72. int err;
  73. err = erofs_init_metabuf(buf, sb, in_metabox);
  74. if (err)
  75. return ERR_PTR(err);
  76. return erofs_bread(buf, offset, true);
  77. }
  78. int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map)
  79. {
  80. struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
  81. struct super_block *sb = inode->i_sb;
  82. unsigned int unit, blksz = sb->s_blocksize;
  83. struct erofs_inode *vi = EROFS_I(inode);
  84. struct erofs_inode_chunk_index *idx;
  85. erofs_blk_t startblk, addrmask;
  86. bool tailpacking;
  87. erofs_off_t pos;
  88. u64 chunknr;
  89. int err = 0;
  90. trace_erofs_map_blocks_enter(inode, map, 0);
  91. map->m_deviceid = 0;
  92. map->m_flags = 0;
  93. if (map->m_la >= inode->i_size)
  94. goto out;
  95. if (vi->datalayout != EROFS_INODE_CHUNK_BASED) {
  96. tailpacking = (vi->datalayout == EROFS_INODE_FLAT_INLINE);
  97. if (!tailpacking && vi->startblk == EROFS_NULL_ADDR)
  98. goto out;
  99. pos = erofs_pos(sb, erofs_iblks(inode) - tailpacking);
  100. map->m_flags = EROFS_MAP_MAPPED;
  101. if (map->m_la < pos) {
  102. map->m_pa = erofs_pos(sb, vi->startblk) + map->m_la;
  103. map->m_llen = pos - map->m_la;
  104. } else {
  105. map->m_pa = erofs_iloc(inode) + vi->inode_isize +
  106. vi->xattr_isize + erofs_blkoff(sb, map->m_la);
  107. map->m_llen = inode->i_size - map->m_la;
  108. map->m_flags |= EROFS_MAP_META;
  109. }
  110. goto out;
  111. }
  112. if (vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES)
  113. unit = sizeof(*idx); /* chunk index */
  114. else
  115. unit = EROFS_BLOCK_MAP_ENTRY_SIZE; /* block map */
  116. chunknr = map->m_la >> vi->chunkbits;
  117. pos = ALIGN(erofs_iloc(inode) + vi->inode_isize +
  118. vi->xattr_isize, unit) + unit * chunknr;
  119. idx = erofs_read_metabuf(&buf, sb, pos, erofs_inode_in_metabox(inode));
  120. if (IS_ERR(idx)) {
  121. err = PTR_ERR(idx);
  122. goto out;
  123. }
  124. map->m_la = chunknr << vi->chunkbits;
  125. map->m_llen = min_t(erofs_off_t, 1UL << vi->chunkbits,
  126. round_up(inode->i_size - map->m_la, blksz));
  127. if (vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES) {
  128. addrmask = (vi->chunkformat & EROFS_CHUNK_FORMAT_48BIT) ?
  129. BIT_ULL(48) - 1 : BIT_ULL(32) - 1;
  130. startblk = (((u64)le16_to_cpu(idx->startblk_hi) << 32) |
  131. le32_to_cpu(idx->startblk_lo)) & addrmask;
  132. if ((startblk ^ EROFS_NULL_ADDR) & addrmask) {
  133. map->m_deviceid = le16_to_cpu(idx->device_id) &
  134. EROFS_SB(sb)->device_id_mask;
  135. map->m_pa = erofs_pos(sb, startblk);
  136. map->m_flags = EROFS_MAP_MAPPED;
  137. }
  138. } else {
  139. startblk = le32_to_cpu(*(__le32 *)idx);
  140. if (startblk != (u32)EROFS_NULL_ADDR) {
  141. map->m_pa = erofs_pos(sb, startblk);
  142. map->m_flags = EROFS_MAP_MAPPED;
  143. }
  144. }
  145. erofs_put_metabuf(&buf);
  146. out:
  147. if (!err) {
  148. map->m_plen = map->m_llen;
  149. /* inline data should be located in the same meta block */
  150. if ((map->m_flags & EROFS_MAP_META) &&
  151. erofs_blkoff(sb, map->m_pa) + map->m_plen > blksz) {
  152. erofs_err(sb, "inline data across blocks @ nid %llu", vi->nid);
  153. DBG_BUGON(1);
  154. return -EFSCORRUPTED;
  155. }
  156. }
  157. trace_erofs_map_blocks_exit(inode, map, 0, err);
  158. return err;
  159. }
  160. static void erofs_fill_from_devinfo(struct erofs_map_dev *map,
  161. struct super_block *sb, struct erofs_device_info *dif)
  162. {
  163. map->m_sb = sb;
  164. map->m_dif = dif;
  165. map->m_bdev = NULL;
  166. if (dif->file && S_ISBLK(file_inode(dif->file)->i_mode))
  167. map->m_bdev = file_bdev(dif->file);
  168. }
  169. int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
  170. {
  171. struct erofs_dev_context *devs = EROFS_SB(sb)->devs;
  172. struct erofs_device_info *dif;
  173. erofs_off_t startoff;
  174. int id;
  175. erofs_fill_from_devinfo(map, sb, &EROFS_SB(sb)->dif0);
  176. map->m_bdev = sb->s_bdev; /* use s_bdev for the primary device */
  177. if (map->m_deviceid) {
  178. down_read(&devs->rwsem);
  179. dif = idr_find(&devs->tree, map->m_deviceid - 1);
  180. if (!dif) {
  181. up_read(&devs->rwsem);
  182. return -ENODEV;
  183. }
  184. if (devs->flatdev) {
  185. map->m_pa += erofs_pos(sb, dif->uniaddr);
  186. up_read(&devs->rwsem);
  187. return 0;
  188. }
  189. erofs_fill_from_devinfo(map, sb, dif);
  190. up_read(&devs->rwsem);
  191. } else if (devs->extra_devices && !devs->flatdev) {
  192. down_read(&devs->rwsem);
  193. idr_for_each_entry(&devs->tree, dif, id) {
  194. if (!dif->uniaddr)
  195. continue;
  196. startoff = erofs_pos(sb, dif->uniaddr);
  197. if (map->m_pa >= startoff &&
  198. map->m_pa < startoff + erofs_pos(sb, dif->blocks)) {
  199. map->m_pa -= startoff;
  200. erofs_fill_from_devinfo(map, sb, dif);
  201. break;
  202. }
  203. }
  204. up_read(&devs->rwsem);
  205. }
  206. return 0;
  207. }
  208. /*
  209. * bit 30: I/O error occurred on this folio
  210. * bit 29: CPU has dirty data in D-cache (needs aliasing handling);
  211. * bit 0 - 29: remaining parts to complete this folio
  212. */
  213. #define EROFS_ONLINEFOLIO_EIO 30
  214. #define EROFS_ONLINEFOLIO_DIRTY 29
  215. void erofs_onlinefolio_init(struct folio *folio)
  216. {
  217. union {
  218. atomic_t o;
  219. void *v;
  220. } u = { .o = ATOMIC_INIT(1) };
  221. folio->private = u.v; /* valid only if file-backed folio is locked */
  222. }
  223. void erofs_onlinefolio_split(struct folio *folio)
  224. {
  225. atomic_inc((atomic_t *)&folio->private);
  226. }
  227. void erofs_onlinefolio_end(struct folio *folio, int err, bool dirty)
  228. {
  229. int orig, v;
  230. do {
  231. orig = atomic_read((atomic_t *)&folio->private);
  232. DBG_BUGON(orig <= 0);
  233. v = dirty << EROFS_ONLINEFOLIO_DIRTY;
  234. v |= (orig - 1) | (!!err << EROFS_ONLINEFOLIO_EIO);
  235. } while (atomic_cmpxchg((atomic_t *)&folio->private, orig, v) != orig);
  236. if (v & (BIT(EROFS_ONLINEFOLIO_DIRTY) - 1))
  237. return;
  238. folio->private = 0;
  239. if (v & BIT(EROFS_ONLINEFOLIO_DIRTY))
  240. flush_dcache_folio(folio);
  241. folio_end_read(folio, !(v & BIT(EROFS_ONLINEFOLIO_EIO)));
  242. }
  243. struct erofs_iomap_iter_ctx {
  244. struct page *page;
  245. void *base;
  246. struct inode *realinode;
  247. };
  248. static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
  249. unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
  250. {
  251. struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
  252. struct erofs_iomap_iter_ctx *ctx = iter->private;
  253. struct inode *realinode = ctx ? ctx->realinode : inode;
  254. struct super_block *sb = realinode->i_sb;
  255. struct erofs_map_blocks map;
  256. struct erofs_map_dev mdev;
  257. int ret;
  258. map.m_la = offset;
  259. map.m_llen = length;
  260. ret = erofs_map_blocks(realinode, &map);
  261. if (ret < 0)
  262. return ret;
  263. iomap->offset = map.m_la;
  264. iomap->length = map.m_llen;
  265. iomap->flags = 0;
  266. iomap->addr = IOMAP_NULL_ADDR;
  267. if (!(map.m_flags & EROFS_MAP_MAPPED)) {
  268. iomap->type = IOMAP_HOLE;
  269. return 0;
  270. }
  271. if (!(map.m_flags & EROFS_MAP_META) || !erofs_inode_in_metabox(realinode)) {
  272. mdev = (struct erofs_map_dev) {
  273. .m_deviceid = map.m_deviceid,
  274. .m_pa = map.m_pa,
  275. };
  276. ret = erofs_map_dev(sb, &mdev);
  277. if (ret)
  278. return ret;
  279. if (flags & IOMAP_DAX)
  280. iomap->dax_dev = mdev.m_dif->dax_dev;
  281. else
  282. iomap->bdev = mdev.m_bdev;
  283. iomap->addr = mdev.m_dif->fsoff + mdev.m_pa;
  284. if (flags & IOMAP_DAX)
  285. iomap->addr += mdev.m_dif->dax_part_off;
  286. }
  287. if (map.m_flags & EROFS_MAP_META) {
  288. iomap->type = IOMAP_INLINE;
  289. /* read context should read the inlined data */
  290. if (ctx) {
  291. struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
  292. void *ptr;
  293. ptr = erofs_read_metabuf(&buf, sb, map.m_pa,
  294. erofs_inode_in_metabox(realinode));
  295. if (IS_ERR(ptr))
  296. return PTR_ERR(ptr);
  297. iomap->inline_data = ptr;
  298. ctx->page = buf.page;
  299. ctx->base = buf.base;
  300. }
  301. } else {
  302. iomap->type = IOMAP_MAPPED;
  303. }
  304. return 0;
  305. }
  306. static int erofs_iomap_end(struct inode *inode, loff_t pos, loff_t length,
  307. ssize_t written, unsigned int flags, struct iomap *iomap)
  308. {
  309. struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
  310. struct erofs_iomap_iter_ctx *ctx = iter->private;
  311. if (ctx && ctx->base) {
  312. struct erofs_buf buf = {
  313. .page = ctx->page,
  314. .base = ctx->base,
  315. };
  316. DBG_BUGON(iomap->type != IOMAP_INLINE);
  317. erofs_put_metabuf(&buf);
  318. ctx->base = NULL;
  319. }
  320. return written;
  321. }
  322. static const struct iomap_ops erofs_iomap_ops = {
  323. .iomap_begin = erofs_iomap_begin,
  324. .iomap_end = erofs_iomap_end,
  325. };
  326. int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
  327. u64 start, u64 len)
  328. {
  329. if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout)) {
  330. if (!IS_ENABLED(CONFIG_EROFS_FS_ZIP))
  331. return -EOPNOTSUPP;
  332. return iomap_fiemap(inode, fieinfo, start, len,
  333. &z_erofs_iomap_report_ops);
  334. }
  335. return iomap_fiemap(inode, fieinfo, start, len, &erofs_iomap_ops);
  336. }
  337. /*
  338. * since we dont have write or truncate flows, so no inode
  339. * locking needs to be held at the moment.
  340. */
  341. static int erofs_read_folio(struct file *file, struct folio *folio)
  342. {
  343. struct iomap_read_folio_ctx read_ctx = {
  344. .ops = &iomap_bio_read_ops,
  345. .cur_folio = folio,
  346. };
  347. bool need_iput;
  348. struct erofs_iomap_iter_ctx iter_ctx = {
  349. .realinode = erofs_real_inode(folio_inode(folio), &need_iput),
  350. };
  351. trace_erofs_read_folio(iter_ctx.realinode, folio, true);
  352. iomap_read_folio(&erofs_iomap_ops, &read_ctx, &iter_ctx);
  353. if (need_iput)
  354. iput(iter_ctx.realinode);
  355. return 0;
  356. }
  357. static void erofs_readahead(struct readahead_control *rac)
  358. {
  359. struct iomap_read_folio_ctx read_ctx = {
  360. .ops = &iomap_bio_read_ops,
  361. .rac = rac,
  362. };
  363. bool need_iput;
  364. struct erofs_iomap_iter_ctx iter_ctx = {
  365. .realinode = erofs_real_inode(rac->mapping->host, &need_iput),
  366. };
  367. trace_erofs_readahead(iter_ctx.realinode, readahead_index(rac),
  368. readahead_count(rac), true);
  369. iomap_readahead(&erofs_iomap_ops, &read_ctx, &iter_ctx);
  370. if (need_iput)
  371. iput(iter_ctx.realinode);
  372. }
  373. static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
  374. {
  375. return iomap_bmap(mapping, block, &erofs_iomap_ops);
  376. }
  377. static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
  378. {
  379. struct inode *inode = file_inode(iocb->ki_filp);
  380. /* no need taking (shared) inode lock since it's a ro filesystem */
  381. if (!iov_iter_count(to))
  382. return 0;
  383. if (IS_ENABLED(CONFIG_FS_DAX) && IS_DAX(inode))
  384. return dax_iomap_rw(iocb, to, &erofs_iomap_ops);
  385. if ((iocb->ki_flags & IOCB_DIRECT) && inode->i_sb->s_bdev) {
  386. struct erofs_iomap_iter_ctx iter_ctx = {
  387. .realinode = inode,
  388. };
  389. return iomap_dio_rw(iocb, to, &erofs_iomap_ops,
  390. NULL, 0, &iter_ctx, 0);
  391. }
  392. return filemap_read(iocb, to, 0);
  393. }
  394. /* for uncompressed (aligned) files and raw access for other files */
  395. const struct address_space_operations erofs_aops = {
  396. .read_folio = erofs_read_folio,
  397. .readahead = erofs_readahead,
  398. .bmap = erofs_bmap,
  399. .direct_IO = noop_direct_IO,
  400. .release_folio = iomap_release_folio,
  401. .invalidate_folio = iomap_invalidate_folio,
  402. };
  403. #ifdef CONFIG_FS_DAX
  404. static vm_fault_t erofs_dax_huge_fault(struct vm_fault *vmf,
  405. unsigned int order)
  406. {
  407. return dax_iomap_fault(vmf, order, NULL, NULL, &erofs_iomap_ops);
  408. }
  409. static vm_fault_t erofs_dax_fault(struct vm_fault *vmf)
  410. {
  411. return erofs_dax_huge_fault(vmf, 0);
  412. }
  413. static const struct vm_operations_struct erofs_dax_vm_ops = {
  414. .fault = erofs_dax_fault,
  415. .huge_fault = erofs_dax_huge_fault,
  416. };
  417. static int erofs_file_mmap_prepare(struct vm_area_desc *desc)
  418. {
  419. if (!IS_DAX(file_inode(desc->file)))
  420. return generic_file_readonly_mmap_prepare(desc);
  421. if (vma_desc_test_flags(desc, VMA_SHARED_BIT) &&
  422. vma_desc_test_flags(desc, VMA_MAYWRITE_BIT))
  423. return -EINVAL;
  424. desc->vm_ops = &erofs_dax_vm_ops;
  425. vma_desc_set_flags(desc, VMA_HUGEPAGE_BIT);
  426. return 0;
  427. }
  428. #else
  429. #define erofs_file_mmap_prepare generic_file_readonly_mmap_prepare
  430. #endif
  431. static loff_t erofs_file_llseek(struct file *file, loff_t offset, int whence)
  432. {
  433. struct inode *inode = file->f_mapping->host;
  434. const struct iomap_ops *ops = &erofs_iomap_ops;
  435. if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout)) {
  436. if (!IS_ENABLED(CONFIG_EROFS_FS_ZIP))
  437. return generic_file_llseek(file, offset, whence);
  438. ops = &z_erofs_iomap_report_ops;
  439. }
  440. if (whence == SEEK_HOLE)
  441. offset = iomap_seek_hole(inode, offset, ops);
  442. else if (whence == SEEK_DATA)
  443. offset = iomap_seek_data(inode, offset, ops);
  444. else
  445. return generic_file_llseek(file, offset, whence);
  446. if (offset < 0)
  447. return offset;
  448. return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
  449. }
  450. const struct file_operations erofs_file_fops = {
  451. .llseek = erofs_file_llseek,
  452. .read_iter = erofs_file_read_iter,
  453. .unlocked_ioctl = erofs_ioctl,
  454. #ifdef CONFIG_COMPAT
  455. .compat_ioctl = erofs_compat_ioctl,
  456. #endif
  457. .mmap_prepare = erofs_file_mmap_prepare,
  458. .get_unmapped_area = thp_get_unmapped_area,
  459. .splice_read = filemap_splice_read,
  460. .setlease = generic_setlease,
  461. };