validation.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. /* vnode and volume validity verification.
  3. *
  4. * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
  5. * Written by David Howells (dhowells@redhat.com)
  6. */
  7. #include <linux/kernel.h>
  8. #include <linux/module.h>
  9. #include <linux/sched.h>
  10. #include "internal.h"
  11. /*
  12. * Data validation is managed through a number of mechanisms from the server:
  13. *
  14. * (1) On first contact with a server (such as if it has just been rebooted),
  15. * the server sends us a CB.InitCallBackState* request.
  16. *
  17. * (2) On a RW volume, in response to certain vnode (inode)-accessing RPC
  18. * calls, the server maintains a time-limited per-vnode promise that it
  19. * will send us a CB.CallBack request if a third party alters the vnodes
  20. * accessed.
  21. *
  22. * Note that a vnode-level callbacks may also be sent for other reasons,
  23. * such as filelock release.
  24. *
  25. * (3) On a RO (or Backup) volume, in response to certain vnode-accessing RPC
  26. * calls, each server maintains a time-limited per-volume promise that it
  27. * will send us a CB.CallBack request if the RO volume is updated to a
  28. * snapshot of the RW volume ("vos release"). This is an atomic event
  29. * that cuts over all instances of the RO volume across multiple servers
  30. * simultaneously.
  31. *
  32. * Note that a volume-level callbacks may also be sent for other reasons,
  33. * such as the volumeserver taking over control of the volume from the
  34. * fileserver.
  35. *
  36. * Note also that each server maintains an independent time limit on an
  37. * independent callback.
  38. *
  39. * (4) Certain RPC calls include a volume information record "VolSync" in
  40. * their reply. This contains a creation date for the volume that should
  41. * remain unchanged for a RW volume (but will be changed if the volume is
  42. * restored from backup) or will be bumped to the time of snapshotting
  43. * when a RO volume is released.
  44. *
  45. * In order to track this events, the following are provided:
  46. *
  47. * ->cb_v_break. A counter of events that might mean that the contents of
  48. * a volume have been altered since we last checked a vnode.
  49. *
  50. * ->cb_v_check. A counter of the number of events that we've sent a
  51. * query to the server for. Everything's up to date if this equals
  52. * cb_v_break.
  53. *
  54. * ->cb_scrub. A counter of the number of regression events for which we
  55. * have to completely wipe the cache.
  56. *
  57. * ->cb_ro_snapshot. A counter of the number of times that we've
  58. * recognised that a RO volume has been updated.
  59. *
  60. * ->cb_break. A counter of events that might mean that the contents of a
  61. * vnode have been altered.
  62. *
  63. * ->cb_expires_at. The time at which the callback promise expires or
  64. * AFS_NO_CB_PROMISE if we have no promise.
  65. *
  66. * The way we manage things is:
  67. *
  68. * (1) When a volume-level CB.CallBack occurs, we increment ->cb_v_break on
  69. * the volume and reset ->cb_expires_at (ie. set AFS_NO_CB_PROMISE) on the
  70. * volume and volume's server record.
  71. *
  72. * (2) When a CB.InitCallBackState occurs, we treat this as a volume-level
  73. * callback break on all the volumes that have been using that volume
  74. * (ie. increment ->cb_v_break and reset ->cb_expires_at).
  75. *
  76. * (3) When a vnode-level CB.CallBack occurs, we increment ->cb_break on the
  77. * vnode and reset its ->cb_expires_at. If the vnode is mmapped, we also
  78. * dispatch a work item to unmap all PTEs to the vnode's pagecache to
  79. * force reentry to the filesystem for revalidation.
  80. *
  81. * (4) When entering the filesystem, we call afs_validate() to check the
  82. * validity of a vnode. This first checks to see if ->cb_v_check and
  83. * ->cb_v_break match, and if they don't, we lock volume->cb_check_lock
  84. * exclusively and perform an FS.FetchStatus on the vnode.
  85. *
  86. * After checking the volume, we check the vnode. If there's a mismatch
  87. * between the volume counters and the vnode's mirrors of those counters,
  88. * we lock vnode->validate_lock and issue an FS.FetchStatus on the vnode.
  89. *
  90. * (5) When the reply from FS.FetchStatus arrives, the VolSync record is
  91. * parsed:
  92. *
  93. * (A) If the Creation timestamp has changed on a RW volume or regressed
  94. * on a RO volume, we try to increment ->cb_scrub; if it advances on a
  95. * RO volume, we assume "vos release" happened and try to increment
  96. * ->cb_ro_snapshot.
  97. *
  98. * (B) If the Update timestamp has regressed, we try to increment
  99. * ->cb_scrub.
  100. *
  101. * Note that in both of these cases, we only do the increment if we can
  102. * cmpxchg the value of the timestamp from the value we noted before the
  103. * op. This tries to prevent parallel ops from fighting one another.
  104. *
  105. * volume->cb_v_check is then set to ->cb_v_break.
  106. *
  107. * (6) The AFSCallBack record included in the FS.FetchStatus reply is also
  108. * parsed and used to set the promise in ->cb_expires_at for the vnode,
  109. * the volume and the volume's server record.
  110. *
  111. * (7) If ->cb_scrub is seen to have advanced, we invalidate the pagecache for
  112. * the vnode.
  113. */
  114. /*
  115. * Check the validity of a vnode/inode and its parent volume.
  116. */
  117. bool afs_check_validity(const struct afs_vnode *vnode)
  118. {
  119. const struct afs_volume *volume = vnode->volume;
  120. enum afs_vnode_invalid_trace trace = afs_vnode_valid_trace;
  121. time64_t cb_expires_at = atomic64_read(&vnode->cb_expires_at);
  122. time64_t deadline = ktime_get_real_seconds() + 10;
  123. if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
  124. return true;
  125. if (atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break))
  126. trace = afs_vnode_invalid_trace_cb_v_break;
  127. else if (cb_expires_at == AFS_NO_CB_PROMISE)
  128. trace = afs_vnode_invalid_trace_no_cb_promise;
  129. else if (cb_expires_at <= deadline)
  130. trace = afs_vnode_invalid_trace_expired;
  131. else if (volume->cb_expires_at <= deadline)
  132. trace = afs_vnode_invalid_trace_vol_expired;
  133. else if (vnode->cb_ro_snapshot != atomic_read(&volume->cb_ro_snapshot))
  134. trace = afs_vnode_invalid_trace_cb_ro_snapshot;
  135. else if (vnode->cb_scrub != atomic_read(&volume->cb_scrub))
  136. trace = afs_vnode_invalid_trace_cb_scrub;
  137. else if (test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags))
  138. trace = afs_vnode_invalid_trace_zap_data;
  139. else
  140. return true;
  141. trace_afs_vnode_invalid(vnode, trace);
  142. return false;
  143. }
  144. /*
  145. * See if the server we've just talked to is currently excluded.
  146. */
  147. static bool __afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume)
  148. {
  149. const struct afs_server_entry *se;
  150. const struct afs_server_list *slist;
  151. bool is_excluded = true;
  152. int i;
  153. rcu_read_lock();
  154. slist = rcu_dereference(volume->servers);
  155. for (i = 0; i < slist->nr_servers; i++) {
  156. se = &slist->servers[i];
  157. if (op->server == se->server) {
  158. is_excluded = test_bit(AFS_SE_EXCLUDED, &se->flags);
  159. break;
  160. }
  161. }
  162. rcu_read_unlock();
  163. return is_excluded;
  164. }
  165. /*
  166. * Update the volume's server list when the creation time changes and see if
  167. * the server we've just talked to is currently excluded.
  168. */
  169. static int afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume)
  170. {
  171. int ret;
  172. if (__afs_is_server_excluded(op, volume))
  173. return 1;
  174. set_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags);
  175. ret = afs_check_volume_status(op->volume, op);
  176. if (ret < 0)
  177. return ret;
  178. return __afs_is_server_excluded(op, volume);
  179. }
  180. /*
  181. * Handle a change to the volume creation time in the VolSync record.
  182. */
  183. static int afs_update_volume_creation_time(struct afs_operation *op, struct afs_volume *volume)
  184. {
  185. unsigned int snap;
  186. time64_t cur = volume->creation_time;
  187. time64_t old = op->pre_volsync.creation;
  188. time64_t new = op->volsync.creation;
  189. int ret;
  190. _enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);
  191. if (cur == TIME64_MIN) {
  192. volume->creation_time = new;
  193. return 0;
  194. }
  195. if (new == cur)
  196. return 0;
  197. /* Try to advance the creation timestamp from what we had before the
  198. * operation to what we got back from the server. This should
  199. * hopefully ensure that in a race between multiple operations only one
  200. * of them will do this.
  201. */
  202. if (cur != old)
  203. return 0;
  204. /* If the creation time changes in an unexpected way, we need to scrub
  205. * our caches. For a RW vol, this will only change if the volume is
  206. * restored from a backup; for a RO/Backup vol, this will advance when
  207. * the volume is updated to a new snapshot (eg. "vos release").
  208. */
  209. if (volume->type == AFSVL_RWVOL)
  210. goto regressed;
  211. if (volume->type == AFSVL_BACKVOL) {
  212. if (new < old)
  213. goto regressed;
  214. goto advance;
  215. }
  216. /* We have an RO volume, we need to query the VL server and look at the
  217. * server flags to see if RW->RO replication is in progress.
  218. */
  219. ret = afs_is_server_excluded(op, volume);
  220. if (ret < 0)
  221. return ret;
  222. if (ret > 0) {
  223. snap = atomic_read(&volume->cb_ro_snapshot);
  224. trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_volume_excluded);
  225. return ret;
  226. }
  227. advance:
  228. snap = atomic_inc_return(&volume->cb_ro_snapshot);
  229. trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_for_vos_release);
  230. volume->creation_time = new;
  231. return 0;
  232. regressed:
  233. atomic_inc(&volume->cb_scrub);
  234. trace_afs_cb_v_break(volume->vid, 0, afs_cb_break_for_creation_regress);
  235. volume->creation_time = new;
  236. return 0;
  237. }
  238. /*
  239. * Handle a change to the volume update time in the VolSync record.
  240. */
  241. static void afs_update_volume_update_time(struct afs_operation *op, struct afs_volume *volume)
  242. {
  243. enum afs_cb_break_reason reason = afs_cb_break_no_break;
  244. time64_t cur = volume->update_time;
  245. time64_t old = op->pre_volsync.update;
  246. time64_t new = op->volsync.update;
  247. _enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);
  248. if (cur == TIME64_MIN) {
  249. volume->update_time = new;
  250. return;
  251. }
  252. if (new == cur)
  253. return;
  254. /* If the volume update time changes in an unexpected way, we need to
  255. * scrub our caches. For a RW vol, this will advance on every
  256. * modification op; for a RO/Backup vol, this will advance when the
  257. * volume is updated to a new snapshot (eg. "vos release").
  258. */
  259. if (new < old)
  260. reason = afs_cb_break_for_update_regress;
  261. /* Try to advance the update timestamp from what we had before the
  262. * operation to what we got back from the server. This should
  263. * hopefully ensure that in a race between multiple operations only one
  264. * of them will do this.
  265. */
  266. if (cur == old) {
  267. if (reason == afs_cb_break_for_update_regress) {
  268. atomic_inc(&volume->cb_scrub);
  269. trace_afs_cb_v_break(volume->vid, 0, reason);
  270. }
  271. volume->update_time = new;
  272. }
  273. }
  274. static int afs_update_volume_times(struct afs_operation *op, struct afs_volume *volume)
  275. {
  276. int ret = 0;
  277. if (likely(op->volsync.creation == volume->creation_time &&
  278. op->volsync.update == volume->update_time))
  279. return 0;
  280. mutex_lock(&volume->volsync_lock);
  281. if (op->volsync.creation != volume->creation_time) {
  282. ret = afs_update_volume_creation_time(op, volume);
  283. if (ret < 0)
  284. goto out;
  285. }
  286. if (op->volsync.update != volume->update_time)
  287. afs_update_volume_update_time(op, volume);
  288. out:
  289. mutex_unlock(&volume->volsync_lock);
  290. return ret;
  291. }
  292. /*
  293. * Update the state of a volume, including recording the expiration time of the
  294. * callback promise. Returns 1 to redo the operation from the start.
  295. */
  296. int afs_update_volume_state(struct afs_operation *op)
  297. {
  298. struct afs_server_list *slist = op->server_list;
  299. struct afs_server_entry *se = &slist->servers[op->server_index];
  300. struct afs_callback *cb = &op->file[0].scb.callback;
  301. struct afs_volume *volume = op->volume;
  302. unsigned int cb_v_break = atomic_read(&volume->cb_v_break);
  303. unsigned int cb_v_check = atomic_read(&volume->cb_v_check);
  304. int ret;
  305. _enter("%llx", op->volume->vid);
  306. if (op->volsync.creation != TIME64_MIN || op->volsync.update != TIME64_MIN) {
  307. ret = afs_update_volume_times(op, volume);
  308. if (ret != 0) {
  309. _leave(" = %d", ret);
  310. return ret;
  311. }
  312. }
  313. if (op->cb_v_break == cb_v_break &&
  314. (op->file[0].scb.have_cb || op->file[1].scb.have_cb)) {
  315. time64_t expires_at = cb->expires_at;
  316. if (!op->file[0].scb.have_cb)
  317. expires_at = op->file[1].scb.callback.expires_at;
  318. se->cb_expires_at = expires_at;
  319. volume->cb_expires_at = expires_at;
  320. }
  321. if (cb_v_check < op->cb_v_break)
  322. atomic_cmpxchg(&volume->cb_v_check, cb_v_check, op->cb_v_break);
  323. return 0;
  324. }
  325. /*
  326. * mark the data attached to an inode as obsolete due to a write on the server
  327. * - might also want to ditch all the outstanding writes and dirty pages
  328. */
  329. static void afs_zap_data(struct afs_vnode *vnode)
  330. {
  331. _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);
  332. afs_invalidate_cache(vnode, 0);
  333. /* nuke all the non-dirty pages that aren't locked, mapped or being
  334. * written back in a regular file and completely discard the pages in a
  335. * directory or symlink */
  336. if (S_ISREG(vnode->netfs.inode.i_mode))
  337. filemap_invalidate_inode(&vnode->netfs.inode, true, 0, LLONG_MAX);
  338. else
  339. filemap_invalidate_inode(&vnode->netfs.inode, false, 0, LLONG_MAX);
  340. }
  341. /*
  342. * validate a vnode/inode
  343. * - there are several things we need to check
  344. * - parent dir data changes (rm, rmdir, rename, mkdir, create, link,
  345. * symlink)
  346. * - parent dir metadata changed (security changes)
  347. * - dentry data changed (write, truncate)
  348. * - dentry metadata changed (security changes)
  349. */
  350. int afs_validate(struct afs_vnode *vnode, struct key *key)
  351. {
  352. struct afs_volume *volume = vnode->volume;
  353. unsigned int cb_ro_snapshot, cb_scrub;
  354. time64_t deadline = ktime_get_real_seconds() + 10;
  355. bool zap = false, locked_vol = false;
  356. int ret;
  357. _enter("{v={%llx:%llu} fl=%lx},%x",
  358. vnode->fid.vid, vnode->fid.vnode, vnode->flags,
  359. key_serial(key));
  360. if (afs_check_validity(vnode))
  361. return test_bit(AFS_VNODE_DELETED, &vnode->flags) ? -ESTALE : 0;
  362. ret = down_write_killable(&vnode->validate_lock);
  363. if (ret < 0)
  364. goto error;
  365. if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
  366. ret = -ESTALE;
  367. goto error_unlock;
  368. }
  369. /* Validate a volume after the v_break has changed or the volume
  370. * callback expired. We only want to do this once per volume per
  371. * v_break change. The actual work will be done when parsing the
  372. * status fetch reply.
  373. */
  374. if (volume->cb_expires_at <= deadline ||
  375. atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break)) {
  376. ret = mutex_lock_interruptible(&volume->cb_check_lock);
  377. if (ret < 0)
  378. goto error_unlock;
  379. locked_vol = true;
  380. }
  381. cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot);
  382. cb_scrub = atomic_read(&volume->cb_scrub);
  383. if (vnode->cb_ro_snapshot != cb_ro_snapshot ||
  384. vnode->cb_scrub != cb_scrub)
  385. unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false);
  386. if (vnode->cb_ro_snapshot != cb_ro_snapshot ||
  387. vnode->cb_scrub != cb_scrub ||
  388. volume->cb_expires_at <= deadline ||
  389. atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) ||
  390. atomic64_read(&vnode->cb_expires_at) <= deadline
  391. ) {
  392. ret = afs_fetch_status(vnode, key, false, NULL);
  393. if (ret < 0) {
  394. if (ret == -ENOENT) {
  395. set_bit(AFS_VNODE_DELETED, &vnode->flags);
  396. ret = -ESTALE;
  397. }
  398. goto error_unlock;
  399. }
  400. _debug("new promise [fl=%lx]", vnode->flags);
  401. }
  402. /* We can drop the volume lock now as. */
  403. if (locked_vol) {
  404. mutex_unlock(&volume->cb_check_lock);
  405. locked_vol = false;
  406. }
  407. cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot);
  408. cb_scrub = atomic_read(&volume->cb_scrub);
  409. _debug("vnode inval %x==%x %x==%x",
  410. vnode->cb_ro_snapshot, cb_ro_snapshot,
  411. vnode->cb_scrub, cb_scrub);
  412. if (vnode->cb_scrub != cb_scrub)
  413. zap = true;
  414. vnode->cb_ro_snapshot = cb_ro_snapshot;
  415. vnode->cb_scrub = cb_scrub;
  416. /* if the vnode's data version number changed then its contents are
  417. * different */
  418. zap |= test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
  419. if (zap)
  420. afs_zap_data(vnode);
  421. up_write(&vnode->validate_lock);
  422. _leave(" = 0");
  423. return 0;
  424. error_unlock:
  425. if (locked_vol)
  426. mutex_unlock(&volume->cb_check_lock);
  427. up_write(&vnode->validate_lock);
  428. error:
  429. _leave(" = %d", ret);
  430. return ret;
  431. }