recover.c 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /******************************************************************************
  3. *******************************************************************************
  4. **
  5. ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
  6. ** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
  7. **
  8. **
  9. *******************************************************************************
  10. ******************************************************************************/
  11. #include "dlm_internal.h"
  12. #include "lockspace.h"
  13. #include "dir.h"
  14. #include "config.h"
  15. #include "ast.h"
  16. #include "memory.h"
  17. #include "rcom.h"
  18. #include "lock.h"
  19. #include "lowcomms.h"
  20. #include "member.h"
  21. #include "recover.h"
  22. /*
  23. * Recovery waiting routines: these functions wait for a particular reply from
  24. * a remote node, or for the remote node to report a certain status. They need
  25. * to abort if the lockspace is stopped indicating a node has failed (perhaps
  26. * the one being waited for).
  27. */
  28. /*
  29. * Wait until given function returns non-zero or lockspace is stopped
  30. * (LS_RECOVERY_STOP set due to failure of a node in ls_nodes). When another
  31. * function thinks it could have completed the waited-on task, they should wake
  32. * up ls_wait_general to get an immediate response rather than waiting for the
  33. * timeout. This uses a timeout so it can check periodically if the wait
  34. * should abort due to node failure (which doesn't cause a wake_up).
  35. * This should only be called by the dlm_recoverd thread.
  36. */
  37. int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls))
  38. {
  39. int error = 0;
  40. int rv;
  41. while (1) {
  42. rv = wait_event_timeout(ls->ls_wait_general,
  43. testfn(ls) || dlm_recovery_stopped(ls),
  44. dlm_config.ci_recover_timer * HZ);
  45. if (rv)
  46. break;
  47. if (test_bit(LSFL_RCOM_WAIT, &ls->ls_flags)) {
  48. log_debug(ls, "dlm_wait_function timed out");
  49. return -ETIMEDOUT;
  50. }
  51. }
  52. if (dlm_recovery_stopped(ls)) {
  53. log_debug(ls, "dlm_wait_function aborted");
  54. error = -EINTR;
  55. }
  56. return error;
  57. }
  58. /*
  59. * An efficient way for all nodes to wait for all others to have a certain
  60. * status. The node with the lowest nodeid polls all the others for their
  61. * status (wait_status_all) and all the others poll the node with the low id
  62. * for its accumulated result (wait_status_low). When all nodes have set
  63. * status flag X, then status flag X_ALL will be set on the low nodeid.
  64. */
  65. uint32_t dlm_recover_status(struct dlm_ls *ls)
  66. {
  67. uint32_t status;
  68. spin_lock_bh(&ls->ls_recover_lock);
  69. status = ls->ls_recover_status;
  70. spin_unlock_bh(&ls->ls_recover_lock);
  71. return status;
  72. }
  73. static void _set_recover_status(struct dlm_ls *ls, uint32_t status)
  74. {
  75. ls->ls_recover_status |= status;
  76. }
  77. void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status)
  78. {
  79. spin_lock_bh(&ls->ls_recover_lock);
  80. _set_recover_status(ls, status);
  81. spin_unlock_bh(&ls->ls_recover_lock);
  82. }
  83. static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status,
  84. int save_slots, uint64_t seq)
  85. {
  86. struct dlm_rcom *rc = ls->ls_recover_buf;
  87. struct dlm_member *memb;
  88. int error = 0, delay;
  89. list_for_each_entry(memb, &ls->ls_nodes, list) {
  90. delay = 0;
  91. for (;;) {
  92. if (dlm_recovery_stopped(ls)) {
  93. error = -EINTR;
  94. goto out;
  95. }
  96. error = dlm_rcom_status(ls, memb->nodeid, 0, seq);
  97. if (error)
  98. goto out;
  99. if (save_slots)
  100. dlm_slot_save(ls, rc, memb);
  101. if (le32_to_cpu(rc->rc_result) & wait_status)
  102. break;
  103. if (delay < 1000)
  104. delay += 20;
  105. msleep(delay);
  106. }
  107. }
  108. out:
  109. return error;
  110. }
  111. static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status,
  112. uint32_t status_flags, uint64_t seq)
  113. {
  114. struct dlm_rcom *rc = ls->ls_recover_buf;
  115. int error = 0, delay = 0, nodeid = ls->ls_low_nodeid;
  116. for (;;) {
  117. if (dlm_recovery_stopped(ls)) {
  118. error = -EINTR;
  119. goto out;
  120. }
  121. error = dlm_rcom_status(ls, nodeid, status_flags, seq);
  122. if (error)
  123. break;
  124. if (le32_to_cpu(rc->rc_result) & wait_status)
  125. break;
  126. if (delay < 1000)
  127. delay += 20;
  128. msleep(delay);
  129. }
  130. out:
  131. return error;
  132. }
  133. static int wait_status(struct dlm_ls *ls, uint32_t status, uint64_t seq)
  134. {
  135. uint32_t status_all = status << 1;
  136. int error;
  137. if (ls->ls_low_nodeid == dlm_our_nodeid()) {
  138. error = wait_status_all(ls, status, 0, seq);
  139. if (!error)
  140. dlm_set_recover_status(ls, status_all);
  141. } else
  142. error = wait_status_low(ls, status_all, 0, seq);
  143. return error;
  144. }
  145. int dlm_recover_members_wait(struct dlm_ls *ls, uint64_t seq)
  146. {
  147. struct dlm_member *memb;
  148. struct dlm_slot *slots;
  149. int num_slots, slots_size;
  150. int error, rv;
  151. uint32_t gen;
  152. list_for_each_entry(memb, &ls->ls_nodes, list) {
  153. memb->slot = -1;
  154. memb->generation = 0;
  155. }
  156. if (ls->ls_low_nodeid == dlm_our_nodeid()) {
  157. error = wait_status_all(ls, DLM_RS_NODES, 1, seq);
  158. if (error)
  159. goto out;
  160. /* slots array is sparse, slots_size may be > num_slots */
  161. rv = dlm_slots_assign(ls, &num_slots, &slots_size, &slots, &gen);
  162. if (!rv) {
  163. spin_lock_bh(&ls->ls_recover_lock);
  164. _set_recover_status(ls, DLM_RS_NODES_ALL);
  165. ls->ls_num_slots = num_slots;
  166. ls->ls_slots_size = slots_size;
  167. ls->ls_slots = slots;
  168. ls->ls_generation = gen;
  169. spin_unlock_bh(&ls->ls_recover_lock);
  170. } else {
  171. dlm_set_recover_status(ls, DLM_RS_NODES_ALL);
  172. }
  173. } else {
  174. error = wait_status_low(ls, DLM_RS_NODES_ALL,
  175. DLM_RSF_NEED_SLOTS, seq);
  176. if (error)
  177. goto out;
  178. dlm_slots_copy_in(ls);
  179. }
  180. out:
  181. return error;
  182. }
  183. int dlm_recover_directory_wait(struct dlm_ls *ls, uint64_t seq)
  184. {
  185. return wait_status(ls, DLM_RS_DIR, seq);
  186. }
  187. int dlm_recover_locks_wait(struct dlm_ls *ls, uint64_t seq)
  188. {
  189. return wait_status(ls, DLM_RS_LOCKS, seq);
  190. }
  191. int dlm_recover_done_wait(struct dlm_ls *ls, uint64_t seq)
  192. {
  193. return wait_status(ls, DLM_RS_DONE, seq);
  194. }
  195. /*
  196. * The recover_list contains all the rsb's for which we've requested the new
  197. * master nodeid. As replies are returned from the resource directories the
  198. * rsb's are removed from the list. When the list is empty we're done.
  199. *
  200. * The recover_list is later similarly used for all rsb's for which we've sent
  201. * new lkb's and need to receive new corresponding lkid's.
  202. *
  203. * We use the address of the rsb struct as a simple local identifier for the
  204. * rsb so we can match an rcom reply with the rsb it was sent for.
  205. */
  206. static int recover_list_empty(struct dlm_ls *ls)
  207. {
  208. int empty;
  209. spin_lock_bh(&ls->ls_recover_list_lock);
  210. empty = list_empty(&ls->ls_recover_list);
  211. spin_unlock_bh(&ls->ls_recover_list_lock);
  212. return empty;
  213. }
  214. static void recover_list_add(struct dlm_rsb *r)
  215. {
  216. struct dlm_ls *ls = r->res_ls;
  217. spin_lock_bh(&ls->ls_recover_list_lock);
  218. if (list_empty(&r->res_recover_list)) {
  219. list_add_tail(&r->res_recover_list, &ls->ls_recover_list);
  220. ls->ls_recover_list_count++;
  221. dlm_hold_rsb(r);
  222. }
  223. spin_unlock_bh(&ls->ls_recover_list_lock);
  224. }
  225. static void recover_list_del(struct dlm_rsb *r)
  226. {
  227. struct dlm_ls *ls = r->res_ls;
  228. spin_lock_bh(&ls->ls_recover_list_lock);
  229. list_del_init(&r->res_recover_list);
  230. ls->ls_recover_list_count--;
  231. spin_unlock_bh(&ls->ls_recover_list_lock);
  232. dlm_put_rsb(r);
  233. }
  234. static void recover_list_clear(struct dlm_ls *ls)
  235. {
  236. struct dlm_rsb *r, *s;
  237. spin_lock_bh(&ls->ls_recover_list_lock);
  238. list_for_each_entry_safe(r, s, &ls->ls_recover_list, res_recover_list) {
  239. list_del_init(&r->res_recover_list);
  240. r->res_recover_locks_count = 0;
  241. dlm_put_rsb(r);
  242. ls->ls_recover_list_count--;
  243. }
  244. if (ls->ls_recover_list_count != 0) {
  245. log_error(ls, "warning: recover_list_count %d",
  246. ls->ls_recover_list_count);
  247. ls->ls_recover_list_count = 0;
  248. }
  249. spin_unlock_bh(&ls->ls_recover_list_lock);
  250. }
  251. static int recover_xa_empty(struct dlm_ls *ls)
  252. {
  253. int empty = 1;
  254. spin_lock_bh(&ls->ls_recover_xa_lock);
  255. if (ls->ls_recover_list_count)
  256. empty = 0;
  257. spin_unlock_bh(&ls->ls_recover_xa_lock);
  258. return empty;
  259. }
  260. static int recover_xa_add(struct dlm_rsb *r)
  261. {
  262. struct dlm_ls *ls = r->res_ls;
  263. struct xa_limit limit = {
  264. .min = 1,
  265. .max = UINT_MAX,
  266. };
  267. uint32_t id;
  268. int rv;
  269. spin_lock_bh(&ls->ls_recover_xa_lock);
  270. if (r->res_id) {
  271. rv = -1;
  272. goto out_unlock;
  273. }
  274. rv = xa_alloc(&ls->ls_recover_xa, &id, r, limit, GFP_ATOMIC);
  275. if (rv < 0)
  276. goto out_unlock;
  277. r->res_id = id;
  278. ls->ls_recover_list_count++;
  279. dlm_hold_rsb(r);
  280. rv = 0;
  281. out_unlock:
  282. spin_unlock_bh(&ls->ls_recover_xa_lock);
  283. return rv;
  284. }
  285. static void recover_xa_del(struct dlm_rsb *r)
  286. {
  287. struct dlm_ls *ls = r->res_ls;
  288. spin_lock_bh(&ls->ls_recover_xa_lock);
  289. xa_erase_bh(&ls->ls_recover_xa, r->res_id);
  290. r->res_id = 0;
  291. ls->ls_recover_list_count--;
  292. spin_unlock_bh(&ls->ls_recover_xa_lock);
  293. dlm_put_rsb(r);
  294. }
  295. static struct dlm_rsb *recover_xa_find(struct dlm_ls *ls, uint64_t id)
  296. {
  297. struct dlm_rsb *r;
  298. spin_lock_bh(&ls->ls_recover_xa_lock);
  299. r = xa_load(&ls->ls_recover_xa, (int)id);
  300. spin_unlock_bh(&ls->ls_recover_xa_lock);
  301. return r;
  302. }
  303. static void recover_xa_clear(struct dlm_ls *ls)
  304. {
  305. struct dlm_rsb *r;
  306. unsigned long id;
  307. spin_lock_bh(&ls->ls_recover_xa_lock);
  308. xa_for_each(&ls->ls_recover_xa, id, r) {
  309. xa_erase_bh(&ls->ls_recover_xa, id);
  310. r->res_id = 0;
  311. r->res_recover_locks_count = 0;
  312. ls->ls_recover_list_count--;
  313. dlm_put_rsb(r);
  314. }
  315. if (ls->ls_recover_list_count != 0) {
  316. log_error(ls, "warning: recover_list_count %d",
  317. ls->ls_recover_list_count);
  318. ls->ls_recover_list_count = 0;
  319. }
  320. spin_unlock_bh(&ls->ls_recover_xa_lock);
  321. }
  322. /* Master recovery: find new master node for rsb's that were
  323. mastered on nodes that have been removed.
  324. dlm_recover_masters
  325. recover_master
  326. dlm_send_rcom_lookup -> receive_rcom_lookup
  327. dlm_dir_lookup
  328. receive_rcom_lookup_reply <-
  329. dlm_recover_master_reply
  330. set_new_master
  331. set_master_lkbs
  332. set_lock_master
  333. */
  334. /*
  335. * Set the lock master for all LKBs in a lock queue
  336. * If we are the new master of the rsb, we may have received new
  337. * MSTCPY locks from other nodes already which we need to ignore
  338. * when setting the new nodeid.
  339. */
  340. static void set_lock_master(struct list_head *queue, int nodeid)
  341. {
  342. struct dlm_lkb *lkb;
  343. list_for_each_entry(lkb, queue, lkb_statequeue) {
  344. if (!test_bit(DLM_IFL_MSTCPY_BIT, &lkb->lkb_iflags)) {
  345. lkb->lkb_nodeid = nodeid;
  346. lkb->lkb_remid = 0;
  347. }
  348. }
  349. }
  350. static void set_master_lkbs(struct dlm_rsb *r)
  351. {
  352. set_lock_master(&r->res_grantqueue, r->res_nodeid);
  353. set_lock_master(&r->res_convertqueue, r->res_nodeid);
  354. set_lock_master(&r->res_waitqueue, r->res_nodeid);
  355. }
  356. /*
  357. * Propagate the new master nodeid to locks
  358. * The NEW_MASTER flag tells dlm_recover_locks() which rsb's to consider.
  359. * The NEW_MASTER2 flag tells recover_lvb() and recover_grant() which
  360. * rsb's to consider.
  361. */
  362. static void set_new_master(struct dlm_rsb *r)
  363. {
  364. set_master_lkbs(r);
  365. rsb_set_flag(r, RSB_NEW_MASTER);
  366. rsb_set_flag(r, RSB_NEW_MASTER2);
  367. }
  368. /*
  369. * We do async lookups on rsb's that need new masters. The rsb's
  370. * waiting for a lookup reply are kept on the recover_list.
  371. *
  372. * Another node recovering the master may have sent us a rcom lookup,
  373. * and our dlm_master_lookup() set it as the new master, along with
  374. * NEW_MASTER so that we'll recover it here (this implies dir_nodeid
  375. * equals our_nodeid below).
  376. */
  377. static int recover_master(struct dlm_rsb *r, unsigned int *count, uint64_t seq)
  378. {
  379. struct dlm_ls *ls = r->res_ls;
  380. int our_nodeid, dir_nodeid;
  381. int is_removed = 0;
  382. int error;
  383. if (r->res_nodeid != -1 && is_master(r))
  384. return 0;
  385. if (r->res_nodeid != -1)
  386. is_removed = dlm_is_removed(ls, r->res_nodeid);
  387. if (!is_removed && !rsb_flag(r, RSB_NEW_MASTER))
  388. return 0;
  389. our_nodeid = dlm_our_nodeid();
  390. dir_nodeid = dlm_dir_nodeid(r);
  391. if (dir_nodeid == our_nodeid) {
  392. if (is_removed) {
  393. r->res_master_nodeid = our_nodeid;
  394. r->res_nodeid = 0;
  395. }
  396. /* set master of lkbs to ourself when is_removed, or to
  397. another new master which we set along with NEW_MASTER
  398. in dlm_master_lookup */
  399. set_new_master(r);
  400. error = 0;
  401. } else {
  402. recover_xa_add(r);
  403. error = dlm_send_rcom_lookup(r, dir_nodeid, seq);
  404. }
  405. (*count)++;
  406. return error;
  407. }
  408. /*
  409. * All MSTCPY locks are purged and rebuilt, even if the master stayed the same.
  410. * This is necessary because recovery can be started, aborted and restarted,
  411. * causing the master nodeid to briefly change during the aborted recovery, and
  412. * change back to the original value in the second recovery. The MSTCPY locks
  413. * may or may not have been purged during the aborted recovery. Another node
  414. * with an outstanding request in waiters list and a request reply saved in the
  415. * requestqueue, cannot know whether it should ignore the reply and resend the
  416. * request, or accept the reply and complete the request. It must do the
  417. * former if the remote node purged MSTCPY locks, and it must do the later if
  418. * the remote node did not. This is solved by always purging MSTCPY locks, in
  419. * which case, the request reply would always be ignored and the request
  420. * resent.
  421. */
  422. static int recover_master_static(struct dlm_rsb *r, unsigned int *count)
  423. {
  424. int dir_nodeid = dlm_dir_nodeid(r);
  425. int new_master = dir_nodeid;
  426. if (dir_nodeid == dlm_our_nodeid())
  427. new_master = 0;
  428. dlm_purge_mstcpy_locks(r);
  429. r->res_master_nodeid = dir_nodeid;
  430. r->res_nodeid = new_master;
  431. set_new_master(r);
  432. (*count)++;
  433. return 0;
  434. }
  435. /*
  436. * Go through local root resources and for each rsb which has a master which
  437. * has departed, get the new master nodeid from the directory. The dir will
  438. * assign mastery to the first node to look up the new master. That means
  439. * we'll discover in this lookup if we're the new master of any rsb's.
  440. *
  441. * We fire off all the dir lookup requests individually and asynchronously to
  442. * the correct dir node.
  443. */
  444. int dlm_recover_masters(struct dlm_ls *ls, uint64_t seq,
  445. const struct list_head *root_list)
  446. {
  447. struct dlm_rsb *r;
  448. unsigned int total = 0;
  449. unsigned int count = 0;
  450. int nodir = dlm_no_directory(ls);
  451. int error;
  452. log_rinfo(ls, "dlm_recover_masters");
  453. list_for_each_entry(r, root_list, res_root_list) {
  454. if (dlm_recovery_stopped(ls)) {
  455. error = -EINTR;
  456. goto out;
  457. }
  458. lock_rsb(r);
  459. if (nodir)
  460. error = recover_master_static(r, &count);
  461. else
  462. error = recover_master(r, &count, seq);
  463. unlock_rsb(r);
  464. cond_resched();
  465. total++;
  466. if (error)
  467. goto out;
  468. }
  469. log_rinfo(ls, "dlm_recover_masters %u of %u", count, total);
  470. error = dlm_wait_function(ls, &recover_xa_empty);
  471. out:
  472. if (error)
  473. recover_xa_clear(ls);
  474. return error;
  475. }
  476. int dlm_recover_master_reply(struct dlm_ls *ls, const struct dlm_rcom *rc)
  477. {
  478. struct dlm_rsb *r;
  479. int ret_nodeid, new_master;
  480. r = recover_xa_find(ls, le64_to_cpu(rc->rc_id));
  481. if (!r) {
  482. log_error(ls, "dlm_recover_master_reply no id %llx",
  483. (unsigned long long)le64_to_cpu(rc->rc_id));
  484. goto out;
  485. }
  486. ret_nodeid = le32_to_cpu(rc->rc_result);
  487. if (ret_nodeid == dlm_our_nodeid())
  488. new_master = 0;
  489. else
  490. new_master = ret_nodeid;
  491. lock_rsb(r);
  492. r->res_master_nodeid = ret_nodeid;
  493. r->res_nodeid = new_master;
  494. set_new_master(r);
  495. unlock_rsb(r);
  496. recover_xa_del(r);
  497. if (recover_xa_empty(ls))
  498. wake_up(&ls->ls_wait_general);
  499. out:
  500. return 0;
  501. }
  502. /* Lock recovery: rebuild the process-copy locks we hold on a
  503. remastered rsb on the new rsb master.
  504. dlm_recover_locks
  505. recover_locks
  506. recover_locks_queue
  507. dlm_send_rcom_lock -> receive_rcom_lock
  508. dlm_recover_master_copy
  509. receive_rcom_lock_reply <-
  510. dlm_recover_process_copy
  511. */
  512. /*
  513. * keep a count of the number of lkb's we send to the new master; when we get
  514. * an equal number of replies then recovery for the rsb is done
  515. */
  516. static int recover_locks_queue(struct dlm_rsb *r, struct list_head *head,
  517. uint64_t seq)
  518. {
  519. struct dlm_lkb *lkb;
  520. int error = 0;
  521. list_for_each_entry(lkb, head, lkb_statequeue) {
  522. error = dlm_send_rcom_lock(r, lkb, seq);
  523. if (error)
  524. break;
  525. r->res_recover_locks_count++;
  526. }
  527. return error;
  528. }
  529. static int recover_locks(struct dlm_rsb *r, uint64_t seq)
  530. {
  531. int error = 0;
  532. lock_rsb(r);
  533. DLM_ASSERT(!r->res_recover_locks_count, dlm_dump_rsb(r););
  534. error = recover_locks_queue(r, &r->res_grantqueue, seq);
  535. if (error)
  536. goto out;
  537. error = recover_locks_queue(r, &r->res_convertqueue, seq);
  538. if (error)
  539. goto out;
  540. error = recover_locks_queue(r, &r->res_waitqueue, seq);
  541. if (error)
  542. goto out;
  543. if (r->res_recover_locks_count)
  544. recover_list_add(r);
  545. else
  546. rsb_clear_flag(r, RSB_NEW_MASTER);
  547. out:
  548. unlock_rsb(r);
  549. return error;
  550. }
  551. int dlm_recover_locks(struct dlm_ls *ls, uint64_t seq,
  552. const struct list_head *root_list)
  553. {
  554. struct dlm_rsb *r;
  555. int error, count = 0;
  556. list_for_each_entry(r, root_list, res_root_list) {
  557. if (r->res_nodeid != -1 && is_master(r)) {
  558. rsb_clear_flag(r, RSB_NEW_MASTER);
  559. continue;
  560. }
  561. if (!rsb_flag(r, RSB_NEW_MASTER))
  562. continue;
  563. if (dlm_recovery_stopped(ls)) {
  564. error = -EINTR;
  565. goto out;
  566. }
  567. error = recover_locks(r, seq);
  568. if (error)
  569. goto out;
  570. count += r->res_recover_locks_count;
  571. }
  572. log_rinfo(ls, "dlm_recover_locks %d out", count);
  573. error = dlm_wait_function(ls, &recover_list_empty);
  574. out:
  575. if (error)
  576. recover_list_clear(ls);
  577. return error;
  578. }
  579. void dlm_recovered_lock(struct dlm_rsb *r)
  580. {
  581. DLM_ASSERT(rsb_flag(r, RSB_NEW_MASTER), dlm_dump_rsb(r););
  582. r->res_recover_locks_count--;
  583. if (!r->res_recover_locks_count) {
  584. rsb_clear_flag(r, RSB_NEW_MASTER);
  585. recover_list_del(r);
  586. }
  587. if (recover_list_empty(r->res_ls))
  588. wake_up(&r->res_ls->ls_wait_general);
  589. }
  590. /*
  591. * The lvb needs to be recovered on all master rsb's. This includes setting
  592. * the VALNOTVALID flag if necessary, and determining the correct lvb contents
  593. * based on the lvb's of the locks held on the rsb.
  594. *
  595. * RSB_VALNOTVALID is set in two cases:
  596. *
  597. * 1. we are master, but not new, and we purged an EX/PW lock held by a
  598. * failed node (in dlm_recover_purge which set RSB_RECOVER_LVB_INVAL)
  599. *
  600. * 2. we are a new master, and there are only NL/CR locks left.
  601. * (We could probably improve this by only invaliding in this way when
  602. * the previous master left uncleanly. VMS docs mention that.)
  603. *
  604. * The LVB contents are only considered for changing when this is a new master
  605. * of the rsb (NEW_MASTER2). Then, the rsb's lvb is taken from any lkb with
  606. * mode > CR. If no lkb's exist with mode above CR, the lvb contents are taken
  607. * from the lkb with the largest lvb sequence number.
  608. */
  609. static void recover_lvb(struct dlm_rsb *r)
  610. {
  611. struct dlm_lkb *big_lkb = NULL, *iter, *high_lkb = NULL;
  612. uint32_t high_seq = 0;
  613. int lock_lvb_exists = 0;
  614. int lvblen = r->res_ls->ls_lvblen;
  615. if (!rsb_flag(r, RSB_NEW_MASTER2) &&
  616. rsb_flag(r, RSB_RECOVER_LVB_INVAL)) {
  617. /* case 1 above */
  618. rsb_set_flag(r, RSB_VALNOTVALID);
  619. return;
  620. }
  621. if (!rsb_flag(r, RSB_NEW_MASTER2))
  622. return;
  623. /* we are the new master, so figure out if VALNOTVALID should
  624. be set, and set the rsb lvb from the best lkb available. */
  625. list_for_each_entry(iter, &r->res_grantqueue, lkb_statequeue) {
  626. if (!(iter->lkb_exflags & DLM_LKF_VALBLK))
  627. continue;
  628. lock_lvb_exists = 1;
  629. if (iter->lkb_grmode > DLM_LOCK_CR) {
  630. big_lkb = iter;
  631. goto setflag;
  632. }
  633. if (((int)iter->lkb_lvbseq - (int)high_seq) >= 0) {
  634. high_lkb = iter;
  635. high_seq = iter->lkb_lvbseq;
  636. }
  637. }
  638. list_for_each_entry(iter, &r->res_convertqueue, lkb_statequeue) {
  639. if (!(iter->lkb_exflags & DLM_LKF_VALBLK))
  640. continue;
  641. lock_lvb_exists = 1;
  642. if (iter->lkb_grmode > DLM_LOCK_CR) {
  643. big_lkb = iter;
  644. goto setflag;
  645. }
  646. if (((int)iter->lkb_lvbseq - (int)high_seq) >= 0) {
  647. high_lkb = iter;
  648. high_seq = iter->lkb_lvbseq;
  649. }
  650. }
  651. setflag:
  652. if (!lock_lvb_exists)
  653. goto out;
  654. /* lvb is invalidated if only NL/CR locks remain */
  655. if (!big_lkb)
  656. rsb_set_flag(r, RSB_VALNOTVALID);
  657. if (!r->res_lvbptr) {
  658. r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
  659. if (!r->res_lvbptr)
  660. goto out;
  661. }
  662. if (big_lkb) {
  663. r->res_lvbseq = big_lkb->lkb_lvbseq;
  664. memcpy(r->res_lvbptr, big_lkb->lkb_lvbptr, lvblen);
  665. } else if (high_lkb) {
  666. r->res_lvbseq = high_lkb->lkb_lvbseq;
  667. memcpy(r->res_lvbptr, high_lkb->lkb_lvbptr, lvblen);
  668. } else {
  669. r->res_lvbseq = 0;
  670. memset(r->res_lvbptr, 0, lvblen);
  671. }
  672. out:
  673. return;
  674. }
  675. /* All master rsb's flagged RECOVER_CONVERT need to be looked at. The locks
  676. * converting PR->CW or CW->PR may need to have their lkb_grmode changed.
  677. */
  678. static void recover_conversion(struct dlm_rsb *r)
  679. {
  680. struct dlm_ls *ls = r->res_ls;
  681. uint32_t other_lkid = 0;
  682. int other_grmode = -1;
  683. struct dlm_lkb *lkb;
  684. list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
  685. if (lkb->lkb_grmode == DLM_LOCK_PR ||
  686. lkb->lkb_grmode == DLM_LOCK_CW) {
  687. other_grmode = lkb->lkb_grmode;
  688. other_lkid = lkb->lkb_id;
  689. break;
  690. }
  691. }
  692. if (other_grmode == -1)
  693. return;
  694. list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
  695. /* Lock recovery created incompatible granted modes, so
  696. * change the granted mode of the converting lock to
  697. * NL. The rqmode of the converting lock should be CW,
  698. * which means the converting lock should be granted at
  699. * the end of recovery.
  700. */
  701. if (((lkb->lkb_grmode == DLM_LOCK_PR) && (other_grmode == DLM_LOCK_CW)) ||
  702. ((lkb->lkb_grmode == DLM_LOCK_CW) && (other_grmode == DLM_LOCK_PR))) {
  703. log_rinfo(ls, "%s %x gr %d rq %d, remote %d %x, other_lkid %u, other gr %d, set gr=NL",
  704. __func__, lkb->lkb_id, lkb->lkb_grmode,
  705. lkb->lkb_rqmode, lkb->lkb_nodeid,
  706. lkb->lkb_remid, other_lkid, other_grmode);
  707. lkb->lkb_grmode = DLM_LOCK_NL;
  708. }
  709. }
  710. }
  711. /* We've become the new master for this rsb and waiting/converting locks may
  712. need to be granted in dlm_recover_grant() due to locks that may have
  713. existed from a removed node. */
  714. static void recover_grant(struct dlm_rsb *r)
  715. {
  716. if (!list_empty(&r->res_waitqueue) || !list_empty(&r->res_convertqueue))
  717. rsb_set_flag(r, RSB_RECOVER_GRANT);
  718. }
  719. void dlm_recover_rsbs(struct dlm_ls *ls, const struct list_head *root_list)
  720. {
  721. struct dlm_rsb *r;
  722. unsigned int count = 0;
  723. list_for_each_entry(r, root_list, res_root_list) {
  724. lock_rsb(r);
  725. if (r->res_nodeid != -1 && is_master(r)) {
  726. if (rsb_flag(r, RSB_RECOVER_CONVERT))
  727. recover_conversion(r);
  728. /* recover lvb before granting locks so the updated
  729. lvb/VALNOTVALID is presented in the completion */
  730. recover_lvb(r);
  731. if (rsb_flag(r, RSB_NEW_MASTER2))
  732. recover_grant(r);
  733. count++;
  734. } else {
  735. rsb_clear_flag(r, RSB_VALNOTVALID);
  736. }
  737. rsb_clear_flag(r, RSB_RECOVER_CONVERT);
  738. rsb_clear_flag(r, RSB_RECOVER_LVB_INVAL);
  739. rsb_clear_flag(r, RSB_NEW_MASTER2);
  740. unlock_rsb(r);
  741. }
  742. if (count)
  743. log_rinfo(ls, "dlm_recover_rsbs %d done", count);
  744. }
  745. void dlm_clear_inactive(struct dlm_ls *ls)
  746. {
  747. struct dlm_rsb *r, *safe;
  748. unsigned int count = 0;
  749. write_lock_bh(&ls->ls_rsbtbl_lock);
  750. list_for_each_entry_safe(r, safe, &ls->ls_slow_inactive, res_slow_list) {
  751. list_del(&r->res_slow_list);
  752. rhashtable_remove_fast(&ls->ls_rsbtbl, &r->res_node,
  753. dlm_rhash_rsb_params);
  754. if (!list_empty(&r->res_scan_list))
  755. list_del_init(&r->res_scan_list);
  756. free_inactive_rsb(r);
  757. count++;
  758. }
  759. write_unlock_bh(&ls->ls_rsbtbl_lock);
  760. if (count)
  761. log_rinfo(ls, "dlm_clear_inactive %u done", count);
  762. }