connection.c 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976
  1. /*
  2. * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
  3. *
  4. * This software is available to you under a choice of one of two
  5. * licenses. You may choose to be licensed under the terms of the GNU
  6. * General Public License (GPL) Version 2, available from the file
  7. * COPYING in the main directory of this source tree, or the
  8. * OpenIB.org BSD license below:
  9. *
  10. * Redistribution and use in source and binary forms, with or
  11. * without modification, are permitted provided that the following
  12. * conditions are met:
  13. *
  14. * - Redistributions of source code must retain the above
  15. * copyright notice, this list of conditions and the following
  16. * disclaimer.
  17. *
  18. * - Redistributions in binary form must reproduce the above
  19. * copyright notice, this list of conditions and the following
  20. * disclaimer in the documentation and/or other materials
  21. * provided with the distribution.
  22. *
  23. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  24. * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  25. * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  26. * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  27. * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  28. * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  29. * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  30. * SOFTWARE.
  31. *
  32. */
  33. #include <linux/kernel.h>
  34. #include <linux/list.h>
  35. #include <linux/slab.h>
  36. #include <linux/export.h>
  37. #include <net/ipv6.h>
  38. #include <net/inet6_hashtables.h>
  39. #include <net/addrconf.h>
  40. #include "rds.h"
  41. #include "loop.h"
  42. #define RDS_CONNECTION_HASH_BITS 12
  43. #define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS)
  44. #define RDS_CONNECTION_HASH_MASK (RDS_CONNECTION_HASH_ENTRIES - 1)
  45. /* converting this to RCU is a chore for another day.. */
  46. static DEFINE_SPINLOCK(rds_conn_lock);
  47. static unsigned long rds_conn_count;
  48. static struct hlist_head rds_conn_hash[RDS_CONNECTION_HASH_ENTRIES];
  49. static struct kmem_cache *rds_conn_slab;
  50. static struct hlist_head *rds_conn_bucket(const struct in6_addr *laddr,
  51. const struct in6_addr *faddr)
  52. {
  53. static u32 rds6_hash_secret __read_mostly;
  54. static u32 rds_hash_secret __read_mostly;
  55. __be32 lhash, fhash;
  56. u32 hash;
  57. net_get_random_once(&rds_hash_secret, sizeof(rds_hash_secret));
  58. net_get_random_once(&rds6_hash_secret, sizeof(rds6_hash_secret));
  59. lhash = laddr->s6_addr32[3];
  60. #if IS_ENABLED(CONFIG_IPV6)
  61. fhash = (__force __be32)__ipv6_addr_jhash(faddr, rds6_hash_secret);
  62. #else
  63. fhash = faddr->s6_addr32[3];
  64. #endif
  65. hash = __inet_ehashfn(lhash, 0, fhash, 0, rds_hash_secret);
  66. return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK];
  67. }
  68. #define rds_conn_info_set(var, test, suffix) do { \
  69. if (test) \
  70. var |= RDS_INFO_CONNECTION_FLAG_##suffix; \
  71. } while (0)
  72. /* rcu read lock must be held or the connection spinlock */
  73. static struct rds_connection *rds_conn_lookup(struct net *net,
  74. struct hlist_head *head,
  75. const struct in6_addr *laddr,
  76. const struct in6_addr *faddr,
  77. struct rds_transport *trans,
  78. u8 tos, int dev_if)
  79. {
  80. struct rds_connection *conn, *ret = NULL;
  81. hlist_for_each_entry_rcu(conn, head, c_hash_node) {
  82. if (ipv6_addr_equal(&conn->c_faddr, faddr) &&
  83. ipv6_addr_equal(&conn->c_laddr, laddr) &&
  84. conn->c_trans == trans &&
  85. conn->c_tos == tos &&
  86. net == rds_conn_net(conn) &&
  87. conn->c_dev_if == dev_if) {
  88. ret = conn;
  89. break;
  90. }
  91. }
  92. rdsdebug("returning conn %p for %pI6c -> %pI6c\n", ret,
  93. laddr, faddr);
  94. return ret;
  95. }
  96. /*
  97. * This is called by transports as they're bringing down a connection.
  98. * It clears partial message state so that the transport can start sending
  99. * and receiving over this connection again in the future. It is up to
  100. * the transport to have serialized this call with its send and recv.
  101. */
  102. static void rds_conn_path_reset(struct rds_conn_path *cp)
  103. {
  104. struct rds_connection *conn = cp->cp_conn;
  105. rdsdebug("connection %pI6c to %pI6c reset\n",
  106. &conn->c_laddr, &conn->c_faddr);
  107. rds_stats_inc(s_conn_reset);
  108. rds_send_path_reset(cp);
  109. cp->cp_flags = 0;
  110. /* Do not clear next_rx_seq here, else we cannot distinguish
  111. * retransmitted packets from new packets, and will hand all
  112. * of them to the application. That is not consistent with the
  113. * reliability guarantees of RDS. */
  114. }
  115. static void __rds_conn_path_init(struct rds_connection *conn,
  116. struct rds_conn_path *cp, bool is_outgoing)
  117. {
  118. spin_lock_init(&cp->cp_lock);
  119. cp->cp_next_tx_seq = 1;
  120. init_waitqueue_head(&cp->cp_waitq);
  121. INIT_LIST_HEAD(&cp->cp_send_queue);
  122. INIT_LIST_HEAD(&cp->cp_retrans);
  123. cp->cp_conn = conn;
  124. atomic_set(&cp->cp_state, RDS_CONN_DOWN);
  125. cp->cp_send_gen = 0;
  126. cp->cp_reconnect_jiffies = 0;
  127. cp->cp_conn->c_proposed_version = RDS_PROTOCOL_VERSION;
  128. INIT_DELAYED_WORK(&cp->cp_send_w, rds_send_worker);
  129. INIT_DELAYED_WORK(&cp->cp_recv_w, rds_recv_worker);
  130. INIT_DELAYED_WORK(&cp->cp_conn_w, rds_connect_worker);
  131. INIT_WORK(&cp->cp_down_w, rds_shutdown_worker);
  132. mutex_init(&cp->cp_cm_lock);
  133. cp->cp_flags = 0;
  134. }
  135. /*
  136. * There is only every one 'conn' for a given pair of addresses in the
  137. * system at a time. They contain messages to be retransmitted and so
  138. * span the lifetime of the actual underlying transport connections.
  139. *
  140. * For now they are not garbage collected once they're created. They
  141. * are torn down as the module is removed, if ever.
  142. */
  143. static struct rds_connection *__rds_conn_create(struct net *net,
  144. const struct in6_addr *laddr,
  145. const struct in6_addr *faddr,
  146. struct rds_transport *trans,
  147. gfp_t gfp, u8 tos,
  148. int is_outgoing,
  149. int dev_if)
  150. {
  151. struct rds_connection *conn, *parent = NULL;
  152. struct hlist_head *head = rds_conn_bucket(laddr, faddr);
  153. struct rds_transport *loop_trans;
  154. struct rds_conn_path *free_cp = NULL;
  155. unsigned long flags;
  156. int ret, i;
  157. int npaths = (trans->t_mp_capable ? RDS_MPATH_WORKERS : 1);
  158. rcu_read_lock();
  159. conn = rds_conn_lookup(net, head, laddr, faddr, trans, tos, dev_if);
  160. if (conn &&
  161. conn->c_loopback &&
  162. conn->c_trans != &rds_loop_transport &&
  163. ipv6_addr_equal(laddr, faddr) &&
  164. !is_outgoing) {
  165. /* This is a looped back IB connection, and we're
  166. * called by the code handling the incoming connect.
  167. * We need a second connection object into which we
  168. * can stick the other QP. */
  169. parent = conn;
  170. conn = parent->c_passive;
  171. }
  172. rcu_read_unlock();
  173. if (conn)
  174. goto out;
  175. conn = kmem_cache_zalloc(rds_conn_slab, gfp);
  176. if (!conn) {
  177. conn = ERR_PTR(-ENOMEM);
  178. goto out;
  179. }
  180. conn->c_path = kzalloc_objs(struct rds_conn_path, npaths, gfp);
  181. if (!conn->c_path) {
  182. kmem_cache_free(rds_conn_slab, conn);
  183. conn = ERR_PTR(-ENOMEM);
  184. goto out;
  185. }
  186. INIT_HLIST_NODE(&conn->c_hash_node);
  187. conn->c_laddr = *laddr;
  188. conn->c_isv6 = !ipv6_addr_v4mapped(laddr);
  189. conn->c_faddr = *faddr;
  190. conn->c_dev_if = dev_if;
  191. conn->c_tos = tos;
  192. #if IS_ENABLED(CONFIG_IPV6)
  193. /* If the local address is link local, set c_bound_if to be the
  194. * index used for this connection. Otherwise, set it to 0 as
  195. * the socket is not bound to an interface. c_bound_if is used
  196. * to look up a socket when a packet is received
  197. */
  198. if (ipv6_addr_type(laddr) & IPV6_ADDR_LINKLOCAL)
  199. conn->c_bound_if = dev_if;
  200. else
  201. #endif
  202. conn->c_bound_if = 0;
  203. rds_conn_net_set(conn, net);
  204. ret = rds_cong_get_maps(conn);
  205. if (ret) {
  206. kfree(conn->c_path);
  207. kmem_cache_free(rds_conn_slab, conn);
  208. conn = ERR_PTR(ret);
  209. goto out;
  210. }
  211. /*
  212. * This is where a connection becomes loopback. If *any* RDS sockets
  213. * can bind to the destination address then we'd rather the messages
  214. * flow through loopback rather than either transport.
  215. */
  216. loop_trans = rds_trans_get_preferred(net, faddr, conn->c_dev_if);
  217. if (loop_trans) {
  218. rds_trans_put(loop_trans);
  219. conn->c_loopback = 1;
  220. if (trans->t_prefer_loopback) {
  221. if (likely(is_outgoing)) {
  222. /* "outgoing" connection to local address.
  223. * Protocol says it wants the connection
  224. * handled by the loopback transport.
  225. * This is what TCP does.
  226. */
  227. trans = &rds_loop_transport;
  228. } else {
  229. /* No transport currently in use
  230. * should end up here, but if it
  231. * does, reset/destroy the connection.
  232. */
  233. kfree(conn->c_path);
  234. kmem_cache_free(rds_conn_slab, conn);
  235. conn = ERR_PTR(-EOPNOTSUPP);
  236. goto out;
  237. }
  238. }
  239. }
  240. conn->c_trans = trans;
  241. init_waitqueue_head(&conn->c_hs_waitq);
  242. for (i = 0; i < npaths; i++) {
  243. __rds_conn_path_init(conn, &conn->c_path[i],
  244. is_outgoing);
  245. conn->c_path[i].cp_index = i;
  246. conn->c_path[i].cp_wq =
  247. alloc_ordered_workqueue("krds_cp_wq#%lu/%d", 0,
  248. rds_conn_count, i);
  249. if (!conn->c_path[i].cp_wq)
  250. conn->c_path[i].cp_wq = rds_wq;
  251. }
  252. rcu_read_lock();
  253. if (rds_destroy_pending(conn))
  254. ret = -ENETDOWN;
  255. else
  256. ret = trans->conn_alloc(conn, GFP_ATOMIC);
  257. if (ret) {
  258. rcu_read_unlock();
  259. free_cp = conn->c_path;
  260. kmem_cache_free(rds_conn_slab, conn);
  261. conn = ERR_PTR(ret);
  262. goto out;
  263. }
  264. rdsdebug("allocated conn %p for %pI6c -> %pI6c over %s %s\n",
  265. conn, laddr, faddr,
  266. strnlen(trans->t_name, sizeof(trans->t_name)) ?
  267. trans->t_name : "[unknown]", is_outgoing ? "(outgoing)" : "");
  268. /*
  269. * Since we ran without holding the conn lock, someone could
  270. * have created the same conn (either normal or passive) in the
  271. * interim. We check while holding the lock. If we won, we complete
  272. * init and return our conn. If we lost, we rollback and return the
  273. * other one.
  274. */
  275. spin_lock_irqsave(&rds_conn_lock, flags);
  276. if (parent) {
  277. /* Creating passive conn */
  278. if (parent->c_passive) {
  279. trans->conn_free(conn->c_path[0].cp_transport_data);
  280. free_cp = conn->c_path;
  281. kmem_cache_free(rds_conn_slab, conn);
  282. conn = parent->c_passive;
  283. } else {
  284. parent->c_passive = conn;
  285. rds_cong_add_conn(conn);
  286. rds_conn_count++;
  287. }
  288. } else {
  289. /* Creating normal conn */
  290. struct rds_connection *found;
  291. found = rds_conn_lookup(net, head, laddr, faddr, trans,
  292. tos, dev_if);
  293. if (found) {
  294. struct rds_conn_path *cp;
  295. int i;
  296. for (i = 0; i < npaths; i++) {
  297. cp = &conn->c_path[i];
  298. /* The ->conn_alloc invocation may have
  299. * allocated resource for all paths, so all
  300. * of them may have to be freed here.
  301. */
  302. if (cp->cp_transport_data)
  303. trans->conn_free(cp->cp_transport_data);
  304. }
  305. free_cp = conn->c_path;
  306. kmem_cache_free(rds_conn_slab, conn);
  307. conn = found;
  308. } else {
  309. conn->c_my_gen_num = rds_gen_num;
  310. conn->c_peer_gen_num = 0;
  311. hlist_add_head_rcu(&conn->c_hash_node, head);
  312. rds_cong_add_conn(conn);
  313. rds_conn_count++;
  314. }
  315. }
  316. spin_unlock_irqrestore(&rds_conn_lock, flags);
  317. rcu_read_unlock();
  318. out:
  319. if (free_cp) {
  320. for (i = 0; i < npaths; i++)
  321. if (free_cp[i].cp_wq != rds_wq)
  322. destroy_workqueue(free_cp[i].cp_wq);
  323. kfree(free_cp);
  324. }
  325. return conn;
  326. }
  327. struct rds_connection *rds_conn_create(struct net *net,
  328. const struct in6_addr *laddr,
  329. const struct in6_addr *faddr,
  330. struct rds_transport *trans, u8 tos,
  331. gfp_t gfp, int dev_if)
  332. {
  333. return __rds_conn_create(net, laddr, faddr, trans, gfp, tos, 0, dev_if);
  334. }
  335. EXPORT_SYMBOL_GPL(rds_conn_create);
  336. struct rds_connection *rds_conn_create_outgoing(struct net *net,
  337. const struct in6_addr *laddr,
  338. const struct in6_addr *faddr,
  339. struct rds_transport *trans,
  340. u8 tos, gfp_t gfp, int dev_if)
  341. {
  342. return __rds_conn_create(net, laddr, faddr, trans, gfp, tos, 1, dev_if);
  343. }
  344. EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);
  345. void rds_conn_shutdown(struct rds_conn_path *cp)
  346. {
  347. struct rds_connection *conn = cp->cp_conn;
  348. /* shut it down unless it's down already */
  349. if (!rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_DOWN)) {
  350. /*
  351. * Quiesce the connection mgmt handlers before we start tearing
  352. * things down. We don't hold the mutex for the entire
  353. * duration of the shutdown operation, else we may be
  354. * deadlocking with the CM handler. Instead, the CM event
  355. * handler is supposed to check for state DISCONNECTING
  356. */
  357. mutex_lock(&cp->cp_cm_lock);
  358. if (!rds_conn_path_transition(cp, RDS_CONN_UP,
  359. RDS_CONN_DISCONNECTING) &&
  360. !rds_conn_path_transition(cp, RDS_CONN_ERROR,
  361. RDS_CONN_DISCONNECTING) &&
  362. !rds_conn_path_transition(cp, RDS_CONN_RESETTING,
  363. RDS_CONN_DISCONNECTING)) {
  364. rds_conn_path_error(cp,
  365. "shutdown called in state %d\n",
  366. atomic_read(&cp->cp_state));
  367. mutex_unlock(&cp->cp_cm_lock);
  368. return;
  369. }
  370. mutex_unlock(&cp->cp_cm_lock);
  371. wait_event(cp->cp_waitq,
  372. !test_bit(RDS_IN_XMIT, &cp->cp_flags));
  373. wait_event(cp->cp_waitq,
  374. !test_bit(RDS_RECV_REFILL, &cp->cp_flags));
  375. conn->c_trans->conn_path_shutdown(cp);
  376. rds_conn_path_reset(cp);
  377. if (!rds_conn_path_transition(cp, RDS_CONN_DISCONNECTING,
  378. RDS_CONN_DOWN) &&
  379. !rds_conn_path_transition(cp, RDS_CONN_ERROR,
  380. RDS_CONN_DOWN)) {
  381. /* This can happen - eg when we're in the middle of tearing
  382. * down the connection, and someone unloads the rds module.
  383. * Quite reproducible with loopback connections.
  384. * Mostly harmless.
  385. *
  386. * Note that this also happens with rds-tcp because
  387. * we could have triggered rds_conn_path_drop in irq
  388. * mode from rds_tcp_state change on the receipt of
  389. * a FIN, thus we need to recheck for RDS_CONN_ERROR
  390. * here.
  391. */
  392. rds_conn_path_error(cp, "%s: failed to transition "
  393. "to state DOWN, current state "
  394. "is %d\n", __func__,
  395. atomic_read(&cp->cp_state));
  396. return;
  397. }
  398. }
  399. /* Then reconnect if it's still live.
  400. * The passive side of an IB loopback connection is never added
  401. * to the conn hash, so we never trigger a reconnect on this
  402. * conn - the reconnect is always triggered by the active peer. */
  403. cancel_delayed_work_sync(&cp->cp_conn_w);
  404. clear_bit(RDS_RECONNECT_PENDING, &cp->cp_flags);
  405. rcu_read_lock();
  406. if (!hlist_unhashed(&conn->c_hash_node)) {
  407. rcu_read_unlock();
  408. if (conn->c_trans->t_mp_capable &&
  409. cp->cp_index == 0)
  410. rds_send_ping(conn, 0);
  411. rds_queue_reconnect(cp);
  412. } else {
  413. rcu_read_unlock();
  414. }
  415. /* we do not hold the socket lock here but it is safe because
  416. * fan-out is disabled when calling conn_slots_available()
  417. */
  418. if (conn->c_trans->conn_slots_available)
  419. conn->c_trans->conn_slots_available(conn, false);
  420. }
  421. /* destroy a single rds_conn_path. rds_conn_destroy() iterates over
  422. * all paths using rds_conn_path_destroy()
  423. */
  424. static void rds_conn_path_destroy(struct rds_conn_path *cp)
  425. {
  426. struct rds_message *rm, *rtmp;
  427. if (!cp->cp_transport_data)
  428. return;
  429. /* make sure lingering queued work won't try to ref the conn */
  430. cancel_delayed_work_sync(&cp->cp_send_w);
  431. cancel_delayed_work_sync(&cp->cp_recv_w);
  432. rds_conn_path_drop(cp, true);
  433. flush_work(&cp->cp_down_w);
  434. /* tear down queued messages */
  435. list_for_each_entry_safe(rm, rtmp,
  436. &cp->cp_send_queue,
  437. m_conn_item) {
  438. list_del_init(&rm->m_conn_item);
  439. BUG_ON(!list_empty(&rm->m_sock_item));
  440. rds_message_put(rm);
  441. }
  442. if (cp->cp_xmit_rm)
  443. rds_message_put(cp->cp_xmit_rm);
  444. WARN_ON(delayed_work_pending(&cp->cp_send_w));
  445. WARN_ON(delayed_work_pending(&cp->cp_recv_w));
  446. WARN_ON(delayed_work_pending(&cp->cp_conn_w));
  447. WARN_ON(work_pending(&cp->cp_down_w));
  448. if (cp->cp_wq != rds_wq) {
  449. destroy_workqueue(cp->cp_wq);
  450. cp->cp_wq = NULL;
  451. }
  452. cp->cp_conn->c_trans->conn_free(cp->cp_transport_data);
  453. }
  454. /*
  455. * Stop and free a connection.
  456. *
  457. * This can only be used in very limited circumstances. It assumes that once
  458. * the conn has been shutdown that no one else is referencing the connection.
  459. * We can only ensure this in the rmmod path in the current code.
  460. */
  461. void rds_conn_destroy(struct rds_connection *conn)
  462. {
  463. unsigned long flags;
  464. int i;
  465. struct rds_conn_path *cp;
  466. int npaths = (conn->c_trans->t_mp_capable ? RDS_MPATH_WORKERS : 1);
  467. rdsdebug("freeing conn %p for %pI4 -> "
  468. "%pI4\n", conn, &conn->c_laddr,
  469. &conn->c_faddr);
  470. /* Ensure conn will not be scheduled for reconnect */
  471. spin_lock_irq(&rds_conn_lock);
  472. hlist_del_init_rcu(&conn->c_hash_node);
  473. spin_unlock_irq(&rds_conn_lock);
  474. synchronize_rcu();
  475. /* shut the connection down */
  476. for (i = 0; i < npaths; i++) {
  477. cp = &conn->c_path[i];
  478. rds_conn_path_destroy(cp);
  479. BUG_ON(!list_empty(&cp->cp_retrans));
  480. }
  481. /*
  482. * The congestion maps aren't freed up here. They're
  483. * freed by rds_cong_exit() after all the connections
  484. * have been freed.
  485. */
  486. rds_cong_remove_conn(conn);
  487. kfree(conn->c_path);
  488. kmem_cache_free(rds_conn_slab, conn);
  489. spin_lock_irqsave(&rds_conn_lock, flags);
  490. rds_conn_count--;
  491. spin_unlock_irqrestore(&rds_conn_lock, flags);
  492. }
  493. EXPORT_SYMBOL_GPL(rds_conn_destroy);
  494. static void __rds_inc_msg_cp(struct rds_incoming *inc,
  495. struct rds_info_iterator *iter,
  496. void *saddr, void *daddr, int flip, bool isv6)
  497. {
  498. #if IS_ENABLED(CONFIG_IPV6)
  499. if (isv6)
  500. rds6_inc_info_copy(inc, iter, saddr, daddr, flip);
  501. else
  502. #endif
  503. rds_inc_info_copy(inc, iter, *(__be32 *)saddr,
  504. *(__be32 *)daddr, flip);
  505. }
  506. static void rds_conn_message_info_cmn(struct socket *sock, unsigned int len,
  507. struct rds_info_iterator *iter,
  508. struct rds_info_lengths *lens,
  509. int want_send, bool isv6)
  510. {
  511. struct hlist_head *head;
  512. struct list_head *list;
  513. struct rds_connection *conn;
  514. struct rds_message *rm;
  515. unsigned int total = 0;
  516. unsigned long flags;
  517. size_t i;
  518. int j;
  519. if (isv6)
  520. len /= sizeof(struct rds6_info_message);
  521. else
  522. len /= sizeof(struct rds_info_message);
  523. rcu_read_lock();
  524. for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
  525. i++, head++) {
  526. hlist_for_each_entry_rcu(conn, head, c_hash_node) {
  527. struct rds_conn_path *cp;
  528. int npaths;
  529. if (!isv6 && conn->c_isv6)
  530. continue;
  531. npaths = (conn->c_trans->t_mp_capable ?
  532. RDS_MPATH_WORKERS : 1);
  533. for (j = 0; j < npaths; j++) {
  534. cp = &conn->c_path[j];
  535. if (want_send)
  536. list = &cp->cp_send_queue;
  537. else
  538. list = &cp->cp_retrans;
  539. spin_lock_irqsave(&cp->cp_lock, flags);
  540. /* XXX too lazy to maintain counts.. */
  541. list_for_each_entry(rm, list, m_conn_item) {
  542. total++;
  543. if (total <= len)
  544. __rds_inc_msg_cp(&rm->m_inc,
  545. iter,
  546. &conn->c_laddr,
  547. &conn->c_faddr,
  548. 0, isv6);
  549. }
  550. spin_unlock_irqrestore(&cp->cp_lock, flags);
  551. }
  552. }
  553. }
  554. rcu_read_unlock();
  555. lens->nr = total;
  556. if (isv6)
  557. lens->each = sizeof(struct rds6_info_message);
  558. else
  559. lens->each = sizeof(struct rds_info_message);
  560. }
  561. static void rds_conn_message_info(struct socket *sock, unsigned int len,
  562. struct rds_info_iterator *iter,
  563. struct rds_info_lengths *lens,
  564. int want_send)
  565. {
  566. rds_conn_message_info_cmn(sock, len, iter, lens, want_send, false);
  567. }
  568. #if IS_ENABLED(CONFIG_IPV6)
  569. static void rds6_conn_message_info(struct socket *sock, unsigned int len,
  570. struct rds_info_iterator *iter,
  571. struct rds_info_lengths *lens,
  572. int want_send)
  573. {
  574. rds_conn_message_info_cmn(sock, len, iter, lens, want_send, true);
  575. }
  576. #endif
  577. static void rds_conn_message_info_send(struct socket *sock, unsigned int len,
  578. struct rds_info_iterator *iter,
  579. struct rds_info_lengths *lens)
  580. {
  581. rds_conn_message_info(sock, len, iter, lens, 1);
  582. }
  583. #if IS_ENABLED(CONFIG_IPV6)
  584. static void rds6_conn_message_info_send(struct socket *sock, unsigned int len,
  585. struct rds_info_iterator *iter,
  586. struct rds_info_lengths *lens)
  587. {
  588. rds6_conn_message_info(sock, len, iter, lens, 1);
  589. }
  590. #endif
  591. static void rds_conn_message_info_retrans(struct socket *sock,
  592. unsigned int len,
  593. struct rds_info_iterator *iter,
  594. struct rds_info_lengths *lens)
  595. {
  596. rds_conn_message_info(sock, len, iter, lens, 0);
  597. }
  598. #if IS_ENABLED(CONFIG_IPV6)
  599. static void rds6_conn_message_info_retrans(struct socket *sock,
  600. unsigned int len,
  601. struct rds_info_iterator *iter,
  602. struct rds_info_lengths *lens)
  603. {
  604. rds6_conn_message_info(sock, len, iter, lens, 0);
  605. }
  606. #endif
  607. void rds_for_each_conn_info(struct socket *sock, unsigned int len,
  608. struct rds_info_iterator *iter,
  609. struct rds_info_lengths *lens,
  610. int (*visitor)(struct rds_connection *, void *),
  611. u64 *buffer,
  612. size_t item_len)
  613. {
  614. struct hlist_head *head;
  615. struct rds_connection *conn;
  616. size_t i;
  617. rcu_read_lock();
  618. lens->nr = 0;
  619. lens->each = item_len;
  620. for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
  621. i++, head++) {
  622. hlist_for_each_entry_rcu(conn, head, c_hash_node) {
  623. /* XXX no c_lock usage.. */
  624. if (!visitor(conn, buffer))
  625. continue;
  626. /* We copy as much as we can fit in the buffer,
  627. * but we count all items so that the caller
  628. * can resize the buffer. */
  629. if (len >= item_len) {
  630. rds_info_copy(iter, buffer, item_len);
  631. len -= item_len;
  632. }
  633. lens->nr++;
  634. }
  635. }
  636. rcu_read_unlock();
  637. }
  638. EXPORT_SYMBOL_GPL(rds_for_each_conn_info);
  639. static void rds_walk_conn_path_info(struct socket *sock, unsigned int len,
  640. struct rds_info_iterator *iter,
  641. struct rds_info_lengths *lens,
  642. int (*visitor)(struct rds_conn_path *, void *),
  643. u64 *buffer,
  644. size_t item_len)
  645. {
  646. struct hlist_head *head;
  647. struct rds_connection *conn;
  648. size_t i;
  649. rcu_read_lock();
  650. lens->nr = 0;
  651. lens->each = item_len;
  652. for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
  653. i++, head++) {
  654. hlist_for_each_entry_rcu(conn, head, c_hash_node) {
  655. struct rds_conn_path *cp;
  656. /* XXX We only copy the information from the first
  657. * path for now. The problem is that if there are
  658. * more than one underlying paths, we cannot report
  659. * information of all of them using the existing
  660. * API. For example, there is only one next_tx_seq,
  661. * which path's next_tx_seq should we report? It is
  662. * a bug in the design of MPRDS.
  663. */
  664. cp = conn->c_path;
  665. /* XXX no cp_lock usage.. */
  666. if (!visitor(cp, buffer))
  667. continue;
  668. /* We copy as much as we can fit in the buffer,
  669. * but we count all items so that the caller
  670. * can resize the buffer.
  671. */
  672. if (len >= item_len) {
  673. rds_info_copy(iter, buffer, item_len);
  674. len -= item_len;
  675. }
  676. lens->nr++;
  677. }
  678. }
  679. rcu_read_unlock();
  680. }
  681. static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer)
  682. {
  683. struct rds_info_connection *cinfo = buffer;
  684. struct rds_connection *conn = cp->cp_conn;
  685. if (conn->c_isv6)
  686. return 0;
  687. cinfo->next_tx_seq = cp->cp_next_tx_seq;
  688. cinfo->next_rx_seq = cp->cp_next_rx_seq;
  689. cinfo->laddr = conn->c_laddr.s6_addr32[3];
  690. cinfo->faddr = conn->c_faddr.s6_addr32[3];
  691. cinfo->tos = conn->c_tos;
  692. strscpy_pad(cinfo->transport, conn->c_trans->t_name);
  693. cinfo->flags = 0;
  694. rds_conn_info_set(cinfo->flags, test_bit(RDS_IN_XMIT, &cp->cp_flags),
  695. SENDING);
  696. /* XXX Future: return the state rather than these funky bits */
  697. rds_conn_info_set(cinfo->flags,
  698. atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING,
  699. CONNECTING);
  700. rds_conn_info_set(cinfo->flags,
  701. atomic_read(&cp->cp_state) == RDS_CONN_UP,
  702. CONNECTED);
  703. return 1;
  704. }
  705. #if IS_ENABLED(CONFIG_IPV6)
  706. static int rds6_conn_info_visitor(struct rds_conn_path *cp, void *buffer)
  707. {
  708. struct rds6_info_connection *cinfo6 = buffer;
  709. struct rds_connection *conn = cp->cp_conn;
  710. cinfo6->next_tx_seq = cp->cp_next_tx_seq;
  711. cinfo6->next_rx_seq = cp->cp_next_rx_seq;
  712. cinfo6->laddr = conn->c_laddr;
  713. cinfo6->faddr = conn->c_faddr;
  714. strscpy_pad(cinfo6->transport, conn->c_trans->t_name);
  715. cinfo6->flags = 0;
  716. rds_conn_info_set(cinfo6->flags, test_bit(RDS_IN_XMIT, &cp->cp_flags),
  717. SENDING);
  718. /* XXX Future: return the state rather than these funky bits */
  719. rds_conn_info_set(cinfo6->flags,
  720. atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING,
  721. CONNECTING);
  722. rds_conn_info_set(cinfo6->flags,
  723. atomic_read(&cp->cp_state) == RDS_CONN_UP,
  724. CONNECTED);
  725. /* Just return 1 as there is no error case. This is a helper function
  726. * for rds_walk_conn_path_info() and it wants a return value.
  727. */
  728. return 1;
  729. }
  730. #endif
  731. static void rds_conn_info(struct socket *sock, unsigned int len,
  732. struct rds_info_iterator *iter,
  733. struct rds_info_lengths *lens)
  734. {
  735. u64 buffer[(sizeof(struct rds_info_connection) + 7) / 8];
  736. rds_walk_conn_path_info(sock, len, iter, lens,
  737. rds_conn_info_visitor,
  738. buffer,
  739. sizeof(struct rds_info_connection));
  740. }
  741. #if IS_ENABLED(CONFIG_IPV6)
  742. static void rds6_conn_info(struct socket *sock, unsigned int len,
  743. struct rds_info_iterator *iter,
  744. struct rds_info_lengths *lens)
  745. {
  746. u64 buffer[(sizeof(struct rds6_info_connection) + 7) / 8];
  747. rds_walk_conn_path_info(sock, len, iter, lens,
  748. rds6_conn_info_visitor,
  749. buffer,
  750. sizeof(struct rds6_info_connection));
  751. }
  752. #endif
  753. int rds_conn_init(void)
  754. {
  755. int ret;
  756. ret = rds_loop_net_init(); /* register pernet callback */
  757. if (ret)
  758. return ret;
  759. rds_conn_slab = KMEM_CACHE(rds_connection, 0);
  760. if (!rds_conn_slab) {
  761. rds_loop_net_exit();
  762. return -ENOMEM;
  763. }
  764. rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info);
  765. rds_info_register_func(RDS_INFO_SEND_MESSAGES,
  766. rds_conn_message_info_send);
  767. rds_info_register_func(RDS_INFO_RETRANS_MESSAGES,
  768. rds_conn_message_info_retrans);
  769. #if IS_ENABLED(CONFIG_IPV6)
  770. rds_info_register_func(RDS6_INFO_CONNECTIONS, rds6_conn_info);
  771. rds_info_register_func(RDS6_INFO_SEND_MESSAGES,
  772. rds6_conn_message_info_send);
  773. rds_info_register_func(RDS6_INFO_RETRANS_MESSAGES,
  774. rds6_conn_message_info_retrans);
  775. #endif
  776. return 0;
  777. }
  778. void rds_conn_exit(void)
  779. {
  780. rds_loop_net_exit(); /* unregister pernet callback */
  781. rds_loop_exit();
  782. WARN_ON(!hlist_empty(rds_conn_hash));
  783. kmem_cache_destroy(rds_conn_slab);
  784. rds_info_deregister_func(RDS_INFO_CONNECTIONS, rds_conn_info);
  785. rds_info_deregister_func(RDS_INFO_SEND_MESSAGES,
  786. rds_conn_message_info_send);
  787. rds_info_deregister_func(RDS_INFO_RETRANS_MESSAGES,
  788. rds_conn_message_info_retrans);
  789. #if IS_ENABLED(CONFIG_IPV6)
  790. rds_info_deregister_func(RDS6_INFO_CONNECTIONS, rds6_conn_info);
  791. rds_info_deregister_func(RDS6_INFO_SEND_MESSAGES,
  792. rds6_conn_message_info_send);
  793. rds_info_deregister_func(RDS6_INFO_RETRANS_MESSAGES,
  794. rds6_conn_message_info_retrans);
  795. #endif
  796. }
  797. /*
  798. * Force a disconnect
  799. */
  800. void rds_conn_path_drop(struct rds_conn_path *cp, bool destroy)
  801. {
  802. atomic_set(&cp->cp_state, RDS_CONN_ERROR);
  803. rcu_read_lock();
  804. if (!destroy && rds_destroy_pending(cp->cp_conn)) {
  805. rcu_read_unlock();
  806. return;
  807. }
  808. queue_work(cp->cp_wq, &cp->cp_down_w);
  809. rcu_read_unlock();
  810. }
  811. EXPORT_SYMBOL_GPL(rds_conn_path_drop);
  812. void rds_conn_drop(struct rds_connection *conn)
  813. {
  814. WARN_ON(conn->c_trans->t_mp_capable);
  815. rds_conn_path_drop(&conn->c_path[0], false);
  816. }
  817. EXPORT_SYMBOL_GPL(rds_conn_drop);
  818. /*
  819. * If the connection is down, trigger a connect. We may have scheduled a
  820. * delayed reconnect however - in this case we should not interfere.
  821. */
  822. void rds_conn_path_connect_if_down(struct rds_conn_path *cp)
  823. {
  824. rcu_read_lock();
  825. if (rds_destroy_pending(cp->cp_conn)) {
  826. rcu_read_unlock();
  827. return;
  828. }
  829. if (rds_conn_path_state(cp) == RDS_CONN_DOWN &&
  830. !test_and_set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags))
  831. queue_delayed_work(cp->cp_wq, &cp->cp_conn_w, 0);
  832. rcu_read_unlock();
  833. }
  834. EXPORT_SYMBOL_GPL(rds_conn_path_connect_if_down);
  835. /* Check connectivity of all paths
  836. */
  837. void rds_check_all_paths(struct rds_connection *conn)
  838. {
  839. int i = 0;
  840. do {
  841. rds_conn_path_connect_if_down(&conn->c_path[i]);
  842. } while (++i < conn->c_npaths);
  843. }
  844. void rds_conn_connect_if_down(struct rds_connection *conn)
  845. {
  846. WARN_ON(conn->c_trans->t_mp_capable);
  847. rds_conn_path_connect_if_down(&conn->c_path[0]);
  848. }
  849. EXPORT_SYMBOL_GPL(rds_conn_connect_if_down);
  850. void
  851. __rds_conn_path_error(struct rds_conn_path *cp, const char *fmt, ...)
  852. {
  853. va_list ap;
  854. va_start(ap, fmt);
  855. vprintk(fmt, ap);
  856. va_end(ap);
  857. rds_conn_path_drop(cp, false);
  858. }