ib_rdma.c 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703
  1. /*
  2. * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
  3. *
  4. * This software is available to you under a choice of one of two
  5. * licenses. You may choose to be licensed under the terms of the GNU
  6. * General Public License (GPL) Version 2, available from the file
  7. * COPYING in the main directory of this source tree, or the
  8. * OpenIB.org BSD license below:
  9. *
  10. * Redistribution and use in source and binary forms, with or
  11. * without modification, are permitted provided that the following
  12. * conditions are met:
  13. *
  14. * - Redistributions of source code must retain the above
  15. * copyright notice, this list of conditions and the following
  16. * disclaimer.
  17. *
  18. * - Redistributions in binary form must reproduce the above
  19. * copyright notice, this list of conditions and the following
  20. * disclaimer in the documentation and/or other materials
  21. * provided with the distribution.
  22. *
  23. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  24. * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  25. * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  26. * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  27. * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  28. * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  29. * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  30. * SOFTWARE.
  31. *
  32. */
  33. #include <linux/kernel.h>
  34. #include <linux/slab.h>
  35. #include <linux/rculist.h>
  36. #include <linux/llist.h>
  37. #include "rds_single_path.h"
  38. #include "ib_mr.h"
  39. #include "rds.h"
  40. struct workqueue_struct *rds_ib_mr_wq;
  41. static void rds_ib_odp_mr_worker(struct work_struct *work);
  42. static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
  43. {
  44. struct rds_ib_device *rds_ibdev;
  45. struct rds_ib_ipaddr *i_ipaddr;
  46. rcu_read_lock();
  47. list_for_each_entry_rcu(rds_ibdev, &rds_ib_devices, list) {
  48. list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
  49. if (i_ipaddr->ipaddr == ipaddr) {
  50. refcount_inc(&rds_ibdev->refcount);
  51. rcu_read_unlock();
  52. return rds_ibdev;
  53. }
  54. }
  55. }
  56. rcu_read_unlock();
  57. return NULL;
  58. }
  59. static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
  60. {
  61. struct rds_ib_ipaddr *i_ipaddr;
  62. i_ipaddr = kmalloc_obj(*i_ipaddr);
  63. if (!i_ipaddr)
  64. return -ENOMEM;
  65. i_ipaddr->ipaddr = ipaddr;
  66. spin_lock_irq(&rds_ibdev->spinlock);
  67. list_add_tail_rcu(&i_ipaddr->list, &rds_ibdev->ipaddr_list);
  68. spin_unlock_irq(&rds_ibdev->spinlock);
  69. return 0;
  70. }
  71. static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
  72. {
  73. struct rds_ib_ipaddr *i_ipaddr;
  74. struct rds_ib_ipaddr *to_free = NULL;
  75. spin_lock_irq(&rds_ibdev->spinlock);
  76. list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
  77. if (i_ipaddr->ipaddr == ipaddr) {
  78. list_del_rcu(&i_ipaddr->list);
  79. to_free = i_ipaddr;
  80. break;
  81. }
  82. }
  83. spin_unlock_irq(&rds_ibdev->spinlock);
  84. if (to_free)
  85. kfree_rcu(to_free, rcu);
  86. }
  87. int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev,
  88. struct in6_addr *ipaddr)
  89. {
  90. struct rds_ib_device *rds_ibdev_old;
  91. rds_ibdev_old = rds_ib_get_device(ipaddr->s6_addr32[3]);
  92. if (!rds_ibdev_old)
  93. return rds_ib_add_ipaddr(rds_ibdev, ipaddr->s6_addr32[3]);
  94. if (rds_ibdev_old != rds_ibdev) {
  95. rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr->s6_addr32[3]);
  96. rds_ib_dev_put(rds_ibdev_old);
  97. return rds_ib_add_ipaddr(rds_ibdev, ipaddr->s6_addr32[3]);
  98. }
  99. rds_ib_dev_put(rds_ibdev_old);
  100. return 0;
  101. }
  102. void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
  103. {
  104. struct rds_ib_connection *ic = conn->c_transport_data;
  105. /* conn was previously on the nodev_conns_list */
  106. spin_lock_irq(&ib_nodev_conns_lock);
  107. BUG_ON(list_empty(&ib_nodev_conns));
  108. BUG_ON(list_empty(&ic->ib_node));
  109. list_del(&ic->ib_node);
  110. spin_lock(&rds_ibdev->spinlock);
  111. list_add_tail(&ic->ib_node, &rds_ibdev->conn_list);
  112. spin_unlock(&rds_ibdev->spinlock);
  113. spin_unlock_irq(&ib_nodev_conns_lock);
  114. ic->rds_ibdev = rds_ibdev;
  115. refcount_inc(&rds_ibdev->refcount);
  116. }
  117. void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
  118. {
  119. struct rds_ib_connection *ic = conn->c_transport_data;
  120. /* place conn on nodev_conns_list */
  121. spin_lock(&ib_nodev_conns_lock);
  122. spin_lock_irq(&rds_ibdev->spinlock);
  123. BUG_ON(list_empty(&ic->ib_node));
  124. list_del(&ic->ib_node);
  125. spin_unlock_irq(&rds_ibdev->spinlock);
  126. list_add_tail(&ic->ib_node, &ib_nodev_conns);
  127. spin_unlock(&ib_nodev_conns_lock);
  128. ic->rds_ibdev = NULL;
  129. rds_ib_dev_put(rds_ibdev);
  130. }
  131. void rds_ib_destroy_nodev_conns(void)
  132. {
  133. struct rds_ib_connection *ic, *_ic;
  134. LIST_HEAD(tmp_list);
  135. /* avoid calling conn_destroy with irqs off */
  136. spin_lock_irq(&ib_nodev_conns_lock);
  137. list_splice(&ib_nodev_conns, &tmp_list);
  138. spin_unlock_irq(&ib_nodev_conns_lock);
  139. list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node)
  140. rds_conn_destroy(ic->conn);
  141. }
  142. void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo)
  143. {
  144. struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool;
  145. iinfo->rdma_mr_max = pool_1m->max_items;
  146. iinfo->rdma_mr_size = pool_1m->max_pages;
  147. }
  148. #if IS_ENABLED(CONFIG_IPV6)
  149. void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev,
  150. struct rds6_info_rdma_connection *iinfo6)
  151. {
  152. struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool;
  153. iinfo6->rdma_mr_max = pool_1m->max_items;
  154. iinfo6->rdma_mr_size = pool_1m->max_pages;
  155. }
  156. #endif
  157. struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool)
  158. {
  159. struct rds_ib_mr *ibmr = NULL;
  160. struct llist_node *ret;
  161. unsigned long flags;
  162. spin_lock_irqsave(&pool->clean_lock, flags);
  163. ret = llist_del_first(&pool->clean_list);
  164. spin_unlock_irqrestore(&pool->clean_lock, flags);
  165. if (ret) {
  166. ibmr = llist_entry(ret, struct rds_ib_mr, llnode);
  167. if (pool->pool_type == RDS_IB_MR_8K_POOL)
  168. rds_ib_stats_inc(s_ib_rdma_mr_8k_reused);
  169. else
  170. rds_ib_stats_inc(s_ib_rdma_mr_1m_reused);
  171. }
  172. return ibmr;
  173. }
  174. void rds_ib_sync_mr(void *trans_private, int direction)
  175. {
  176. struct rds_ib_mr *ibmr = trans_private;
  177. struct rds_ib_device *rds_ibdev = ibmr->device;
  178. if (ibmr->odp)
  179. return;
  180. switch (direction) {
  181. case DMA_FROM_DEVICE:
  182. ib_dma_sync_sg_for_cpu(rds_ibdev->dev, ibmr->sg,
  183. ibmr->sg_dma_len, DMA_BIDIRECTIONAL);
  184. break;
  185. case DMA_TO_DEVICE:
  186. ib_dma_sync_sg_for_device(rds_ibdev->dev, ibmr->sg,
  187. ibmr->sg_dma_len, DMA_BIDIRECTIONAL);
  188. break;
  189. }
  190. }
  191. void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
  192. {
  193. struct rds_ib_device *rds_ibdev = ibmr->device;
  194. if (ibmr->sg_dma_len) {
  195. ib_dma_unmap_sg(rds_ibdev->dev,
  196. ibmr->sg, ibmr->sg_len,
  197. DMA_BIDIRECTIONAL);
  198. ibmr->sg_dma_len = 0;
  199. }
  200. /* Release the s/g list */
  201. if (ibmr->sg_len) {
  202. unsigned int i;
  203. for (i = 0; i < ibmr->sg_len; ++i) {
  204. struct page *page = sg_page(&ibmr->sg[i]);
  205. /* FIXME we need a way to tell a r/w MR
  206. * from a r/o MR */
  207. WARN_ON(!page->mapping && irqs_disabled());
  208. set_page_dirty(page);
  209. put_page(page);
  210. }
  211. kfree(ibmr->sg);
  212. ibmr->sg = NULL;
  213. ibmr->sg_len = 0;
  214. }
  215. }
  216. void rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
  217. {
  218. unsigned int pinned = ibmr->sg_len;
  219. __rds_ib_teardown_mr(ibmr);
  220. if (pinned) {
  221. struct rds_ib_mr_pool *pool = ibmr->pool;
  222. atomic_sub(pinned, &pool->free_pinned);
  223. }
  224. }
  225. static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int free_all)
  226. {
  227. unsigned int item_count;
  228. item_count = atomic_read(&pool->item_count);
  229. if (free_all)
  230. return item_count;
  231. return 0;
  232. }
  233. /*
  234. * given an llist of mrs, put them all into the list_head for more processing
  235. */
  236. static unsigned int llist_append_to_list(struct llist_head *llist,
  237. struct list_head *list)
  238. {
  239. struct rds_ib_mr *ibmr;
  240. struct llist_node *node;
  241. struct llist_node *next;
  242. unsigned int count = 0;
  243. node = llist_del_all(llist);
  244. while (node) {
  245. next = node->next;
  246. ibmr = llist_entry(node, struct rds_ib_mr, llnode);
  247. list_add_tail(&ibmr->unmap_list, list);
  248. node = next;
  249. count++;
  250. }
  251. return count;
  252. }
  253. /*
  254. * this takes a list head of mrs and turns it into linked llist nodes
  255. * of clusters. Each cluster has linked llist nodes of
  256. * MR_CLUSTER_SIZE mrs that are ready for reuse.
  257. */
  258. static void list_to_llist_nodes(struct list_head *list,
  259. struct llist_node **nodes_head,
  260. struct llist_node **nodes_tail)
  261. {
  262. struct rds_ib_mr *ibmr;
  263. struct llist_node *cur = NULL;
  264. struct llist_node **next = nodes_head;
  265. list_for_each_entry(ibmr, list, unmap_list) {
  266. cur = &ibmr->llnode;
  267. *next = cur;
  268. next = &cur->next;
  269. }
  270. *next = NULL;
  271. *nodes_tail = cur;
  272. }
  273. /*
  274. * Flush our pool of MRs.
  275. * At a minimum, all currently unused MRs are unmapped.
  276. * If the number of MRs allocated exceeds the limit, we also try
  277. * to free as many MRs as needed to get back to this limit.
  278. */
  279. int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
  280. int free_all, struct rds_ib_mr **ibmr_ret)
  281. {
  282. struct rds_ib_mr *ibmr;
  283. struct llist_node *clean_nodes;
  284. struct llist_node *clean_tail;
  285. LIST_HEAD(unmap_list);
  286. unsigned long unpinned = 0;
  287. unsigned int nfreed = 0, dirty_to_clean = 0, free_goal;
  288. if (pool->pool_type == RDS_IB_MR_8K_POOL)
  289. rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_flush);
  290. else
  291. rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_flush);
  292. if (ibmr_ret) {
  293. DEFINE_WAIT(wait);
  294. while (!mutex_trylock(&pool->flush_lock)) {
  295. ibmr = rds_ib_reuse_mr(pool);
  296. if (ibmr) {
  297. *ibmr_ret = ibmr;
  298. finish_wait(&pool->flush_wait, &wait);
  299. goto out_nolock;
  300. }
  301. prepare_to_wait(&pool->flush_wait, &wait,
  302. TASK_UNINTERRUPTIBLE);
  303. if (llist_empty(&pool->clean_list))
  304. schedule();
  305. ibmr = rds_ib_reuse_mr(pool);
  306. if (ibmr) {
  307. *ibmr_ret = ibmr;
  308. finish_wait(&pool->flush_wait, &wait);
  309. goto out_nolock;
  310. }
  311. }
  312. finish_wait(&pool->flush_wait, &wait);
  313. } else
  314. mutex_lock(&pool->flush_lock);
  315. if (ibmr_ret) {
  316. ibmr = rds_ib_reuse_mr(pool);
  317. if (ibmr) {
  318. *ibmr_ret = ibmr;
  319. goto out;
  320. }
  321. }
  322. /* Get the list of all MRs to be dropped. Ordering matters -
  323. * we want to put drop_list ahead of free_list.
  324. */
  325. dirty_to_clean = llist_append_to_list(&pool->drop_list, &unmap_list);
  326. dirty_to_clean += llist_append_to_list(&pool->free_list, &unmap_list);
  327. if (free_all) {
  328. unsigned long flags;
  329. spin_lock_irqsave(&pool->clean_lock, flags);
  330. llist_append_to_list(&pool->clean_list, &unmap_list);
  331. spin_unlock_irqrestore(&pool->clean_lock, flags);
  332. }
  333. free_goal = rds_ib_flush_goal(pool, free_all);
  334. if (list_empty(&unmap_list))
  335. goto out;
  336. rds_ib_unreg_frmr(&unmap_list, &nfreed, &unpinned, free_goal);
  337. if (!list_empty(&unmap_list)) {
  338. unsigned long flags;
  339. list_to_llist_nodes(&unmap_list, &clean_nodes, &clean_tail);
  340. if (ibmr_ret) {
  341. *ibmr_ret = llist_entry(clean_nodes, struct rds_ib_mr, llnode);
  342. clean_nodes = clean_nodes->next;
  343. }
  344. /* more than one entry in llist nodes */
  345. if (clean_nodes) {
  346. spin_lock_irqsave(&pool->clean_lock, flags);
  347. llist_add_batch(clean_nodes, clean_tail,
  348. &pool->clean_list);
  349. spin_unlock_irqrestore(&pool->clean_lock, flags);
  350. }
  351. }
  352. atomic_sub(unpinned, &pool->free_pinned);
  353. atomic_sub(dirty_to_clean, &pool->dirty_count);
  354. atomic_sub(nfreed, &pool->item_count);
  355. out:
  356. mutex_unlock(&pool->flush_lock);
  357. if (waitqueue_active(&pool->flush_wait))
  358. wake_up(&pool->flush_wait);
  359. out_nolock:
  360. return 0;
  361. }
  362. struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *pool)
  363. {
  364. struct rds_ib_mr *ibmr = NULL;
  365. int iter = 0;
  366. while (1) {
  367. ibmr = rds_ib_reuse_mr(pool);
  368. if (ibmr)
  369. return ibmr;
  370. if (atomic_inc_return(&pool->item_count) <= pool->max_items)
  371. break;
  372. atomic_dec(&pool->item_count);
  373. if (++iter > 2) {
  374. if (pool->pool_type == RDS_IB_MR_8K_POOL)
  375. rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_depleted);
  376. else
  377. rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_depleted);
  378. break;
  379. }
  380. /* We do have some empty MRs. Flush them out. */
  381. if (pool->pool_type == RDS_IB_MR_8K_POOL)
  382. rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_wait);
  383. else
  384. rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_wait);
  385. rds_ib_flush_mr_pool(pool, 0, &ibmr);
  386. if (ibmr)
  387. return ibmr;
  388. }
  389. return NULL;
  390. }
  391. static void rds_ib_mr_pool_flush_worker(struct work_struct *work)
  392. {
  393. struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker.work);
  394. rds_ib_flush_mr_pool(pool, 0, NULL);
  395. }
  396. void rds_ib_free_mr(void *trans_private, int invalidate)
  397. {
  398. struct rds_ib_mr *ibmr = trans_private;
  399. struct rds_ib_mr_pool *pool = ibmr->pool;
  400. struct rds_ib_device *rds_ibdev = ibmr->device;
  401. rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len);
  402. if (ibmr->odp) {
  403. /* A MR created and marked as use_once. We use delayed work,
  404. * because there is a change that we are in interrupt and can't
  405. * call to ib_dereg_mr() directly.
  406. */
  407. INIT_DELAYED_WORK(&ibmr->work, rds_ib_odp_mr_worker);
  408. queue_delayed_work(rds_ib_mr_wq, &ibmr->work, 0);
  409. return;
  410. }
  411. /* Return it to the pool's free list */
  412. rds_ib_free_frmr_list(ibmr);
  413. atomic_add(ibmr->sg_len, &pool->free_pinned);
  414. atomic_inc(&pool->dirty_count);
  415. /* If we've pinned too many pages, request a flush */
  416. if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned ||
  417. atomic_read(&pool->dirty_count) >= pool->max_items / 5)
  418. queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10);
  419. if (invalidate) {
  420. if (likely(!in_interrupt())) {
  421. rds_ib_flush_mr_pool(pool, 0, NULL);
  422. } else {
  423. /* We get here if the user created a MR marked
  424. * as use_once and invalidate at the same time.
  425. */
  426. queue_delayed_work(rds_ib_mr_wq,
  427. &pool->flush_worker, 10);
  428. }
  429. }
  430. rds_ib_dev_put(rds_ibdev);
  431. }
  432. void rds_ib_flush_mrs(void)
  433. {
  434. struct rds_ib_device *rds_ibdev;
  435. down_read(&rds_ib_devices_lock);
  436. list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
  437. if (rds_ibdev->mr_8k_pool)
  438. rds_ib_flush_mr_pool(rds_ibdev->mr_8k_pool, 0, NULL);
  439. if (rds_ibdev->mr_1m_pool)
  440. rds_ib_flush_mr_pool(rds_ibdev->mr_1m_pool, 0, NULL);
  441. }
  442. up_read(&rds_ib_devices_lock);
  443. }
  444. u32 rds_ib_get_lkey(void *trans_private)
  445. {
  446. struct rds_ib_mr *ibmr = trans_private;
  447. return ibmr->u.mr->lkey;
  448. }
  449. void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
  450. struct rds_sock *rs, u32 *key_ret,
  451. struct rds_connection *conn,
  452. u64 start, u64 length, int need_odp)
  453. {
  454. struct rds_ib_device *rds_ibdev;
  455. struct rds_ib_mr *ibmr = NULL;
  456. struct rds_ib_connection *ic = NULL;
  457. int ret;
  458. rds_ibdev = rds_ib_get_device(rs->rs_bound_addr.s6_addr32[3]);
  459. if (!rds_ibdev) {
  460. ret = -ENODEV;
  461. goto out;
  462. }
  463. if (need_odp == ODP_ZEROBASED || need_odp == ODP_VIRTUAL) {
  464. u64 virt_addr = need_odp == ODP_ZEROBASED ? 0 : start;
  465. int access_flags =
  466. (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ |
  467. IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_ATOMIC |
  468. IB_ACCESS_ON_DEMAND);
  469. struct ib_sge sge = {};
  470. struct ib_mr *ib_mr;
  471. if (!rds_ibdev->odp_capable) {
  472. ret = -EOPNOTSUPP;
  473. goto out;
  474. }
  475. ib_mr = ib_reg_user_mr(rds_ibdev->pd, start, length, virt_addr,
  476. access_flags);
  477. if (IS_ERR(ib_mr)) {
  478. rdsdebug("rds_ib_get_user_mr returned %d\n",
  479. IS_ERR(ib_mr));
  480. ret = PTR_ERR(ib_mr);
  481. goto out;
  482. }
  483. if (key_ret)
  484. *key_ret = ib_mr->rkey;
  485. ibmr = kzalloc_obj(*ibmr);
  486. if (!ibmr) {
  487. ib_dereg_mr(ib_mr);
  488. ret = -ENOMEM;
  489. goto out;
  490. }
  491. ibmr->u.mr = ib_mr;
  492. ibmr->odp = 1;
  493. sge.addr = virt_addr;
  494. sge.length = length;
  495. sge.lkey = ib_mr->lkey;
  496. ib_advise_mr(rds_ibdev->pd,
  497. IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE,
  498. IB_UVERBS_ADVISE_MR_FLAG_FLUSH, &sge, 1);
  499. return ibmr;
  500. }
  501. if (conn) {
  502. ic = conn->c_transport_data;
  503. if (!ic || !ic->i_cm_id || !ic->i_cm_id->qp) {
  504. ret = -ENODEV;
  505. goto out;
  506. }
  507. }
  508. if (!rds_ibdev->mr_8k_pool || !rds_ibdev->mr_1m_pool) {
  509. ret = -ENODEV;
  510. goto out;
  511. }
  512. ibmr = rds_ib_reg_frmr(rds_ibdev, ic, sg, nents, key_ret);
  513. if (IS_ERR(ibmr)) {
  514. ret = PTR_ERR(ibmr);
  515. pr_warn("RDS/IB: rds_ib_get_mr failed (errno=%d)\n", ret);
  516. } else {
  517. return ibmr;
  518. }
  519. out:
  520. if (rds_ibdev)
  521. rds_ib_dev_put(rds_ibdev);
  522. return ERR_PTR(ret);
  523. }
  524. void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool)
  525. {
  526. cancel_delayed_work_sync(&pool->flush_worker);
  527. rds_ib_flush_mr_pool(pool, 1, NULL);
  528. WARN_ON(atomic_read(&pool->item_count));
  529. WARN_ON(atomic_read(&pool->free_pinned));
  530. kfree(pool);
  531. }
  532. struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev,
  533. int pool_type)
  534. {
  535. struct rds_ib_mr_pool *pool;
  536. pool = kzalloc_obj(*pool);
  537. if (!pool)
  538. return ERR_PTR(-ENOMEM);
  539. pool->pool_type = pool_type;
  540. init_llist_head(&pool->free_list);
  541. init_llist_head(&pool->drop_list);
  542. init_llist_head(&pool->clean_list);
  543. spin_lock_init(&pool->clean_lock);
  544. mutex_init(&pool->flush_lock);
  545. init_waitqueue_head(&pool->flush_wait);
  546. INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
  547. if (pool_type == RDS_IB_MR_1M_POOL) {
  548. /* +1 allows for unaligned MRs */
  549. pool->max_pages = RDS_MR_1M_MSG_SIZE + 1;
  550. pool->max_items = rds_ibdev->max_1m_mrs;
  551. } else {
  552. /* pool_type == RDS_IB_MR_8K_POOL */
  553. pool->max_pages = RDS_MR_8K_MSG_SIZE + 1;
  554. pool->max_items = rds_ibdev->max_8k_mrs;
  555. }
  556. pool->max_free_pinned = pool->max_items * pool->max_pages / 4;
  557. pool->max_items_soft = rds_ibdev->max_mrs * 3 / 4;
  558. return pool;
  559. }
  560. int rds_ib_mr_init(void)
  561. {
  562. rds_ib_mr_wq = alloc_workqueue("rds_mr_flushd",
  563. WQ_MEM_RECLAIM | WQ_PERCPU, 0);
  564. if (!rds_ib_mr_wq)
  565. return -ENOMEM;
  566. return 0;
  567. }
  568. /* By the time this is called all the IB devices should have been torn down and
  569. * had their pools freed. As each pool is freed its work struct is waited on,
  570. * so the pool flushing work queue should be idle by the time we get here.
  571. */
  572. void rds_ib_mr_exit(void)
  573. {
  574. destroy_workqueue(rds_ib_mr_wq);
  575. }
  576. static void rds_ib_odp_mr_worker(struct work_struct *work)
  577. {
  578. struct rds_ib_mr *ibmr;
  579. ibmr = container_of(work, struct rds_ib_mr, work.work);
  580. ib_dereg_mr(ibmr->u.mr);
  581. kfree(ibmr);
  582. }