umem_odp.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469
  1. /*
  2. * Copyright (c) 2014 Mellanox Technologies. All rights reserved.
  3. *
  4. * This software is available to you under a choice of one of two
  5. * licenses. You may choose to be licensed under the terms of the GNU
  6. * General Public License (GPL) Version 2, available from the file
  7. * COPYING in the main directory of this source tree, or the
  8. * OpenIB.org BSD license below:
  9. *
  10. * Redistribution and use in source and binary forms, with or
  11. * without modification, are permitted provided that the following
  12. * conditions are met:
  13. *
  14. * - Redistributions of source code must retain the above
  15. * copyright notice, this list of conditions and the following
  16. * disclaimer.
  17. *
  18. * - Redistributions in binary form must reproduce the above
  19. * copyright notice, this list of conditions and the following
  20. * disclaimer in the documentation and/or other materials
  21. * provided with the distribution.
  22. *
  23. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  24. * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  25. * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  26. * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  27. * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  28. * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  29. * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  30. * SOFTWARE.
  31. */
  32. #include <linux/types.h>
  33. #include <linux/sched.h>
  34. #include <linux/sched/mm.h>
  35. #include <linux/sched/task.h>
  36. #include <linux/pid.h>
  37. #include <linux/slab.h>
  38. #include <linux/export.h>
  39. #include <linux/vmalloc.h>
  40. #include <linux/hugetlb.h>
  41. #include <linux/interval_tree.h>
  42. #include <linux/hmm.h>
  43. #include <linux/hmm-dma.h>
  44. #include <linux/pagemap.h>
  45. #include <rdma/ib_umem_odp.h>
  46. #include "uverbs.h"
  47. static void ib_init_umem_implicit_odp(struct ib_umem_odp *umem_odp)
  48. {
  49. umem_odp->is_implicit_odp = 1;
  50. umem_odp->umem.is_odp = 1;
  51. mutex_init(&umem_odp->umem_mutex);
  52. }
  53. static int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
  54. const struct mmu_interval_notifier_ops *ops)
  55. {
  56. struct ib_device *dev = umem_odp->umem.ibdev;
  57. size_t page_size = 1UL << umem_odp->page_shift;
  58. struct hmm_dma_map *map;
  59. unsigned long start;
  60. unsigned long end;
  61. size_t nr_entries;
  62. int ret = 0;
  63. umem_odp->umem.is_odp = 1;
  64. mutex_init(&umem_odp->umem_mutex);
  65. start = ALIGN_DOWN(umem_odp->umem.address, page_size);
  66. if (check_add_overflow(umem_odp->umem.address,
  67. (unsigned long)umem_odp->umem.length, &end))
  68. return -EOVERFLOW;
  69. end = ALIGN(end, page_size);
  70. if (unlikely(end < page_size))
  71. return -EOVERFLOW;
  72. /*
  73. * The mmu notifier can be called within reclaim contexts and takes the
  74. * umem_mutex. This is rare to trigger in testing, teach lockdep about
  75. * it.
  76. */
  77. if (IS_ENABLED(CONFIG_LOCKDEP)) {
  78. fs_reclaim_acquire(GFP_KERNEL);
  79. mutex_lock(&umem_odp->umem_mutex);
  80. mutex_unlock(&umem_odp->umem_mutex);
  81. fs_reclaim_release(GFP_KERNEL);
  82. }
  83. nr_entries = (end - start) >> PAGE_SHIFT;
  84. if (!(nr_entries * PAGE_SIZE / page_size))
  85. return -EINVAL;
  86. map = &umem_odp->map;
  87. if (ib_uses_virt_dma(dev)) {
  88. map->pfn_list = kvcalloc(nr_entries, sizeof(*map->pfn_list),
  89. GFP_KERNEL | __GFP_NOWARN);
  90. if (!map->pfn_list)
  91. ret = -ENOMEM;
  92. } else
  93. ret = hmm_dma_map_alloc(dev->dma_device, map,
  94. (end - start) >> PAGE_SHIFT,
  95. 1 << umem_odp->page_shift);
  96. if (ret)
  97. return ret;
  98. ret = mmu_interval_notifier_insert(&umem_odp->notifier,
  99. umem_odp->umem.owning_mm, start,
  100. end - start, ops);
  101. if (ret)
  102. goto out_free_map;
  103. return 0;
  104. out_free_map:
  105. if (ib_uses_virt_dma(dev))
  106. kvfree(map->pfn_list);
  107. else
  108. hmm_dma_map_free(dev->dma_device, map);
  109. return ret;
  110. }
  111. /**
  112. * ib_umem_odp_alloc_implicit - Allocate a parent implicit ODP umem
  113. *
  114. * Implicit ODP umems do not have a VA range and do not have any page lists.
  115. * They exist only to hold the per_mm reference to help the driver create
  116. * children umems.
  117. *
  118. * @device: IB device to create UMEM
  119. * @access: ib_reg_mr access flags
  120. */
  121. struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device,
  122. int access)
  123. {
  124. struct ib_umem *umem;
  125. struct ib_umem_odp *umem_odp;
  126. if (access & IB_ACCESS_HUGETLB)
  127. return ERR_PTR(-EINVAL);
  128. umem_odp = kzalloc_obj(*umem_odp);
  129. if (!umem_odp)
  130. return ERR_PTR(-ENOMEM);
  131. umem = &umem_odp->umem;
  132. umem->ibdev = device;
  133. umem->writable = ib_access_writable(access);
  134. umem->owning_mm = current->mm;
  135. umem_odp->page_shift = PAGE_SHIFT;
  136. umem_odp->tgid = get_task_pid(current, PIDTYPE_TGID);
  137. ib_init_umem_implicit_odp(umem_odp);
  138. return umem_odp;
  139. }
  140. EXPORT_SYMBOL(ib_umem_odp_alloc_implicit);
  141. /**
  142. * ib_umem_odp_alloc_child - Allocate a child ODP umem under an implicit
  143. * parent ODP umem
  144. *
  145. * @root: The parent umem enclosing the child. This must be allocated using
  146. * ib_alloc_implicit_odp_umem()
  147. * @addr: The starting userspace VA
  148. * @size: The length of the userspace VA
  149. * @ops: MMU interval ops, currently only @invalidate
  150. */
  151. struct ib_umem_odp *
  152. ib_umem_odp_alloc_child(struct ib_umem_odp *root, unsigned long addr,
  153. size_t size,
  154. const struct mmu_interval_notifier_ops *ops)
  155. {
  156. /*
  157. * Caller must ensure that root cannot be freed during the call to
  158. * ib_alloc_odp_umem.
  159. */
  160. struct ib_umem_odp *odp_data;
  161. struct ib_umem *umem;
  162. int ret;
  163. if (WARN_ON(!root->is_implicit_odp))
  164. return ERR_PTR(-EINVAL);
  165. odp_data = kzalloc_obj(*odp_data);
  166. if (!odp_data)
  167. return ERR_PTR(-ENOMEM);
  168. umem = &odp_data->umem;
  169. umem->ibdev = root->umem.ibdev;
  170. umem->length = size;
  171. umem->address = addr;
  172. umem->writable = root->umem.writable;
  173. umem->owning_mm = root->umem.owning_mm;
  174. odp_data->page_shift = PAGE_SHIFT;
  175. odp_data->notifier.ops = ops;
  176. /*
  177. * A mmget must be held when registering a notifier, the owming_mm only
  178. * has a mm_grab at this point.
  179. */
  180. if (!mmget_not_zero(umem->owning_mm)) {
  181. ret = -EFAULT;
  182. goto out_free;
  183. }
  184. odp_data->tgid = get_pid(root->tgid);
  185. ret = ib_init_umem_odp(odp_data, ops);
  186. if (ret)
  187. goto out_tgid;
  188. mmput(umem->owning_mm);
  189. return odp_data;
  190. out_tgid:
  191. put_pid(odp_data->tgid);
  192. mmput(umem->owning_mm);
  193. out_free:
  194. kfree(odp_data);
  195. return ERR_PTR(ret);
  196. }
  197. EXPORT_SYMBOL(ib_umem_odp_alloc_child);
  198. /**
  199. * ib_umem_odp_get - Create a umem_odp for a userspace va
  200. *
  201. * @device: IB device struct to get UMEM
  202. * @addr: userspace virtual address to start at
  203. * @size: length of region to pin
  204. * @access: IB_ACCESS_xxx flags for memory being pinned
  205. * @ops: MMU interval ops, currently only @invalidate
  206. *
  207. * The driver should use when the access flags indicate ODP memory. It avoids
  208. * pinning, instead, stores the mm for future page fault handling in
  209. * conjunction with MMU notifiers.
  210. */
  211. struct ib_umem_odp *ib_umem_odp_get(struct ib_device *device,
  212. unsigned long addr, size_t size, int access,
  213. const struct mmu_interval_notifier_ops *ops)
  214. {
  215. struct ib_umem_odp *umem_odp;
  216. int ret;
  217. if (WARN_ON_ONCE(!(access & IB_ACCESS_ON_DEMAND)))
  218. return ERR_PTR(-EINVAL);
  219. umem_odp = kzalloc_obj(struct ib_umem_odp);
  220. if (!umem_odp)
  221. return ERR_PTR(-ENOMEM);
  222. umem_odp->umem.ibdev = device;
  223. umem_odp->umem.length = size;
  224. umem_odp->umem.address = addr;
  225. umem_odp->umem.writable = ib_access_writable(access);
  226. umem_odp->umem.owning_mm = current->mm;
  227. umem_odp->notifier.ops = ops;
  228. umem_odp->page_shift = PAGE_SHIFT;
  229. #ifdef CONFIG_HUGETLB_PAGE
  230. if (access & IB_ACCESS_HUGETLB)
  231. umem_odp->page_shift = HPAGE_SHIFT;
  232. #endif
  233. umem_odp->tgid = get_task_pid(current, PIDTYPE_TGID);
  234. ret = ib_init_umem_odp(umem_odp, ops);
  235. if (ret)
  236. goto err_put_pid;
  237. return umem_odp;
  238. err_put_pid:
  239. put_pid(umem_odp->tgid);
  240. kfree(umem_odp);
  241. return ERR_PTR(ret);
  242. }
  243. EXPORT_SYMBOL(ib_umem_odp_get);
  244. static void ib_umem_odp_free(struct ib_umem_odp *umem_odp)
  245. {
  246. struct ib_device *dev = umem_odp->umem.ibdev;
  247. /*
  248. * Ensure that no more pages are mapped in the umem.
  249. *
  250. * It is the driver's responsibility to ensure, before calling us,
  251. * that the hardware will not attempt to access the MR any more.
  252. */
  253. mutex_lock(&umem_odp->umem_mutex);
  254. ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
  255. ib_umem_end(umem_odp));
  256. mutex_unlock(&umem_odp->umem_mutex);
  257. mmu_interval_notifier_remove(&umem_odp->notifier);
  258. if (ib_uses_virt_dma(dev))
  259. kvfree(umem_odp->map.pfn_list);
  260. else
  261. hmm_dma_map_free(dev->dma_device, &umem_odp->map);
  262. }
  263. void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
  264. {
  265. if (!umem_odp->is_implicit_odp)
  266. ib_umem_odp_free(umem_odp);
  267. put_pid(umem_odp->tgid);
  268. kfree(umem_odp);
  269. }
  270. EXPORT_SYMBOL(ib_umem_odp_release);
  271. /**
  272. * ib_umem_odp_map_dma_and_lock - DMA map userspace memory in an ODP MR and lock it.
  273. *
  274. * Maps the range passed in the argument to DMA addresses.
  275. * Upon success the ODP MR will be locked to let caller complete its device
  276. * page table update.
  277. *
  278. * Returns the number of pages mapped in success, negative error code
  279. * for failure.
  280. * @umem_odp: the umem to map and pin
  281. * @user_virt: the address from which we need to map.
  282. * @bcnt: the minimal number of bytes to pin and map. The mapping might be
  283. * bigger due to alignment, and may also be smaller in case of an error
  284. * pinning or mapping a page. The actual pages mapped is returned in
  285. * the return value.
  286. * @access_mask: bit mask of the requested access permissions for the given
  287. * range.
  288. * @fault: is faulting required for the given range
  289. */
  290. int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
  291. u64 bcnt, u64 access_mask, bool fault)
  292. __acquires(&umem_odp->umem_mutex)
  293. {
  294. struct task_struct *owning_process = NULL;
  295. struct mm_struct *owning_mm = umem_odp->umem.owning_mm;
  296. int pfn_index, dma_index, ret = 0, start_idx;
  297. unsigned int page_shift, hmm_order, pfn_start_idx;
  298. unsigned long num_pfns, current_seq;
  299. struct hmm_range range = {};
  300. unsigned long timeout;
  301. if (user_virt < ib_umem_start(umem_odp) ||
  302. user_virt + bcnt > ib_umem_end(umem_odp))
  303. return -EFAULT;
  304. page_shift = umem_odp->page_shift;
  305. /*
  306. * owning_process is allowed to be NULL, this means somehow the mm is
  307. * existing beyond the lifetime of the originating process.. Presumably
  308. * mmget_not_zero will fail in this case.
  309. */
  310. owning_process = get_pid_task(umem_odp->tgid, PIDTYPE_PID);
  311. if (!owning_process || !mmget_not_zero(owning_mm)) {
  312. ret = -EINVAL;
  313. goto out_put_task;
  314. }
  315. range.notifier = &umem_odp->notifier;
  316. range.start = ALIGN_DOWN(user_virt, 1UL << page_shift);
  317. range.end = ALIGN(user_virt + bcnt, 1UL << page_shift);
  318. pfn_start_idx = (range.start - ib_umem_start(umem_odp)) >> PAGE_SHIFT;
  319. num_pfns = (range.end - range.start) >> PAGE_SHIFT;
  320. if (fault) {
  321. range.default_flags = HMM_PFN_REQ_FAULT;
  322. if (access_mask & HMM_PFN_WRITE)
  323. range.default_flags |= HMM_PFN_REQ_WRITE;
  324. }
  325. range.hmm_pfns = &(umem_odp->map.pfn_list[pfn_start_idx]);
  326. timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
  327. retry:
  328. current_seq = range.notifier_seq =
  329. mmu_interval_read_begin(&umem_odp->notifier);
  330. mmap_read_lock(owning_mm);
  331. ret = hmm_range_fault(&range);
  332. mmap_read_unlock(owning_mm);
  333. if (unlikely(ret)) {
  334. if (ret == -EBUSY && !time_after(jiffies, timeout))
  335. goto retry;
  336. goto out_put_mm;
  337. }
  338. start_idx = (range.start - ib_umem_start(umem_odp)) >> page_shift;
  339. dma_index = start_idx;
  340. mutex_lock(&umem_odp->umem_mutex);
  341. if (mmu_interval_read_retry(&umem_odp->notifier, current_seq)) {
  342. mutex_unlock(&umem_odp->umem_mutex);
  343. goto retry;
  344. }
  345. for (pfn_index = 0; pfn_index < num_pfns;
  346. pfn_index += 1 << (page_shift - PAGE_SHIFT), dma_index++) {
  347. /*
  348. * Since we asked for hmm_range_fault() to populate
  349. * pages it shouldn't return an error entry on success.
  350. */
  351. WARN_ON(fault && range.hmm_pfns[pfn_index] & HMM_PFN_ERROR);
  352. WARN_ON(fault && !(range.hmm_pfns[pfn_index] & HMM_PFN_VALID));
  353. if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID))
  354. continue;
  355. if (range.hmm_pfns[pfn_index] & HMM_PFN_DMA_MAPPED)
  356. continue;
  357. hmm_order = hmm_pfn_to_map_order(range.hmm_pfns[pfn_index]);
  358. /* If a hugepage was detected and ODP wasn't set for, the umem
  359. * page_shift will be used, the opposite case is an error.
  360. */
  361. if (hmm_order + PAGE_SHIFT < page_shift) {
  362. ret = -EINVAL;
  363. ibdev_dbg(umem_odp->umem.ibdev,
  364. "%s: un-expected hmm_order %u, page_shift %u\n",
  365. __func__, hmm_order, page_shift);
  366. break;
  367. }
  368. }
  369. /* upon success lock should stay on hold for the callee */
  370. if (!ret)
  371. ret = dma_index - start_idx;
  372. else
  373. mutex_unlock(&umem_odp->umem_mutex);
  374. out_put_mm:
  375. mmput_async(owning_mm);
  376. out_put_task:
  377. if (owning_process)
  378. put_task_struct(owning_process);
  379. return ret;
  380. }
  381. EXPORT_SYMBOL(ib_umem_odp_map_dma_and_lock);
  382. void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
  383. u64 bound)
  384. {
  385. struct ib_device *dev = umem_odp->umem.ibdev;
  386. u64 addr;
  387. lockdep_assert_held(&umem_odp->umem_mutex);
  388. virt = max_t(u64, virt, ib_umem_start(umem_odp));
  389. bound = min_t(u64, bound, ib_umem_end(umem_odp));
  390. for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) {
  391. u64 offset = addr - ib_umem_start(umem_odp);
  392. size_t idx = offset >> umem_odp->page_shift;
  393. unsigned long pfn = umem_odp->map.pfn_list[idx];
  394. if (!hmm_dma_unmap_pfn(dev->dma_device, &umem_odp->map, idx))
  395. goto clear;
  396. if (pfn & HMM_PFN_WRITE) {
  397. struct page *page = hmm_pfn_to_page(pfn);
  398. struct page *head_page = compound_head(page);
  399. /*
  400. * set_page_dirty prefers being called with
  401. * the page lock. However, MMU notifiers are
  402. * called sometimes with and sometimes without
  403. * the lock. We rely on the umem_mutex instead
  404. * to prevent other mmu notifiers from
  405. * continuing and allowing the page mapping to
  406. * be removed.
  407. */
  408. set_page_dirty(head_page);
  409. }
  410. umem_odp->npages--;
  411. clear:
  412. umem_odp->map.pfn_list[idx] &= ~HMM_PFN_FLAGS;
  413. }
  414. }
  415. EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);