mshv_eventfd.c 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * eventfd support for mshv
  4. *
  5. * Heavily inspired from KVM implementation of irqfd/ioeventfd. The basic
  6. * framework code is taken from the kvm implementation.
  7. *
  8. * All credits to kvm developers.
  9. */
  10. #include <linux/syscalls.h>
  11. #include <linux/wait.h>
  12. #include <linux/poll.h>
  13. #include <linux/file.h>
  14. #include <linux/list.h>
  15. #include <linux/workqueue.h>
  16. #include <linux/eventfd.h>
  17. #if IS_ENABLED(CONFIG_X86_64)
  18. #include <asm/apic.h>
  19. #endif
  20. #include <asm/mshyperv.h>
  21. #include "mshv_eventfd.h"
  22. #include "mshv.h"
  23. #include "mshv_root.h"
  24. static struct workqueue_struct *irqfd_cleanup_wq;
  25. void mshv_register_irq_ack_notifier(struct mshv_partition *partition,
  26. struct mshv_irq_ack_notifier *mian)
  27. {
  28. mutex_lock(&partition->pt_irq_lock);
  29. hlist_add_head_rcu(&mian->link, &partition->irq_ack_notifier_list);
  30. mutex_unlock(&partition->pt_irq_lock);
  31. }
  32. void mshv_unregister_irq_ack_notifier(struct mshv_partition *partition,
  33. struct mshv_irq_ack_notifier *mian)
  34. {
  35. mutex_lock(&partition->pt_irq_lock);
  36. hlist_del_init_rcu(&mian->link);
  37. mutex_unlock(&partition->pt_irq_lock);
  38. synchronize_rcu();
  39. }
  40. bool mshv_notify_acked_gsi(struct mshv_partition *partition, int gsi)
  41. {
  42. struct mshv_irq_ack_notifier *mian;
  43. bool acked = false;
  44. rcu_read_lock();
  45. hlist_for_each_entry_rcu(mian, &partition->irq_ack_notifier_list,
  46. link) {
  47. if (mian->irq_ack_gsi == gsi) {
  48. mian->irq_acked(mian);
  49. acked = true;
  50. }
  51. }
  52. rcu_read_unlock();
  53. return acked;
  54. }
  55. #if IS_ENABLED(CONFIG_ARM64)
  56. static inline bool hv_should_clear_interrupt(enum hv_interrupt_type type)
  57. {
  58. return false;
  59. }
  60. #elif IS_ENABLED(CONFIG_X86_64)
  61. static inline bool hv_should_clear_interrupt(enum hv_interrupt_type type)
  62. {
  63. return type == HV_X64_INTERRUPT_TYPE_EXTINT;
  64. }
  65. #endif
  66. static void mshv_irqfd_resampler_ack(struct mshv_irq_ack_notifier *mian)
  67. {
  68. struct mshv_irqfd_resampler *resampler;
  69. struct mshv_partition *partition;
  70. struct mshv_irqfd *irqfd;
  71. int idx;
  72. resampler = container_of(mian, struct mshv_irqfd_resampler,
  73. rsmplr_notifier);
  74. partition = resampler->rsmplr_partn;
  75. idx = srcu_read_lock(&partition->pt_irq_srcu);
  76. hlist_for_each_entry_srcu(irqfd, &resampler->rsmplr_irqfd_list,
  77. irqfd_resampler_hnode,
  78. srcu_read_lock_held(&partition->pt_irq_srcu)) {
  79. if (hv_should_clear_interrupt(irqfd->irqfd_lapic_irq.lapic_control.interrupt_type))
  80. hv_call_clear_virtual_interrupt(partition->pt_id);
  81. eventfd_signal(irqfd->irqfd_resamplefd);
  82. }
  83. srcu_read_unlock(&partition->pt_irq_srcu, idx);
  84. }
  85. #if IS_ENABLED(CONFIG_X86_64)
  86. static bool
  87. mshv_vp_irq_vector_injected(union hv_vp_register_page_interrupt_vectors iv,
  88. u32 vector)
  89. {
  90. int i;
  91. for (i = 0; i < iv.vector_count; i++) {
  92. if (iv.vector[i] == vector)
  93. return true;
  94. }
  95. return false;
  96. }
  97. static int mshv_vp_irq_try_set_vector(struct mshv_vp *vp, u32 vector)
  98. {
  99. union hv_vp_register_page_interrupt_vectors iv, new_iv;
  100. iv = vp->vp_register_page->interrupt_vectors;
  101. new_iv = iv;
  102. if (mshv_vp_irq_vector_injected(iv, vector))
  103. return 0;
  104. if (iv.vector_count >= HV_VP_REGISTER_PAGE_MAX_VECTOR_COUNT)
  105. return -ENOSPC;
  106. new_iv.vector[new_iv.vector_count++] = vector;
  107. if (!try_cmpxchg(&vp->vp_register_page->interrupt_vectors.as_uint64,
  108. &iv.as_uint64, new_iv.as_uint64))
  109. return -EAGAIN;
  110. return 0;
  111. }
  112. static int mshv_vp_irq_set_vector(struct mshv_vp *vp, u32 vector)
  113. {
  114. int ret;
  115. do {
  116. ret = mshv_vp_irq_try_set_vector(vp, vector);
  117. } while (ret == -EAGAIN && !need_resched());
  118. return ret;
  119. }
  120. /*
  121. * Try to raise irq for guest via shared vector array. hyp does the actual
  122. * inject of the interrupt.
  123. */
  124. static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd)
  125. {
  126. struct mshv_partition *partition = irqfd->irqfd_partn;
  127. struct mshv_lapic_irq *irq = &irqfd->irqfd_lapic_irq;
  128. struct mshv_vp *vp;
  129. if (!(ms_hyperv.ext_features &
  130. HV_VP_DISPATCH_INTERRUPT_INJECTION_AVAILABLE))
  131. return -EOPNOTSUPP;
  132. if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT)
  133. return -EOPNOTSUPP;
  134. #if IS_ENABLED(CONFIG_X86)
  135. if (irq->lapic_control.logical_dest_mode)
  136. return -EOPNOTSUPP;
  137. #endif
  138. vp = partition->pt_vp_array[irq->lapic_apic_id];
  139. if (!vp->vp_register_page)
  140. return -EOPNOTSUPP;
  141. if (mshv_vp_irq_set_vector(vp, irq->lapic_vector))
  142. return -EINVAL;
  143. if (vp->run.flags.root_sched_dispatched &&
  144. vp->vp_register_page->interrupt_vectors.as_uint64)
  145. return -EBUSY;
  146. wake_up(&vp->run.vp_suspend_queue);
  147. return 0;
  148. }
  149. #else /* CONFIG_X86_64 */
  150. static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd)
  151. {
  152. return -EOPNOTSUPP;
  153. }
  154. #endif
  155. static void mshv_assert_irq_slow(struct mshv_irqfd *irqfd)
  156. {
  157. struct mshv_partition *partition = irqfd->irqfd_partn;
  158. struct mshv_lapic_irq *irq = &irqfd->irqfd_lapic_irq;
  159. unsigned int seq;
  160. int idx;
  161. #if IS_ENABLED(CONFIG_X86)
  162. WARN_ON(irqfd->irqfd_resampler &&
  163. !irq->lapic_control.level_triggered);
  164. #endif
  165. idx = srcu_read_lock(&partition->pt_irq_srcu);
  166. if (irqfd->irqfd_girq_ent.guest_irq_num) {
  167. if (!irqfd->irqfd_girq_ent.girq_entry_valid) {
  168. srcu_read_unlock(&partition->pt_irq_srcu, idx);
  169. return;
  170. }
  171. do {
  172. seq = read_seqcount_begin(&irqfd->irqfd_irqe_sc);
  173. } while (read_seqcount_retry(&irqfd->irqfd_irqe_sc, seq));
  174. }
  175. hv_call_assert_virtual_interrupt(irqfd->irqfd_partn->pt_id,
  176. irq->lapic_vector, irq->lapic_apic_id,
  177. irq->lapic_control);
  178. srcu_read_unlock(&partition->pt_irq_srcu, idx);
  179. }
  180. static void mshv_irqfd_resampler_shutdown(struct mshv_irqfd *irqfd)
  181. {
  182. struct mshv_irqfd_resampler *rp = irqfd->irqfd_resampler;
  183. struct mshv_partition *pt = rp->rsmplr_partn;
  184. mutex_lock(&pt->irqfds_resampler_lock);
  185. hlist_del_rcu(&irqfd->irqfd_resampler_hnode);
  186. synchronize_srcu(&pt->pt_irq_srcu);
  187. if (hlist_empty(&rp->rsmplr_irqfd_list)) {
  188. hlist_del(&rp->rsmplr_hnode);
  189. mshv_unregister_irq_ack_notifier(pt, &rp->rsmplr_notifier);
  190. kfree(rp);
  191. }
  192. mutex_unlock(&pt->irqfds_resampler_lock);
  193. }
  194. /*
  195. * Race-free decouple logic (ordering is critical)
  196. */
  197. static void mshv_irqfd_shutdown(struct work_struct *work)
  198. {
  199. struct mshv_irqfd *irqfd =
  200. container_of(work, struct mshv_irqfd, irqfd_shutdown);
  201. u64 cnt;
  202. /*
  203. * Synchronize with the wait-queue and unhook ourselves to prevent
  204. * further events.
  205. */
  206. eventfd_ctx_remove_wait_queue(irqfd->irqfd_eventfd_ctx, &irqfd->irqfd_wait, &cnt);
  207. if (irqfd->irqfd_resampler) {
  208. mshv_irqfd_resampler_shutdown(irqfd);
  209. eventfd_ctx_put(irqfd->irqfd_resamplefd);
  210. }
  211. /*
  212. * It is now safe to release the object's resources
  213. */
  214. eventfd_ctx_put(irqfd->irqfd_eventfd_ctx);
  215. kfree(irqfd);
  216. }
  217. /* assumes partition->pt_irqfds_lock is held */
  218. static bool mshv_irqfd_is_active(struct mshv_irqfd *irqfd)
  219. {
  220. return !hlist_unhashed(&irqfd->irqfd_hnode);
  221. }
  222. /*
  223. * Mark the irqfd as inactive and schedule it for removal
  224. *
  225. * assumes partition->pt_irqfds_lock is held
  226. */
  227. static void mshv_irqfd_deactivate(struct mshv_irqfd *irqfd)
  228. {
  229. if (!mshv_irqfd_is_active(irqfd))
  230. return;
  231. hlist_del(&irqfd->irqfd_hnode);
  232. queue_work(irqfd_cleanup_wq, &irqfd->irqfd_shutdown);
  233. }
  234. /*
  235. * Called with wqh->lock held and interrupts disabled
  236. */
  237. static int mshv_irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode,
  238. int sync, void *key)
  239. {
  240. struct mshv_irqfd *irqfd = container_of(wait, struct mshv_irqfd,
  241. irqfd_wait);
  242. __poll_t flags = key_to_poll(key);
  243. int idx;
  244. unsigned int seq;
  245. struct mshv_partition *pt = irqfd->irqfd_partn;
  246. int ret = 0;
  247. if (flags & EPOLLIN) {
  248. u64 cnt;
  249. eventfd_ctx_do_read(irqfd->irqfd_eventfd_ctx, &cnt);
  250. idx = srcu_read_lock(&pt->pt_irq_srcu);
  251. do {
  252. seq = read_seqcount_begin(&irqfd->irqfd_irqe_sc);
  253. } while (read_seqcount_retry(&irqfd->irqfd_irqe_sc, seq));
  254. /* An event has been signaled, raise an interrupt */
  255. ret = mshv_try_assert_irq_fast(irqfd);
  256. if (ret)
  257. mshv_assert_irq_slow(irqfd);
  258. srcu_read_unlock(&pt->pt_irq_srcu, idx);
  259. ret = 1;
  260. }
  261. if (flags & EPOLLHUP) {
  262. /* The eventfd is closing, detach from the partition */
  263. unsigned long flags;
  264. spin_lock_irqsave(&pt->pt_irqfds_lock, flags);
  265. /*
  266. * We must check if someone deactivated the irqfd before
  267. * we could acquire the pt_irqfds_lock since the item is
  268. * deactivated from the mshv side before it is unhooked from
  269. * the wait-queue. If it is already deactivated, we can
  270. * simply return knowing the other side will cleanup for us.
  271. * We cannot race against the irqfd going away since the
  272. * other side is required to acquire wqh->lock, which we hold
  273. */
  274. if (mshv_irqfd_is_active(irqfd))
  275. mshv_irqfd_deactivate(irqfd);
  276. spin_unlock_irqrestore(&pt->pt_irqfds_lock, flags);
  277. }
  278. return ret;
  279. }
  280. /* Must be called under pt_irqfds_lock */
  281. static void mshv_irqfd_update(struct mshv_partition *pt,
  282. struct mshv_irqfd *irqfd)
  283. {
  284. write_seqcount_begin(&irqfd->irqfd_irqe_sc);
  285. irqfd->irqfd_girq_ent = mshv_ret_girq_entry(pt,
  286. irqfd->irqfd_irqnum);
  287. mshv_copy_girq_info(&irqfd->irqfd_girq_ent, &irqfd->irqfd_lapic_irq);
  288. write_seqcount_end(&irqfd->irqfd_irqe_sc);
  289. }
  290. void mshv_irqfd_routing_update(struct mshv_partition *pt)
  291. {
  292. struct mshv_irqfd *irqfd;
  293. spin_lock_irq(&pt->pt_irqfds_lock);
  294. hlist_for_each_entry(irqfd, &pt->pt_irqfds_list, irqfd_hnode)
  295. mshv_irqfd_update(pt, irqfd);
  296. spin_unlock_irq(&pt->pt_irqfds_lock);
  297. }
  298. static void mshv_irqfd_queue_proc(struct file *file, wait_queue_head_t *wqh,
  299. poll_table *polltbl)
  300. {
  301. struct mshv_irqfd *irqfd =
  302. container_of(polltbl, struct mshv_irqfd, irqfd_polltbl);
  303. /*
  304. * TODO: Ensure there isn't already an exclusive, priority waiter, e.g.
  305. * that the irqfd isn't already bound to another partition. Only the
  306. * first exclusive waiter encountered will be notified, and
  307. * add_wait_queue_priority() doesn't enforce exclusivity.
  308. */
  309. irqfd->irqfd_wait.flags |= WQ_FLAG_EXCLUSIVE;
  310. add_wait_queue_priority(wqh, &irqfd->irqfd_wait);
  311. }
  312. static int mshv_irqfd_assign(struct mshv_partition *pt,
  313. struct mshv_user_irqfd *args)
  314. {
  315. struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;
  316. struct mshv_irqfd *irqfd, *tmp;
  317. __poll_t events;
  318. int ret;
  319. int idx;
  320. CLASS(fd, f)(args->fd);
  321. irqfd = kzalloc_obj(*irqfd);
  322. if (!irqfd)
  323. return -ENOMEM;
  324. irqfd->irqfd_partn = pt;
  325. irqfd->irqfd_irqnum = args->gsi;
  326. INIT_WORK(&irqfd->irqfd_shutdown, mshv_irqfd_shutdown);
  327. seqcount_spinlock_init(&irqfd->irqfd_irqe_sc, &pt->pt_irqfds_lock);
  328. if (fd_empty(f)) {
  329. ret = -EBADF;
  330. goto out;
  331. }
  332. eventfd = eventfd_ctx_fileget(fd_file(f));
  333. if (IS_ERR(eventfd)) {
  334. ret = PTR_ERR(eventfd);
  335. goto fail;
  336. }
  337. irqfd->irqfd_eventfd_ctx = eventfd;
  338. if (args->flags & BIT(MSHV_IRQFD_BIT_RESAMPLE)) {
  339. struct mshv_irqfd_resampler *rp;
  340. resamplefd = eventfd_ctx_fdget(args->resamplefd);
  341. if (IS_ERR(resamplefd)) {
  342. ret = PTR_ERR(resamplefd);
  343. goto fail;
  344. }
  345. irqfd->irqfd_resamplefd = resamplefd;
  346. mutex_lock(&pt->irqfds_resampler_lock);
  347. hlist_for_each_entry(rp, &pt->irqfds_resampler_list,
  348. rsmplr_hnode) {
  349. if (rp->rsmplr_notifier.irq_ack_gsi ==
  350. irqfd->irqfd_irqnum) {
  351. irqfd->irqfd_resampler = rp;
  352. break;
  353. }
  354. }
  355. if (!irqfd->irqfd_resampler) {
  356. rp = kzalloc_obj(*rp, GFP_KERNEL_ACCOUNT);
  357. if (!rp) {
  358. ret = -ENOMEM;
  359. mutex_unlock(&pt->irqfds_resampler_lock);
  360. goto fail;
  361. }
  362. rp->rsmplr_partn = pt;
  363. INIT_HLIST_HEAD(&rp->rsmplr_irqfd_list);
  364. rp->rsmplr_notifier.irq_ack_gsi = irqfd->irqfd_irqnum;
  365. rp->rsmplr_notifier.irq_acked =
  366. mshv_irqfd_resampler_ack;
  367. hlist_add_head(&rp->rsmplr_hnode,
  368. &pt->irqfds_resampler_list);
  369. mshv_register_irq_ack_notifier(pt,
  370. &rp->rsmplr_notifier);
  371. irqfd->irqfd_resampler = rp;
  372. }
  373. hlist_add_head_rcu(&irqfd->irqfd_resampler_hnode,
  374. &irqfd->irqfd_resampler->rsmplr_irqfd_list);
  375. mutex_unlock(&pt->irqfds_resampler_lock);
  376. }
  377. /*
  378. * Install our own custom wake-up handling so we are notified via
  379. * a callback whenever someone signals the underlying eventfd
  380. */
  381. init_waitqueue_func_entry(&irqfd->irqfd_wait, mshv_irqfd_wakeup);
  382. init_poll_funcptr(&irqfd->irqfd_polltbl, mshv_irqfd_queue_proc);
  383. spin_lock_irq(&pt->pt_irqfds_lock);
  384. #if IS_ENABLED(CONFIG_X86)
  385. if (args->flags & BIT(MSHV_IRQFD_BIT_RESAMPLE) &&
  386. !irqfd->irqfd_lapic_irq.lapic_control.level_triggered) {
  387. /*
  388. * Resample Fd must be for level triggered interrupt
  389. * Otherwise return with failure
  390. */
  391. spin_unlock_irq(&pt->pt_irqfds_lock);
  392. ret = -EINVAL;
  393. goto fail;
  394. }
  395. #endif
  396. ret = 0;
  397. hlist_for_each_entry(tmp, &pt->pt_irqfds_list, irqfd_hnode) {
  398. if (irqfd->irqfd_eventfd_ctx != tmp->irqfd_eventfd_ctx)
  399. continue;
  400. /* This fd is used for another irq already. */
  401. ret = -EBUSY;
  402. spin_unlock_irq(&pt->pt_irqfds_lock);
  403. goto fail;
  404. }
  405. idx = srcu_read_lock(&pt->pt_irq_srcu);
  406. mshv_irqfd_update(pt, irqfd);
  407. hlist_add_head(&irqfd->irqfd_hnode, &pt->pt_irqfds_list);
  408. spin_unlock_irq(&pt->pt_irqfds_lock);
  409. /*
  410. * Check if there was an event already pending on the eventfd
  411. * before we registered, and trigger it as if we didn't miss it.
  412. */
  413. events = vfs_poll(fd_file(f), &irqfd->irqfd_polltbl);
  414. if (events & EPOLLIN)
  415. mshv_assert_irq_slow(irqfd);
  416. srcu_read_unlock(&pt->pt_irq_srcu, idx);
  417. return 0;
  418. fail:
  419. if (irqfd->irqfd_resampler)
  420. mshv_irqfd_resampler_shutdown(irqfd);
  421. if (resamplefd && !IS_ERR(resamplefd))
  422. eventfd_ctx_put(resamplefd);
  423. if (eventfd && !IS_ERR(eventfd))
  424. eventfd_ctx_put(eventfd);
  425. out:
  426. kfree(irqfd);
  427. return ret;
  428. }
  429. /*
  430. * shutdown any irqfd's that match fd+gsi
  431. */
  432. static int mshv_irqfd_deassign(struct mshv_partition *pt,
  433. struct mshv_user_irqfd *args)
  434. {
  435. struct mshv_irqfd *irqfd;
  436. struct hlist_node *n;
  437. struct eventfd_ctx *eventfd;
  438. eventfd = eventfd_ctx_fdget(args->fd);
  439. if (IS_ERR(eventfd))
  440. return PTR_ERR(eventfd);
  441. hlist_for_each_entry_safe(irqfd, n, &pt->pt_irqfds_list,
  442. irqfd_hnode) {
  443. if (irqfd->irqfd_eventfd_ctx == eventfd &&
  444. irqfd->irqfd_irqnum == args->gsi)
  445. mshv_irqfd_deactivate(irqfd);
  446. }
  447. eventfd_ctx_put(eventfd);
  448. /*
  449. * Block until we know all outstanding shutdown jobs have completed
  450. * so that we guarantee there will not be any more interrupts on this
  451. * gsi once this deassign function returns.
  452. */
  453. flush_workqueue(irqfd_cleanup_wq);
  454. return 0;
  455. }
  456. int mshv_set_unset_irqfd(struct mshv_partition *pt,
  457. struct mshv_user_irqfd *args)
  458. {
  459. if (args->flags & ~MSHV_IRQFD_FLAGS_MASK)
  460. return -EINVAL;
  461. if (args->flags & BIT(MSHV_IRQFD_BIT_DEASSIGN))
  462. return mshv_irqfd_deassign(pt, args);
  463. return mshv_irqfd_assign(pt, args);
  464. }
  465. /*
  466. * This function is called as the mshv VM fd is being released.
  467. * Shutdown all irqfds that still remain open
  468. */
  469. static void mshv_irqfd_release(struct mshv_partition *pt)
  470. {
  471. struct mshv_irqfd *irqfd;
  472. struct hlist_node *n;
  473. spin_lock_irq(&pt->pt_irqfds_lock);
  474. hlist_for_each_entry_safe(irqfd, n, &pt->pt_irqfds_list, irqfd_hnode)
  475. mshv_irqfd_deactivate(irqfd);
  476. spin_unlock_irq(&pt->pt_irqfds_lock);
  477. /*
  478. * Block until we know all outstanding shutdown jobs have completed
  479. * since we do not take a mshv_partition* reference.
  480. */
  481. flush_workqueue(irqfd_cleanup_wq);
  482. }
  483. int mshv_irqfd_wq_init(void)
  484. {
  485. irqfd_cleanup_wq = alloc_workqueue("mshv-irqfd-cleanup", WQ_PERCPU, 0);
  486. if (!irqfd_cleanup_wq)
  487. return -ENOMEM;
  488. return 0;
  489. }
  490. void mshv_irqfd_wq_cleanup(void)
  491. {
  492. destroy_workqueue(irqfd_cleanup_wq);
  493. }
  494. /*
  495. * --------------------------------------------------------------------
  496. * ioeventfd: translate a MMIO memory write to an eventfd signal.
  497. *
  498. * userspace can register a MMIO address with an eventfd for receiving
  499. * notification when the memory has been touched.
  500. * --------------------------------------------------------------------
  501. */
  502. static void ioeventfd_release(struct mshv_ioeventfd *p, u64 partition_id)
  503. {
  504. if (p->iovntfd_doorbell_id > 0)
  505. mshv_unregister_doorbell(partition_id, p->iovntfd_doorbell_id);
  506. eventfd_ctx_put(p->iovntfd_eventfd);
  507. kfree(p);
  508. }
  509. /* MMIO writes trigger an event if the addr/val match */
  510. static void ioeventfd_mmio_write(int doorbell_id, void *data)
  511. {
  512. struct mshv_partition *partition = (struct mshv_partition *)data;
  513. struct mshv_ioeventfd *p;
  514. rcu_read_lock();
  515. hlist_for_each_entry_rcu(p, &partition->ioeventfds_list, iovntfd_hnode)
  516. if (p->iovntfd_doorbell_id == doorbell_id) {
  517. eventfd_signal(p->iovntfd_eventfd);
  518. break;
  519. }
  520. rcu_read_unlock();
  521. }
  522. static bool ioeventfd_check_collision(struct mshv_partition *pt,
  523. struct mshv_ioeventfd *p)
  524. __must_hold(&pt->mutex)
  525. {
  526. struct mshv_ioeventfd *_p;
  527. hlist_for_each_entry(_p, &pt->ioeventfds_list, iovntfd_hnode)
  528. if (_p->iovntfd_addr == p->iovntfd_addr &&
  529. _p->iovntfd_length == p->iovntfd_length &&
  530. (_p->iovntfd_wildcard || p->iovntfd_wildcard ||
  531. _p->iovntfd_datamatch == p->iovntfd_datamatch))
  532. return true;
  533. return false;
  534. }
  535. static int mshv_assign_ioeventfd(struct mshv_partition *pt,
  536. struct mshv_user_ioeventfd *args)
  537. __must_hold(&pt->mutex)
  538. {
  539. struct mshv_ioeventfd *p;
  540. struct eventfd_ctx *eventfd;
  541. u64 doorbell_flags = 0;
  542. int ret;
  543. /* This mutex is currently protecting ioeventfd.items list */
  544. WARN_ON_ONCE(!mutex_is_locked(&pt->pt_mutex));
  545. if (args->flags & BIT(MSHV_IOEVENTFD_BIT_PIO))
  546. return -EOPNOTSUPP;
  547. /* must be natural-word sized */
  548. switch (args->len) {
  549. case 0:
  550. doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_ANY;
  551. break;
  552. case 1:
  553. doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_BYTE;
  554. break;
  555. case 2:
  556. doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_WORD;
  557. break;
  558. case 4:
  559. doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_DWORD;
  560. break;
  561. case 8:
  562. doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_QWORD;
  563. break;
  564. default:
  565. return -EINVAL;
  566. }
  567. /* check for range overflow */
  568. if (args->addr + args->len < args->addr)
  569. return -EINVAL;
  570. /* check for extra flags that we don't understand */
  571. if (args->flags & ~MSHV_IOEVENTFD_FLAGS_MASK)
  572. return -EINVAL;
  573. eventfd = eventfd_ctx_fdget(args->fd);
  574. if (IS_ERR(eventfd))
  575. return PTR_ERR(eventfd);
  576. p = kzalloc_obj(*p);
  577. if (!p) {
  578. ret = -ENOMEM;
  579. goto fail;
  580. }
  581. p->iovntfd_addr = args->addr;
  582. p->iovntfd_length = args->len;
  583. p->iovntfd_eventfd = eventfd;
  584. /* The datamatch feature is optional, otherwise this is a wildcard */
  585. if (args->flags & BIT(MSHV_IOEVENTFD_BIT_DATAMATCH)) {
  586. p->iovntfd_datamatch = args->datamatch;
  587. } else {
  588. p->iovntfd_wildcard = true;
  589. doorbell_flags |= HV_DOORBELL_FLAG_TRIGGER_ANY_VALUE;
  590. }
  591. if (ioeventfd_check_collision(pt, p)) {
  592. ret = -EEXIST;
  593. goto unlock_fail;
  594. }
  595. ret = mshv_register_doorbell(pt->pt_id, ioeventfd_mmio_write,
  596. (void *)pt, p->iovntfd_addr,
  597. p->iovntfd_datamatch, doorbell_flags);
  598. if (ret < 0)
  599. goto unlock_fail;
  600. p->iovntfd_doorbell_id = ret;
  601. hlist_add_head_rcu(&p->iovntfd_hnode, &pt->ioeventfds_list);
  602. return 0;
  603. unlock_fail:
  604. kfree(p);
  605. fail:
  606. eventfd_ctx_put(eventfd);
  607. return ret;
  608. }
  609. static int mshv_deassign_ioeventfd(struct mshv_partition *pt,
  610. struct mshv_user_ioeventfd *args)
  611. __must_hold(&pt->mutex)
  612. {
  613. struct mshv_ioeventfd *p;
  614. struct eventfd_ctx *eventfd;
  615. struct hlist_node *n;
  616. int ret = -ENOENT;
  617. /* This mutex is currently protecting ioeventfd.items list */
  618. WARN_ON_ONCE(!mutex_is_locked(&pt->pt_mutex));
  619. eventfd = eventfd_ctx_fdget(args->fd);
  620. if (IS_ERR(eventfd))
  621. return PTR_ERR(eventfd);
  622. hlist_for_each_entry_safe(p, n, &pt->ioeventfds_list, iovntfd_hnode) {
  623. bool wildcard = !(args->flags & BIT(MSHV_IOEVENTFD_BIT_DATAMATCH));
  624. if (p->iovntfd_eventfd != eventfd ||
  625. p->iovntfd_addr != args->addr ||
  626. p->iovntfd_length != args->len ||
  627. p->iovntfd_wildcard != wildcard)
  628. continue;
  629. if (!p->iovntfd_wildcard &&
  630. p->iovntfd_datamatch != args->datamatch)
  631. continue;
  632. hlist_del_rcu(&p->iovntfd_hnode);
  633. synchronize_rcu();
  634. ioeventfd_release(p, pt->pt_id);
  635. ret = 0;
  636. break;
  637. }
  638. eventfd_ctx_put(eventfd);
  639. return ret;
  640. }
  641. int mshv_set_unset_ioeventfd(struct mshv_partition *pt,
  642. struct mshv_user_ioeventfd *args)
  643. __must_hold(&pt->mutex)
  644. {
  645. if ((args->flags & ~MSHV_IOEVENTFD_FLAGS_MASK) ||
  646. mshv_field_nonzero(*args, rsvd))
  647. return -EINVAL;
  648. /* PIO not yet implemented */
  649. if (args->flags & BIT(MSHV_IOEVENTFD_BIT_PIO))
  650. return -EOPNOTSUPP;
  651. if (args->flags & BIT(MSHV_IOEVENTFD_BIT_DEASSIGN))
  652. return mshv_deassign_ioeventfd(pt, args);
  653. return mshv_assign_ioeventfd(pt, args);
  654. }
  655. void mshv_eventfd_init(struct mshv_partition *pt)
  656. {
  657. spin_lock_init(&pt->pt_irqfds_lock);
  658. INIT_HLIST_HEAD(&pt->pt_irqfds_list);
  659. INIT_HLIST_HEAD(&pt->irqfds_resampler_list);
  660. mutex_init(&pt->irqfds_resampler_lock);
  661. INIT_HLIST_HEAD(&pt->ioeventfds_list);
  662. }
  663. void mshv_eventfd_release(struct mshv_partition *pt)
  664. {
  665. struct hlist_head items;
  666. struct hlist_node *n;
  667. struct mshv_ioeventfd *p;
  668. hlist_move_list(&pt->ioeventfds_list, &items);
  669. synchronize_rcu();
  670. hlist_for_each_entry_safe(p, n, &items, iovntfd_hnode) {
  671. hlist_del(&p->iovntfd_hnode);
  672. ioeventfd_release(p, pt->pt_id);
  673. }
  674. mshv_irqfd_release(pt);
  675. }