vfio_kern.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright (C) 2025 Ant Group
  4. * Author: Tiwei Bie <tiwei.btw@antgroup.com>
  5. */
  6. #define pr_fmt(fmt) "vfio-uml: " fmt
  7. #include <linux/module.h>
  8. #include <linux/logic_iomem.h>
  9. #include <linux/mutex.h>
  10. #include <linux/list.h>
  11. #include <linux/string.h>
  12. #include <linux/unaligned.h>
  13. #include <irq_kern.h>
  14. #include <init.h>
  15. #include <os.h>
  16. #include "mconsole_kern.h"
  17. #include "virt-pci.h"
  18. #include "vfio_user.h"
  19. #define to_vdev(_pdev) container_of(_pdev, struct uml_vfio_device, pdev)
  20. struct uml_vfio_intr_ctx {
  21. struct uml_vfio_device *dev;
  22. int irq;
  23. };
  24. struct uml_vfio_device {
  25. const char *name;
  26. int group;
  27. struct um_pci_device pdev;
  28. struct uml_vfio_user_device udev;
  29. struct uml_vfio_intr_ctx *intr_ctx;
  30. int msix_cap;
  31. int msix_bar;
  32. int msix_offset;
  33. int msix_size;
  34. u32 *msix_data;
  35. struct list_head list;
  36. };
  37. struct uml_vfio_group {
  38. int id;
  39. int fd;
  40. int users;
  41. struct list_head list;
  42. };
  43. static struct {
  44. int fd;
  45. int users;
  46. } uml_vfio_container = { .fd = -1 };
  47. static DEFINE_MUTEX(uml_vfio_container_mtx);
  48. static LIST_HEAD(uml_vfio_groups);
  49. static DEFINE_MUTEX(uml_vfio_groups_mtx);
  50. static LIST_HEAD(uml_vfio_devices);
  51. static DEFINE_MUTEX(uml_vfio_devices_mtx);
  52. static int uml_vfio_set_container(int group_fd)
  53. {
  54. int err;
  55. guard(mutex)(&uml_vfio_container_mtx);
  56. err = uml_vfio_user_set_container(uml_vfio_container.fd, group_fd);
  57. if (err)
  58. return err;
  59. uml_vfio_container.users++;
  60. if (uml_vfio_container.users > 1)
  61. return 0;
  62. err = uml_vfio_user_setup_iommu(uml_vfio_container.fd);
  63. if (err) {
  64. uml_vfio_user_unset_container(uml_vfio_container.fd, group_fd);
  65. uml_vfio_container.users--;
  66. }
  67. return err;
  68. }
  69. static void uml_vfio_unset_container(int group_fd)
  70. {
  71. guard(mutex)(&uml_vfio_container_mtx);
  72. uml_vfio_user_unset_container(uml_vfio_container.fd, group_fd);
  73. uml_vfio_container.users--;
  74. }
  75. static int uml_vfio_open_group(int group_id)
  76. {
  77. struct uml_vfio_group *group;
  78. int err;
  79. guard(mutex)(&uml_vfio_groups_mtx);
  80. list_for_each_entry(group, &uml_vfio_groups, list) {
  81. if (group->id == group_id) {
  82. group->users++;
  83. return group->fd;
  84. }
  85. }
  86. group = kzalloc_obj(*group);
  87. if (!group)
  88. return -ENOMEM;
  89. group->fd = uml_vfio_user_open_group(group_id);
  90. if (group->fd < 0) {
  91. err = group->fd;
  92. goto free_group;
  93. }
  94. err = uml_vfio_set_container(group->fd);
  95. if (err)
  96. goto close_group;
  97. group->id = group_id;
  98. group->users = 1;
  99. list_add(&group->list, &uml_vfio_groups);
  100. return group->fd;
  101. close_group:
  102. os_close_file(group->fd);
  103. free_group:
  104. kfree(group);
  105. return err;
  106. }
  107. static int uml_vfio_release_group(int group_fd)
  108. {
  109. struct uml_vfio_group *group;
  110. guard(mutex)(&uml_vfio_groups_mtx);
  111. list_for_each_entry(group, &uml_vfio_groups, list) {
  112. if (group->fd == group_fd) {
  113. group->users--;
  114. if (group->users == 0) {
  115. uml_vfio_unset_container(group_fd);
  116. os_close_file(group_fd);
  117. list_del(&group->list);
  118. kfree(group);
  119. }
  120. return 0;
  121. }
  122. }
  123. return -ENOENT;
  124. }
  125. static irqreturn_t uml_vfio_interrupt(int unused, void *opaque)
  126. {
  127. struct uml_vfio_intr_ctx *ctx = opaque;
  128. struct uml_vfio_device *dev = ctx->dev;
  129. int index = ctx - dev->intr_ctx;
  130. int irqfd = dev->udev.irqfd[index];
  131. int irq = dev->msix_data[index];
  132. uint64_t v;
  133. int r;
  134. do {
  135. r = os_read_file(irqfd, &v, sizeof(v));
  136. if (r == sizeof(v))
  137. generic_handle_irq(irq);
  138. } while (r == sizeof(v) || r == -EINTR);
  139. WARN(r != -EAGAIN, "read returned %d\n", r);
  140. return IRQ_HANDLED;
  141. }
  142. static int uml_vfio_activate_irq(struct uml_vfio_device *dev, int index)
  143. {
  144. struct uml_vfio_intr_ctx *ctx = &dev->intr_ctx[index];
  145. int err, irqfd;
  146. if (ctx->irq >= 0)
  147. return 0;
  148. irqfd = uml_vfio_user_activate_irq(&dev->udev, index);
  149. if (irqfd < 0)
  150. return irqfd;
  151. ctx->irq = um_request_irq(UM_IRQ_ALLOC, irqfd, IRQ_READ,
  152. uml_vfio_interrupt, 0,
  153. "vfio-uml", ctx);
  154. if (ctx->irq < 0) {
  155. err = ctx->irq;
  156. goto deactivate;
  157. }
  158. err = add_sigio_fd(irqfd);
  159. if (err)
  160. goto free_irq;
  161. return 0;
  162. free_irq:
  163. um_free_irq(ctx->irq, ctx);
  164. ctx->irq = -1;
  165. deactivate:
  166. uml_vfio_user_deactivate_irq(&dev->udev, index);
  167. return err;
  168. }
  169. static int uml_vfio_deactivate_irq(struct uml_vfio_device *dev, int index)
  170. {
  171. struct uml_vfio_intr_ctx *ctx = &dev->intr_ctx[index];
  172. if (ctx->irq >= 0) {
  173. ignore_sigio_fd(dev->udev.irqfd[index]);
  174. um_free_irq(ctx->irq, ctx);
  175. uml_vfio_user_deactivate_irq(&dev->udev, index);
  176. ctx->irq = -1;
  177. }
  178. return 0;
  179. }
  180. static int uml_vfio_update_msix_cap(struct uml_vfio_device *dev,
  181. unsigned int offset, int size,
  182. unsigned long val)
  183. {
  184. /*
  185. * Here, we handle only the operations we care about,
  186. * ignoring the rest.
  187. */
  188. if (size == 2 && offset == dev->msix_cap + PCI_MSIX_FLAGS) {
  189. switch (val & ~PCI_MSIX_FLAGS_QSIZE) {
  190. case PCI_MSIX_FLAGS_ENABLE:
  191. case 0:
  192. return uml_vfio_user_update_irqs(&dev->udev);
  193. }
  194. }
  195. return 0;
  196. }
  197. static int uml_vfio_update_msix_table(struct uml_vfio_device *dev,
  198. unsigned int offset, int size,
  199. unsigned long val)
  200. {
  201. int index;
  202. /*
  203. * Here, we handle only the operations we care about,
  204. * ignoring the rest.
  205. */
  206. offset -= dev->msix_offset + PCI_MSIX_ENTRY_DATA;
  207. if (size != 4 || offset % PCI_MSIX_ENTRY_SIZE != 0)
  208. return 0;
  209. index = offset / PCI_MSIX_ENTRY_SIZE;
  210. if (index >= dev->udev.irq_count)
  211. return -EINVAL;
  212. dev->msix_data[index] = val;
  213. return val ? uml_vfio_activate_irq(dev, index) :
  214. uml_vfio_deactivate_irq(dev, index);
  215. }
  216. static unsigned long __uml_vfio_cfgspace_read(struct uml_vfio_device *dev,
  217. unsigned int offset, int size)
  218. {
  219. u8 data[8];
  220. memset(data, 0xff, sizeof(data));
  221. if (uml_vfio_user_cfgspace_read(&dev->udev, offset, data, size))
  222. return ULONG_MAX;
  223. switch (size) {
  224. case 1:
  225. return data[0];
  226. case 2:
  227. return le16_to_cpup((void *)data);
  228. case 4:
  229. return le32_to_cpup((void *)data);
  230. #ifdef CONFIG_64BIT
  231. case 8:
  232. return le64_to_cpup((void *)data);
  233. #endif
  234. default:
  235. return ULONG_MAX;
  236. }
  237. }
  238. static unsigned long uml_vfio_cfgspace_read(struct um_pci_device *pdev,
  239. unsigned int offset, int size)
  240. {
  241. struct uml_vfio_device *dev = to_vdev(pdev);
  242. return __uml_vfio_cfgspace_read(dev, offset, size);
  243. }
  244. static void __uml_vfio_cfgspace_write(struct uml_vfio_device *dev,
  245. unsigned int offset, int size,
  246. unsigned long val)
  247. {
  248. u8 data[8];
  249. switch (size) {
  250. case 1:
  251. data[0] = (u8)val;
  252. break;
  253. case 2:
  254. put_unaligned_le16(val, (void *)data);
  255. break;
  256. case 4:
  257. put_unaligned_le32(val, (void *)data);
  258. break;
  259. #ifdef CONFIG_64BIT
  260. case 8:
  261. put_unaligned_le64(val, (void *)data);
  262. break;
  263. #endif
  264. }
  265. WARN_ON(uml_vfio_user_cfgspace_write(&dev->udev, offset, data, size));
  266. }
  267. static void uml_vfio_cfgspace_write(struct um_pci_device *pdev,
  268. unsigned int offset, int size,
  269. unsigned long val)
  270. {
  271. struct uml_vfio_device *dev = to_vdev(pdev);
  272. if (offset < dev->msix_cap + PCI_CAP_MSIX_SIZEOF &&
  273. offset + size > dev->msix_cap)
  274. WARN_ON(uml_vfio_update_msix_cap(dev, offset, size, val));
  275. __uml_vfio_cfgspace_write(dev, offset, size, val);
  276. }
  277. static void uml_vfio_bar_copy_from(struct um_pci_device *pdev, int bar,
  278. void *buffer, unsigned int offset, int size)
  279. {
  280. struct uml_vfio_device *dev = to_vdev(pdev);
  281. memset(buffer, 0xff, size);
  282. uml_vfio_user_bar_read(&dev->udev, bar, offset, buffer, size);
  283. }
  284. static unsigned long uml_vfio_bar_read(struct um_pci_device *pdev, int bar,
  285. unsigned int offset, int size)
  286. {
  287. u8 data[8];
  288. uml_vfio_bar_copy_from(pdev, bar, data, offset, size);
  289. switch (size) {
  290. case 1:
  291. return data[0];
  292. case 2:
  293. return le16_to_cpup((void *)data);
  294. case 4:
  295. return le32_to_cpup((void *)data);
  296. #ifdef CONFIG_64BIT
  297. case 8:
  298. return le64_to_cpup((void *)data);
  299. #endif
  300. default:
  301. return ULONG_MAX;
  302. }
  303. }
  304. static void uml_vfio_bar_copy_to(struct um_pci_device *pdev, int bar,
  305. unsigned int offset, const void *buffer,
  306. int size)
  307. {
  308. struct uml_vfio_device *dev = to_vdev(pdev);
  309. uml_vfio_user_bar_write(&dev->udev, bar, offset, buffer, size);
  310. }
  311. static void uml_vfio_bar_write(struct um_pci_device *pdev, int bar,
  312. unsigned int offset, int size,
  313. unsigned long val)
  314. {
  315. struct uml_vfio_device *dev = to_vdev(pdev);
  316. u8 data[8];
  317. if (bar == dev->msix_bar && offset + size > dev->msix_offset &&
  318. offset < dev->msix_offset + dev->msix_size)
  319. WARN_ON(uml_vfio_update_msix_table(dev, offset, size, val));
  320. switch (size) {
  321. case 1:
  322. data[0] = (u8)val;
  323. break;
  324. case 2:
  325. put_unaligned_le16(val, (void *)data);
  326. break;
  327. case 4:
  328. put_unaligned_le32(val, (void *)data);
  329. break;
  330. #ifdef CONFIG_64BIT
  331. case 8:
  332. put_unaligned_le64(val, (void *)data);
  333. break;
  334. #endif
  335. }
  336. uml_vfio_bar_copy_to(pdev, bar, offset, data, size);
  337. }
  338. static void uml_vfio_bar_set(struct um_pci_device *pdev, int bar,
  339. unsigned int offset, u8 value, int size)
  340. {
  341. struct uml_vfio_device *dev = to_vdev(pdev);
  342. int i;
  343. for (i = 0; i < size; i++)
  344. uml_vfio_user_bar_write(&dev->udev, bar, offset + i, &value, 1);
  345. }
  346. static const struct um_pci_ops uml_vfio_um_pci_ops = {
  347. .cfgspace_read = uml_vfio_cfgspace_read,
  348. .cfgspace_write = uml_vfio_cfgspace_write,
  349. .bar_read = uml_vfio_bar_read,
  350. .bar_write = uml_vfio_bar_write,
  351. .bar_copy_from = uml_vfio_bar_copy_from,
  352. .bar_copy_to = uml_vfio_bar_copy_to,
  353. .bar_set = uml_vfio_bar_set,
  354. };
  355. static u8 uml_vfio_find_capability(struct uml_vfio_device *dev, u8 cap)
  356. {
  357. u8 id, pos;
  358. u16 ent;
  359. int ttl = 48; /* PCI_FIND_CAP_TTL */
  360. pos = __uml_vfio_cfgspace_read(dev, PCI_CAPABILITY_LIST, sizeof(pos));
  361. while (pos && ttl--) {
  362. ent = __uml_vfio_cfgspace_read(dev, pos, sizeof(ent));
  363. id = ent & 0xff;
  364. if (id == 0xff)
  365. break;
  366. if (id == cap)
  367. return pos;
  368. pos = ent >> 8;
  369. }
  370. return 0;
  371. }
  372. static int uml_vfio_read_msix_table(struct uml_vfio_device *dev)
  373. {
  374. unsigned int off;
  375. u16 flags;
  376. u32 tbl;
  377. off = uml_vfio_find_capability(dev, PCI_CAP_ID_MSIX);
  378. if (!off)
  379. return -ENOTSUPP;
  380. dev->msix_cap = off;
  381. tbl = __uml_vfio_cfgspace_read(dev, off + PCI_MSIX_TABLE, sizeof(tbl));
  382. flags = __uml_vfio_cfgspace_read(dev, off + PCI_MSIX_FLAGS, sizeof(flags));
  383. dev->msix_bar = tbl & PCI_MSIX_TABLE_BIR;
  384. dev->msix_offset = tbl & PCI_MSIX_TABLE_OFFSET;
  385. dev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * PCI_MSIX_ENTRY_SIZE;
  386. dev->msix_data = kzalloc(dev->msix_size, GFP_KERNEL);
  387. if (!dev->msix_data)
  388. return -ENOMEM;
  389. return 0;
  390. }
  391. static void uml_vfio_open_device(struct uml_vfio_device *dev)
  392. {
  393. struct uml_vfio_intr_ctx *ctx;
  394. int err, group_id, i;
  395. group_id = uml_vfio_user_get_group_id(dev->name);
  396. if (group_id < 0) {
  397. pr_err("Failed to get group id (%s), error %d\n",
  398. dev->name, group_id);
  399. goto free_dev;
  400. }
  401. dev->group = uml_vfio_open_group(group_id);
  402. if (dev->group < 0) {
  403. pr_err("Failed to open group %d (%s), error %d\n",
  404. group_id, dev->name, dev->group);
  405. goto free_dev;
  406. }
  407. err = uml_vfio_user_setup_device(&dev->udev, dev->group, dev->name);
  408. if (err) {
  409. pr_err("Failed to setup device (%s), error %d\n",
  410. dev->name, err);
  411. goto release_group;
  412. }
  413. err = uml_vfio_read_msix_table(dev);
  414. if (err) {
  415. pr_err("Failed to read MSI-X table (%s), error %d\n",
  416. dev->name, err);
  417. goto teardown_udev;
  418. }
  419. dev->intr_ctx = kmalloc_objs(struct uml_vfio_intr_ctx,
  420. dev->udev.irq_count);
  421. if (!dev->intr_ctx) {
  422. pr_err("Failed to allocate interrupt context (%s)\n",
  423. dev->name);
  424. goto free_msix;
  425. }
  426. for (i = 0; i < dev->udev.irq_count; i++) {
  427. ctx = &dev->intr_ctx[i];
  428. ctx->dev = dev;
  429. ctx->irq = -1;
  430. }
  431. dev->pdev.ops = &uml_vfio_um_pci_ops;
  432. err = um_pci_device_register(&dev->pdev);
  433. if (err) {
  434. pr_err("Failed to register UM PCI device (%s), error %d\n",
  435. dev->name, err);
  436. goto free_intr_ctx;
  437. }
  438. return;
  439. free_intr_ctx:
  440. kfree(dev->intr_ctx);
  441. free_msix:
  442. kfree(dev->msix_data);
  443. teardown_udev:
  444. uml_vfio_user_teardown_device(&dev->udev);
  445. release_group:
  446. uml_vfio_release_group(dev->group);
  447. free_dev:
  448. list_del(&dev->list);
  449. kfree(dev->name);
  450. kfree(dev);
  451. }
  452. static void uml_vfio_release_device(struct uml_vfio_device *dev)
  453. {
  454. int i;
  455. for (i = 0; i < dev->udev.irq_count; i++)
  456. uml_vfio_deactivate_irq(dev, i);
  457. uml_vfio_user_update_irqs(&dev->udev);
  458. um_pci_device_unregister(&dev->pdev);
  459. kfree(dev->intr_ctx);
  460. kfree(dev->msix_data);
  461. uml_vfio_user_teardown_device(&dev->udev);
  462. uml_vfio_release_group(dev->group);
  463. list_del(&dev->list);
  464. kfree(dev->name);
  465. kfree(dev);
  466. }
  467. static struct uml_vfio_device *uml_vfio_find_device(const char *device)
  468. {
  469. struct uml_vfio_device *dev;
  470. list_for_each_entry(dev, &uml_vfio_devices, list) {
  471. if (!strcmp(dev->name, device))
  472. return dev;
  473. }
  474. return NULL;
  475. }
  476. static struct uml_vfio_device *uml_vfio_add_device(const char *device)
  477. {
  478. struct uml_vfio_device *dev;
  479. int fd;
  480. guard(mutex)(&uml_vfio_devices_mtx);
  481. if (uml_vfio_container.fd < 0) {
  482. fd = uml_vfio_user_open_container();
  483. if (fd < 0)
  484. return ERR_PTR(fd);
  485. uml_vfio_container.fd = fd;
  486. }
  487. if (uml_vfio_find_device(device))
  488. return ERR_PTR(-EEXIST);
  489. dev = kzalloc_obj(*dev);
  490. if (!dev)
  491. return ERR_PTR(-ENOMEM);
  492. dev->name = kstrdup(device, GFP_KERNEL);
  493. if (!dev->name) {
  494. kfree(dev);
  495. return ERR_PTR(-ENOMEM);
  496. }
  497. list_add_tail(&dev->list, &uml_vfio_devices);
  498. return dev;
  499. }
  500. static int uml_vfio_cmdline_set(const char *device, const struct kernel_param *kp)
  501. {
  502. struct uml_vfio_device *dev;
  503. dev = uml_vfio_add_device(device);
  504. if (IS_ERR(dev))
  505. return PTR_ERR(dev);
  506. return 0;
  507. }
  508. static int uml_vfio_cmdline_get(char *buffer, const struct kernel_param *kp)
  509. {
  510. return 0;
  511. }
  512. static const struct kernel_param_ops uml_vfio_cmdline_param_ops = {
  513. .set = uml_vfio_cmdline_set,
  514. .get = uml_vfio_cmdline_get,
  515. };
  516. device_param_cb(device, &uml_vfio_cmdline_param_ops, NULL, 0400);
  517. __uml_help(uml_vfio_cmdline_param_ops,
  518. "vfio_uml.device=<domain:bus:slot.function>\n"
  519. " Pass through a PCI device to UML via VFIO. Currently, only MSI-X\n"
  520. " capable devices are supported, and it is assumed that drivers will\n"
  521. " use MSI-X. This parameter can be specified multiple times to pass\n"
  522. " through multiple PCI devices to UML.\n\n"
  523. );
  524. static int uml_vfio_mc_config(char *str, char **error_out)
  525. {
  526. struct uml_vfio_device *dev;
  527. if (*str != '=') {
  528. *error_out = "Invalid config";
  529. return -EINVAL;
  530. }
  531. str += 1;
  532. dev = uml_vfio_add_device(str);
  533. if (IS_ERR(dev))
  534. return PTR_ERR(dev);
  535. uml_vfio_open_device(dev);
  536. return 0;
  537. }
  538. static int uml_vfio_mc_id(char **str, int *start_out, int *end_out)
  539. {
  540. return -EOPNOTSUPP;
  541. }
  542. static int uml_vfio_mc_remove(int n, char **error_out)
  543. {
  544. return -EOPNOTSUPP;
  545. }
  546. static struct mc_device uml_vfio_mc = {
  547. .list = LIST_HEAD_INIT(uml_vfio_mc.list),
  548. .name = "vfio_uml.device",
  549. .config = uml_vfio_mc_config,
  550. .get_config = NULL,
  551. .id = uml_vfio_mc_id,
  552. .remove = uml_vfio_mc_remove,
  553. };
  554. static int __init uml_vfio_init(void)
  555. {
  556. struct uml_vfio_device *dev, *n;
  557. sigio_broken();
  558. /* If the opening fails, the device will be released. */
  559. list_for_each_entry_safe(dev, n, &uml_vfio_devices, list)
  560. uml_vfio_open_device(dev);
  561. mconsole_register_dev(&uml_vfio_mc);
  562. return 0;
  563. }
  564. late_initcall(uml_vfio_init);
  565. static void __exit uml_vfio_exit(void)
  566. {
  567. struct uml_vfio_device *dev, *n;
  568. list_for_each_entry_safe(dev, n, &uml_vfio_devices, list)
  569. uml_vfio_release_device(dev);
  570. if (uml_vfio_container.fd >= 0)
  571. os_close_file(uml_vfio_container.fd);
  572. }
  573. module_exit(uml_vfio_exit);