vsock.c 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * vhost transport for vsock
  4. *
  5. * Copyright (C) 2013-2015 Red Hat, Inc.
  6. * Author: Asias He <asias@redhat.com>
  7. * Stefan Hajnoczi <stefanha@redhat.com>
  8. */
  9. #include <linux/miscdevice.h>
  10. #include <linux/atomic.h>
  11. #include <linux/module.h>
  12. #include <linux/mutex.h>
  13. #include <linux/vmalloc.h>
  14. #include <net/sock.h>
  15. #include <linux/virtio_vsock.h>
  16. #include <linux/vhost.h>
  17. #include <linux/hashtable.h>
  18. #include <net/af_vsock.h>
  19. #include "vhost.h"
  20. #define VHOST_VSOCK_DEFAULT_HOST_CID 2
  21. /* Max number of bytes transferred before requeueing the job.
  22. * Using this limit prevents one virtqueue from starving others. */
  23. #define VHOST_VSOCK_WEIGHT 0x80000
  24. /* Max number of packets transferred before requeueing the job.
  25. * Using this limit prevents one virtqueue from starving others with
  26. * small pkts.
  27. */
  28. #define VHOST_VSOCK_PKT_WEIGHT 256
  29. static const int vhost_vsock_bits[] = {
  30. VHOST_FEATURES,
  31. VIRTIO_F_ACCESS_PLATFORM,
  32. VIRTIO_VSOCK_F_SEQPACKET
  33. };
  34. #define VHOST_VSOCK_FEATURES VHOST_FEATURES_U64(vhost_vsock_bits, 0)
  35. enum {
  36. VHOST_VSOCK_BACKEND_FEATURES = (1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2)
  37. };
  38. /* Used to track all the vhost_vsock instances on the system. */
  39. static DEFINE_MUTEX(vhost_vsock_mutex);
  40. static DEFINE_READ_MOSTLY_HASHTABLE(vhost_vsock_hash, 8);
  41. struct vhost_vsock {
  42. struct vhost_dev dev;
  43. struct vhost_virtqueue vqs[2];
  44. struct net *net;
  45. netns_tracker ns_tracker;
  46. /* Link to global vhost_vsock_hash, writes use vhost_vsock_mutex */
  47. struct hlist_node hash;
  48. struct vhost_work send_pkt_work;
  49. struct sk_buff_head send_pkt_queue; /* host->guest pending packets */
  50. atomic_t queued_replies;
  51. u32 guest_cid;
  52. bool seqpacket_allow;
  53. };
  54. static u32 vhost_transport_get_local_cid(void)
  55. {
  56. return VHOST_VSOCK_DEFAULT_HOST_CID;
  57. }
  58. /* Callers must be in an RCU read section or hold the vhost_vsock_mutex.
  59. * The return value can only be dereferenced while within the section.
  60. */
  61. static struct vhost_vsock *vhost_vsock_get(u32 guest_cid, struct net *net)
  62. {
  63. struct vhost_vsock *vsock;
  64. hash_for_each_possible_rcu(vhost_vsock_hash, vsock, hash, guest_cid,
  65. lockdep_is_held(&vhost_vsock_mutex)) {
  66. u32 other_cid = vsock->guest_cid;
  67. /* Skip instances that have no CID yet */
  68. if (other_cid == 0)
  69. continue;
  70. if (other_cid == guest_cid &&
  71. vsock_net_check_mode(net, vsock->net))
  72. return vsock;
  73. }
  74. return NULL;
  75. }
  76. static void
  77. vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
  78. struct vhost_virtqueue *vq)
  79. {
  80. struct vhost_virtqueue *tx_vq = &vsock->vqs[VSOCK_VQ_TX];
  81. int pkts = 0, total_len = 0;
  82. bool added = false;
  83. bool restart_tx = false;
  84. mutex_lock(&vq->mutex);
  85. if (!vhost_vq_get_backend(vq))
  86. goto out;
  87. if (!vq_meta_prefetch(vq))
  88. goto out;
  89. /* Avoid further vmexits, we're already processing the virtqueue */
  90. vhost_disable_notify(&vsock->dev, vq);
  91. do {
  92. struct virtio_vsock_hdr *hdr;
  93. size_t iov_len, payload_len;
  94. struct iov_iter iov_iter;
  95. u32 flags_to_restore = 0;
  96. struct sk_buff *skb;
  97. unsigned out, in;
  98. size_t nbytes;
  99. u32 offset;
  100. int head;
  101. skb = virtio_vsock_skb_dequeue(&vsock->send_pkt_queue);
  102. if (!skb) {
  103. vhost_enable_notify(&vsock->dev, vq);
  104. break;
  105. }
  106. head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
  107. &out, &in, NULL, NULL);
  108. if (head < 0) {
  109. virtio_vsock_skb_queue_head(&vsock->send_pkt_queue, skb);
  110. break;
  111. }
  112. if (head == vq->num) {
  113. virtio_vsock_skb_queue_head(&vsock->send_pkt_queue, skb);
  114. /* We cannot finish yet if more buffers snuck in while
  115. * re-enabling notify.
  116. */
  117. if (unlikely(vhost_enable_notify(&vsock->dev, vq))) {
  118. vhost_disable_notify(&vsock->dev, vq);
  119. continue;
  120. }
  121. break;
  122. }
  123. if (out) {
  124. kfree_skb(skb);
  125. vq_err(vq, "Expected 0 output buffers, got %u\n", out);
  126. break;
  127. }
  128. iov_len = iov_length(&vq->iov[out], in);
  129. if (iov_len < sizeof(*hdr)) {
  130. kfree_skb(skb);
  131. vq_err(vq, "Buffer len [%zu] too small\n", iov_len);
  132. break;
  133. }
  134. iov_iter_init(&iov_iter, ITER_DEST, &vq->iov[out], in, iov_len);
  135. offset = VIRTIO_VSOCK_SKB_CB(skb)->offset;
  136. payload_len = skb->len - offset;
  137. hdr = virtio_vsock_hdr(skb);
  138. /* If the packet is greater than the space available in the
  139. * buffer, we split it using multiple buffers.
  140. */
  141. if (payload_len > iov_len - sizeof(*hdr)) {
  142. payload_len = iov_len - sizeof(*hdr);
  143. /* As we are copying pieces of large packet's buffer to
  144. * small rx buffers, headers of packets in rx queue are
  145. * created dynamically and are initialized with header
  146. * of current packet(except length). But in case of
  147. * SOCK_SEQPACKET, we also must clear message delimeter
  148. * bit (VIRTIO_VSOCK_SEQ_EOM) and MSG_EOR bit
  149. * (VIRTIO_VSOCK_SEQ_EOR) if set. Otherwise,
  150. * there will be sequence of packets with these
  151. * bits set. After initialized header will be copied to
  152. * rx buffer, these required bits will be restored.
  153. */
  154. if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) {
  155. hdr->flags &= ~cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM);
  156. flags_to_restore |= VIRTIO_VSOCK_SEQ_EOM;
  157. if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOR) {
  158. hdr->flags &= ~cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
  159. flags_to_restore |= VIRTIO_VSOCK_SEQ_EOR;
  160. }
  161. }
  162. }
  163. /* Set the correct length in the header */
  164. hdr->len = cpu_to_le32(payload_len);
  165. nbytes = copy_to_iter(hdr, sizeof(*hdr), &iov_iter);
  166. if (nbytes != sizeof(*hdr)) {
  167. kfree_skb(skb);
  168. vq_err(vq, "Faulted on copying pkt hdr\n");
  169. break;
  170. }
  171. if (skb_copy_datagram_iter(skb,
  172. offset,
  173. &iov_iter,
  174. payload_len)) {
  175. kfree_skb(skb);
  176. vq_err(vq, "Faulted on copying pkt buf\n");
  177. break;
  178. }
  179. /* Deliver to monitoring devices all packets that we
  180. * will transmit.
  181. */
  182. virtio_transport_deliver_tap_pkt(skb);
  183. vhost_add_used(vq, head, sizeof(*hdr) + payload_len);
  184. added = true;
  185. VIRTIO_VSOCK_SKB_CB(skb)->offset += payload_len;
  186. total_len += payload_len;
  187. /* If we didn't send all the payload we can requeue the packet
  188. * to send it with the next available buffer.
  189. */
  190. if (VIRTIO_VSOCK_SKB_CB(skb)->offset < skb->len) {
  191. hdr->flags |= cpu_to_le32(flags_to_restore);
  192. /* We are queueing the same skb to handle
  193. * the remaining bytes, and we want to deliver it
  194. * to monitoring devices in the next iteration.
  195. */
  196. virtio_vsock_skb_clear_tap_delivered(skb);
  197. virtio_vsock_skb_queue_head(&vsock->send_pkt_queue, skb);
  198. } else {
  199. if (virtio_vsock_skb_reply(skb)) {
  200. int val;
  201. val = atomic_dec_return(&vsock->queued_replies);
  202. /* Do we have resources to resume tx
  203. * processing?
  204. */
  205. if (val + 1 == tx_vq->num)
  206. restart_tx = true;
  207. }
  208. virtio_transport_consume_skb_sent(skb, true);
  209. }
  210. } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len)));
  211. if (added)
  212. vhost_signal(&vsock->dev, vq);
  213. out:
  214. mutex_unlock(&vq->mutex);
  215. if (restart_tx)
  216. vhost_poll_queue(&tx_vq->poll);
  217. }
  218. static void vhost_transport_send_pkt_work(struct vhost_work *work)
  219. {
  220. struct vhost_virtqueue *vq;
  221. struct vhost_vsock *vsock;
  222. vsock = container_of(work, struct vhost_vsock, send_pkt_work);
  223. vq = &vsock->vqs[VSOCK_VQ_RX];
  224. vhost_transport_do_send_pkt(vsock, vq);
  225. }
  226. static int
  227. vhost_transport_send_pkt(struct sk_buff *skb, struct net *net)
  228. {
  229. struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb);
  230. struct vhost_vsock *vsock;
  231. int len = skb->len;
  232. rcu_read_lock();
  233. /* Find the vhost_vsock according to guest context id */
  234. vsock = vhost_vsock_get(le64_to_cpu(hdr->dst_cid), net);
  235. if (!vsock) {
  236. rcu_read_unlock();
  237. kfree_skb(skb);
  238. return -ENODEV;
  239. }
  240. if (virtio_vsock_skb_reply(skb))
  241. atomic_inc(&vsock->queued_replies);
  242. virtio_vsock_skb_queue_tail(&vsock->send_pkt_queue, skb);
  243. vhost_vq_work_queue(&vsock->vqs[VSOCK_VQ_RX], &vsock->send_pkt_work);
  244. rcu_read_unlock();
  245. return len;
  246. }
  247. static int
  248. vhost_transport_cancel_pkt(struct vsock_sock *vsk)
  249. {
  250. struct vhost_vsock *vsock;
  251. int cnt = 0;
  252. int ret = -ENODEV;
  253. rcu_read_lock();
  254. /* Find the vhost_vsock according to guest context id */
  255. vsock = vhost_vsock_get(vsk->remote_addr.svm_cid,
  256. sock_net(sk_vsock(vsk)));
  257. if (!vsock)
  258. goto out;
  259. cnt = virtio_transport_purge_skbs(vsk, &vsock->send_pkt_queue);
  260. if (cnt) {
  261. struct vhost_virtqueue *tx_vq = &vsock->vqs[VSOCK_VQ_TX];
  262. int new_cnt;
  263. new_cnt = atomic_sub_return(cnt, &vsock->queued_replies);
  264. if (new_cnt + cnt >= tx_vq->num && new_cnt < tx_vq->num)
  265. vhost_poll_queue(&tx_vq->poll);
  266. }
  267. ret = 0;
  268. out:
  269. rcu_read_unlock();
  270. return ret;
  271. }
  272. static struct sk_buff *
  273. vhost_vsock_alloc_skb(struct vhost_virtqueue *vq,
  274. unsigned int out, unsigned int in)
  275. {
  276. struct virtio_vsock_hdr *hdr;
  277. struct iov_iter iov_iter;
  278. struct sk_buff *skb;
  279. size_t payload_len;
  280. size_t nbytes;
  281. size_t len;
  282. if (in != 0) {
  283. vq_err(vq, "Expected 0 input buffers, got %u\n", in);
  284. return NULL;
  285. }
  286. len = iov_length(vq->iov, out);
  287. if (len < VIRTIO_VSOCK_SKB_HEADROOM ||
  288. len > VIRTIO_VSOCK_MAX_PKT_BUF_SIZE + VIRTIO_VSOCK_SKB_HEADROOM)
  289. return NULL;
  290. /* len contains both payload and hdr */
  291. skb = virtio_vsock_alloc_skb(len, GFP_KERNEL);
  292. if (!skb)
  293. return NULL;
  294. iov_iter_init(&iov_iter, ITER_SOURCE, vq->iov, out, len);
  295. hdr = virtio_vsock_hdr(skb);
  296. nbytes = copy_from_iter(hdr, sizeof(*hdr), &iov_iter);
  297. if (nbytes != sizeof(*hdr)) {
  298. vq_err(vq, "Expected %zu bytes for pkt->hdr, got %zu bytes\n",
  299. sizeof(*hdr), nbytes);
  300. kfree_skb(skb);
  301. return NULL;
  302. }
  303. payload_len = le32_to_cpu(hdr->len);
  304. /* No payload */
  305. if (!payload_len)
  306. return skb;
  307. /* The pkt is too big or the length in the header is invalid */
  308. if (payload_len + sizeof(*hdr) > len) {
  309. kfree_skb(skb);
  310. return NULL;
  311. }
  312. virtio_vsock_skb_put(skb, payload_len);
  313. if (skb_copy_datagram_from_iter(skb, 0, &iov_iter, payload_len)) {
  314. vq_err(vq, "Failed to copy %zu byte payload\n", payload_len);
  315. kfree_skb(skb);
  316. return NULL;
  317. }
  318. return skb;
  319. }
  320. /* Is there space left for replies to rx packets? */
  321. static bool vhost_vsock_more_replies(struct vhost_vsock *vsock)
  322. {
  323. struct vhost_virtqueue *vq = &vsock->vqs[VSOCK_VQ_TX];
  324. int val;
  325. smp_rmb(); /* paired with atomic_inc() and atomic_dec_return() */
  326. val = atomic_read(&vsock->queued_replies);
  327. return val < vq->num;
  328. }
  329. static bool vhost_transport_msgzerocopy_allow(void)
  330. {
  331. return true;
  332. }
  333. static bool vhost_transport_seqpacket_allow(struct vsock_sock *vsk,
  334. u32 remote_cid);
  335. static bool
  336. vhost_transport_stream_allow(struct vsock_sock *vsk, u32 cid, u32 port)
  337. {
  338. return true;
  339. }
  340. static struct virtio_transport vhost_transport = {
  341. .transport = {
  342. .module = THIS_MODULE,
  343. .get_local_cid = vhost_transport_get_local_cid,
  344. .init = virtio_transport_do_socket_init,
  345. .destruct = virtio_transport_destruct,
  346. .release = virtio_transport_release,
  347. .connect = virtio_transport_connect,
  348. .shutdown = virtio_transport_shutdown,
  349. .cancel_pkt = vhost_transport_cancel_pkt,
  350. .dgram_enqueue = virtio_transport_dgram_enqueue,
  351. .dgram_dequeue = virtio_transport_dgram_dequeue,
  352. .dgram_bind = virtio_transport_dgram_bind,
  353. .dgram_allow = virtio_transport_dgram_allow,
  354. .stream_enqueue = virtio_transport_stream_enqueue,
  355. .stream_dequeue = virtio_transport_stream_dequeue,
  356. .stream_has_data = virtio_transport_stream_has_data,
  357. .stream_has_space = virtio_transport_stream_has_space,
  358. .stream_rcvhiwat = virtio_transport_stream_rcvhiwat,
  359. .stream_is_active = virtio_transport_stream_is_active,
  360. .stream_allow = vhost_transport_stream_allow,
  361. .seqpacket_dequeue = virtio_transport_seqpacket_dequeue,
  362. .seqpacket_enqueue = virtio_transport_seqpacket_enqueue,
  363. .seqpacket_allow = vhost_transport_seqpacket_allow,
  364. .seqpacket_has_data = virtio_transport_seqpacket_has_data,
  365. .msgzerocopy_allow = vhost_transport_msgzerocopy_allow,
  366. .notify_poll_in = virtio_transport_notify_poll_in,
  367. .notify_poll_out = virtio_transport_notify_poll_out,
  368. .notify_recv_init = virtio_transport_notify_recv_init,
  369. .notify_recv_pre_block = virtio_transport_notify_recv_pre_block,
  370. .notify_recv_pre_dequeue = virtio_transport_notify_recv_pre_dequeue,
  371. .notify_recv_post_dequeue = virtio_transport_notify_recv_post_dequeue,
  372. .notify_send_init = virtio_transport_notify_send_init,
  373. .notify_send_pre_block = virtio_transport_notify_send_pre_block,
  374. .notify_send_pre_enqueue = virtio_transport_notify_send_pre_enqueue,
  375. .notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
  376. .notify_buffer_size = virtio_transport_notify_buffer_size,
  377. .notify_set_rcvlowat = virtio_transport_notify_set_rcvlowat,
  378. .unsent_bytes = virtio_transport_unsent_bytes,
  379. .read_skb = virtio_transport_read_skb,
  380. },
  381. .send_pkt = vhost_transport_send_pkt,
  382. };
  383. static bool vhost_transport_seqpacket_allow(struct vsock_sock *vsk,
  384. u32 remote_cid)
  385. {
  386. struct net *net = sock_net(sk_vsock(vsk));
  387. struct vhost_vsock *vsock;
  388. bool seqpacket_allow = false;
  389. rcu_read_lock();
  390. vsock = vhost_vsock_get(remote_cid, net);
  391. if (vsock)
  392. seqpacket_allow = vsock->seqpacket_allow;
  393. rcu_read_unlock();
  394. return seqpacket_allow;
  395. }
  396. static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
  397. {
  398. struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
  399. poll.work);
  400. struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock,
  401. dev);
  402. int head, pkts = 0, total_len = 0;
  403. unsigned int out, in;
  404. struct sk_buff *skb;
  405. bool added = false;
  406. mutex_lock(&vq->mutex);
  407. if (!vhost_vq_get_backend(vq))
  408. goto out;
  409. if (!vq_meta_prefetch(vq))
  410. goto out;
  411. vhost_disable_notify(&vsock->dev, vq);
  412. do {
  413. struct virtio_vsock_hdr *hdr;
  414. if (!vhost_vsock_more_replies(vsock)) {
  415. /* Stop tx until the device processes already
  416. * pending replies. Leave tx virtqueue
  417. * callbacks disabled.
  418. */
  419. goto no_more_replies;
  420. }
  421. head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
  422. &out, &in, NULL, NULL);
  423. if (head < 0)
  424. break;
  425. if (head == vq->num) {
  426. if (unlikely(vhost_enable_notify(&vsock->dev, vq))) {
  427. vhost_disable_notify(&vsock->dev, vq);
  428. continue;
  429. }
  430. break;
  431. }
  432. skb = vhost_vsock_alloc_skb(vq, out, in);
  433. if (!skb) {
  434. vq_err(vq, "Faulted on pkt\n");
  435. continue;
  436. }
  437. total_len += sizeof(*hdr) + skb->len;
  438. /* Deliver to monitoring devices all received packets */
  439. virtio_transport_deliver_tap_pkt(skb);
  440. hdr = virtio_vsock_hdr(skb);
  441. /* Only accept correctly addressed packets */
  442. if (le64_to_cpu(hdr->src_cid) == vsock->guest_cid &&
  443. le64_to_cpu(hdr->dst_cid) ==
  444. vhost_transport_get_local_cid())
  445. virtio_transport_recv_pkt(&vhost_transport, skb,
  446. vsock->net);
  447. else
  448. kfree_skb(skb);
  449. vhost_add_used(vq, head, 0);
  450. added = true;
  451. } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len)));
  452. no_more_replies:
  453. if (added)
  454. vhost_signal(&vsock->dev, vq);
  455. out:
  456. mutex_unlock(&vq->mutex);
  457. }
  458. static void vhost_vsock_handle_rx_kick(struct vhost_work *work)
  459. {
  460. struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
  461. poll.work);
  462. struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock,
  463. dev);
  464. vhost_transport_do_send_pkt(vsock, vq);
  465. }
  466. static int vhost_vsock_start(struct vhost_vsock *vsock)
  467. {
  468. struct vhost_virtqueue *vq;
  469. size_t i;
  470. int ret;
  471. mutex_lock(&vsock->dev.mutex);
  472. ret = vhost_dev_check_owner(&vsock->dev);
  473. if (ret)
  474. goto err;
  475. for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
  476. vq = &vsock->vqs[i];
  477. mutex_lock(&vq->mutex);
  478. if (!vhost_vq_access_ok(vq)) {
  479. ret = -EFAULT;
  480. goto err_vq;
  481. }
  482. if (!vhost_vq_get_backend(vq)) {
  483. vhost_vq_set_backend(vq, vsock);
  484. ret = vhost_vq_init_access(vq);
  485. if (ret)
  486. goto err_vq;
  487. }
  488. mutex_unlock(&vq->mutex);
  489. }
  490. /* Some packets may have been queued before the device was started,
  491. * let's kick the send worker to send them.
  492. */
  493. vhost_vq_work_queue(&vsock->vqs[VSOCK_VQ_RX], &vsock->send_pkt_work);
  494. mutex_unlock(&vsock->dev.mutex);
  495. return 0;
  496. err_vq:
  497. vhost_vq_set_backend(vq, NULL);
  498. mutex_unlock(&vq->mutex);
  499. for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
  500. vq = &vsock->vqs[i];
  501. mutex_lock(&vq->mutex);
  502. vhost_vq_set_backend(vq, NULL);
  503. mutex_unlock(&vq->mutex);
  504. }
  505. err:
  506. mutex_unlock(&vsock->dev.mutex);
  507. return ret;
  508. }
  509. static int vhost_vsock_stop(struct vhost_vsock *vsock, bool check_owner)
  510. {
  511. size_t i;
  512. int ret = 0;
  513. mutex_lock(&vsock->dev.mutex);
  514. if (check_owner) {
  515. ret = vhost_dev_check_owner(&vsock->dev);
  516. if (ret)
  517. goto err;
  518. }
  519. for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
  520. struct vhost_virtqueue *vq = &vsock->vqs[i];
  521. mutex_lock(&vq->mutex);
  522. vhost_vq_set_backend(vq, NULL);
  523. mutex_unlock(&vq->mutex);
  524. }
  525. err:
  526. mutex_unlock(&vsock->dev.mutex);
  527. return ret;
  528. }
  529. static void vhost_vsock_free(struct vhost_vsock *vsock)
  530. {
  531. kvfree(vsock);
  532. }
  533. static int vhost_vsock_dev_open(struct inode *inode, struct file *file)
  534. {
  535. struct vhost_virtqueue **vqs;
  536. struct vhost_vsock *vsock;
  537. struct net *net;
  538. int ret;
  539. /* This struct is large and allocation could fail, fall back to vmalloc
  540. * if there is no other way.
  541. */
  542. vsock = kvmalloc_obj(*vsock, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
  543. if (!vsock)
  544. return -ENOMEM;
  545. vqs = kmalloc_objs(*vqs, ARRAY_SIZE(vsock->vqs));
  546. if (!vqs) {
  547. ret = -ENOMEM;
  548. goto out;
  549. }
  550. net = current->nsproxy->net_ns;
  551. vsock->net = get_net_track(net, &vsock->ns_tracker, GFP_KERNEL);
  552. vsock->guest_cid = 0; /* no CID assigned yet */
  553. vsock->seqpacket_allow = false;
  554. atomic_set(&vsock->queued_replies, 0);
  555. vqs[VSOCK_VQ_TX] = &vsock->vqs[VSOCK_VQ_TX];
  556. vqs[VSOCK_VQ_RX] = &vsock->vqs[VSOCK_VQ_RX];
  557. vsock->vqs[VSOCK_VQ_TX].handle_kick = vhost_vsock_handle_tx_kick;
  558. vsock->vqs[VSOCK_VQ_RX].handle_kick = vhost_vsock_handle_rx_kick;
  559. vhost_dev_init(&vsock->dev, vqs, ARRAY_SIZE(vsock->vqs),
  560. UIO_MAXIOV, VHOST_VSOCK_PKT_WEIGHT,
  561. VHOST_VSOCK_WEIGHT, true, NULL);
  562. file->private_data = vsock;
  563. skb_queue_head_init(&vsock->send_pkt_queue);
  564. vhost_work_init(&vsock->send_pkt_work, vhost_transport_send_pkt_work);
  565. return 0;
  566. out:
  567. vhost_vsock_free(vsock);
  568. return ret;
  569. }
  570. static void vhost_vsock_flush(struct vhost_vsock *vsock)
  571. {
  572. vhost_dev_flush(&vsock->dev);
  573. }
  574. static void vhost_vsock_reset_orphans(struct sock *sk)
  575. {
  576. struct vsock_sock *vsk = vsock_sk(sk);
  577. /* vmci_transport.c doesn't take sk_lock here either. At least we're
  578. * under vsock_table_lock so the sock cannot disappear while we're
  579. * executing.
  580. */
  581. rcu_read_lock();
  582. /* If the peer is still valid, no need to reset connection */
  583. if (vhost_vsock_get(vsk->remote_addr.svm_cid, sock_net(sk))) {
  584. rcu_read_unlock();
  585. return;
  586. }
  587. rcu_read_unlock();
  588. /* If the close timeout is pending, let it expire. This avoids races
  589. * with the timeout callback.
  590. */
  591. if (vsk->close_work_scheduled)
  592. return;
  593. sock_set_flag(sk, SOCK_DONE);
  594. vsk->peer_shutdown = SHUTDOWN_MASK;
  595. sk->sk_state = SS_UNCONNECTED;
  596. sk->sk_err = ECONNRESET;
  597. sk_error_report(sk);
  598. }
  599. static int vhost_vsock_dev_release(struct inode *inode, struct file *file)
  600. {
  601. struct vhost_vsock *vsock = file->private_data;
  602. mutex_lock(&vhost_vsock_mutex);
  603. if (vsock->guest_cid)
  604. hash_del_rcu(&vsock->hash);
  605. mutex_unlock(&vhost_vsock_mutex);
  606. /* Wait for other CPUs to finish using vsock */
  607. synchronize_rcu();
  608. /* Iterating over all connections for all CIDs to find orphans is
  609. * inefficient. Room for improvement here. */
  610. vsock_for_each_connected_socket(&vhost_transport.transport,
  611. vhost_vsock_reset_orphans);
  612. /* Don't check the owner, because we are in the release path, so we
  613. * need to stop the vsock device in any case.
  614. * vhost_vsock_stop() can not fail in this case, so we don't need to
  615. * check the return code.
  616. */
  617. vhost_vsock_stop(vsock, false);
  618. vhost_vsock_flush(vsock);
  619. vhost_dev_stop(&vsock->dev);
  620. virtio_vsock_skb_queue_purge(&vsock->send_pkt_queue);
  621. vhost_dev_cleanup(&vsock->dev);
  622. put_net_track(vsock->net, &vsock->ns_tracker);
  623. kfree(vsock->dev.vqs);
  624. vhost_vsock_free(vsock);
  625. return 0;
  626. }
  627. static int vhost_vsock_set_cid(struct vhost_vsock *vsock, u64 guest_cid)
  628. {
  629. struct vhost_vsock *other;
  630. /* Refuse reserved CIDs */
  631. if (guest_cid <= VMADDR_CID_HOST ||
  632. guest_cid == U32_MAX)
  633. return -EINVAL;
  634. /* 64-bit CIDs are not yet supported */
  635. if (guest_cid > U32_MAX)
  636. return -EINVAL;
  637. /* Refuse if CID is assigned to the guest->host transport (i.e. nested
  638. * VM), to make the loopback work.
  639. */
  640. if (vsock_find_cid(guest_cid))
  641. return -EADDRINUSE;
  642. /* Refuse if CID is already in use */
  643. mutex_lock(&vhost_vsock_mutex);
  644. other = vhost_vsock_get(guest_cid, vsock->net);
  645. if (other && other != vsock) {
  646. mutex_unlock(&vhost_vsock_mutex);
  647. return -EADDRINUSE;
  648. }
  649. if (vsock->guest_cid)
  650. hash_del_rcu(&vsock->hash);
  651. vsock->guest_cid = guest_cid;
  652. hash_add_rcu(vhost_vsock_hash, &vsock->hash, vsock->guest_cid);
  653. mutex_unlock(&vhost_vsock_mutex);
  654. return 0;
  655. }
  656. static int vhost_vsock_set_features(struct vhost_vsock *vsock, u64 features)
  657. {
  658. struct vhost_virtqueue *vq;
  659. int i;
  660. if (features & ~VHOST_VSOCK_FEATURES)
  661. return -EOPNOTSUPP;
  662. mutex_lock(&vsock->dev.mutex);
  663. if ((features & (1 << VHOST_F_LOG_ALL)) &&
  664. !vhost_log_access_ok(&vsock->dev)) {
  665. goto err;
  666. }
  667. if ((features & (1ULL << VIRTIO_F_ACCESS_PLATFORM))) {
  668. if (vhost_init_device_iotlb(&vsock->dev))
  669. goto err;
  670. }
  671. vsock->seqpacket_allow = features & (1ULL << VIRTIO_VSOCK_F_SEQPACKET);
  672. for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
  673. vq = &vsock->vqs[i];
  674. mutex_lock(&vq->mutex);
  675. vq->acked_features = features;
  676. mutex_unlock(&vq->mutex);
  677. }
  678. mutex_unlock(&vsock->dev.mutex);
  679. return 0;
  680. err:
  681. mutex_unlock(&vsock->dev.mutex);
  682. return -EFAULT;
  683. }
  684. static long vhost_vsock_dev_ioctl(struct file *f, unsigned int ioctl,
  685. unsigned long arg)
  686. {
  687. struct vhost_vsock *vsock = f->private_data;
  688. void __user *argp = (void __user *)arg;
  689. u64 guest_cid;
  690. u64 features;
  691. int start;
  692. int r;
  693. switch (ioctl) {
  694. case VHOST_VSOCK_SET_GUEST_CID:
  695. if (copy_from_user(&guest_cid, argp, sizeof(guest_cid)))
  696. return -EFAULT;
  697. return vhost_vsock_set_cid(vsock, guest_cid);
  698. case VHOST_VSOCK_SET_RUNNING:
  699. if (copy_from_user(&start, argp, sizeof(start)))
  700. return -EFAULT;
  701. if (start)
  702. return vhost_vsock_start(vsock);
  703. else
  704. return vhost_vsock_stop(vsock, true);
  705. case VHOST_GET_FEATURES:
  706. features = VHOST_VSOCK_FEATURES;
  707. if (copy_to_user(argp, &features, sizeof(features)))
  708. return -EFAULT;
  709. return 0;
  710. case VHOST_SET_FEATURES:
  711. if (copy_from_user(&features, argp, sizeof(features)))
  712. return -EFAULT;
  713. return vhost_vsock_set_features(vsock, features);
  714. case VHOST_GET_BACKEND_FEATURES:
  715. features = VHOST_VSOCK_BACKEND_FEATURES;
  716. if (copy_to_user(argp, &features, sizeof(features)))
  717. return -EFAULT;
  718. return 0;
  719. case VHOST_SET_BACKEND_FEATURES:
  720. if (copy_from_user(&features, argp, sizeof(features)))
  721. return -EFAULT;
  722. if (features & ~VHOST_VSOCK_BACKEND_FEATURES)
  723. return -EOPNOTSUPP;
  724. vhost_set_backend_features(&vsock->dev, features);
  725. return 0;
  726. default:
  727. mutex_lock(&vsock->dev.mutex);
  728. r = vhost_dev_ioctl(&vsock->dev, ioctl, argp);
  729. if (r == -ENOIOCTLCMD)
  730. r = vhost_vring_ioctl(&vsock->dev, ioctl, argp);
  731. else
  732. vhost_vsock_flush(vsock);
  733. mutex_unlock(&vsock->dev.mutex);
  734. return r;
  735. }
  736. }
  737. static ssize_t vhost_vsock_chr_read_iter(struct kiocb *iocb, struct iov_iter *to)
  738. {
  739. struct file *file = iocb->ki_filp;
  740. struct vhost_vsock *vsock = file->private_data;
  741. struct vhost_dev *dev = &vsock->dev;
  742. int noblock = file->f_flags & O_NONBLOCK;
  743. return vhost_chr_read_iter(dev, to, noblock);
  744. }
  745. static ssize_t vhost_vsock_chr_write_iter(struct kiocb *iocb,
  746. struct iov_iter *from)
  747. {
  748. struct file *file = iocb->ki_filp;
  749. struct vhost_vsock *vsock = file->private_data;
  750. struct vhost_dev *dev = &vsock->dev;
  751. return vhost_chr_write_iter(dev, from);
  752. }
  753. static __poll_t vhost_vsock_chr_poll(struct file *file, poll_table *wait)
  754. {
  755. struct vhost_vsock *vsock = file->private_data;
  756. struct vhost_dev *dev = &vsock->dev;
  757. return vhost_chr_poll(file, dev, wait);
  758. }
  759. static const struct file_operations vhost_vsock_fops = {
  760. .owner = THIS_MODULE,
  761. .open = vhost_vsock_dev_open,
  762. .release = vhost_vsock_dev_release,
  763. .llseek = noop_llseek,
  764. .unlocked_ioctl = vhost_vsock_dev_ioctl,
  765. .compat_ioctl = compat_ptr_ioctl,
  766. .read_iter = vhost_vsock_chr_read_iter,
  767. .write_iter = vhost_vsock_chr_write_iter,
  768. .poll = vhost_vsock_chr_poll,
  769. };
  770. static struct miscdevice vhost_vsock_misc = {
  771. .minor = VHOST_VSOCK_MINOR,
  772. .name = "vhost-vsock",
  773. .fops = &vhost_vsock_fops,
  774. };
  775. static int __init vhost_vsock_init(void)
  776. {
  777. int ret;
  778. ret = vsock_core_register(&vhost_transport.transport,
  779. VSOCK_TRANSPORT_F_H2G);
  780. if (ret < 0)
  781. return ret;
  782. ret = misc_register(&vhost_vsock_misc);
  783. if (ret) {
  784. vsock_core_unregister(&vhost_transport.transport);
  785. return ret;
  786. }
  787. return 0;
  788. };
  789. static void __exit vhost_vsock_exit(void)
  790. {
  791. misc_deregister(&vhost_vsock_misc);
  792. vsock_core_unregister(&vhost_transport.transport);
  793. };
  794. module_init(vhost_vsock_init);
  795. module_exit(vhost_vsock_exit);
  796. MODULE_LICENSE("GPL v2");
  797. MODULE_AUTHOR("Asias He");
  798. MODULE_DESCRIPTION("vhost transport for vsock ");
  799. MODULE_ALIAS_MISCDEV(VHOST_VSOCK_MINOR);
  800. MODULE_ALIAS("devname:vhost-vsock");