af_vsock.c 76 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * VMware vSockets Driver
  4. *
  5. * Copyright (C) 2007-2013 VMware, Inc. All rights reserved.
  6. */
  7. /* Implementation notes:
  8. *
  9. * - There are two kinds of sockets: those created by user action (such as
  10. * calling socket(2)) and those created by incoming connection request packets.
  11. *
  12. * - There are two "global" tables, one for bound sockets (sockets that have
  13. * specified an address that they are responsible for) and one for connected
  14. * sockets (sockets that have established a connection with another socket).
  15. * These tables are "global" in that all sockets on the system are placed
  16. * within them. - Note, though, that the bound table contains an extra entry
  17. * for a list of unbound sockets and SOCK_DGRAM sockets will always remain in
  18. * that list. The bound table is used solely for lookup of sockets when packets
  19. * are received and that's not necessary for SOCK_DGRAM sockets since we create
  20. * a datagram handle for each and need not perform a lookup. Keeping SOCK_DGRAM
  21. * sockets out of the bound hash buckets will reduce the chance of collisions
  22. * when looking for SOCK_STREAM sockets and prevents us from having to check the
  23. * socket type in the hash table lookups.
  24. *
  25. * - Sockets created by user action will either be "client" sockets that
  26. * initiate a connection or "server" sockets that listen for connections; we do
  27. * not support simultaneous connects (two "client" sockets connecting).
  28. *
  29. * - "Server" sockets are referred to as listener sockets throughout this
  30. * implementation because they are in the TCP_LISTEN state. When a
  31. * connection request is received (the second kind of socket mentioned above),
  32. * we create a new socket and refer to it as a pending socket. These pending
  33. * sockets are placed on the pending connection list of the listener socket.
  34. * When future packets are received for the address the listener socket is
  35. * bound to, we check if the source of the packet is from one that has an
  36. * existing pending connection. If it does, we process the packet for the
  37. * pending socket. When that socket reaches the connected state, it is removed
  38. * from the listener socket's pending list and enqueued in the listener
  39. * socket's accept queue. Callers of accept(2) will accept connected sockets
  40. * from the listener socket's accept queue. If the socket cannot be accepted
  41. * for some reason then it is marked rejected. Once the connection is
  42. * accepted, it is owned by the user process and the responsibility for cleanup
  43. * falls with that user process.
  44. *
  45. * - It is possible that these pending sockets will never reach the connected
  46. * state; in fact, we may never receive another packet after the connection
  47. * request. Because of this, we must schedule a cleanup function to run in the
  48. * future, after some amount of time passes where a connection should have been
  49. * established. This function ensures that the socket is off all lists so it
  50. * cannot be retrieved, then drops all references to the socket so it is cleaned
  51. * up (sock_put() -> sk_free() -> our sk_destruct implementation). Note this
  52. * function will also cleanup rejected sockets, those that reach the connected
  53. * state but leave it before they have been accepted.
  54. *
  55. * - Lock ordering for pending or accept queue sockets is:
  56. *
  57. * lock_sock(listener);
  58. * lock_sock_nested(pending, SINGLE_DEPTH_NESTING);
  59. *
  60. * Using explicit nested locking keeps lockdep happy since normally only one
  61. * lock of a given class may be taken at a time.
  62. *
  63. * - Sockets created by user action will be cleaned up when the user process
  64. * calls close(2), causing our release implementation to be called. Our release
  65. * implementation will perform some cleanup then drop the last reference so our
  66. * sk_destruct implementation is invoked. Our sk_destruct implementation will
  67. * perform additional cleanup that's common for both types of sockets.
  68. *
  69. * - A socket's reference count is what ensures that the structure won't be
  70. * freed. Each entry in a list (such as the "global" bound and connected tables
  71. * and the listener socket's pending list and connected queue) ensures a
  72. * reference. When we defer work until process context and pass a socket as our
  73. * argument, we must ensure the reference count is increased to ensure the
  74. * socket isn't freed before the function is run; the deferred function will
  75. * then drop the reference.
  76. *
  77. * - sk->sk_state uses the TCP state constants because they are widely used by
  78. * other address families and exposed to userspace tools like ss(8):
  79. *
  80. * TCP_CLOSE - unconnected
  81. * TCP_SYN_SENT - connecting
  82. * TCP_ESTABLISHED - connected
  83. * TCP_CLOSING - disconnecting
  84. * TCP_LISTEN - listening
  85. *
  86. * - Namespaces in vsock support two different modes: "local" and "global".
  87. * Each mode defines how the namespace interacts with CIDs.
  88. * Each namespace exposes two sysctl files:
  89. *
  90. * - /proc/sys/net/vsock/ns_mode (read-only) reports the current namespace's
  91. * mode, which is set at namespace creation and immutable thereafter.
  92. * - /proc/sys/net/vsock/child_ns_mode (write-once) controls what mode future
  93. * child namespaces will inherit when created. The initial value matches
  94. * the namespace's own ns_mode.
  95. *
  96. * Changing child_ns_mode only affects newly created namespaces, not the
  97. * current namespace or existing children. A "local" namespace cannot set
  98. * child_ns_mode to "global". child_ns_mode is write-once, so that it may be
  99. * configured and locked down by a namespace manager. Writing a different
  100. * value after the first write returns -EBUSY. At namespace creation, ns_mode
  101. * is inherited from the parent's child_ns_mode.
  102. *
  103. * The init_net mode is "global" and cannot be modified. The init_net
  104. * child_ns_mode is also write-once, so an init process (e.g. systemd) can
  105. * set it to "local" to ensure all new namespaces inherit local mode.
  106. *
  107. * The modes affect the allocation and accessibility of CIDs as follows:
  108. *
  109. * - global - access and allocation are all system-wide
  110. * - all CID allocation from global namespaces draw from the same
  111. * system-wide pool.
  112. * - if one global namespace has already allocated some CID, another
  113. * global namespace will not be able to allocate the same CID.
  114. * - global mode AF_VSOCK sockets can reach any VM or socket in any global
  115. * namespace, they are not contained to only their own namespace.
  116. * - AF_VSOCK sockets in a global mode namespace cannot reach VMs or
  117. * sockets in any local mode namespace.
  118. * - local - access and allocation are contained within the namespace
  119. * - CID allocation draws only from a private pool local only to the
  120. * namespace, and does not affect the CIDs available for allocation in any
  121. * other namespace (global or local).
  122. * - VMs in a local namespace do not collide with CIDs in any other local
  123. * namespace or any global namespace. For example, if a VM in a local mode
  124. * namespace is given CID 10, then CID 10 is still available for
  125. * allocation in any other namespace, but not in the same namespace.
  126. * - AF_VSOCK sockets in a local mode namespace can connect only to VMs or
  127. * other sockets within their own namespace.
  128. * - sockets bound to VMADDR_CID_ANY in local namespaces will never resolve
  129. * to any transport that is not compatible with local mode. There is no
  130. * error that propagates to the user (as there is for connection attempts)
  131. * because it is possible for some packet to reach this socket from
  132. * a different transport that *does* support local mode. For
  133. * example, virtio-vsock may not support local mode, but the socket
  134. * may still accept a connection from vhost-vsock which does.
  135. */
  136. #include <linux/compat.h>
  137. #include <linux/types.h>
  138. #include <linux/bitops.h>
  139. #include <linux/cred.h>
  140. #include <linux/errqueue.h>
  141. #include <linux/init.h>
  142. #include <linux/io.h>
  143. #include <linux/kernel.h>
  144. #include <linux/sched/signal.h>
  145. #include <linux/kmod.h>
  146. #include <linux/list.h>
  147. #include <linux/miscdevice.h>
  148. #include <linux/module.h>
  149. #include <linux/mutex.h>
  150. #include <linux/net.h>
  151. #include <linux/proc_fs.h>
  152. #include <linux/poll.h>
  153. #include <linux/random.h>
  154. #include <linux/skbuff.h>
  155. #include <linux/smp.h>
  156. #include <linux/socket.h>
  157. #include <linux/stddef.h>
  158. #include <linux/sysctl.h>
  159. #include <linux/unistd.h>
  160. #include <linux/wait.h>
  161. #include <linux/workqueue.h>
  162. #include <net/sock.h>
  163. #include <net/af_vsock.h>
  164. #include <net/netns/vsock.h>
  165. #include <uapi/linux/vm_sockets.h>
  166. #include <uapi/asm-generic/ioctls.h>
  167. #define VSOCK_NET_MODE_STR_GLOBAL "global"
  168. #define VSOCK_NET_MODE_STR_LOCAL "local"
  169. /* 6 chars for "global", 1 for null-terminator, and 1 more for '\n'.
  170. * The newline is added by proc_dostring() for read operations.
  171. */
  172. #define VSOCK_NET_MODE_STR_MAX 8
  173. static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr);
  174. static void vsock_sk_destruct(struct sock *sk);
  175. static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
  176. static void vsock_close(struct sock *sk, long timeout);
  177. /* Protocol family. */
  178. struct proto vsock_proto = {
  179. .name = "AF_VSOCK",
  180. .owner = THIS_MODULE,
  181. .obj_size = sizeof(struct vsock_sock),
  182. .close = vsock_close,
  183. #ifdef CONFIG_BPF_SYSCALL
  184. .psock_update_sk_prot = vsock_bpf_update_proto,
  185. #endif
  186. };
  187. /* The default peer timeout indicates how long we will wait for a peer response
  188. * to a control message.
  189. */
  190. #define VSOCK_DEFAULT_CONNECT_TIMEOUT (2 * HZ)
  191. #define VSOCK_DEFAULT_BUFFER_SIZE (1024 * 256)
  192. #define VSOCK_DEFAULT_BUFFER_MAX_SIZE (1024 * 256)
  193. #define VSOCK_DEFAULT_BUFFER_MIN_SIZE 128
  194. /* Transport used for host->guest communication */
  195. static const struct vsock_transport *transport_h2g;
  196. /* Transport used for guest->host communication */
  197. static const struct vsock_transport *transport_g2h;
  198. /* Transport used for DGRAM communication */
  199. static const struct vsock_transport *transport_dgram;
  200. /* Transport used for local communication */
  201. static const struct vsock_transport *transport_local;
  202. static DEFINE_MUTEX(vsock_register_mutex);
  203. /**** UTILS ****/
  204. /* Each bound VSocket is stored in the bind hash table and each connected
  205. * VSocket is stored in the connected hash table.
  206. *
  207. * Unbound sockets are all put on the same list attached to the end of the hash
  208. * table (vsock_unbound_sockets). Bound sockets are added to the hash table in
  209. * the bucket that their local address hashes to (vsock_bound_sockets(addr)
  210. * represents the list that addr hashes to).
  211. *
  212. * Specifically, we initialize the vsock_bind_table array to a size of
  213. * VSOCK_HASH_SIZE + 1 so that vsock_bind_table[0] through
  214. * vsock_bind_table[VSOCK_HASH_SIZE - 1] are for bound sockets and
  215. * vsock_bind_table[VSOCK_HASH_SIZE] is for unbound sockets. The hash function
  216. * mods with VSOCK_HASH_SIZE to ensure this.
  217. */
  218. #define MAX_PORT_RETRIES 24
  219. #define VSOCK_HASH(addr) ((addr)->svm_port % VSOCK_HASH_SIZE)
  220. #define vsock_bound_sockets(addr) (&vsock_bind_table[VSOCK_HASH(addr)])
  221. #define vsock_unbound_sockets (&vsock_bind_table[VSOCK_HASH_SIZE])
  222. /* XXX This can probably be implemented in a better way. */
  223. #define VSOCK_CONN_HASH(src, dst) \
  224. (((src)->svm_cid ^ (dst)->svm_port) % VSOCK_HASH_SIZE)
  225. #define vsock_connected_sockets(src, dst) \
  226. (&vsock_connected_table[VSOCK_CONN_HASH(src, dst)])
  227. #define vsock_connected_sockets_vsk(vsk) \
  228. vsock_connected_sockets(&(vsk)->remote_addr, &(vsk)->local_addr)
  229. struct list_head vsock_bind_table[VSOCK_HASH_SIZE + 1];
  230. EXPORT_SYMBOL_GPL(vsock_bind_table);
  231. struct list_head vsock_connected_table[VSOCK_HASH_SIZE];
  232. EXPORT_SYMBOL_GPL(vsock_connected_table);
  233. DEFINE_SPINLOCK(vsock_table_lock);
  234. EXPORT_SYMBOL_GPL(vsock_table_lock);
  235. /* Autobind this socket to the local address if necessary. */
  236. static int vsock_auto_bind(struct vsock_sock *vsk)
  237. {
  238. struct sock *sk = sk_vsock(vsk);
  239. struct sockaddr_vm local_addr;
  240. if (vsock_addr_bound(&vsk->local_addr))
  241. return 0;
  242. vsock_addr_init(&local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY);
  243. return __vsock_bind(sk, &local_addr);
  244. }
  245. static void vsock_init_tables(void)
  246. {
  247. int i;
  248. for (i = 0; i < ARRAY_SIZE(vsock_bind_table); i++)
  249. INIT_LIST_HEAD(&vsock_bind_table[i]);
  250. for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++)
  251. INIT_LIST_HEAD(&vsock_connected_table[i]);
  252. }
  253. static void __vsock_insert_bound(struct list_head *list,
  254. struct vsock_sock *vsk)
  255. {
  256. sock_hold(&vsk->sk);
  257. list_add(&vsk->bound_table, list);
  258. }
  259. static void __vsock_insert_connected(struct list_head *list,
  260. struct vsock_sock *vsk)
  261. {
  262. sock_hold(&vsk->sk);
  263. list_add(&vsk->connected_table, list);
  264. }
  265. static void __vsock_remove_bound(struct vsock_sock *vsk)
  266. {
  267. list_del_init(&vsk->bound_table);
  268. sock_put(&vsk->sk);
  269. }
  270. static void __vsock_remove_connected(struct vsock_sock *vsk)
  271. {
  272. list_del_init(&vsk->connected_table);
  273. sock_put(&vsk->sk);
  274. }
  275. static struct sock *__vsock_find_bound_socket_net(struct sockaddr_vm *addr,
  276. struct net *net)
  277. {
  278. struct vsock_sock *vsk;
  279. list_for_each_entry(vsk, vsock_bound_sockets(addr), bound_table) {
  280. struct sock *sk = sk_vsock(vsk);
  281. if (vsock_addr_equals_addr(addr, &vsk->local_addr) &&
  282. vsock_net_check_mode(sock_net(sk), net))
  283. return sk;
  284. if (addr->svm_port == vsk->local_addr.svm_port &&
  285. (vsk->local_addr.svm_cid == VMADDR_CID_ANY ||
  286. addr->svm_cid == VMADDR_CID_ANY) &&
  287. vsock_net_check_mode(sock_net(sk), net))
  288. return sk;
  289. }
  290. return NULL;
  291. }
  292. static struct sock *
  293. __vsock_find_connected_socket_net(struct sockaddr_vm *src,
  294. struct sockaddr_vm *dst, struct net *net)
  295. {
  296. struct vsock_sock *vsk;
  297. list_for_each_entry(vsk, vsock_connected_sockets(src, dst),
  298. connected_table) {
  299. struct sock *sk = sk_vsock(vsk);
  300. if (vsock_addr_equals_addr(src, &vsk->remote_addr) &&
  301. dst->svm_port == vsk->local_addr.svm_port &&
  302. vsock_net_check_mode(sock_net(sk), net)) {
  303. return sk;
  304. }
  305. }
  306. return NULL;
  307. }
  308. static void vsock_insert_unbound(struct vsock_sock *vsk)
  309. {
  310. spin_lock_bh(&vsock_table_lock);
  311. __vsock_insert_bound(vsock_unbound_sockets, vsk);
  312. spin_unlock_bh(&vsock_table_lock);
  313. }
  314. void vsock_insert_connected(struct vsock_sock *vsk)
  315. {
  316. struct list_head *list = vsock_connected_sockets(
  317. &vsk->remote_addr, &vsk->local_addr);
  318. spin_lock_bh(&vsock_table_lock);
  319. __vsock_insert_connected(list, vsk);
  320. spin_unlock_bh(&vsock_table_lock);
  321. }
  322. EXPORT_SYMBOL_GPL(vsock_insert_connected);
  323. void vsock_remove_bound(struct vsock_sock *vsk)
  324. {
  325. spin_lock_bh(&vsock_table_lock);
  326. if (__vsock_in_bound_table(vsk))
  327. __vsock_remove_bound(vsk);
  328. spin_unlock_bh(&vsock_table_lock);
  329. }
  330. EXPORT_SYMBOL_GPL(vsock_remove_bound);
  331. void vsock_remove_connected(struct vsock_sock *vsk)
  332. {
  333. spin_lock_bh(&vsock_table_lock);
  334. if (__vsock_in_connected_table(vsk))
  335. __vsock_remove_connected(vsk);
  336. spin_unlock_bh(&vsock_table_lock);
  337. }
  338. EXPORT_SYMBOL_GPL(vsock_remove_connected);
  339. /* Find a bound socket, filtering by namespace and namespace mode.
  340. *
  341. * Use this in transports that are namespace-aware and can provide the
  342. * network namespace context.
  343. */
  344. struct sock *vsock_find_bound_socket_net(struct sockaddr_vm *addr,
  345. struct net *net)
  346. {
  347. struct sock *sk;
  348. spin_lock_bh(&vsock_table_lock);
  349. sk = __vsock_find_bound_socket_net(addr, net);
  350. if (sk)
  351. sock_hold(sk);
  352. spin_unlock_bh(&vsock_table_lock);
  353. return sk;
  354. }
  355. EXPORT_SYMBOL_GPL(vsock_find_bound_socket_net);
  356. /* Find a bound socket without namespace filtering.
  357. *
  358. * Use this in transports that lack namespace context. All sockets are
  359. * treated as if in global mode.
  360. */
  361. struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr)
  362. {
  363. return vsock_find_bound_socket_net(addr, NULL);
  364. }
  365. EXPORT_SYMBOL_GPL(vsock_find_bound_socket);
  366. /* Find a connected socket, filtering by namespace and namespace mode.
  367. *
  368. * Use this in transports that are namespace-aware and can provide the
  369. * network namespace context.
  370. */
  371. struct sock *vsock_find_connected_socket_net(struct sockaddr_vm *src,
  372. struct sockaddr_vm *dst,
  373. struct net *net)
  374. {
  375. struct sock *sk;
  376. spin_lock_bh(&vsock_table_lock);
  377. sk = __vsock_find_connected_socket_net(src, dst, net);
  378. if (sk)
  379. sock_hold(sk);
  380. spin_unlock_bh(&vsock_table_lock);
  381. return sk;
  382. }
  383. EXPORT_SYMBOL_GPL(vsock_find_connected_socket_net);
  384. /* Find a connected socket without namespace filtering.
  385. *
  386. * Use this in transports that lack namespace context. All sockets are
  387. * treated as if in global mode.
  388. */
  389. struct sock *vsock_find_connected_socket(struct sockaddr_vm *src,
  390. struct sockaddr_vm *dst)
  391. {
  392. return vsock_find_connected_socket_net(src, dst, NULL);
  393. }
  394. EXPORT_SYMBOL_GPL(vsock_find_connected_socket);
  395. void vsock_remove_sock(struct vsock_sock *vsk)
  396. {
  397. /* Transport reassignment must not remove the binding. */
  398. if (sock_flag(sk_vsock(vsk), SOCK_DEAD))
  399. vsock_remove_bound(vsk);
  400. vsock_remove_connected(vsk);
  401. }
  402. EXPORT_SYMBOL_GPL(vsock_remove_sock);
  403. void vsock_for_each_connected_socket(struct vsock_transport *transport,
  404. void (*fn)(struct sock *sk))
  405. {
  406. int i;
  407. spin_lock_bh(&vsock_table_lock);
  408. for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++) {
  409. struct vsock_sock *vsk;
  410. list_for_each_entry(vsk, &vsock_connected_table[i],
  411. connected_table) {
  412. if (vsk->transport != transport)
  413. continue;
  414. fn(sk_vsock(vsk));
  415. }
  416. }
  417. spin_unlock_bh(&vsock_table_lock);
  418. }
  419. EXPORT_SYMBOL_GPL(vsock_for_each_connected_socket);
  420. void vsock_add_pending(struct sock *listener, struct sock *pending)
  421. {
  422. struct vsock_sock *vlistener;
  423. struct vsock_sock *vpending;
  424. vlistener = vsock_sk(listener);
  425. vpending = vsock_sk(pending);
  426. sock_hold(pending);
  427. sock_hold(listener);
  428. list_add_tail(&vpending->pending_links, &vlistener->pending_links);
  429. }
  430. EXPORT_SYMBOL_GPL(vsock_add_pending);
  431. void vsock_remove_pending(struct sock *listener, struct sock *pending)
  432. {
  433. struct vsock_sock *vpending = vsock_sk(pending);
  434. list_del_init(&vpending->pending_links);
  435. sock_put(listener);
  436. sock_put(pending);
  437. }
  438. EXPORT_SYMBOL_GPL(vsock_remove_pending);
  439. void vsock_enqueue_accept(struct sock *listener, struct sock *connected)
  440. {
  441. struct vsock_sock *vlistener;
  442. struct vsock_sock *vconnected;
  443. vlistener = vsock_sk(listener);
  444. vconnected = vsock_sk(connected);
  445. sock_hold(connected);
  446. sock_hold(listener);
  447. list_add_tail(&vconnected->accept_queue, &vlistener->accept_queue);
  448. }
  449. EXPORT_SYMBOL_GPL(vsock_enqueue_accept);
  450. static bool vsock_use_local_transport(unsigned int remote_cid)
  451. {
  452. lockdep_assert_held(&vsock_register_mutex);
  453. if (!transport_local)
  454. return false;
  455. if (remote_cid == VMADDR_CID_LOCAL)
  456. return true;
  457. if (transport_g2h) {
  458. return remote_cid == transport_g2h->get_local_cid();
  459. } else {
  460. return remote_cid == VMADDR_CID_HOST;
  461. }
  462. }
  463. static void vsock_deassign_transport(struct vsock_sock *vsk)
  464. {
  465. if (!vsk->transport)
  466. return;
  467. vsk->transport->destruct(vsk);
  468. module_put(vsk->transport->module);
  469. vsk->transport = NULL;
  470. }
  471. /* Assign a transport to a socket and call the .init transport callback.
  472. *
  473. * Note: for connection oriented socket this must be called when vsk->remote_addr
  474. * is set (e.g. during the connect() or when a connection request on a listener
  475. * socket is received).
  476. * The vsk->remote_addr is used to decide which transport to use:
  477. * - remote CID == VMADDR_CID_LOCAL or g2h->local_cid or VMADDR_CID_HOST if
  478. * g2h is not loaded, will use local transport;
  479. * - remote CID <= VMADDR_CID_HOST or h2g is not loaded or remote flags field
  480. * includes VMADDR_FLAG_TO_HOST flag value, will use guest->host transport;
  481. * - remote CID > VMADDR_CID_HOST will use host->guest transport;
  482. */
  483. int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk)
  484. {
  485. const struct vsock_transport *new_transport;
  486. struct sock *sk = sk_vsock(vsk);
  487. unsigned int remote_cid = vsk->remote_addr.svm_cid;
  488. __u8 remote_flags;
  489. int ret;
  490. /* If the packet is coming with the source and destination CIDs higher
  491. * than VMADDR_CID_HOST, then a vsock channel where all the packets are
  492. * forwarded to the host should be established. Then the host will
  493. * need to forward the packets to the guest.
  494. *
  495. * The flag is set on the (listen) receive path (psk is not NULL). On
  496. * the connect path the flag can be set by the user space application.
  497. */
  498. if (psk && vsk->local_addr.svm_cid > VMADDR_CID_HOST &&
  499. vsk->remote_addr.svm_cid > VMADDR_CID_HOST)
  500. vsk->remote_addr.svm_flags |= VMADDR_FLAG_TO_HOST;
  501. remote_flags = vsk->remote_addr.svm_flags;
  502. mutex_lock(&vsock_register_mutex);
  503. switch (sk->sk_type) {
  504. case SOCK_DGRAM:
  505. new_transport = transport_dgram;
  506. break;
  507. case SOCK_STREAM:
  508. case SOCK_SEQPACKET:
  509. if (vsock_use_local_transport(remote_cid))
  510. new_transport = transport_local;
  511. else if (remote_cid <= VMADDR_CID_HOST || !transport_h2g ||
  512. (remote_flags & VMADDR_FLAG_TO_HOST))
  513. new_transport = transport_g2h;
  514. else
  515. new_transport = transport_h2g;
  516. break;
  517. default:
  518. ret = -ESOCKTNOSUPPORT;
  519. goto err;
  520. }
  521. if (vsk->transport && vsk->transport == new_transport) {
  522. ret = 0;
  523. goto err;
  524. }
  525. /* We increase the module refcnt to prevent the transport unloading
  526. * while there are open sockets assigned to it.
  527. */
  528. if (!new_transport || !try_module_get(new_transport->module)) {
  529. ret = -ENODEV;
  530. goto err;
  531. }
  532. /* It's safe to release the mutex after a successful try_module_get().
  533. * Whichever transport `new_transport` points at, it won't go away until
  534. * the last module_put() below or in vsock_deassign_transport().
  535. */
  536. mutex_unlock(&vsock_register_mutex);
  537. if (vsk->transport) {
  538. /* transport->release() must be called with sock lock acquired.
  539. * This path can only be taken during vsock_connect(), where we
  540. * have already held the sock lock. In the other cases, this
  541. * function is called on a new socket which is not assigned to
  542. * any transport.
  543. */
  544. vsk->transport->release(vsk);
  545. vsock_deassign_transport(vsk);
  546. /* transport's release() and destruct() can touch some socket
  547. * state, since we are reassigning the socket to a new transport
  548. * during vsock_connect(), let's reset these fields to have a
  549. * clean state.
  550. */
  551. sock_reset_flag(sk, SOCK_DONE);
  552. sk->sk_state = TCP_CLOSE;
  553. vsk->peer_shutdown = 0;
  554. }
  555. if (sk->sk_type == SOCK_SEQPACKET) {
  556. if (!new_transport->seqpacket_allow ||
  557. !new_transport->seqpacket_allow(vsk, remote_cid)) {
  558. module_put(new_transport->module);
  559. return -ESOCKTNOSUPPORT;
  560. }
  561. }
  562. ret = new_transport->init(vsk, psk);
  563. if (ret) {
  564. module_put(new_transport->module);
  565. return ret;
  566. }
  567. vsk->transport = new_transport;
  568. return 0;
  569. err:
  570. mutex_unlock(&vsock_register_mutex);
  571. return ret;
  572. }
  573. EXPORT_SYMBOL_GPL(vsock_assign_transport);
  574. /*
  575. * Provide safe access to static transport_{h2g,g2h,dgram,local} callbacks.
  576. * Otherwise we may race with module removal. Do not use on `vsk->transport`.
  577. */
  578. static u32 vsock_registered_transport_cid(const struct vsock_transport **transport)
  579. {
  580. u32 cid = VMADDR_CID_ANY;
  581. mutex_lock(&vsock_register_mutex);
  582. if (*transport)
  583. cid = (*transport)->get_local_cid();
  584. mutex_unlock(&vsock_register_mutex);
  585. return cid;
  586. }
  587. bool vsock_find_cid(unsigned int cid)
  588. {
  589. if (cid == vsock_registered_transport_cid(&transport_g2h))
  590. return true;
  591. if (transport_h2g && cid == VMADDR_CID_HOST)
  592. return true;
  593. if (transport_local && cid == VMADDR_CID_LOCAL)
  594. return true;
  595. return false;
  596. }
  597. EXPORT_SYMBOL_GPL(vsock_find_cid);
  598. static struct sock *vsock_dequeue_accept(struct sock *listener)
  599. {
  600. struct vsock_sock *vlistener;
  601. struct vsock_sock *vconnected;
  602. vlistener = vsock_sk(listener);
  603. if (list_empty(&vlistener->accept_queue))
  604. return NULL;
  605. vconnected = list_entry(vlistener->accept_queue.next,
  606. struct vsock_sock, accept_queue);
  607. list_del_init(&vconnected->accept_queue);
  608. sock_put(listener);
  609. /* The caller will need a reference on the connected socket so we let
  610. * it call sock_put().
  611. */
  612. return sk_vsock(vconnected);
  613. }
  614. static bool vsock_is_accept_queue_empty(struct sock *sk)
  615. {
  616. struct vsock_sock *vsk = vsock_sk(sk);
  617. return list_empty(&vsk->accept_queue);
  618. }
  619. static bool vsock_is_pending(struct sock *sk)
  620. {
  621. struct vsock_sock *vsk = vsock_sk(sk);
  622. return !list_empty(&vsk->pending_links);
  623. }
  624. static int vsock_send_shutdown(struct sock *sk, int mode)
  625. {
  626. struct vsock_sock *vsk = vsock_sk(sk);
  627. if (!vsk->transport)
  628. return -ENODEV;
  629. return vsk->transport->shutdown(vsk, mode);
  630. }
  631. static void vsock_pending_work(struct work_struct *work)
  632. {
  633. struct sock *sk;
  634. struct sock *listener;
  635. struct vsock_sock *vsk;
  636. bool cleanup;
  637. vsk = container_of(work, struct vsock_sock, pending_work.work);
  638. sk = sk_vsock(vsk);
  639. listener = vsk->listener;
  640. cleanup = true;
  641. lock_sock(listener);
  642. lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
  643. if (vsock_is_pending(sk)) {
  644. vsock_remove_pending(listener, sk);
  645. sk_acceptq_removed(listener);
  646. } else if (!vsk->rejected) {
  647. /* We are not on the pending list and accept() did not reject
  648. * us, so we must have been accepted by our user process. We
  649. * just need to drop our references to the sockets and be on
  650. * our way.
  651. */
  652. cleanup = false;
  653. goto out;
  654. }
  655. /* We need to remove ourself from the global connected sockets list so
  656. * incoming packets can't find this socket, and to reduce the reference
  657. * count.
  658. */
  659. vsock_remove_connected(vsk);
  660. sk->sk_state = TCP_CLOSE;
  661. out:
  662. release_sock(sk);
  663. release_sock(listener);
  664. if (cleanup)
  665. sock_put(sk);
  666. sock_put(sk);
  667. sock_put(listener);
  668. }
  669. /**** SOCKET OPERATIONS ****/
  670. static int __vsock_bind_connectible(struct vsock_sock *vsk,
  671. struct sockaddr_vm *addr)
  672. {
  673. struct net *net = sock_net(sk_vsock(vsk));
  674. struct sockaddr_vm new_addr;
  675. if (!net->vsock.port)
  676. net->vsock.port = get_random_u32_above(LAST_RESERVED_PORT);
  677. vsock_addr_init(&new_addr, addr->svm_cid, addr->svm_port);
  678. if (addr->svm_port == VMADDR_PORT_ANY) {
  679. bool found = false;
  680. unsigned int i;
  681. for (i = 0; i < MAX_PORT_RETRIES; i++) {
  682. if (net->vsock.port == VMADDR_PORT_ANY ||
  683. net->vsock.port <= LAST_RESERVED_PORT)
  684. net->vsock.port = LAST_RESERVED_PORT + 1;
  685. new_addr.svm_port = net->vsock.port++;
  686. if (!__vsock_find_bound_socket_net(&new_addr, net)) {
  687. found = true;
  688. break;
  689. }
  690. }
  691. if (!found)
  692. return -EADDRNOTAVAIL;
  693. } else {
  694. /* If port is in reserved range, ensure caller
  695. * has necessary privileges.
  696. */
  697. if (addr->svm_port <= LAST_RESERVED_PORT &&
  698. !capable(CAP_NET_BIND_SERVICE)) {
  699. return -EACCES;
  700. }
  701. if (__vsock_find_bound_socket_net(&new_addr, net))
  702. return -EADDRINUSE;
  703. }
  704. vsock_addr_init(&vsk->local_addr, new_addr.svm_cid, new_addr.svm_port);
  705. /* Remove connection oriented sockets from the unbound list and add them
  706. * to the hash table for easy lookup by its address. The unbound list
  707. * is simply an extra entry at the end of the hash table, a trick used
  708. * by AF_UNIX.
  709. */
  710. __vsock_remove_bound(vsk);
  711. __vsock_insert_bound(vsock_bound_sockets(&vsk->local_addr), vsk);
  712. return 0;
  713. }
  714. static int __vsock_bind_dgram(struct vsock_sock *vsk,
  715. struct sockaddr_vm *addr)
  716. {
  717. return vsk->transport->dgram_bind(vsk, addr);
  718. }
  719. static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr)
  720. {
  721. struct vsock_sock *vsk = vsock_sk(sk);
  722. int retval;
  723. /* First ensure this socket isn't already bound. */
  724. if (vsock_addr_bound(&vsk->local_addr))
  725. return -EINVAL;
  726. /* Now bind to the provided address or select appropriate values if
  727. * none are provided (VMADDR_CID_ANY and VMADDR_PORT_ANY). Note that
  728. * like AF_INET prevents binding to a non-local IP address (in most
  729. * cases), we only allow binding to a local CID.
  730. */
  731. if (addr->svm_cid != VMADDR_CID_ANY && !vsock_find_cid(addr->svm_cid))
  732. return -EADDRNOTAVAIL;
  733. switch (sk->sk_socket->type) {
  734. case SOCK_STREAM:
  735. case SOCK_SEQPACKET:
  736. spin_lock_bh(&vsock_table_lock);
  737. retval = __vsock_bind_connectible(vsk, addr);
  738. spin_unlock_bh(&vsock_table_lock);
  739. break;
  740. case SOCK_DGRAM:
  741. retval = __vsock_bind_dgram(vsk, addr);
  742. break;
  743. default:
  744. retval = -EINVAL;
  745. break;
  746. }
  747. return retval;
  748. }
  749. static void vsock_connect_timeout(struct work_struct *work);
  750. static struct sock *__vsock_create(struct net *net,
  751. struct socket *sock,
  752. struct sock *parent,
  753. gfp_t priority,
  754. unsigned short type,
  755. int kern)
  756. {
  757. struct sock *sk;
  758. struct vsock_sock *psk;
  759. struct vsock_sock *vsk;
  760. sk = sk_alloc(net, AF_VSOCK, priority, &vsock_proto, kern);
  761. if (!sk)
  762. return NULL;
  763. sock_init_data(sock, sk);
  764. /* sk->sk_type is normally set in sock_init_data, but only if sock is
  765. * non-NULL. We make sure that our sockets always have a type by
  766. * setting it here if needed.
  767. */
  768. if (!sock)
  769. sk->sk_type = type;
  770. vsk = vsock_sk(sk);
  771. vsock_addr_init(&vsk->local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY);
  772. vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY);
  773. sk->sk_destruct = vsock_sk_destruct;
  774. sk->sk_backlog_rcv = vsock_queue_rcv_skb;
  775. sock_reset_flag(sk, SOCK_DONE);
  776. INIT_LIST_HEAD(&vsk->bound_table);
  777. INIT_LIST_HEAD(&vsk->connected_table);
  778. vsk->listener = NULL;
  779. INIT_LIST_HEAD(&vsk->pending_links);
  780. INIT_LIST_HEAD(&vsk->accept_queue);
  781. vsk->rejected = false;
  782. vsk->sent_request = false;
  783. vsk->ignore_connecting_rst = false;
  784. vsk->peer_shutdown = 0;
  785. INIT_DELAYED_WORK(&vsk->connect_work, vsock_connect_timeout);
  786. INIT_DELAYED_WORK(&vsk->pending_work, vsock_pending_work);
  787. psk = parent ? vsock_sk(parent) : NULL;
  788. if (parent) {
  789. vsk->trusted = psk->trusted;
  790. vsk->owner = get_cred(psk->owner);
  791. vsk->connect_timeout = psk->connect_timeout;
  792. vsk->buffer_size = psk->buffer_size;
  793. vsk->buffer_min_size = psk->buffer_min_size;
  794. vsk->buffer_max_size = psk->buffer_max_size;
  795. security_sk_clone(parent, sk);
  796. } else {
  797. vsk->trusted = ns_capable_noaudit(&init_user_ns, CAP_NET_ADMIN);
  798. vsk->owner = get_current_cred();
  799. vsk->connect_timeout = VSOCK_DEFAULT_CONNECT_TIMEOUT;
  800. vsk->buffer_size = VSOCK_DEFAULT_BUFFER_SIZE;
  801. vsk->buffer_min_size = VSOCK_DEFAULT_BUFFER_MIN_SIZE;
  802. vsk->buffer_max_size = VSOCK_DEFAULT_BUFFER_MAX_SIZE;
  803. }
  804. return sk;
  805. }
  806. static bool sock_type_connectible(u16 type)
  807. {
  808. return (type == SOCK_STREAM) || (type == SOCK_SEQPACKET);
  809. }
  810. static void __vsock_release(struct sock *sk, int level)
  811. {
  812. struct vsock_sock *vsk;
  813. struct sock *pending;
  814. vsk = vsock_sk(sk);
  815. pending = NULL; /* Compiler warning. */
  816. /* When "level" is SINGLE_DEPTH_NESTING, use the nested
  817. * version to avoid the warning "possible recursive locking
  818. * detected". When "level" is 0, lock_sock_nested(sk, level)
  819. * is the same as lock_sock(sk).
  820. */
  821. lock_sock_nested(sk, level);
  822. /* Indicate to vsock_remove_sock() that the socket is being released and
  823. * can be removed from the bound_table. Unlike transport reassignment
  824. * case, where the socket must remain bound despite vsock_remove_sock()
  825. * being called from the transport release() callback.
  826. */
  827. sock_set_flag(sk, SOCK_DEAD);
  828. if (vsk->transport)
  829. vsk->transport->release(vsk);
  830. else if (sock_type_connectible(sk->sk_type))
  831. vsock_remove_sock(vsk);
  832. sock_orphan(sk);
  833. sk->sk_shutdown = SHUTDOWN_MASK;
  834. skb_queue_purge(&sk->sk_receive_queue);
  835. /* Clean up any sockets that never were accepted. */
  836. while ((pending = vsock_dequeue_accept(sk)) != NULL) {
  837. __vsock_release(pending, SINGLE_DEPTH_NESTING);
  838. sock_put(pending);
  839. }
  840. release_sock(sk);
  841. sock_put(sk);
  842. }
  843. static void vsock_sk_destruct(struct sock *sk)
  844. {
  845. struct vsock_sock *vsk = vsock_sk(sk);
  846. /* Flush MSG_ZEROCOPY leftovers. */
  847. __skb_queue_purge(&sk->sk_error_queue);
  848. vsock_deassign_transport(vsk);
  849. /* When clearing these addresses, there's no need to set the family and
  850. * possibly register the address family with the kernel.
  851. */
  852. vsock_addr_init(&vsk->local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY);
  853. vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY);
  854. put_cred(vsk->owner);
  855. }
  856. static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
  857. {
  858. int err;
  859. err = sock_queue_rcv_skb(sk, skb);
  860. if (err)
  861. kfree_skb(skb);
  862. return err;
  863. }
  864. struct sock *vsock_create_connected(struct sock *parent)
  865. {
  866. return __vsock_create(sock_net(parent), NULL, parent, GFP_KERNEL,
  867. parent->sk_type, 0);
  868. }
  869. EXPORT_SYMBOL_GPL(vsock_create_connected);
  870. s64 vsock_stream_has_data(struct vsock_sock *vsk)
  871. {
  872. if (WARN_ON(!vsk->transport))
  873. return 0;
  874. return vsk->transport->stream_has_data(vsk);
  875. }
  876. EXPORT_SYMBOL_GPL(vsock_stream_has_data);
  877. s64 vsock_connectible_has_data(struct vsock_sock *vsk)
  878. {
  879. struct sock *sk = sk_vsock(vsk);
  880. if (WARN_ON(!vsk->transport))
  881. return 0;
  882. if (sk->sk_type == SOCK_SEQPACKET)
  883. return vsk->transport->seqpacket_has_data(vsk);
  884. else
  885. return vsock_stream_has_data(vsk);
  886. }
  887. EXPORT_SYMBOL_GPL(vsock_connectible_has_data);
  888. s64 vsock_stream_has_space(struct vsock_sock *vsk)
  889. {
  890. if (WARN_ON(!vsk->transport))
  891. return 0;
  892. return vsk->transport->stream_has_space(vsk);
  893. }
  894. EXPORT_SYMBOL_GPL(vsock_stream_has_space);
  895. void vsock_data_ready(struct sock *sk)
  896. {
  897. struct vsock_sock *vsk = vsock_sk(sk);
  898. if (vsock_stream_has_data(vsk) >= sk->sk_rcvlowat ||
  899. sock_flag(sk, SOCK_DONE))
  900. sk->sk_data_ready(sk);
  901. }
  902. EXPORT_SYMBOL_GPL(vsock_data_ready);
  903. /* Dummy callback required by sockmap.
  904. * See unconditional call of saved_close() in sock_map_close().
  905. */
  906. static void vsock_close(struct sock *sk, long timeout)
  907. {
  908. }
  909. static int vsock_release(struct socket *sock)
  910. {
  911. struct sock *sk = sock->sk;
  912. if (!sk)
  913. return 0;
  914. sk->sk_prot->close(sk, 0);
  915. __vsock_release(sk, 0);
  916. sock->sk = NULL;
  917. sock->state = SS_FREE;
  918. return 0;
  919. }
  920. static int
  921. vsock_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len)
  922. {
  923. int err;
  924. struct sock *sk;
  925. struct sockaddr_vm *vm_addr;
  926. sk = sock->sk;
  927. if (vsock_addr_cast(addr, addr_len, &vm_addr) != 0)
  928. return -EINVAL;
  929. lock_sock(sk);
  930. err = __vsock_bind(sk, vm_addr);
  931. release_sock(sk);
  932. return err;
  933. }
  934. static int vsock_getname(struct socket *sock,
  935. struct sockaddr *addr, int peer)
  936. {
  937. int err;
  938. struct sock *sk;
  939. struct vsock_sock *vsk;
  940. struct sockaddr_vm *vm_addr;
  941. sk = sock->sk;
  942. vsk = vsock_sk(sk);
  943. err = 0;
  944. lock_sock(sk);
  945. if (peer) {
  946. if (sock->state != SS_CONNECTED) {
  947. err = -ENOTCONN;
  948. goto out;
  949. }
  950. vm_addr = &vsk->remote_addr;
  951. } else {
  952. vm_addr = &vsk->local_addr;
  953. }
  954. BUILD_BUG_ON(sizeof(*vm_addr) > sizeof(struct sockaddr_storage));
  955. memcpy(addr, vm_addr, sizeof(*vm_addr));
  956. err = sizeof(*vm_addr);
  957. out:
  958. release_sock(sk);
  959. return err;
  960. }
  961. void vsock_linger(struct sock *sk)
  962. {
  963. DEFINE_WAIT_FUNC(wait, woken_wake_function);
  964. ssize_t (*unsent)(struct vsock_sock *vsk);
  965. struct vsock_sock *vsk = vsock_sk(sk);
  966. long timeout;
  967. if (!sock_flag(sk, SOCK_LINGER))
  968. return;
  969. timeout = sk->sk_lingertime;
  970. if (!timeout)
  971. return;
  972. /* Transports must implement `unsent_bytes` if they want to support
  973. * SOCK_LINGER through `vsock_linger()` since we use it to check when
  974. * the socket can be closed.
  975. */
  976. unsent = vsk->transport->unsent_bytes;
  977. if (!unsent)
  978. return;
  979. add_wait_queue(sk_sleep(sk), &wait);
  980. do {
  981. if (sk_wait_event(sk, &timeout, unsent(vsk) == 0, &wait))
  982. break;
  983. } while (!signal_pending(current) && timeout);
  984. remove_wait_queue(sk_sleep(sk), &wait);
  985. }
  986. EXPORT_SYMBOL_GPL(vsock_linger);
  987. static int vsock_shutdown(struct socket *sock, int mode)
  988. {
  989. int err;
  990. struct sock *sk;
  991. /* User level uses SHUT_RD (0) and SHUT_WR (1), but the kernel uses
  992. * RCV_SHUTDOWN (1) and SEND_SHUTDOWN (2), so we must increment mode
  993. * here like the other address families do. Note also that the
  994. * increment makes SHUT_RDWR (2) into RCV_SHUTDOWN | SEND_SHUTDOWN (3),
  995. * which is what we want.
  996. */
  997. mode++;
  998. if ((mode & ~SHUTDOWN_MASK) || !mode)
  999. return -EINVAL;
  1000. /* If this is a connection oriented socket and it is not connected then
  1001. * bail out immediately. If it is a DGRAM socket then we must first
  1002. * kick the socket so that it wakes up from any sleeping calls, for
  1003. * example recv(), and then afterwards return the error.
  1004. */
  1005. sk = sock->sk;
  1006. lock_sock(sk);
  1007. if (sock->state == SS_UNCONNECTED) {
  1008. err = -ENOTCONN;
  1009. if (sock_type_connectible(sk->sk_type))
  1010. goto out;
  1011. } else {
  1012. sock->state = SS_DISCONNECTING;
  1013. err = 0;
  1014. }
  1015. /* Receive and send shutdowns are treated alike. */
  1016. mode = mode & (RCV_SHUTDOWN | SEND_SHUTDOWN);
  1017. if (mode) {
  1018. sk->sk_shutdown |= mode;
  1019. sk->sk_state_change(sk);
  1020. if (sock_type_connectible(sk->sk_type)) {
  1021. sock_reset_flag(sk, SOCK_DONE);
  1022. vsock_send_shutdown(sk, mode);
  1023. }
  1024. }
  1025. out:
  1026. release_sock(sk);
  1027. return err;
  1028. }
  1029. static __poll_t vsock_poll(struct file *file, struct socket *sock,
  1030. poll_table *wait)
  1031. {
  1032. struct sock *sk;
  1033. __poll_t mask;
  1034. struct vsock_sock *vsk;
  1035. sk = sock->sk;
  1036. vsk = vsock_sk(sk);
  1037. poll_wait(file, sk_sleep(sk), wait);
  1038. mask = 0;
  1039. if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
  1040. /* Signify that there has been an error on this socket. */
  1041. mask |= EPOLLERR;
  1042. /* INET sockets treat local write shutdown and peer write shutdown as a
  1043. * case of EPOLLHUP set.
  1044. */
  1045. if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
  1046. ((sk->sk_shutdown & SEND_SHUTDOWN) &&
  1047. (vsk->peer_shutdown & SEND_SHUTDOWN))) {
  1048. mask |= EPOLLHUP;
  1049. }
  1050. if (sk->sk_shutdown & RCV_SHUTDOWN ||
  1051. vsk->peer_shutdown & SEND_SHUTDOWN) {
  1052. mask |= EPOLLRDHUP;
  1053. }
  1054. if (sk_is_readable(sk))
  1055. mask |= EPOLLIN | EPOLLRDNORM;
  1056. if (sock->type == SOCK_DGRAM) {
  1057. /* For datagram sockets we can read if there is something in
  1058. * the queue and write as long as the socket isn't shutdown for
  1059. * sending.
  1060. */
  1061. if (!skb_queue_empty_lockless(&sk->sk_receive_queue) ||
  1062. (sk->sk_shutdown & RCV_SHUTDOWN)) {
  1063. mask |= EPOLLIN | EPOLLRDNORM;
  1064. }
  1065. if (!(sk->sk_shutdown & SEND_SHUTDOWN))
  1066. mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
  1067. } else if (sock_type_connectible(sk->sk_type)) {
  1068. const struct vsock_transport *transport;
  1069. lock_sock(sk);
  1070. transport = vsk->transport;
  1071. /* Listening sockets that have connections in their accept
  1072. * queue can be read.
  1073. */
  1074. if (sk->sk_state == TCP_LISTEN
  1075. && !vsock_is_accept_queue_empty(sk))
  1076. mask |= EPOLLIN | EPOLLRDNORM;
  1077. /* If there is something in the queue then we can read. */
  1078. if (transport && transport->stream_is_active(vsk) &&
  1079. !(sk->sk_shutdown & RCV_SHUTDOWN)) {
  1080. bool data_ready_now = false;
  1081. int target = sock_rcvlowat(sk, 0, INT_MAX);
  1082. int ret = transport->notify_poll_in(
  1083. vsk, target, &data_ready_now);
  1084. if (ret < 0) {
  1085. mask |= EPOLLERR;
  1086. } else {
  1087. if (data_ready_now)
  1088. mask |= EPOLLIN | EPOLLRDNORM;
  1089. }
  1090. }
  1091. /* Sockets whose connections have been closed, reset, or
  1092. * terminated should also be considered read, and we check the
  1093. * shutdown flag for that.
  1094. */
  1095. if (sk->sk_shutdown & RCV_SHUTDOWN ||
  1096. vsk->peer_shutdown & SEND_SHUTDOWN) {
  1097. mask |= EPOLLIN | EPOLLRDNORM;
  1098. }
  1099. /* Connected sockets that can produce data can be written. */
  1100. if (transport && sk->sk_state == TCP_ESTABLISHED) {
  1101. if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
  1102. bool space_avail_now = false;
  1103. int ret = transport->notify_poll_out(
  1104. vsk, 1, &space_avail_now);
  1105. if (ret < 0) {
  1106. mask |= EPOLLERR;
  1107. } else {
  1108. if (space_avail_now)
  1109. /* Remove EPOLLWRBAND since INET
  1110. * sockets are not setting it.
  1111. */
  1112. mask |= EPOLLOUT | EPOLLWRNORM;
  1113. }
  1114. }
  1115. }
  1116. /* Simulate INET socket poll behaviors, which sets
  1117. * EPOLLOUT|EPOLLWRNORM when peer is closed and nothing to read,
  1118. * but local send is not shutdown.
  1119. */
  1120. if (sk->sk_state == TCP_CLOSE || sk->sk_state == TCP_CLOSING) {
  1121. if (!(sk->sk_shutdown & SEND_SHUTDOWN))
  1122. mask |= EPOLLOUT | EPOLLWRNORM;
  1123. }
  1124. release_sock(sk);
  1125. }
  1126. return mask;
  1127. }
  1128. static int vsock_read_skb(struct sock *sk, skb_read_actor_t read_actor)
  1129. {
  1130. struct vsock_sock *vsk = vsock_sk(sk);
  1131. if (WARN_ON_ONCE(!vsk->transport))
  1132. return -ENODEV;
  1133. return vsk->transport->read_skb(vsk, read_actor);
  1134. }
  1135. static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
  1136. size_t len)
  1137. {
  1138. int err;
  1139. struct sock *sk;
  1140. struct vsock_sock *vsk;
  1141. struct sockaddr_vm *remote_addr;
  1142. const struct vsock_transport *transport;
  1143. if (msg->msg_flags & MSG_OOB)
  1144. return -EOPNOTSUPP;
  1145. /* For now, MSG_DONTWAIT is always assumed... */
  1146. err = 0;
  1147. sk = sock->sk;
  1148. vsk = vsock_sk(sk);
  1149. lock_sock(sk);
  1150. transport = vsk->transport;
  1151. err = vsock_auto_bind(vsk);
  1152. if (err)
  1153. goto out;
  1154. /* If the provided message contains an address, use that. Otherwise
  1155. * fall back on the socket's remote handle (if it has been connected).
  1156. */
  1157. if (msg->msg_name &&
  1158. vsock_addr_cast(msg->msg_name, msg->msg_namelen,
  1159. &remote_addr) == 0) {
  1160. /* Ensure this address is of the right type and is a valid
  1161. * destination.
  1162. */
  1163. if (remote_addr->svm_cid == VMADDR_CID_ANY)
  1164. remote_addr->svm_cid = transport->get_local_cid();
  1165. if (!vsock_addr_bound(remote_addr)) {
  1166. err = -EINVAL;
  1167. goto out;
  1168. }
  1169. } else if (sock->state == SS_CONNECTED) {
  1170. remote_addr = &vsk->remote_addr;
  1171. if (remote_addr->svm_cid == VMADDR_CID_ANY)
  1172. remote_addr->svm_cid = transport->get_local_cid();
  1173. /* XXX Should connect() or this function ensure remote_addr is
  1174. * bound?
  1175. */
  1176. if (!vsock_addr_bound(&vsk->remote_addr)) {
  1177. err = -EINVAL;
  1178. goto out;
  1179. }
  1180. } else {
  1181. err = -EINVAL;
  1182. goto out;
  1183. }
  1184. if (!transport->dgram_allow(vsk, remote_addr->svm_cid,
  1185. remote_addr->svm_port)) {
  1186. err = -EINVAL;
  1187. goto out;
  1188. }
  1189. err = transport->dgram_enqueue(vsk, remote_addr, msg, len);
  1190. out:
  1191. release_sock(sk);
  1192. return err;
  1193. }
  1194. static int vsock_dgram_connect(struct socket *sock,
  1195. struct sockaddr_unsized *addr, int addr_len, int flags)
  1196. {
  1197. int err;
  1198. struct sock *sk;
  1199. struct vsock_sock *vsk;
  1200. struct sockaddr_vm *remote_addr;
  1201. sk = sock->sk;
  1202. vsk = vsock_sk(sk);
  1203. err = vsock_addr_cast(addr, addr_len, &remote_addr);
  1204. if (err == -EAFNOSUPPORT && remote_addr->svm_family == AF_UNSPEC) {
  1205. lock_sock(sk);
  1206. vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY,
  1207. VMADDR_PORT_ANY);
  1208. sock->state = SS_UNCONNECTED;
  1209. release_sock(sk);
  1210. return 0;
  1211. } else if (err != 0)
  1212. return -EINVAL;
  1213. lock_sock(sk);
  1214. err = vsock_auto_bind(vsk);
  1215. if (err)
  1216. goto out;
  1217. if (!vsk->transport->dgram_allow(vsk, remote_addr->svm_cid,
  1218. remote_addr->svm_port)) {
  1219. err = -EINVAL;
  1220. goto out;
  1221. }
  1222. memcpy(&vsk->remote_addr, remote_addr, sizeof(vsk->remote_addr));
  1223. sock->state = SS_CONNECTED;
  1224. /* sock map disallows redirection of non-TCP sockets with sk_state !=
  1225. * TCP_ESTABLISHED (see sock_map_redirect_allowed()), so we set
  1226. * TCP_ESTABLISHED here to allow redirection of connected vsock dgrams.
  1227. *
  1228. * This doesn't seem to be abnormal state for datagram sockets, as the
  1229. * same approach can be see in other datagram socket types as well
  1230. * (such as unix sockets).
  1231. */
  1232. sk->sk_state = TCP_ESTABLISHED;
  1233. out:
  1234. release_sock(sk);
  1235. return err;
  1236. }
  1237. int __vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
  1238. size_t len, int flags)
  1239. {
  1240. struct sock *sk = sock->sk;
  1241. struct vsock_sock *vsk = vsock_sk(sk);
  1242. return vsk->transport->dgram_dequeue(vsk, msg, len, flags);
  1243. }
  1244. int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
  1245. size_t len, int flags)
  1246. {
  1247. #ifdef CONFIG_BPF_SYSCALL
  1248. struct sock *sk = sock->sk;
  1249. const struct proto *prot;
  1250. prot = READ_ONCE(sk->sk_prot);
  1251. if (prot != &vsock_proto)
  1252. return prot->recvmsg(sk, msg, len, flags, NULL);
  1253. #endif
  1254. return __vsock_dgram_recvmsg(sock, msg, len, flags);
  1255. }
  1256. EXPORT_SYMBOL_GPL(vsock_dgram_recvmsg);
  1257. static int vsock_do_ioctl(struct socket *sock, unsigned int cmd,
  1258. int __user *arg)
  1259. {
  1260. struct sock *sk = sock->sk;
  1261. struct vsock_sock *vsk;
  1262. int ret;
  1263. vsk = vsock_sk(sk);
  1264. switch (cmd) {
  1265. case SIOCINQ: {
  1266. ssize_t n_bytes;
  1267. if (!vsk->transport) {
  1268. ret = -EOPNOTSUPP;
  1269. break;
  1270. }
  1271. if (sock_type_connectible(sk->sk_type) &&
  1272. sk->sk_state == TCP_LISTEN) {
  1273. ret = -EINVAL;
  1274. break;
  1275. }
  1276. n_bytes = vsock_stream_has_data(vsk);
  1277. if (n_bytes < 0) {
  1278. ret = n_bytes;
  1279. break;
  1280. }
  1281. ret = put_user(n_bytes, arg);
  1282. break;
  1283. }
  1284. case SIOCOUTQ: {
  1285. ssize_t n_bytes;
  1286. if (!vsk->transport || !vsk->transport->unsent_bytes) {
  1287. ret = -EOPNOTSUPP;
  1288. break;
  1289. }
  1290. if (sock_type_connectible(sk->sk_type) && sk->sk_state == TCP_LISTEN) {
  1291. ret = -EINVAL;
  1292. break;
  1293. }
  1294. n_bytes = vsk->transport->unsent_bytes(vsk);
  1295. if (n_bytes < 0) {
  1296. ret = n_bytes;
  1297. break;
  1298. }
  1299. ret = put_user(n_bytes, arg);
  1300. break;
  1301. }
  1302. default:
  1303. ret = -ENOIOCTLCMD;
  1304. }
  1305. return ret;
  1306. }
  1307. static int vsock_ioctl(struct socket *sock, unsigned int cmd,
  1308. unsigned long arg)
  1309. {
  1310. int ret;
  1311. lock_sock(sock->sk);
  1312. ret = vsock_do_ioctl(sock, cmd, (int __user *)arg);
  1313. release_sock(sock->sk);
  1314. return ret;
  1315. }
  1316. static const struct proto_ops vsock_dgram_ops = {
  1317. .family = PF_VSOCK,
  1318. .owner = THIS_MODULE,
  1319. .release = vsock_release,
  1320. .bind = vsock_bind,
  1321. .connect = vsock_dgram_connect,
  1322. .socketpair = sock_no_socketpair,
  1323. .accept = sock_no_accept,
  1324. .getname = vsock_getname,
  1325. .poll = vsock_poll,
  1326. .ioctl = vsock_ioctl,
  1327. .listen = sock_no_listen,
  1328. .shutdown = vsock_shutdown,
  1329. .sendmsg = vsock_dgram_sendmsg,
  1330. .recvmsg = vsock_dgram_recvmsg,
  1331. .mmap = sock_no_mmap,
  1332. .read_skb = vsock_read_skb,
  1333. };
  1334. static int vsock_transport_cancel_pkt(struct vsock_sock *vsk)
  1335. {
  1336. const struct vsock_transport *transport = vsk->transport;
  1337. if (!transport || !transport->cancel_pkt)
  1338. return -EOPNOTSUPP;
  1339. return transport->cancel_pkt(vsk);
  1340. }
  1341. static void vsock_connect_timeout(struct work_struct *work)
  1342. {
  1343. struct sock *sk;
  1344. struct vsock_sock *vsk;
  1345. vsk = container_of(work, struct vsock_sock, connect_work.work);
  1346. sk = sk_vsock(vsk);
  1347. lock_sock(sk);
  1348. if (sk->sk_state == TCP_SYN_SENT &&
  1349. (sk->sk_shutdown != SHUTDOWN_MASK)) {
  1350. sk->sk_state = TCP_CLOSE;
  1351. sk->sk_socket->state = SS_UNCONNECTED;
  1352. sk->sk_err = ETIMEDOUT;
  1353. sk_error_report(sk);
  1354. vsock_transport_cancel_pkt(vsk);
  1355. }
  1356. release_sock(sk);
  1357. sock_put(sk);
  1358. }
  1359. static int vsock_connect(struct socket *sock, struct sockaddr_unsized *addr,
  1360. int addr_len, int flags)
  1361. {
  1362. int err;
  1363. struct sock *sk;
  1364. struct vsock_sock *vsk;
  1365. const struct vsock_transport *transport;
  1366. struct sockaddr_vm *remote_addr;
  1367. long timeout;
  1368. DEFINE_WAIT(wait);
  1369. err = 0;
  1370. sk = sock->sk;
  1371. vsk = vsock_sk(sk);
  1372. lock_sock(sk);
  1373. /* XXX AF_UNSPEC should make us disconnect like AF_INET. */
  1374. switch (sock->state) {
  1375. case SS_CONNECTED:
  1376. err = -EISCONN;
  1377. goto out;
  1378. case SS_DISCONNECTING:
  1379. err = -EINVAL;
  1380. goto out;
  1381. case SS_CONNECTING:
  1382. /* This continues on so we can move sock into the SS_CONNECTED
  1383. * state once the connection has completed (at which point err
  1384. * will be set to zero also). Otherwise, we will either wait
  1385. * for the connection or return -EALREADY should this be a
  1386. * non-blocking call.
  1387. */
  1388. err = -EALREADY;
  1389. if (flags & O_NONBLOCK)
  1390. goto out;
  1391. break;
  1392. default:
  1393. if ((sk->sk_state == TCP_LISTEN) ||
  1394. vsock_addr_cast(addr, addr_len, &remote_addr) != 0) {
  1395. err = -EINVAL;
  1396. goto out;
  1397. }
  1398. /* Set the remote address that we are connecting to. */
  1399. memcpy(&vsk->remote_addr, remote_addr,
  1400. sizeof(vsk->remote_addr));
  1401. err = vsock_assign_transport(vsk, NULL);
  1402. if (err)
  1403. goto out;
  1404. transport = vsk->transport;
  1405. /* The hypervisor and well-known contexts do not have socket
  1406. * endpoints.
  1407. */
  1408. if (!transport ||
  1409. !transport->stream_allow(vsk, remote_addr->svm_cid,
  1410. remote_addr->svm_port)) {
  1411. err = -ENETUNREACH;
  1412. goto out;
  1413. }
  1414. if (vsock_msgzerocopy_allow(transport)) {
  1415. set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
  1416. } else if (sock_flag(sk, SOCK_ZEROCOPY)) {
  1417. /* If this option was set before 'connect()',
  1418. * when transport was unknown, check that this
  1419. * feature is supported here.
  1420. */
  1421. err = -EOPNOTSUPP;
  1422. goto out;
  1423. }
  1424. err = vsock_auto_bind(vsk);
  1425. if (err)
  1426. goto out;
  1427. sk->sk_state = TCP_SYN_SENT;
  1428. err = transport->connect(vsk);
  1429. if (err < 0)
  1430. goto out;
  1431. /* sk_err might have been set as a result of an earlier
  1432. * (failed) connect attempt.
  1433. */
  1434. sk->sk_err = 0;
  1435. /* Mark sock as connecting and set the error code to in
  1436. * progress in case this is a non-blocking connect.
  1437. */
  1438. sock->state = SS_CONNECTING;
  1439. err = -EINPROGRESS;
  1440. }
  1441. /* The receive path will handle all communication until we are able to
  1442. * enter the connected state. Here we wait for the connection to be
  1443. * completed or a notification of an error.
  1444. */
  1445. timeout = vsk->connect_timeout;
  1446. prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
  1447. /* If the socket is already closing or it is in an error state, there
  1448. * is no point in waiting.
  1449. */
  1450. while (sk->sk_state != TCP_ESTABLISHED &&
  1451. sk->sk_state != TCP_CLOSING && sk->sk_err == 0) {
  1452. if (flags & O_NONBLOCK) {
  1453. /* If we're not going to block, we schedule a timeout
  1454. * function to generate a timeout on the connection
  1455. * attempt, in case the peer doesn't respond in a
  1456. * timely manner. We hold on to the socket until the
  1457. * timeout fires.
  1458. */
  1459. sock_hold(sk);
  1460. /* If the timeout function is already scheduled,
  1461. * reschedule it, then ungrab the socket refcount to
  1462. * keep it balanced.
  1463. */
  1464. if (mod_delayed_work(system_percpu_wq, &vsk->connect_work,
  1465. timeout))
  1466. sock_put(sk);
  1467. /* Skip ahead to preserve error code set above. */
  1468. goto out_wait;
  1469. }
  1470. release_sock(sk);
  1471. timeout = schedule_timeout(timeout);
  1472. lock_sock(sk);
  1473. /* Connection established. Whatever happens to socket once we
  1474. * release it, that's not connect()'s concern. No need to go
  1475. * into signal and timeout handling. Call it a day.
  1476. *
  1477. * Note that allowing to "reset" an already established socket
  1478. * here is racy and insecure.
  1479. */
  1480. if (sk->sk_state == TCP_ESTABLISHED)
  1481. break;
  1482. /* If connection was _not_ established and a signal/timeout came
  1483. * to be, we want the socket's state reset. User space may want
  1484. * to retry.
  1485. *
  1486. * sk_state != TCP_ESTABLISHED implies that socket is not on
  1487. * vsock_connected_table. We keep the binding and the transport
  1488. * assigned.
  1489. */
  1490. if (signal_pending(current) || timeout == 0) {
  1491. err = timeout == 0 ? -ETIMEDOUT : sock_intr_errno(timeout);
  1492. /* Listener might have already responded with
  1493. * VIRTIO_VSOCK_OP_RESPONSE. Its handling expects our
  1494. * sk_state == TCP_SYN_SENT, which hereby we break.
  1495. * In such case VIRTIO_VSOCK_OP_RST will follow.
  1496. */
  1497. sk->sk_state = TCP_CLOSE;
  1498. sock->state = SS_UNCONNECTED;
  1499. /* Try to cancel VIRTIO_VSOCK_OP_REQUEST skb sent out by
  1500. * transport->connect().
  1501. */
  1502. vsock_transport_cancel_pkt(vsk);
  1503. goto out_wait;
  1504. }
  1505. prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
  1506. }
  1507. if (sk->sk_err) {
  1508. err = -sk->sk_err;
  1509. sk->sk_state = TCP_CLOSE;
  1510. sock->state = SS_UNCONNECTED;
  1511. } else {
  1512. err = 0;
  1513. }
  1514. out_wait:
  1515. finish_wait(sk_sleep(sk), &wait);
  1516. out:
  1517. release_sock(sk);
  1518. return err;
  1519. }
  1520. static int vsock_accept(struct socket *sock, struct socket *newsock,
  1521. struct proto_accept_arg *arg)
  1522. {
  1523. struct sock *listener;
  1524. int err;
  1525. struct sock *connected;
  1526. struct vsock_sock *vconnected;
  1527. long timeout;
  1528. DEFINE_WAIT(wait);
  1529. err = 0;
  1530. listener = sock->sk;
  1531. lock_sock(listener);
  1532. if (!sock_type_connectible(sock->type)) {
  1533. err = -EOPNOTSUPP;
  1534. goto out;
  1535. }
  1536. if (listener->sk_state != TCP_LISTEN) {
  1537. err = -EINVAL;
  1538. goto out;
  1539. }
  1540. /* Wait for children sockets to appear; these are the new sockets
  1541. * created upon connection establishment.
  1542. */
  1543. timeout = sock_rcvtimeo(listener, arg->flags & O_NONBLOCK);
  1544. prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE);
  1545. while ((connected = vsock_dequeue_accept(listener)) == NULL &&
  1546. listener->sk_err == 0) {
  1547. release_sock(listener);
  1548. timeout = schedule_timeout(timeout);
  1549. finish_wait(sk_sleep(listener), &wait);
  1550. lock_sock(listener);
  1551. if (signal_pending(current)) {
  1552. err = sock_intr_errno(timeout);
  1553. goto out;
  1554. } else if (timeout == 0) {
  1555. err = -EAGAIN;
  1556. goto out;
  1557. }
  1558. prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE);
  1559. }
  1560. finish_wait(sk_sleep(listener), &wait);
  1561. if (listener->sk_err)
  1562. err = -listener->sk_err;
  1563. if (connected) {
  1564. sk_acceptq_removed(listener);
  1565. lock_sock_nested(connected, SINGLE_DEPTH_NESTING);
  1566. vconnected = vsock_sk(connected);
  1567. /* If the listener socket has received an error, then we should
  1568. * reject this socket and return. Note that we simply mark the
  1569. * socket rejected, drop our reference, and let the cleanup
  1570. * function handle the cleanup; the fact that we found it in
  1571. * the listener's accept queue guarantees that the cleanup
  1572. * function hasn't run yet.
  1573. */
  1574. if (err) {
  1575. vconnected->rejected = true;
  1576. } else {
  1577. newsock->state = SS_CONNECTED;
  1578. sock_graft(connected, newsock);
  1579. set_bit(SOCK_CUSTOM_SOCKOPT,
  1580. &connected->sk_socket->flags);
  1581. if (vsock_msgzerocopy_allow(vconnected->transport))
  1582. set_bit(SOCK_SUPPORT_ZC,
  1583. &connected->sk_socket->flags);
  1584. }
  1585. release_sock(connected);
  1586. sock_put(connected);
  1587. }
  1588. out:
  1589. release_sock(listener);
  1590. return err;
  1591. }
  1592. static int vsock_listen(struct socket *sock, int backlog)
  1593. {
  1594. int err;
  1595. struct sock *sk;
  1596. struct vsock_sock *vsk;
  1597. sk = sock->sk;
  1598. lock_sock(sk);
  1599. if (!sock_type_connectible(sk->sk_type)) {
  1600. err = -EOPNOTSUPP;
  1601. goto out;
  1602. }
  1603. if (sock->state != SS_UNCONNECTED) {
  1604. err = -EINVAL;
  1605. goto out;
  1606. }
  1607. vsk = vsock_sk(sk);
  1608. if (!vsock_addr_bound(&vsk->local_addr)) {
  1609. err = -EINVAL;
  1610. goto out;
  1611. }
  1612. sk->sk_max_ack_backlog = backlog;
  1613. sk->sk_state = TCP_LISTEN;
  1614. err = 0;
  1615. out:
  1616. release_sock(sk);
  1617. return err;
  1618. }
  1619. static void vsock_update_buffer_size(struct vsock_sock *vsk,
  1620. const struct vsock_transport *transport,
  1621. u64 val)
  1622. {
  1623. if (val > vsk->buffer_max_size)
  1624. val = vsk->buffer_max_size;
  1625. if (val < vsk->buffer_min_size)
  1626. val = vsk->buffer_min_size;
  1627. if (val != vsk->buffer_size &&
  1628. transport && transport->notify_buffer_size)
  1629. transport->notify_buffer_size(vsk, &val);
  1630. vsk->buffer_size = val;
  1631. }
  1632. static int vsock_connectible_setsockopt(struct socket *sock,
  1633. int level,
  1634. int optname,
  1635. sockptr_t optval,
  1636. unsigned int optlen)
  1637. {
  1638. int err;
  1639. struct sock *sk;
  1640. struct vsock_sock *vsk;
  1641. const struct vsock_transport *transport;
  1642. u64 val;
  1643. if (level != AF_VSOCK && level != SOL_SOCKET)
  1644. return -ENOPROTOOPT;
  1645. #define COPY_IN(_v) \
  1646. do { \
  1647. if (optlen < sizeof(_v)) { \
  1648. err = -EINVAL; \
  1649. goto exit; \
  1650. } \
  1651. if (copy_from_sockptr(&_v, optval, sizeof(_v)) != 0) { \
  1652. err = -EFAULT; \
  1653. goto exit; \
  1654. } \
  1655. } while (0)
  1656. err = 0;
  1657. sk = sock->sk;
  1658. vsk = vsock_sk(sk);
  1659. lock_sock(sk);
  1660. transport = vsk->transport;
  1661. if (level == SOL_SOCKET) {
  1662. int zerocopy;
  1663. if (optname != SO_ZEROCOPY) {
  1664. release_sock(sk);
  1665. return sock_setsockopt(sock, level, optname, optval, optlen);
  1666. }
  1667. /* Use 'int' type here, because variable to
  1668. * set this option usually has this type.
  1669. */
  1670. COPY_IN(zerocopy);
  1671. if (zerocopy < 0 || zerocopy > 1) {
  1672. err = -EINVAL;
  1673. goto exit;
  1674. }
  1675. if (transport && !vsock_msgzerocopy_allow(transport)) {
  1676. err = -EOPNOTSUPP;
  1677. goto exit;
  1678. }
  1679. sock_valbool_flag(sk, SOCK_ZEROCOPY, zerocopy);
  1680. goto exit;
  1681. }
  1682. switch (optname) {
  1683. case SO_VM_SOCKETS_BUFFER_SIZE:
  1684. COPY_IN(val);
  1685. vsock_update_buffer_size(vsk, transport, val);
  1686. break;
  1687. case SO_VM_SOCKETS_BUFFER_MAX_SIZE:
  1688. COPY_IN(val);
  1689. vsk->buffer_max_size = val;
  1690. vsock_update_buffer_size(vsk, transport, vsk->buffer_size);
  1691. break;
  1692. case SO_VM_SOCKETS_BUFFER_MIN_SIZE:
  1693. COPY_IN(val);
  1694. vsk->buffer_min_size = val;
  1695. vsock_update_buffer_size(vsk, transport, vsk->buffer_size);
  1696. break;
  1697. case SO_VM_SOCKETS_CONNECT_TIMEOUT_NEW:
  1698. case SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD: {
  1699. struct __kernel_sock_timeval tv;
  1700. err = sock_copy_user_timeval(&tv, optval, optlen,
  1701. optname == SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD);
  1702. if (err)
  1703. break;
  1704. if (tv.tv_sec >= 0 && tv.tv_usec < USEC_PER_SEC &&
  1705. tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)) {
  1706. vsk->connect_timeout = tv.tv_sec * HZ +
  1707. DIV_ROUND_UP((unsigned long)tv.tv_usec, (USEC_PER_SEC / HZ));
  1708. if (vsk->connect_timeout == 0)
  1709. vsk->connect_timeout =
  1710. VSOCK_DEFAULT_CONNECT_TIMEOUT;
  1711. } else {
  1712. err = -ERANGE;
  1713. }
  1714. break;
  1715. }
  1716. default:
  1717. err = -ENOPROTOOPT;
  1718. break;
  1719. }
  1720. #undef COPY_IN
  1721. exit:
  1722. release_sock(sk);
  1723. return err;
  1724. }
  1725. static int vsock_connectible_getsockopt(struct socket *sock,
  1726. int level, int optname,
  1727. char __user *optval,
  1728. int __user *optlen)
  1729. {
  1730. struct sock *sk = sock->sk;
  1731. struct vsock_sock *vsk = vsock_sk(sk);
  1732. union {
  1733. u64 val64;
  1734. struct old_timeval32 tm32;
  1735. struct __kernel_old_timeval tm;
  1736. struct __kernel_sock_timeval stm;
  1737. } v;
  1738. int lv = sizeof(v.val64);
  1739. int len;
  1740. if (level != AF_VSOCK)
  1741. return -ENOPROTOOPT;
  1742. if (get_user(len, optlen))
  1743. return -EFAULT;
  1744. memset(&v, 0, sizeof(v));
  1745. switch (optname) {
  1746. case SO_VM_SOCKETS_BUFFER_SIZE:
  1747. v.val64 = vsk->buffer_size;
  1748. break;
  1749. case SO_VM_SOCKETS_BUFFER_MAX_SIZE:
  1750. v.val64 = vsk->buffer_max_size;
  1751. break;
  1752. case SO_VM_SOCKETS_BUFFER_MIN_SIZE:
  1753. v.val64 = vsk->buffer_min_size;
  1754. break;
  1755. case SO_VM_SOCKETS_CONNECT_TIMEOUT_NEW:
  1756. case SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD:
  1757. lv = sock_get_timeout(vsk->connect_timeout, &v,
  1758. optname == SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD);
  1759. break;
  1760. default:
  1761. return -ENOPROTOOPT;
  1762. }
  1763. if (len < lv)
  1764. return -EINVAL;
  1765. if (len > lv)
  1766. len = lv;
  1767. if (copy_to_user(optval, &v, len))
  1768. return -EFAULT;
  1769. if (put_user(len, optlen))
  1770. return -EFAULT;
  1771. return 0;
  1772. }
  1773. static int vsock_connectible_sendmsg(struct socket *sock, struct msghdr *msg,
  1774. size_t len)
  1775. {
  1776. struct sock *sk;
  1777. struct vsock_sock *vsk;
  1778. const struct vsock_transport *transport;
  1779. ssize_t total_written;
  1780. long timeout;
  1781. int err;
  1782. struct vsock_transport_send_notify_data send_data;
  1783. DEFINE_WAIT_FUNC(wait, woken_wake_function);
  1784. sk = sock->sk;
  1785. vsk = vsock_sk(sk);
  1786. total_written = 0;
  1787. err = 0;
  1788. if (msg->msg_flags & MSG_OOB)
  1789. return -EOPNOTSUPP;
  1790. lock_sock(sk);
  1791. transport = vsk->transport;
  1792. /* Callers should not provide a destination with connection oriented
  1793. * sockets.
  1794. */
  1795. if (msg->msg_namelen) {
  1796. err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
  1797. goto out;
  1798. }
  1799. /* Send data only if both sides are not shutdown in the direction. */
  1800. if (sk->sk_shutdown & SEND_SHUTDOWN ||
  1801. vsk->peer_shutdown & RCV_SHUTDOWN) {
  1802. err = -EPIPE;
  1803. goto out;
  1804. }
  1805. if (!transport || sk->sk_state != TCP_ESTABLISHED ||
  1806. !vsock_addr_bound(&vsk->local_addr)) {
  1807. err = -ENOTCONN;
  1808. goto out;
  1809. }
  1810. if (!vsock_addr_bound(&vsk->remote_addr)) {
  1811. err = -EDESTADDRREQ;
  1812. goto out;
  1813. }
  1814. if (msg->msg_flags & MSG_ZEROCOPY &&
  1815. !vsock_msgzerocopy_allow(transport)) {
  1816. err = -EOPNOTSUPP;
  1817. goto out;
  1818. }
  1819. /* Wait for room in the produce queue to enqueue our user's data. */
  1820. timeout = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
  1821. err = transport->notify_send_init(vsk, &send_data);
  1822. if (err < 0)
  1823. goto out;
  1824. while (total_written < len) {
  1825. ssize_t written;
  1826. add_wait_queue(sk_sleep(sk), &wait);
  1827. while (vsock_stream_has_space(vsk) == 0 &&
  1828. sk->sk_err == 0 &&
  1829. !(sk->sk_shutdown & SEND_SHUTDOWN) &&
  1830. !(vsk->peer_shutdown & RCV_SHUTDOWN)) {
  1831. /* Don't wait for non-blocking sockets. */
  1832. if (timeout == 0) {
  1833. err = -EAGAIN;
  1834. remove_wait_queue(sk_sleep(sk), &wait);
  1835. goto out_err;
  1836. }
  1837. err = transport->notify_send_pre_block(vsk, &send_data);
  1838. if (err < 0) {
  1839. remove_wait_queue(sk_sleep(sk), &wait);
  1840. goto out_err;
  1841. }
  1842. release_sock(sk);
  1843. timeout = wait_woken(&wait, TASK_INTERRUPTIBLE, timeout);
  1844. lock_sock(sk);
  1845. if (signal_pending(current)) {
  1846. err = sock_intr_errno(timeout);
  1847. remove_wait_queue(sk_sleep(sk), &wait);
  1848. goto out_err;
  1849. } else if (timeout == 0) {
  1850. err = -EAGAIN;
  1851. remove_wait_queue(sk_sleep(sk), &wait);
  1852. goto out_err;
  1853. }
  1854. }
  1855. remove_wait_queue(sk_sleep(sk), &wait);
  1856. /* These checks occur both as part of and after the loop
  1857. * conditional since we need to check before and after
  1858. * sleeping.
  1859. */
  1860. if (sk->sk_err) {
  1861. err = -sk->sk_err;
  1862. goto out_err;
  1863. } else if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
  1864. (vsk->peer_shutdown & RCV_SHUTDOWN)) {
  1865. err = -EPIPE;
  1866. goto out_err;
  1867. }
  1868. err = transport->notify_send_pre_enqueue(vsk, &send_data);
  1869. if (err < 0)
  1870. goto out_err;
  1871. /* Note that enqueue will only write as many bytes as are free
  1872. * in the produce queue, so we don't need to ensure len is
  1873. * smaller than the queue size. It is the caller's
  1874. * responsibility to check how many bytes we were able to send.
  1875. */
  1876. if (sk->sk_type == SOCK_SEQPACKET) {
  1877. written = transport->seqpacket_enqueue(vsk,
  1878. msg, len - total_written);
  1879. } else {
  1880. written = transport->stream_enqueue(vsk,
  1881. msg, len - total_written);
  1882. }
  1883. if (written < 0) {
  1884. err = written;
  1885. goto out_err;
  1886. }
  1887. total_written += written;
  1888. err = transport->notify_send_post_enqueue(
  1889. vsk, written, &send_data);
  1890. if (err < 0)
  1891. goto out_err;
  1892. }
  1893. out_err:
  1894. if (total_written > 0) {
  1895. /* Return number of written bytes only if:
  1896. * 1) SOCK_STREAM socket.
  1897. * 2) SOCK_SEQPACKET socket when whole buffer is sent.
  1898. */
  1899. if (sk->sk_type == SOCK_STREAM || total_written == len)
  1900. err = total_written;
  1901. }
  1902. out:
  1903. if (sk->sk_type == SOCK_STREAM)
  1904. err = sk_stream_error(sk, msg->msg_flags, err);
  1905. release_sock(sk);
  1906. return err;
  1907. }
  1908. static int vsock_connectible_wait_data(struct sock *sk,
  1909. struct wait_queue_entry *wait,
  1910. long timeout,
  1911. struct vsock_transport_recv_notify_data *recv_data,
  1912. size_t target)
  1913. {
  1914. const struct vsock_transport *transport;
  1915. struct vsock_sock *vsk;
  1916. s64 data;
  1917. int err;
  1918. vsk = vsock_sk(sk);
  1919. err = 0;
  1920. transport = vsk->transport;
  1921. while (1) {
  1922. prepare_to_wait(sk_sleep(sk), wait, TASK_INTERRUPTIBLE);
  1923. data = vsock_connectible_has_data(vsk);
  1924. if (data != 0)
  1925. break;
  1926. if (sk->sk_err != 0 ||
  1927. (sk->sk_shutdown & RCV_SHUTDOWN) ||
  1928. (vsk->peer_shutdown & SEND_SHUTDOWN)) {
  1929. break;
  1930. }
  1931. /* Don't wait for non-blocking sockets. */
  1932. if (timeout == 0) {
  1933. err = -EAGAIN;
  1934. break;
  1935. }
  1936. if (recv_data) {
  1937. err = transport->notify_recv_pre_block(vsk, target, recv_data);
  1938. if (err < 0)
  1939. break;
  1940. }
  1941. release_sock(sk);
  1942. timeout = schedule_timeout(timeout);
  1943. lock_sock(sk);
  1944. if (signal_pending(current)) {
  1945. err = sock_intr_errno(timeout);
  1946. break;
  1947. } else if (timeout == 0) {
  1948. err = -EAGAIN;
  1949. break;
  1950. }
  1951. }
  1952. finish_wait(sk_sleep(sk), wait);
  1953. if (err)
  1954. return err;
  1955. /* Internal transport error when checking for available
  1956. * data. XXX This should be changed to a connection
  1957. * reset in a later change.
  1958. */
  1959. if (data < 0)
  1960. return -ENOMEM;
  1961. return data;
  1962. }
  1963. static int __vsock_stream_recvmsg(struct sock *sk, struct msghdr *msg,
  1964. size_t len, int flags)
  1965. {
  1966. struct vsock_transport_recv_notify_data recv_data;
  1967. const struct vsock_transport *transport;
  1968. struct vsock_sock *vsk;
  1969. ssize_t copied;
  1970. size_t target;
  1971. long timeout;
  1972. int err;
  1973. DEFINE_WAIT(wait);
  1974. vsk = vsock_sk(sk);
  1975. transport = vsk->transport;
  1976. /* We must not copy less than target bytes into the user's buffer
  1977. * before returning successfully, so we wait for the consume queue to
  1978. * have that much data to consume before dequeueing. Note that this
  1979. * makes it impossible to handle cases where target is greater than the
  1980. * queue size.
  1981. */
  1982. target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
  1983. if (target >= transport->stream_rcvhiwat(vsk)) {
  1984. err = -ENOMEM;
  1985. goto out;
  1986. }
  1987. timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
  1988. copied = 0;
  1989. err = transport->notify_recv_init(vsk, target, &recv_data);
  1990. if (err < 0)
  1991. goto out;
  1992. while (1) {
  1993. ssize_t read;
  1994. err = vsock_connectible_wait_data(sk, &wait, timeout,
  1995. &recv_data, target);
  1996. if (err <= 0)
  1997. break;
  1998. err = transport->notify_recv_pre_dequeue(vsk, target,
  1999. &recv_data);
  2000. if (err < 0)
  2001. break;
  2002. read = transport->stream_dequeue(vsk, msg, len - copied, flags);
  2003. if (read < 0) {
  2004. err = read;
  2005. break;
  2006. }
  2007. copied += read;
  2008. err = transport->notify_recv_post_dequeue(vsk, target, read,
  2009. !(flags & MSG_PEEK), &recv_data);
  2010. if (err < 0)
  2011. goto out;
  2012. if (read >= target || flags & MSG_PEEK)
  2013. break;
  2014. target -= read;
  2015. }
  2016. if (sk->sk_err)
  2017. err = -sk->sk_err;
  2018. else if (sk->sk_shutdown & RCV_SHUTDOWN)
  2019. err = 0;
  2020. if (copied > 0)
  2021. err = copied;
  2022. out:
  2023. return err;
  2024. }
  2025. static int __vsock_seqpacket_recvmsg(struct sock *sk, struct msghdr *msg,
  2026. size_t len, int flags)
  2027. {
  2028. const struct vsock_transport *transport;
  2029. struct vsock_sock *vsk;
  2030. ssize_t msg_len;
  2031. long timeout;
  2032. int err = 0;
  2033. DEFINE_WAIT(wait);
  2034. vsk = vsock_sk(sk);
  2035. transport = vsk->transport;
  2036. timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
  2037. err = vsock_connectible_wait_data(sk, &wait, timeout, NULL, 0);
  2038. if (err <= 0)
  2039. goto out;
  2040. msg_len = transport->seqpacket_dequeue(vsk, msg, flags);
  2041. if (msg_len < 0) {
  2042. err = msg_len;
  2043. goto out;
  2044. }
  2045. if (sk->sk_err) {
  2046. err = -sk->sk_err;
  2047. } else if (sk->sk_shutdown & RCV_SHUTDOWN) {
  2048. err = 0;
  2049. } else {
  2050. /* User sets MSG_TRUNC, so return real length of
  2051. * packet.
  2052. */
  2053. if (flags & MSG_TRUNC)
  2054. err = msg_len;
  2055. else
  2056. err = len - msg_data_left(msg);
  2057. /* Always set MSG_TRUNC if real length of packet is
  2058. * bigger than user's buffer.
  2059. */
  2060. if (msg_len > len)
  2061. msg->msg_flags |= MSG_TRUNC;
  2062. }
  2063. out:
  2064. return err;
  2065. }
  2066. int
  2067. __vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
  2068. int flags)
  2069. {
  2070. struct sock *sk;
  2071. struct vsock_sock *vsk;
  2072. const struct vsock_transport *transport;
  2073. int err;
  2074. sk = sock->sk;
  2075. if (unlikely(flags & MSG_ERRQUEUE))
  2076. return sock_recv_errqueue(sk, msg, len, SOL_VSOCK, VSOCK_RECVERR);
  2077. vsk = vsock_sk(sk);
  2078. err = 0;
  2079. lock_sock(sk);
  2080. transport = vsk->transport;
  2081. if (!transport || sk->sk_state != TCP_ESTABLISHED) {
  2082. /* Recvmsg is supposed to return 0 if a peer performs an
  2083. * orderly shutdown. Differentiate between that case and when a
  2084. * peer has not connected or a local shutdown occurred with the
  2085. * SOCK_DONE flag.
  2086. */
  2087. if (sock_flag(sk, SOCK_DONE))
  2088. err = 0;
  2089. else
  2090. err = -ENOTCONN;
  2091. goto out;
  2092. }
  2093. if (flags & MSG_OOB) {
  2094. err = -EOPNOTSUPP;
  2095. goto out;
  2096. }
  2097. /* We don't check peer_shutdown flag here since peer may actually shut
  2098. * down, but there can be data in the queue that a local socket can
  2099. * receive.
  2100. */
  2101. if (sk->sk_shutdown & RCV_SHUTDOWN) {
  2102. err = 0;
  2103. goto out;
  2104. }
  2105. /* It is valid on Linux to pass in a zero-length receive buffer. This
  2106. * is not an error. We may as well bail out now.
  2107. */
  2108. if (!len) {
  2109. err = 0;
  2110. goto out;
  2111. }
  2112. if (sk->sk_type == SOCK_STREAM)
  2113. err = __vsock_stream_recvmsg(sk, msg, len, flags);
  2114. else
  2115. err = __vsock_seqpacket_recvmsg(sk, msg, len, flags);
  2116. out:
  2117. release_sock(sk);
  2118. return err;
  2119. }
  2120. int
  2121. vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
  2122. int flags)
  2123. {
  2124. #ifdef CONFIG_BPF_SYSCALL
  2125. struct sock *sk = sock->sk;
  2126. const struct proto *prot;
  2127. prot = READ_ONCE(sk->sk_prot);
  2128. if (prot != &vsock_proto)
  2129. return prot->recvmsg(sk, msg, len, flags, NULL);
  2130. #endif
  2131. return __vsock_connectible_recvmsg(sock, msg, len, flags);
  2132. }
  2133. EXPORT_SYMBOL_GPL(vsock_connectible_recvmsg);
  2134. static int vsock_set_rcvlowat(struct sock *sk, int val)
  2135. {
  2136. const struct vsock_transport *transport;
  2137. struct vsock_sock *vsk;
  2138. vsk = vsock_sk(sk);
  2139. if (val > vsk->buffer_size)
  2140. return -EINVAL;
  2141. transport = vsk->transport;
  2142. if (transport && transport->notify_set_rcvlowat) {
  2143. int err;
  2144. err = transport->notify_set_rcvlowat(vsk, val);
  2145. if (err)
  2146. return err;
  2147. }
  2148. WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
  2149. return 0;
  2150. }
  2151. static const struct proto_ops vsock_stream_ops = {
  2152. .family = PF_VSOCK,
  2153. .owner = THIS_MODULE,
  2154. .release = vsock_release,
  2155. .bind = vsock_bind,
  2156. .connect = vsock_connect,
  2157. .socketpair = sock_no_socketpair,
  2158. .accept = vsock_accept,
  2159. .getname = vsock_getname,
  2160. .poll = vsock_poll,
  2161. .ioctl = vsock_ioctl,
  2162. .listen = vsock_listen,
  2163. .shutdown = vsock_shutdown,
  2164. .setsockopt = vsock_connectible_setsockopt,
  2165. .getsockopt = vsock_connectible_getsockopt,
  2166. .sendmsg = vsock_connectible_sendmsg,
  2167. .recvmsg = vsock_connectible_recvmsg,
  2168. .mmap = sock_no_mmap,
  2169. .set_rcvlowat = vsock_set_rcvlowat,
  2170. .read_skb = vsock_read_skb,
  2171. };
  2172. static const struct proto_ops vsock_seqpacket_ops = {
  2173. .family = PF_VSOCK,
  2174. .owner = THIS_MODULE,
  2175. .release = vsock_release,
  2176. .bind = vsock_bind,
  2177. .connect = vsock_connect,
  2178. .socketpair = sock_no_socketpair,
  2179. .accept = vsock_accept,
  2180. .getname = vsock_getname,
  2181. .poll = vsock_poll,
  2182. .ioctl = vsock_ioctl,
  2183. .listen = vsock_listen,
  2184. .shutdown = vsock_shutdown,
  2185. .setsockopt = vsock_connectible_setsockopt,
  2186. .getsockopt = vsock_connectible_getsockopt,
  2187. .sendmsg = vsock_connectible_sendmsg,
  2188. .recvmsg = vsock_connectible_recvmsg,
  2189. .mmap = sock_no_mmap,
  2190. .read_skb = vsock_read_skb,
  2191. };
  2192. static int vsock_create(struct net *net, struct socket *sock,
  2193. int protocol, int kern)
  2194. {
  2195. struct vsock_sock *vsk;
  2196. struct sock *sk;
  2197. int ret;
  2198. if (!sock)
  2199. return -EINVAL;
  2200. if (protocol && protocol != PF_VSOCK)
  2201. return -EPROTONOSUPPORT;
  2202. switch (sock->type) {
  2203. case SOCK_DGRAM:
  2204. sock->ops = &vsock_dgram_ops;
  2205. break;
  2206. case SOCK_STREAM:
  2207. sock->ops = &vsock_stream_ops;
  2208. break;
  2209. case SOCK_SEQPACKET:
  2210. sock->ops = &vsock_seqpacket_ops;
  2211. break;
  2212. default:
  2213. return -ESOCKTNOSUPPORT;
  2214. }
  2215. sock->state = SS_UNCONNECTED;
  2216. sk = __vsock_create(net, sock, NULL, GFP_KERNEL, 0, kern);
  2217. if (!sk)
  2218. return -ENOMEM;
  2219. vsk = vsock_sk(sk);
  2220. if (sock->type == SOCK_DGRAM) {
  2221. ret = vsock_assign_transport(vsk, NULL);
  2222. if (ret < 0) {
  2223. sock->sk = NULL;
  2224. sock_put(sk);
  2225. return ret;
  2226. }
  2227. }
  2228. /* SOCK_DGRAM doesn't have 'setsockopt' callback set in its
  2229. * proto_ops, so there is no handler for custom logic.
  2230. */
  2231. if (sock_type_connectible(sock->type))
  2232. set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags);
  2233. vsock_insert_unbound(vsk);
  2234. return 0;
  2235. }
  2236. static const struct net_proto_family vsock_family_ops = {
  2237. .family = AF_VSOCK,
  2238. .create = vsock_create,
  2239. .owner = THIS_MODULE,
  2240. };
  2241. static long vsock_dev_do_ioctl(struct file *filp,
  2242. unsigned int cmd, void __user *ptr)
  2243. {
  2244. u32 __user *p = ptr;
  2245. int retval = 0;
  2246. u32 cid;
  2247. switch (cmd) {
  2248. case IOCTL_VM_SOCKETS_GET_LOCAL_CID:
  2249. /* To be compatible with the VMCI behavior, we prioritize the
  2250. * guest CID instead of well-know host CID (VMADDR_CID_HOST).
  2251. */
  2252. cid = vsock_registered_transport_cid(&transport_g2h);
  2253. if (cid == VMADDR_CID_ANY)
  2254. cid = vsock_registered_transport_cid(&transport_h2g);
  2255. if (cid == VMADDR_CID_ANY)
  2256. cid = vsock_registered_transport_cid(&transport_local);
  2257. if (put_user(cid, p) != 0)
  2258. retval = -EFAULT;
  2259. break;
  2260. default:
  2261. retval = -ENOIOCTLCMD;
  2262. }
  2263. return retval;
  2264. }
  2265. static long vsock_dev_ioctl(struct file *filp,
  2266. unsigned int cmd, unsigned long arg)
  2267. {
  2268. return vsock_dev_do_ioctl(filp, cmd, (void __user *)arg);
  2269. }
  2270. #ifdef CONFIG_COMPAT
  2271. static long vsock_dev_compat_ioctl(struct file *filp,
  2272. unsigned int cmd, unsigned long arg)
  2273. {
  2274. return vsock_dev_do_ioctl(filp, cmd, compat_ptr(arg));
  2275. }
  2276. #endif
  2277. static const struct file_operations vsock_device_ops = {
  2278. .owner = THIS_MODULE,
  2279. .unlocked_ioctl = vsock_dev_ioctl,
  2280. #ifdef CONFIG_COMPAT
  2281. .compat_ioctl = vsock_dev_compat_ioctl,
  2282. #endif
  2283. .open = nonseekable_open,
  2284. };
  2285. static struct miscdevice vsock_device = {
  2286. .name = "vsock",
  2287. .fops = &vsock_device_ops,
  2288. };
  2289. static int __vsock_net_mode_string(const struct ctl_table *table, int write,
  2290. void *buffer, size_t *lenp, loff_t *ppos,
  2291. enum vsock_net_mode mode,
  2292. enum vsock_net_mode *new_mode)
  2293. {
  2294. char data[VSOCK_NET_MODE_STR_MAX] = {0};
  2295. struct ctl_table tmp;
  2296. int ret;
  2297. if (!table->data || !table->maxlen || !*lenp) {
  2298. *lenp = 0;
  2299. return 0;
  2300. }
  2301. tmp = *table;
  2302. tmp.data = data;
  2303. if (!write) {
  2304. const char *p;
  2305. switch (mode) {
  2306. case VSOCK_NET_MODE_GLOBAL:
  2307. p = VSOCK_NET_MODE_STR_GLOBAL;
  2308. break;
  2309. case VSOCK_NET_MODE_LOCAL:
  2310. p = VSOCK_NET_MODE_STR_LOCAL;
  2311. break;
  2312. default:
  2313. WARN_ONCE(true, "netns has invalid vsock mode");
  2314. *lenp = 0;
  2315. return 0;
  2316. }
  2317. strscpy(data, p, sizeof(data));
  2318. tmp.maxlen = strlen(p);
  2319. }
  2320. ret = proc_dostring(&tmp, write, buffer, lenp, ppos);
  2321. if (ret || !write)
  2322. return ret;
  2323. if (*lenp >= sizeof(data))
  2324. return -EINVAL;
  2325. if (!strncmp(data, VSOCK_NET_MODE_STR_GLOBAL, sizeof(data)))
  2326. *new_mode = VSOCK_NET_MODE_GLOBAL;
  2327. else if (!strncmp(data, VSOCK_NET_MODE_STR_LOCAL, sizeof(data)))
  2328. *new_mode = VSOCK_NET_MODE_LOCAL;
  2329. else
  2330. return -EINVAL;
  2331. return 0;
  2332. }
  2333. static int vsock_net_mode_string(const struct ctl_table *table, int write,
  2334. void *buffer, size_t *lenp, loff_t *ppos)
  2335. {
  2336. struct net *net;
  2337. if (write)
  2338. return -EPERM;
  2339. net = container_of(table->data, struct net, vsock.mode);
  2340. return __vsock_net_mode_string(table, write, buffer, lenp, ppos,
  2341. vsock_net_mode(net), NULL);
  2342. }
  2343. static int vsock_net_child_mode_string(const struct ctl_table *table, int write,
  2344. void *buffer, size_t *lenp, loff_t *ppos)
  2345. {
  2346. enum vsock_net_mode new_mode;
  2347. struct net *net;
  2348. int ret;
  2349. net = container_of(table->data, struct net, vsock.child_ns_mode);
  2350. ret = __vsock_net_mode_string(table, write, buffer, lenp, ppos,
  2351. vsock_net_child_mode(net), &new_mode);
  2352. if (ret)
  2353. return ret;
  2354. if (write) {
  2355. /* Prevent a "local" namespace from escalating to "global",
  2356. * which would give nested namespaces access to global CIDs.
  2357. */
  2358. if (vsock_net_mode(net) == VSOCK_NET_MODE_LOCAL &&
  2359. new_mode == VSOCK_NET_MODE_GLOBAL)
  2360. return -EPERM;
  2361. if (!vsock_net_set_child_mode(net, new_mode))
  2362. return -EBUSY;
  2363. }
  2364. return 0;
  2365. }
  2366. static struct ctl_table vsock_table[] = {
  2367. {
  2368. .procname = "ns_mode",
  2369. .data = &init_net.vsock.mode,
  2370. .maxlen = VSOCK_NET_MODE_STR_MAX,
  2371. .mode = 0444,
  2372. .proc_handler = vsock_net_mode_string
  2373. },
  2374. {
  2375. .procname = "child_ns_mode",
  2376. .data = &init_net.vsock.child_ns_mode,
  2377. .maxlen = VSOCK_NET_MODE_STR_MAX,
  2378. .mode = 0644,
  2379. .proc_handler = vsock_net_child_mode_string
  2380. },
  2381. };
  2382. static int __net_init vsock_sysctl_register(struct net *net)
  2383. {
  2384. struct ctl_table *table;
  2385. if (net_eq(net, &init_net)) {
  2386. table = vsock_table;
  2387. } else {
  2388. table = kmemdup(vsock_table, sizeof(vsock_table), GFP_KERNEL);
  2389. if (!table)
  2390. goto err_alloc;
  2391. table[0].data = &net->vsock.mode;
  2392. table[1].data = &net->vsock.child_ns_mode;
  2393. }
  2394. net->vsock.sysctl_hdr = register_net_sysctl_sz(net, "net/vsock", table,
  2395. ARRAY_SIZE(vsock_table));
  2396. if (!net->vsock.sysctl_hdr)
  2397. goto err_reg;
  2398. return 0;
  2399. err_reg:
  2400. if (!net_eq(net, &init_net))
  2401. kfree(table);
  2402. err_alloc:
  2403. return -ENOMEM;
  2404. }
  2405. static void vsock_sysctl_unregister(struct net *net)
  2406. {
  2407. const struct ctl_table *table;
  2408. table = net->vsock.sysctl_hdr->ctl_table_arg;
  2409. unregister_net_sysctl_table(net->vsock.sysctl_hdr);
  2410. if (!net_eq(net, &init_net))
  2411. kfree(table);
  2412. }
  2413. static void vsock_net_init(struct net *net)
  2414. {
  2415. if (net_eq(net, &init_net))
  2416. net->vsock.mode = VSOCK_NET_MODE_GLOBAL;
  2417. else
  2418. net->vsock.mode = vsock_net_child_mode(current->nsproxy->net_ns);
  2419. net->vsock.child_ns_mode = net->vsock.mode;
  2420. net->vsock.child_ns_mode_locked = 0;
  2421. }
  2422. static __net_init int vsock_sysctl_init_net(struct net *net)
  2423. {
  2424. vsock_net_init(net);
  2425. if (vsock_sysctl_register(net))
  2426. return -ENOMEM;
  2427. return 0;
  2428. }
  2429. static __net_exit void vsock_sysctl_exit_net(struct net *net)
  2430. {
  2431. vsock_sysctl_unregister(net);
  2432. }
  2433. static struct pernet_operations vsock_sysctl_ops = {
  2434. .init = vsock_sysctl_init_net,
  2435. .exit = vsock_sysctl_exit_net,
  2436. };
  2437. static int __init vsock_init(void)
  2438. {
  2439. int err = 0;
  2440. vsock_init_tables();
  2441. vsock_proto.owner = THIS_MODULE;
  2442. vsock_device.minor = MISC_DYNAMIC_MINOR;
  2443. err = misc_register(&vsock_device);
  2444. if (err) {
  2445. pr_err("Failed to register misc device\n");
  2446. goto err_reset_transport;
  2447. }
  2448. err = proto_register(&vsock_proto, 1); /* we want our slab */
  2449. if (err) {
  2450. pr_err("Cannot register vsock protocol\n");
  2451. goto err_deregister_misc;
  2452. }
  2453. err = sock_register(&vsock_family_ops);
  2454. if (err) {
  2455. pr_err("could not register af_vsock (%d) address family: %d\n",
  2456. AF_VSOCK, err);
  2457. goto err_unregister_proto;
  2458. }
  2459. if (register_pernet_subsys(&vsock_sysctl_ops)) {
  2460. err = -ENOMEM;
  2461. goto err_unregister_sock;
  2462. }
  2463. vsock_bpf_build_proto();
  2464. return 0;
  2465. err_unregister_sock:
  2466. sock_unregister(AF_VSOCK);
  2467. err_unregister_proto:
  2468. proto_unregister(&vsock_proto);
  2469. err_deregister_misc:
  2470. misc_deregister(&vsock_device);
  2471. err_reset_transport:
  2472. return err;
  2473. }
  2474. static void __exit vsock_exit(void)
  2475. {
  2476. misc_deregister(&vsock_device);
  2477. sock_unregister(AF_VSOCK);
  2478. proto_unregister(&vsock_proto);
  2479. unregister_pernet_subsys(&vsock_sysctl_ops);
  2480. }
  2481. const struct vsock_transport *vsock_core_get_transport(struct vsock_sock *vsk)
  2482. {
  2483. return vsk->transport;
  2484. }
  2485. EXPORT_SYMBOL_GPL(vsock_core_get_transport);
  2486. int vsock_core_register(const struct vsock_transport *t, int features)
  2487. {
  2488. const struct vsock_transport *t_h2g, *t_g2h, *t_dgram, *t_local;
  2489. int err = mutex_lock_interruptible(&vsock_register_mutex);
  2490. if (err)
  2491. return err;
  2492. t_h2g = transport_h2g;
  2493. t_g2h = transport_g2h;
  2494. t_dgram = transport_dgram;
  2495. t_local = transport_local;
  2496. if (features & VSOCK_TRANSPORT_F_H2G) {
  2497. if (t_h2g) {
  2498. err = -EBUSY;
  2499. goto err_busy;
  2500. }
  2501. t_h2g = t;
  2502. }
  2503. if (features & VSOCK_TRANSPORT_F_G2H) {
  2504. if (t_g2h) {
  2505. err = -EBUSY;
  2506. goto err_busy;
  2507. }
  2508. t_g2h = t;
  2509. }
  2510. if (features & VSOCK_TRANSPORT_F_DGRAM) {
  2511. if (t_dgram) {
  2512. err = -EBUSY;
  2513. goto err_busy;
  2514. }
  2515. t_dgram = t;
  2516. }
  2517. if (features & VSOCK_TRANSPORT_F_LOCAL) {
  2518. if (t_local) {
  2519. err = -EBUSY;
  2520. goto err_busy;
  2521. }
  2522. t_local = t;
  2523. }
  2524. transport_h2g = t_h2g;
  2525. transport_g2h = t_g2h;
  2526. transport_dgram = t_dgram;
  2527. transport_local = t_local;
  2528. err_busy:
  2529. mutex_unlock(&vsock_register_mutex);
  2530. return err;
  2531. }
  2532. EXPORT_SYMBOL_GPL(vsock_core_register);
  2533. void vsock_core_unregister(const struct vsock_transport *t)
  2534. {
  2535. mutex_lock(&vsock_register_mutex);
  2536. if (transport_h2g == t)
  2537. transport_h2g = NULL;
  2538. if (transport_g2h == t)
  2539. transport_g2h = NULL;
  2540. if (transport_dgram == t)
  2541. transport_dgram = NULL;
  2542. if (transport_local == t)
  2543. transport_local = NULL;
  2544. mutex_unlock(&vsock_register_mutex);
  2545. }
  2546. EXPORT_SYMBOL_GPL(vsock_core_unregister);
  2547. module_init(vsock_init);
  2548. module_exit(vsock_exit);
  2549. MODULE_AUTHOR("VMware, Inc.");
  2550. MODULE_DESCRIPTION("VMware Virtual Socket Family");
  2551. MODULE_VERSION("1.0.2.0-k");
  2552. MODULE_LICENSE("GPL v2");