vhost.c 80 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /* Copyright (C) 2009 Red Hat, Inc.
  3. * Copyright (C) 2006 Rusty Russell IBM Corporation
  4. *
  5. * Author: Michael S. Tsirkin <mst@redhat.com>
  6. *
  7. * Inspiration, some code, and most witty comments come from
  8. * Documentation/virtual/lguest/lguest.c, by Rusty Russell
  9. *
  10. * Generic code for virtio server in host kernel.
  11. */
  12. #include <linux/eventfd.h>
  13. #include <linux/vhost.h>
  14. #include <linux/uio.h>
  15. #include <linux/mm.h>
  16. #include <linux/miscdevice.h>
  17. #include <linux/mutex.h>
  18. #include <linux/poll.h>
  19. #include <linux/file.h>
  20. #include <linux/highmem.h>
  21. #include <linux/slab.h>
  22. #include <linux/vmalloc.h>
  23. #include <linux/kthread.h>
  24. #include <linux/cgroup.h>
  25. #include <linux/module.h>
  26. #include <linux/sort.h>
  27. #include <linux/sched/mm.h>
  28. #include <linux/sched/signal.h>
  29. #include <linux/sched/vhost_task.h>
  30. #include <linux/interval_tree_generic.h>
  31. #include <linux/nospec.h>
  32. #include <linux/kcov.h>
  33. #include "vhost.h"
  34. static ushort max_mem_regions = 64;
  35. module_param(max_mem_regions, ushort, 0444);
  36. MODULE_PARM_DESC(max_mem_regions,
  37. "Maximum number of memory regions in memory map. (default: 64)");
  38. static int max_iotlb_entries = 2048;
  39. module_param(max_iotlb_entries, int, 0444);
  40. MODULE_PARM_DESC(max_iotlb_entries,
  41. "Maximum number of iotlb entries. (default: 2048)");
  42. static bool fork_from_owner_default = VHOST_FORK_OWNER_TASK;
  43. #ifdef CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL
  44. module_param(fork_from_owner_default, bool, 0444);
  45. MODULE_PARM_DESC(fork_from_owner_default,
  46. "Set task mode as the default(default: Y)");
  47. #endif
  48. enum {
  49. VHOST_MEMORY_F_LOG = 0x1,
  50. };
  51. #define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
  52. #define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
  53. #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
  54. static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
  55. {
  56. vq->user_be = !virtio_legacy_is_little_endian();
  57. }
  58. static void vhost_enable_cross_endian_big(struct vhost_virtqueue *vq)
  59. {
  60. vq->user_be = true;
  61. }
  62. static void vhost_enable_cross_endian_little(struct vhost_virtqueue *vq)
  63. {
  64. vq->user_be = false;
  65. }
  66. static long vhost_set_vring_endian(struct vhost_virtqueue *vq, int __user *argp)
  67. {
  68. struct vhost_vring_state s;
  69. if (vq->private_data)
  70. return -EBUSY;
  71. if (copy_from_user(&s, argp, sizeof(s)))
  72. return -EFAULT;
  73. if (s.num != VHOST_VRING_LITTLE_ENDIAN &&
  74. s.num != VHOST_VRING_BIG_ENDIAN)
  75. return -EINVAL;
  76. if (s.num == VHOST_VRING_BIG_ENDIAN)
  77. vhost_enable_cross_endian_big(vq);
  78. else
  79. vhost_enable_cross_endian_little(vq);
  80. return 0;
  81. }
  82. static long vhost_get_vring_endian(struct vhost_virtqueue *vq, u32 idx,
  83. int __user *argp)
  84. {
  85. struct vhost_vring_state s = {
  86. .index = idx,
  87. .num = vq->user_be
  88. };
  89. if (copy_to_user(argp, &s, sizeof(s)))
  90. return -EFAULT;
  91. return 0;
  92. }
  93. static void vhost_init_is_le(struct vhost_virtqueue *vq)
  94. {
  95. /* Note for legacy virtio: user_be is initialized at reset time
  96. * according to the host endianness. If userspace does not set an
  97. * explicit endianness, the default behavior is native endian, as
  98. * expected by legacy virtio.
  99. */
  100. vq->is_le = vhost_has_feature(vq, VIRTIO_F_VERSION_1) || !vq->user_be;
  101. }
  102. #else
  103. static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
  104. {
  105. }
  106. static long vhost_set_vring_endian(struct vhost_virtqueue *vq, int __user *argp)
  107. {
  108. return -ENOIOCTLCMD;
  109. }
  110. static long vhost_get_vring_endian(struct vhost_virtqueue *vq, u32 idx,
  111. int __user *argp)
  112. {
  113. return -ENOIOCTLCMD;
  114. }
  115. static void vhost_init_is_le(struct vhost_virtqueue *vq)
  116. {
  117. vq->is_le = vhost_has_feature(vq, VIRTIO_F_VERSION_1)
  118. || virtio_legacy_is_little_endian();
  119. }
  120. #endif /* CONFIG_VHOST_CROSS_ENDIAN_LEGACY */
  121. static void vhost_reset_is_le(struct vhost_virtqueue *vq)
  122. {
  123. vhost_init_is_le(vq);
  124. }
  125. struct vhost_flush_struct {
  126. struct vhost_work work;
  127. struct completion wait_event;
  128. };
  129. static void vhost_flush_work(struct vhost_work *work)
  130. {
  131. struct vhost_flush_struct *s;
  132. s = container_of(work, struct vhost_flush_struct, work);
  133. complete(&s->wait_event);
  134. }
  135. static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
  136. poll_table *pt)
  137. {
  138. struct vhost_poll *poll;
  139. poll = container_of(pt, struct vhost_poll, table);
  140. poll->wqh = wqh;
  141. add_wait_queue(wqh, &poll->wait);
  142. }
  143. static int vhost_poll_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync,
  144. void *key)
  145. {
  146. struct vhost_poll *poll = container_of(wait, struct vhost_poll, wait);
  147. struct vhost_work *work = &poll->work;
  148. if (!(key_to_poll(key) & poll->mask))
  149. return 0;
  150. if (!poll->dev->use_worker)
  151. work->fn(work);
  152. else
  153. vhost_poll_queue(poll);
  154. return 0;
  155. }
  156. void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn)
  157. {
  158. clear_bit(VHOST_WORK_QUEUED, &work->flags);
  159. work->fn = fn;
  160. }
  161. EXPORT_SYMBOL_GPL(vhost_work_init);
  162. /* Init poll structure */
  163. void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
  164. __poll_t mask, struct vhost_dev *dev,
  165. struct vhost_virtqueue *vq)
  166. {
  167. init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup);
  168. init_poll_funcptr(&poll->table, vhost_poll_func);
  169. poll->mask = mask;
  170. poll->dev = dev;
  171. poll->wqh = NULL;
  172. poll->vq = vq;
  173. vhost_work_init(&poll->work, fn);
  174. }
  175. EXPORT_SYMBOL_GPL(vhost_poll_init);
  176. /* Start polling a file. We add ourselves to file's wait queue. The caller must
  177. * keep a reference to a file until after vhost_poll_stop is called. */
  178. int vhost_poll_start(struct vhost_poll *poll, struct file *file)
  179. {
  180. __poll_t mask;
  181. if (poll->wqh)
  182. return 0;
  183. mask = vfs_poll(file, &poll->table);
  184. if (mask)
  185. vhost_poll_wakeup(&poll->wait, 0, 0, poll_to_key(mask));
  186. if (mask & EPOLLERR) {
  187. vhost_poll_stop(poll);
  188. return -EINVAL;
  189. }
  190. return 0;
  191. }
  192. EXPORT_SYMBOL_GPL(vhost_poll_start);
  193. /* Stop polling a file. After this function returns, it becomes safe to drop the
  194. * file reference. You must also flush afterwards. */
  195. void vhost_poll_stop(struct vhost_poll *poll)
  196. {
  197. if (poll->wqh) {
  198. remove_wait_queue(poll->wqh, &poll->wait);
  199. poll->wqh = NULL;
  200. }
  201. }
  202. EXPORT_SYMBOL_GPL(vhost_poll_stop);
  203. static void vhost_worker_queue(struct vhost_worker *worker,
  204. struct vhost_work *work)
  205. {
  206. if (!test_and_set_bit(VHOST_WORK_QUEUED, &work->flags)) {
  207. /* We can only add the work to the list after we're
  208. * sure it was not in the list.
  209. * test_and_set_bit() implies a memory barrier.
  210. */
  211. llist_add(&work->node, &worker->work_list);
  212. worker->ops->wakeup(worker);
  213. }
  214. }
  215. bool vhost_vq_work_queue(struct vhost_virtqueue *vq, struct vhost_work *work)
  216. {
  217. struct vhost_worker *worker;
  218. bool queued = false;
  219. rcu_read_lock();
  220. worker = rcu_dereference(vq->worker);
  221. if (worker) {
  222. queued = true;
  223. vhost_worker_queue(worker, work);
  224. }
  225. rcu_read_unlock();
  226. return queued;
  227. }
  228. EXPORT_SYMBOL_GPL(vhost_vq_work_queue);
  229. /**
  230. * __vhost_worker_flush - flush a worker
  231. * @worker: worker to flush
  232. *
  233. * The worker's flush_mutex must be held.
  234. */
  235. static void __vhost_worker_flush(struct vhost_worker *worker)
  236. {
  237. struct vhost_flush_struct flush;
  238. if (!worker->attachment_cnt || worker->killed)
  239. return;
  240. init_completion(&flush.wait_event);
  241. vhost_work_init(&flush.work, vhost_flush_work);
  242. vhost_worker_queue(worker, &flush.work);
  243. /*
  244. * Drop mutex in case our worker is killed and it needs to take the
  245. * mutex to force cleanup.
  246. */
  247. mutex_unlock(&worker->mutex);
  248. wait_for_completion(&flush.wait_event);
  249. mutex_lock(&worker->mutex);
  250. }
  251. static void vhost_worker_flush(struct vhost_worker *worker)
  252. {
  253. mutex_lock(&worker->mutex);
  254. __vhost_worker_flush(worker);
  255. mutex_unlock(&worker->mutex);
  256. }
  257. void vhost_dev_flush(struct vhost_dev *dev)
  258. {
  259. struct vhost_worker *worker;
  260. unsigned long i;
  261. xa_for_each(&dev->worker_xa, i, worker)
  262. vhost_worker_flush(worker);
  263. }
  264. EXPORT_SYMBOL_GPL(vhost_dev_flush);
  265. /* A lockless hint for busy polling code to exit the loop */
  266. bool vhost_vq_has_work(struct vhost_virtqueue *vq)
  267. {
  268. struct vhost_worker *worker;
  269. bool has_work = false;
  270. rcu_read_lock();
  271. worker = rcu_dereference(vq->worker);
  272. if (worker && !llist_empty(&worker->work_list))
  273. has_work = true;
  274. rcu_read_unlock();
  275. return has_work;
  276. }
  277. EXPORT_SYMBOL_GPL(vhost_vq_has_work);
  278. void vhost_poll_queue(struct vhost_poll *poll)
  279. {
  280. vhost_vq_work_queue(poll->vq, &poll->work);
  281. }
  282. EXPORT_SYMBOL_GPL(vhost_poll_queue);
  283. static void __vhost_vq_meta_reset(struct vhost_virtqueue *vq)
  284. {
  285. int j;
  286. for (j = 0; j < VHOST_NUM_ADDRS; j++)
  287. vq->meta_iotlb[j] = NULL;
  288. }
  289. static void vhost_vq_meta_reset(struct vhost_dev *d)
  290. {
  291. int i;
  292. for (i = 0; i < d->nvqs; ++i)
  293. __vhost_vq_meta_reset(d->vqs[i]);
  294. }
  295. static void vhost_vring_call_reset(struct vhost_vring_call *call_ctx)
  296. {
  297. call_ctx->ctx = NULL;
  298. memset(&call_ctx->producer, 0x0, sizeof(struct irq_bypass_producer));
  299. }
  300. bool vhost_vq_is_setup(struct vhost_virtqueue *vq)
  301. {
  302. return vq->avail && vq->desc && vq->used && vhost_vq_access_ok(vq);
  303. }
  304. EXPORT_SYMBOL_GPL(vhost_vq_is_setup);
  305. static void vhost_vq_reset(struct vhost_dev *dev,
  306. struct vhost_virtqueue *vq)
  307. {
  308. vq->num = 1;
  309. vq->desc = NULL;
  310. vq->avail = NULL;
  311. vq->used = NULL;
  312. vq->last_avail_idx = 0;
  313. vq->next_avail_head = 0;
  314. vq->avail_idx = 0;
  315. vq->last_used_idx = 0;
  316. vq->signalled_used = 0;
  317. vq->signalled_used_valid = false;
  318. vq->used_flags = 0;
  319. vq->log_used = false;
  320. vq->log_addr = -1ull;
  321. vq->private_data = NULL;
  322. virtio_features_zero(vq->acked_features_array);
  323. vq->acked_backend_features = 0;
  324. vq->log_base = NULL;
  325. vq->error_ctx = NULL;
  326. vq->kick = NULL;
  327. vq->log_ctx = NULL;
  328. vhost_disable_cross_endian(vq);
  329. vhost_reset_is_le(vq);
  330. vq->busyloop_timeout = 0;
  331. vq->umem = NULL;
  332. vq->iotlb = NULL;
  333. rcu_assign_pointer(vq->worker, NULL);
  334. vhost_vring_call_reset(&vq->call_ctx);
  335. __vhost_vq_meta_reset(vq);
  336. }
  337. static int vhost_run_work_kthread_list(void *data)
  338. {
  339. struct vhost_worker *worker = data;
  340. struct vhost_work *work, *work_next;
  341. struct vhost_dev *dev = worker->dev;
  342. struct llist_node *node;
  343. kthread_use_mm(dev->mm);
  344. for (;;) {
  345. /* mb paired w/ kthread_stop */
  346. set_current_state(TASK_INTERRUPTIBLE);
  347. if (kthread_should_stop()) {
  348. __set_current_state(TASK_RUNNING);
  349. break;
  350. }
  351. node = llist_del_all(&worker->work_list);
  352. if (!node)
  353. schedule();
  354. node = llist_reverse_order(node);
  355. /* make sure flag is seen after deletion */
  356. smp_wmb();
  357. llist_for_each_entry_safe(work, work_next, node, node) {
  358. clear_bit(VHOST_WORK_QUEUED, &work->flags);
  359. __set_current_state(TASK_RUNNING);
  360. kcov_remote_start_common(worker->kcov_handle);
  361. work->fn(work);
  362. kcov_remote_stop();
  363. cond_resched();
  364. }
  365. }
  366. kthread_unuse_mm(dev->mm);
  367. return 0;
  368. }
  369. static bool vhost_run_work_list(void *data)
  370. {
  371. struct vhost_worker *worker = data;
  372. struct vhost_work *work, *work_next;
  373. struct llist_node *node;
  374. node = llist_del_all(&worker->work_list);
  375. if (node) {
  376. __set_current_state(TASK_RUNNING);
  377. node = llist_reverse_order(node);
  378. /* make sure flag is seen after deletion */
  379. smp_wmb();
  380. llist_for_each_entry_safe(work, work_next, node, node) {
  381. clear_bit(VHOST_WORK_QUEUED, &work->flags);
  382. kcov_remote_start_common(worker->kcov_handle);
  383. work->fn(work);
  384. kcov_remote_stop();
  385. cond_resched();
  386. }
  387. }
  388. return !!node;
  389. }
  390. static void vhost_worker_killed(void *data)
  391. {
  392. struct vhost_worker *worker = data;
  393. struct vhost_dev *dev = worker->dev;
  394. struct vhost_virtqueue *vq;
  395. int i, attach_cnt = 0;
  396. mutex_lock(&worker->mutex);
  397. worker->killed = true;
  398. for (i = 0; i < dev->nvqs; i++) {
  399. vq = dev->vqs[i];
  400. mutex_lock(&vq->mutex);
  401. if (worker ==
  402. rcu_dereference_check(vq->worker,
  403. lockdep_is_held(&vq->mutex))) {
  404. rcu_assign_pointer(vq->worker, NULL);
  405. attach_cnt++;
  406. }
  407. mutex_unlock(&vq->mutex);
  408. }
  409. worker->attachment_cnt -= attach_cnt;
  410. if (attach_cnt)
  411. synchronize_rcu();
  412. /*
  413. * Finish vhost_worker_flush calls and any other works that snuck in
  414. * before the synchronize_rcu.
  415. */
  416. vhost_run_work_list(worker);
  417. mutex_unlock(&worker->mutex);
  418. }
  419. static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
  420. {
  421. kfree(vq->indirect);
  422. vq->indirect = NULL;
  423. kfree(vq->log);
  424. vq->log = NULL;
  425. kfree(vq->heads);
  426. vq->heads = NULL;
  427. kfree(vq->nheads);
  428. vq->nheads = NULL;
  429. }
  430. /* Helper to allocate iovec buffers for all vqs. */
  431. static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
  432. {
  433. struct vhost_virtqueue *vq;
  434. int i;
  435. for (i = 0; i < dev->nvqs; ++i) {
  436. vq = dev->vqs[i];
  437. vq->indirect = kmalloc_objs(*vq->indirect, UIO_MAXIOV);
  438. vq->log = kmalloc_objs(*vq->log, dev->iov_limit);
  439. vq->heads = kmalloc_objs(*vq->heads, dev->iov_limit);
  440. vq->nheads = kmalloc_array(dev->iov_limit, sizeof(*vq->nheads),
  441. GFP_KERNEL);
  442. if (!vq->indirect || !vq->log || !vq->heads || !vq->nheads)
  443. goto err_nomem;
  444. }
  445. return 0;
  446. err_nomem:
  447. for (; i >= 0; --i)
  448. vhost_vq_free_iovecs(dev->vqs[i]);
  449. return -ENOMEM;
  450. }
  451. static void vhost_dev_free_iovecs(struct vhost_dev *dev)
  452. {
  453. int i;
  454. for (i = 0; i < dev->nvqs; ++i)
  455. vhost_vq_free_iovecs(dev->vqs[i]);
  456. }
  457. bool vhost_exceeds_weight(struct vhost_virtqueue *vq,
  458. int pkts, int total_len)
  459. {
  460. struct vhost_dev *dev = vq->dev;
  461. if ((dev->byte_weight && total_len >= dev->byte_weight) ||
  462. pkts >= dev->weight) {
  463. vhost_poll_queue(&vq->poll);
  464. return true;
  465. }
  466. return false;
  467. }
  468. EXPORT_SYMBOL_GPL(vhost_exceeds_weight);
  469. static size_t vhost_get_avail_size(struct vhost_virtqueue *vq,
  470. unsigned int num)
  471. {
  472. size_t event __maybe_unused =
  473. vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
  474. return size_add(struct_size(vq->avail, ring, num), event);
  475. }
  476. static size_t vhost_get_used_size(struct vhost_virtqueue *vq,
  477. unsigned int num)
  478. {
  479. size_t event __maybe_unused =
  480. vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
  481. return size_add(struct_size(vq->used, ring, num), event);
  482. }
  483. static size_t vhost_get_desc_size(struct vhost_virtqueue *vq,
  484. unsigned int num)
  485. {
  486. return sizeof(*vq->desc) * num;
  487. }
  488. void vhost_dev_init(struct vhost_dev *dev,
  489. struct vhost_virtqueue **vqs, int nvqs,
  490. int iov_limit, int weight, int byte_weight,
  491. bool use_worker,
  492. int (*msg_handler)(struct vhost_dev *dev, u32 asid,
  493. struct vhost_iotlb_msg *msg))
  494. {
  495. struct vhost_virtqueue *vq;
  496. int i;
  497. dev->vqs = vqs;
  498. dev->nvqs = nvqs;
  499. mutex_init(&dev->mutex);
  500. dev->log_ctx = NULL;
  501. dev->umem = NULL;
  502. dev->iotlb = NULL;
  503. dev->mm = NULL;
  504. dev->iov_limit = iov_limit;
  505. dev->weight = weight;
  506. dev->byte_weight = byte_weight;
  507. dev->use_worker = use_worker;
  508. dev->msg_handler = msg_handler;
  509. dev->fork_owner = fork_from_owner_default;
  510. init_waitqueue_head(&dev->wait);
  511. INIT_LIST_HEAD(&dev->read_list);
  512. INIT_LIST_HEAD(&dev->pending_list);
  513. spin_lock_init(&dev->iotlb_lock);
  514. xa_init_flags(&dev->worker_xa, XA_FLAGS_ALLOC);
  515. for (i = 0; i < dev->nvqs; ++i) {
  516. vq = dev->vqs[i];
  517. vq->log = NULL;
  518. vq->indirect = NULL;
  519. vq->heads = NULL;
  520. vq->nheads = NULL;
  521. vq->dev = dev;
  522. mutex_init(&vq->mutex);
  523. vhost_vq_reset(dev, vq);
  524. if (vq->handle_kick)
  525. vhost_poll_init(&vq->poll, vq->handle_kick,
  526. EPOLLIN, dev, vq);
  527. }
  528. }
  529. EXPORT_SYMBOL_GPL(vhost_dev_init);
  530. /* Caller should have device mutex */
  531. long vhost_dev_check_owner(struct vhost_dev *dev)
  532. {
  533. /* Are you the owner? If not, I don't think you mean to do that */
  534. return dev->mm == current->mm ? 0 : -EPERM;
  535. }
  536. EXPORT_SYMBOL_GPL(vhost_dev_check_owner);
  537. struct vhost_attach_cgroups_struct {
  538. struct vhost_work work;
  539. struct task_struct *owner;
  540. int ret;
  541. };
  542. static void vhost_attach_cgroups_work(struct vhost_work *work)
  543. {
  544. struct vhost_attach_cgroups_struct *s;
  545. s = container_of(work, struct vhost_attach_cgroups_struct, work);
  546. s->ret = cgroup_attach_task_all(s->owner, current);
  547. }
  548. static int vhost_attach_task_to_cgroups(struct vhost_worker *worker)
  549. {
  550. struct vhost_attach_cgroups_struct attach;
  551. int saved_cnt;
  552. attach.owner = current;
  553. vhost_work_init(&attach.work, vhost_attach_cgroups_work);
  554. vhost_worker_queue(worker, &attach.work);
  555. mutex_lock(&worker->mutex);
  556. /*
  557. * Bypass attachment_cnt check in __vhost_worker_flush:
  558. * Temporarily change it to INT_MAX to bypass the check
  559. */
  560. saved_cnt = worker->attachment_cnt;
  561. worker->attachment_cnt = INT_MAX;
  562. __vhost_worker_flush(worker);
  563. worker->attachment_cnt = saved_cnt;
  564. mutex_unlock(&worker->mutex);
  565. return attach.ret;
  566. }
  567. /* Caller should have device mutex */
  568. bool vhost_dev_has_owner(struct vhost_dev *dev)
  569. {
  570. return dev->mm;
  571. }
  572. EXPORT_SYMBOL_GPL(vhost_dev_has_owner);
  573. static void vhost_attach_mm(struct vhost_dev *dev)
  574. {
  575. /* No owner, become one */
  576. if (dev->use_worker) {
  577. dev->mm = get_task_mm(current);
  578. } else {
  579. /* vDPA device does not use worker thread, so there's
  580. * no need to hold the address space for mm. This helps
  581. * to avoid deadlock in the case of mmap() which may
  582. * hold the refcnt of the file and depends on release
  583. * method to remove vma.
  584. */
  585. dev->mm = current->mm;
  586. mmgrab(dev->mm);
  587. }
  588. }
  589. static void vhost_detach_mm(struct vhost_dev *dev)
  590. {
  591. if (!dev->mm)
  592. return;
  593. if (dev->use_worker)
  594. mmput(dev->mm);
  595. else
  596. mmdrop(dev->mm);
  597. dev->mm = NULL;
  598. }
  599. static void vhost_worker_destroy(struct vhost_dev *dev,
  600. struct vhost_worker *worker)
  601. {
  602. if (!worker)
  603. return;
  604. WARN_ON(!llist_empty(&worker->work_list));
  605. xa_erase(&dev->worker_xa, worker->id);
  606. worker->ops->stop(worker);
  607. kfree(worker);
  608. }
  609. static void vhost_workers_free(struct vhost_dev *dev)
  610. {
  611. struct vhost_worker *worker;
  612. unsigned long i;
  613. if (!dev->use_worker)
  614. return;
  615. for (i = 0; i < dev->nvqs; i++)
  616. rcu_assign_pointer(dev->vqs[i]->worker, NULL);
  617. /*
  618. * Free the default worker we created and cleanup workers userspace
  619. * created but couldn't clean up (it forgot or crashed).
  620. */
  621. xa_for_each(&dev->worker_xa, i, worker)
  622. vhost_worker_destroy(dev, worker);
  623. xa_destroy(&dev->worker_xa);
  624. }
  625. static void vhost_task_wakeup(struct vhost_worker *worker)
  626. {
  627. return vhost_task_wake(worker->vtsk);
  628. }
  629. static void vhost_kthread_wakeup(struct vhost_worker *worker)
  630. {
  631. wake_up_process(worker->kthread_task);
  632. }
  633. static void vhost_task_do_stop(struct vhost_worker *worker)
  634. {
  635. return vhost_task_stop(worker->vtsk);
  636. }
  637. static void vhost_kthread_do_stop(struct vhost_worker *worker)
  638. {
  639. kthread_stop(worker->kthread_task);
  640. }
  641. static int vhost_task_worker_create(struct vhost_worker *worker,
  642. struct vhost_dev *dev, const char *name)
  643. {
  644. struct vhost_task *vtsk;
  645. u32 id;
  646. int ret;
  647. vtsk = vhost_task_create(vhost_run_work_list, vhost_worker_killed,
  648. worker, name);
  649. if (IS_ERR(vtsk))
  650. return PTR_ERR(vtsk);
  651. worker->vtsk = vtsk;
  652. vhost_task_start(vtsk);
  653. ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, GFP_KERNEL);
  654. if (ret < 0) {
  655. vhost_task_do_stop(worker);
  656. return ret;
  657. }
  658. worker->id = id;
  659. return 0;
  660. }
  661. static int vhost_kthread_worker_create(struct vhost_worker *worker,
  662. struct vhost_dev *dev, const char *name)
  663. {
  664. struct task_struct *task;
  665. u32 id;
  666. int ret;
  667. task = kthread_create(vhost_run_work_kthread_list, worker, "%s", name);
  668. if (IS_ERR(task))
  669. return PTR_ERR(task);
  670. worker->kthread_task = task;
  671. wake_up_process(task);
  672. ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, GFP_KERNEL);
  673. if (ret < 0)
  674. goto stop_worker;
  675. ret = vhost_attach_task_to_cgroups(worker);
  676. if (ret)
  677. goto free_id;
  678. worker->id = id;
  679. return 0;
  680. free_id:
  681. xa_erase(&dev->worker_xa, id);
  682. stop_worker:
  683. vhost_kthread_do_stop(worker);
  684. return ret;
  685. }
  686. static const struct vhost_worker_ops kthread_ops = {
  687. .create = vhost_kthread_worker_create,
  688. .stop = vhost_kthread_do_stop,
  689. .wakeup = vhost_kthread_wakeup,
  690. };
  691. static const struct vhost_worker_ops vhost_task_ops = {
  692. .create = vhost_task_worker_create,
  693. .stop = vhost_task_do_stop,
  694. .wakeup = vhost_task_wakeup,
  695. };
  696. static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev)
  697. {
  698. struct vhost_worker *worker;
  699. char name[TASK_COMM_LEN];
  700. int ret;
  701. const struct vhost_worker_ops *ops = dev->fork_owner ? &vhost_task_ops :
  702. &kthread_ops;
  703. worker = kzalloc_obj(*worker, GFP_KERNEL_ACCOUNT);
  704. if (!worker)
  705. return NULL;
  706. worker->dev = dev;
  707. worker->ops = ops;
  708. snprintf(name, sizeof(name), "vhost-%d", current->pid);
  709. mutex_init(&worker->mutex);
  710. init_llist_head(&worker->work_list);
  711. worker->kcov_handle = kcov_common_handle();
  712. ret = ops->create(worker, dev, name);
  713. if (ret < 0)
  714. goto free_worker;
  715. return worker;
  716. free_worker:
  717. kfree(worker);
  718. return NULL;
  719. }
  720. /* Caller must have device mutex */
  721. static void __vhost_vq_attach_worker(struct vhost_virtqueue *vq,
  722. struct vhost_worker *worker)
  723. {
  724. struct vhost_worker *old_worker;
  725. mutex_lock(&worker->mutex);
  726. if (worker->killed) {
  727. mutex_unlock(&worker->mutex);
  728. return;
  729. }
  730. mutex_lock(&vq->mutex);
  731. old_worker = rcu_dereference_check(vq->worker,
  732. lockdep_is_held(&vq->mutex));
  733. rcu_assign_pointer(vq->worker, worker);
  734. worker->attachment_cnt++;
  735. if (!old_worker) {
  736. mutex_unlock(&vq->mutex);
  737. mutex_unlock(&worker->mutex);
  738. return;
  739. }
  740. mutex_unlock(&vq->mutex);
  741. mutex_unlock(&worker->mutex);
  742. /*
  743. * Take the worker mutex to make sure we see the work queued from
  744. * device wide flushes which doesn't use RCU for execution.
  745. */
  746. mutex_lock(&old_worker->mutex);
  747. if (old_worker->killed) {
  748. mutex_unlock(&old_worker->mutex);
  749. return;
  750. }
  751. /*
  752. * We don't want to call synchronize_rcu for every vq during setup
  753. * because it will slow down VM startup. If we haven't done
  754. * VHOST_SET_VRING_KICK and not done the driver specific
  755. * SET_ENDPOINT/RUNNING then we can skip the sync since there will
  756. * not be any works queued for scsi and net.
  757. */
  758. mutex_lock(&vq->mutex);
  759. if (!vhost_vq_get_backend(vq) && !vq->kick) {
  760. mutex_unlock(&vq->mutex);
  761. old_worker->attachment_cnt--;
  762. mutex_unlock(&old_worker->mutex);
  763. /*
  764. * vsock can queue anytime after VHOST_VSOCK_SET_GUEST_CID.
  765. * Warn if it adds support for multiple workers but forgets to
  766. * handle the early queueing case.
  767. */
  768. WARN_ON(!old_worker->attachment_cnt &&
  769. !llist_empty(&old_worker->work_list));
  770. return;
  771. }
  772. mutex_unlock(&vq->mutex);
  773. /* Make sure new vq queue/flush/poll calls see the new worker */
  774. synchronize_rcu();
  775. /* Make sure whatever was queued gets run */
  776. __vhost_worker_flush(old_worker);
  777. old_worker->attachment_cnt--;
  778. mutex_unlock(&old_worker->mutex);
  779. }
  780. /* Caller must have device mutex */
  781. static int vhost_vq_attach_worker(struct vhost_virtqueue *vq,
  782. struct vhost_vring_worker *info)
  783. {
  784. unsigned long index = info->worker_id;
  785. struct vhost_dev *dev = vq->dev;
  786. struct vhost_worker *worker;
  787. if (!dev->use_worker)
  788. return -EINVAL;
  789. worker = xa_find(&dev->worker_xa, &index, UINT_MAX, XA_PRESENT);
  790. if (!worker || worker->id != info->worker_id)
  791. return -ENODEV;
  792. __vhost_vq_attach_worker(vq, worker);
  793. return 0;
  794. }
  795. /* Caller must have device mutex */
  796. static int vhost_new_worker(struct vhost_dev *dev,
  797. struct vhost_worker_state *info)
  798. {
  799. struct vhost_worker *worker;
  800. worker = vhost_worker_create(dev);
  801. if (!worker)
  802. return -ENOMEM;
  803. info->worker_id = worker->id;
  804. return 0;
  805. }
  806. /* Caller must have device mutex */
  807. static int vhost_free_worker(struct vhost_dev *dev,
  808. struct vhost_worker_state *info)
  809. {
  810. unsigned long index = info->worker_id;
  811. struct vhost_worker *worker;
  812. worker = xa_find(&dev->worker_xa, &index, UINT_MAX, XA_PRESENT);
  813. if (!worker || worker->id != info->worker_id)
  814. return -ENODEV;
  815. mutex_lock(&worker->mutex);
  816. if (worker->attachment_cnt || worker->killed) {
  817. mutex_unlock(&worker->mutex);
  818. return -EBUSY;
  819. }
  820. /*
  821. * A flush might have raced and snuck in before attachment_cnt was set
  822. * to zero. Make sure flushes are flushed from the queue before
  823. * freeing.
  824. */
  825. __vhost_worker_flush(worker);
  826. mutex_unlock(&worker->mutex);
  827. vhost_worker_destroy(dev, worker);
  828. return 0;
  829. }
  830. static int vhost_get_vq_from_user(struct vhost_dev *dev, void __user *argp,
  831. struct vhost_virtqueue **vq, u32 *id)
  832. {
  833. u32 __user *idxp = argp;
  834. u32 idx;
  835. long r;
  836. r = get_user(idx, idxp);
  837. if (r < 0)
  838. return r;
  839. if (idx >= dev->nvqs)
  840. return -ENOBUFS;
  841. idx = array_index_nospec(idx, dev->nvqs);
  842. *vq = dev->vqs[idx];
  843. *id = idx;
  844. return 0;
  845. }
  846. /* Caller must have device mutex */
  847. long vhost_worker_ioctl(struct vhost_dev *dev, unsigned int ioctl,
  848. void __user *argp)
  849. {
  850. struct vhost_vring_worker ring_worker;
  851. struct vhost_worker_state state;
  852. struct vhost_worker *worker;
  853. struct vhost_virtqueue *vq;
  854. long ret;
  855. u32 idx;
  856. if (!dev->use_worker)
  857. return -EINVAL;
  858. if (!vhost_dev_has_owner(dev))
  859. return -EINVAL;
  860. ret = vhost_dev_check_owner(dev);
  861. if (ret)
  862. return ret;
  863. switch (ioctl) {
  864. /* dev worker ioctls */
  865. case VHOST_NEW_WORKER:
  866. /*
  867. * vhost_tasks will account for worker threads under the parent's
  868. * NPROC value but kthreads do not. To avoid userspace overflowing
  869. * the system with worker threads fork_owner must be true.
  870. */
  871. if (!dev->fork_owner)
  872. return -EFAULT;
  873. ret = vhost_new_worker(dev, &state);
  874. if (!ret && copy_to_user(argp, &state, sizeof(state)))
  875. ret = -EFAULT;
  876. return ret;
  877. case VHOST_FREE_WORKER:
  878. if (copy_from_user(&state, argp, sizeof(state)))
  879. return -EFAULT;
  880. return vhost_free_worker(dev, &state);
  881. /* vring worker ioctls */
  882. case VHOST_ATTACH_VRING_WORKER:
  883. case VHOST_GET_VRING_WORKER:
  884. break;
  885. default:
  886. return -ENOIOCTLCMD;
  887. }
  888. ret = vhost_get_vq_from_user(dev, argp, &vq, &idx);
  889. if (ret)
  890. return ret;
  891. switch (ioctl) {
  892. case VHOST_ATTACH_VRING_WORKER:
  893. if (copy_from_user(&ring_worker, argp, sizeof(ring_worker))) {
  894. ret = -EFAULT;
  895. break;
  896. }
  897. ret = vhost_vq_attach_worker(vq, &ring_worker);
  898. break;
  899. case VHOST_GET_VRING_WORKER:
  900. worker = rcu_dereference_check(vq->worker,
  901. lockdep_is_held(&dev->mutex));
  902. if (!worker) {
  903. ret = -EINVAL;
  904. break;
  905. }
  906. ring_worker.index = idx;
  907. ring_worker.worker_id = worker->id;
  908. if (copy_to_user(argp, &ring_worker, sizeof(ring_worker)))
  909. ret = -EFAULT;
  910. break;
  911. default:
  912. ret = -ENOIOCTLCMD;
  913. break;
  914. }
  915. return ret;
  916. }
  917. EXPORT_SYMBOL_GPL(vhost_worker_ioctl);
  918. /* Caller should have device mutex */
  919. long vhost_dev_set_owner(struct vhost_dev *dev)
  920. {
  921. struct vhost_worker *worker;
  922. int err, i;
  923. /* Is there an owner already? */
  924. if (vhost_dev_has_owner(dev)) {
  925. err = -EBUSY;
  926. goto err_mm;
  927. }
  928. vhost_attach_mm(dev);
  929. err = vhost_dev_alloc_iovecs(dev);
  930. if (err)
  931. goto err_iovecs;
  932. if (dev->use_worker) {
  933. /*
  934. * This should be done last, because vsock can queue work
  935. * before VHOST_SET_OWNER so it simplifies the failure path
  936. * below since we don't have to worry about vsock queueing
  937. * while we free the worker.
  938. */
  939. worker = vhost_worker_create(dev);
  940. if (!worker) {
  941. err = -ENOMEM;
  942. goto err_worker;
  943. }
  944. for (i = 0; i < dev->nvqs; i++)
  945. __vhost_vq_attach_worker(dev->vqs[i], worker);
  946. }
  947. return 0;
  948. err_worker:
  949. vhost_dev_free_iovecs(dev);
  950. err_iovecs:
  951. vhost_detach_mm(dev);
  952. err_mm:
  953. return err;
  954. }
  955. EXPORT_SYMBOL_GPL(vhost_dev_set_owner);
  956. static struct vhost_iotlb *iotlb_alloc(void)
  957. {
  958. return vhost_iotlb_alloc(max_iotlb_entries,
  959. VHOST_IOTLB_FLAG_RETIRE);
  960. }
  961. struct vhost_iotlb *vhost_dev_reset_owner_prepare(void)
  962. {
  963. return iotlb_alloc();
  964. }
  965. EXPORT_SYMBOL_GPL(vhost_dev_reset_owner_prepare);
  966. /* Caller should have device mutex */
  967. void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_iotlb *umem)
  968. {
  969. int i;
  970. vhost_dev_cleanup(dev);
  971. dev->fork_owner = fork_from_owner_default;
  972. dev->umem = umem;
  973. /* We don't need VQ locks below since vhost_dev_cleanup makes sure
  974. * VQs aren't running.
  975. */
  976. for (i = 0; i < dev->nvqs; ++i)
  977. dev->vqs[i]->umem = umem;
  978. }
  979. EXPORT_SYMBOL_GPL(vhost_dev_reset_owner);
  980. void vhost_dev_stop(struct vhost_dev *dev)
  981. {
  982. int i;
  983. for (i = 0; i < dev->nvqs; ++i) {
  984. if (dev->vqs[i]->kick && dev->vqs[i]->handle_kick)
  985. vhost_poll_stop(&dev->vqs[i]->poll);
  986. }
  987. vhost_dev_flush(dev);
  988. }
  989. EXPORT_SYMBOL_GPL(vhost_dev_stop);
  990. void vhost_clear_msg(struct vhost_dev *dev)
  991. {
  992. struct vhost_msg_node *node, *n;
  993. spin_lock(&dev->iotlb_lock);
  994. list_for_each_entry_safe(node, n, &dev->read_list, node) {
  995. list_del(&node->node);
  996. kfree(node);
  997. }
  998. list_for_each_entry_safe(node, n, &dev->pending_list, node) {
  999. list_del(&node->node);
  1000. kfree(node);
  1001. }
  1002. spin_unlock(&dev->iotlb_lock);
  1003. }
  1004. EXPORT_SYMBOL_GPL(vhost_clear_msg);
  1005. void vhost_dev_cleanup(struct vhost_dev *dev)
  1006. {
  1007. int i;
  1008. for (i = 0; i < dev->nvqs; ++i) {
  1009. if (dev->vqs[i]->error_ctx)
  1010. eventfd_ctx_put(dev->vqs[i]->error_ctx);
  1011. if (dev->vqs[i]->kick)
  1012. fput(dev->vqs[i]->kick);
  1013. if (dev->vqs[i]->call_ctx.ctx)
  1014. eventfd_ctx_put(dev->vqs[i]->call_ctx.ctx);
  1015. vhost_vq_reset(dev, dev->vqs[i]);
  1016. }
  1017. vhost_dev_free_iovecs(dev);
  1018. if (dev->log_ctx)
  1019. eventfd_ctx_put(dev->log_ctx);
  1020. dev->log_ctx = NULL;
  1021. /* No one will access memory at this point */
  1022. vhost_iotlb_free(dev->umem);
  1023. dev->umem = NULL;
  1024. vhost_iotlb_free(dev->iotlb);
  1025. dev->iotlb = NULL;
  1026. vhost_clear_msg(dev);
  1027. wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM);
  1028. vhost_workers_free(dev);
  1029. vhost_detach_mm(dev);
  1030. }
  1031. EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
  1032. static bool log_access_ok(void __user *log_base, u64 addr, unsigned long sz)
  1033. {
  1034. u64 a = addr / VHOST_PAGE_SIZE / 8;
  1035. /* Make sure 64 bit math will not overflow. */
  1036. if (a > ULONG_MAX - (unsigned long)log_base ||
  1037. a + (unsigned long)log_base > ULONG_MAX)
  1038. return false;
  1039. return access_ok(log_base + a,
  1040. (sz + VHOST_PAGE_SIZE * 8 - 1) / VHOST_PAGE_SIZE / 8);
  1041. }
  1042. /* Make sure 64 bit math will not overflow. */
  1043. static bool vhost_overflow(u64 uaddr, u64 size)
  1044. {
  1045. if (uaddr > ULONG_MAX || size > ULONG_MAX)
  1046. return true;
  1047. if (!size)
  1048. return false;
  1049. return uaddr > ULONG_MAX - size + 1;
  1050. }
  1051. /* Caller should have vq mutex and device mutex. */
  1052. static bool vq_memory_access_ok(void __user *log_base, struct vhost_iotlb *umem,
  1053. int log_all)
  1054. {
  1055. struct vhost_iotlb_map *map;
  1056. if (!umem)
  1057. return false;
  1058. list_for_each_entry(map, &umem->list, link) {
  1059. unsigned long a = map->addr;
  1060. if (vhost_overflow(map->addr, map->size))
  1061. return false;
  1062. if (!access_ok((void __user *)a, map->size))
  1063. return false;
  1064. else if (log_all && !log_access_ok(log_base,
  1065. map->start,
  1066. map->size))
  1067. return false;
  1068. }
  1069. return true;
  1070. }
  1071. static inline void __user *vhost_vq_meta_fetch(struct vhost_virtqueue *vq,
  1072. u64 addr, unsigned int size,
  1073. int type)
  1074. {
  1075. const struct vhost_iotlb_map *map = vq->meta_iotlb[type];
  1076. if (!map)
  1077. return NULL;
  1078. return (void __user *)(uintptr_t)(map->addr + addr - map->start);
  1079. }
  1080. /* Can we switch to this memory table? */
  1081. /* Caller should have device mutex but not vq mutex */
  1082. static bool memory_access_ok(struct vhost_dev *d, struct vhost_iotlb *umem,
  1083. int log_all)
  1084. {
  1085. int i;
  1086. for (i = 0; i < d->nvqs; ++i) {
  1087. bool ok;
  1088. bool log;
  1089. mutex_lock(&d->vqs[i]->mutex);
  1090. log = log_all || vhost_has_feature(d->vqs[i], VHOST_F_LOG_ALL);
  1091. /* If ring is inactive, will check when it's enabled. */
  1092. if (d->vqs[i]->private_data)
  1093. ok = vq_memory_access_ok(d->vqs[i]->log_base,
  1094. umem, log);
  1095. else
  1096. ok = true;
  1097. mutex_unlock(&d->vqs[i]->mutex);
  1098. if (!ok)
  1099. return false;
  1100. }
  1101. return true;
  1102. }
  1103. static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
  1104. struct iovec iov[], int iov_size, int access);
  1105. static int vhost_copy_to_user(struct vhost_virtqueue *vq, void __user *to,
  1106. const void *from, unsigned size)
  1107. {
  1108. int ret;
  1109. if (!vq->iotlb)
  1110. return __copy_to_user(to, from, size);
  1111. else {
  1112. /* This function should be called after iotlb
  1113. * prefetch, which means we're sure that all vq
  1114. * could be access through iotlb. So -EAGAIN should
  1115. * not happen in this case.
  1116. */
  1117. struct iov_iter t;
  1118. void __user *uaddr = vhost_vq_meta_fetch(vq,
  1119. (u64)(uintptr_t)to, size,
  1120. VHOST_ADDR_USED);
  1121. if (uaddr)
  1122. return __copy_to_user(uaddr, from, size);
  1123. ret = translate_desc(vq, (u64)(uintptr_t)to, size, vq->iotlb_iov,
  1124. ARRAY_SIZE(vq->iotlb_iov),
  1125. VHOST_ACCESS_WO);
  1126. if (ret < 0)
  1127. goto out;
  1128. iov_iter_init(&t, ITER_DEST, vq->iotlb_iov, ret, size);
  1129. ret = copy_to_iter(from, size, &t);
  1130. if (ret == size)
  1131. ret = 0;
  1132. }
  1133. out:
  1134. return ret;
  1135. }
  1136. static int vhost_copy_from_user(struct vhost_virtqueue *vq, void *to,
  1137. void __user *from, unsigned size)
  1138. {
  1139. int ret;
  1140. if (!vq->iotlb)
  1141. return __copy_from_user(to, from, size);
  1142. else {
  1143. /* This function should be called after iotlb
  1144. * prefetch, which means we're sure that vq
  1145. * could be access through iotlb. So -EAGAIN should
  1146. * not happen in this case.
  1147. */
  1148. void __user *uaddr = vhost_vq_meta_fetch(vq,
  1149. (u64)(uintptr_t)from, size,
  1150. VHOST_ADDR_DESC);
  1151. struct iov_iter f;
  1152. if (uaddr)
  1153. return __copy_from_user(to, uaddr, size);
  1154. ret = translate_desc(vq, (u64)(uintptr_t)from, size, vq->iotlb_iov,
  1155. ARRAY_SIZE(vq->iotlb_iov),
  1156. VHOST_ACCESS_RO);
  1157. if (ret < 0) {
  1158. vq_err(vq, "IOTLB translation failure: uaddr "
  1159. "%p size 0x%llx\n", from,
  1160. (unsigned long long) size);
  1161. goto out;
  1162. }
  1163. iov_iter_init(&f, ITER_SOURCE, vq->iotlb_iov, ret, size);
  1164. ret = copy_from_iter(to, size, &f);
  1165. if (ret == size)
  1166. ret = 0;
  1167. }
  1168. out:
  1169. return ret;
  1170. }
  1171. static void __user *__vhost_get_user_slow(struct vhost_virtqueue *vq,
  1172. void __user *addr, unsigned int size,
  1173. int type)
  1174. {
  1175. int ret;
  1176. ret = translate_desc(vq, (u64)(uintptr_t)addr, size, vq->iotlb_iov,
  1177. ARRAY_SIZE(vq->iotlb_iov),
  1178. VHOST_ACCESS_RO);
  1179. if (ret < 0) {
  1180. vq_err(vq, "IOTLB translation failure: uaddr "
  1181. "%p size 0x%llx\n", addr,
  1182. (unsigned long long) size);
  1183. return NULL;
  1184. }
  1185. if (ret != 1 || vq->iotlb_iov[0].iov_len != size) {
  1186. vq_err(vq, "Non atomic userspace memory access: uaddr "
  1187. "%p size 0x%llx\n", addr,
  1188. (unsigned long long) size);
  1189. return NULL;
  1190. }
  1191. return vq->iotlb_iov[0].iov_base;
  1192. }
  1193. /* This function should be called after iotlb
  1194. * prefetch, which means we're sure that vq
  1195. * could be access through iotlb. So -EAGAIN should
  1196. * not happen in this case.
  1197. */
  1198. static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
  1199. void __user *addr, unsigned int size,
  1200. int type)
  1201. {
  1202. void __user *uaddr = vhost_vq_meta_fetch(vq,
  1203. (u64)(uintptr_t)addr, size, type);
  1204. if (uaddr)
  1205. return uaddr;
  1206. return __vhost_get_user_slow(vq, addr, size, type);
  1207. }
  1208. #define vhost_put_user(vq, x, ptr) \
  1209. ({ \
  1210. int ret; \
  1211. if (!vq->iotlb) { \
  1212. ret = put_user(x, ptr); \
  1213. } else { \
  1214. __typeof__(ptr) to = \
  1215. (__typeof__(ptr)) __vhost_get_user(vq, ptr, \
  1216. sizeof(*ptr), VHOST_ADDR_USED); \
  1217. if (to != NULL) \
  1218. ret = put_user(x, to); \
  1219. else \
  1220. ret = -EFAULT; \
  1221. } \
  1222. ret; \
  1223. })
  1224. static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
  1225. {
  1226. return vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
  1227. vhost_avail_event(vq));
  1228. }
  1229. static inline int vhost_put_used(struct vhost_virtqueue *vq,
  1230. struct vring_used_elem *head, int idx,
  1231. int count)
  1232. {
  1233. return vhost_copy_to_user(vq, vq->used->ring + idx, head,
  1234. count * sizeof(*head));
  1235. }
  1236. static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
  1237. {
  1238. return vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
  1239. &vq->used->flags);
  1240. }
  1241. static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
  1242. {
  1243. return vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
  1244. &vq->used->idx);
  1245. }
  1246. #define vhost_get_user(vq, x, ptr, type) \
  1247. ({ \
  1248. int ret; \
  1249. if (!vq->iotlb) { \
  1250. ret = get_user(x, ptr); \
  1251. } else { \
  1252. __typeof__(ptr) from = \
  1253. (__typeof__(ptr)) __vhost_get_user(vq, ptr, \
  1254. sizeof(*ptr), \
  1255. type); \
  1256. if (from != NULL) \
  1257. ret = get_user(x, from); \
  1258. else \
  1259. ret = -EFAULT; \
  1260. } \
  1261. ret; \
  1262. })
  1263. #define vhost_get_avail(vq, x, ptr) \
  1264. vhost_get_user(vq, x, ptr, VHOST_ADDR_AVAIL)
  1265. #define vhost_get_used(vq, x, ptr) \
  1266. vhost_get_user(vq, x, ptr, VHOST_ADDR_USED)
  1267. static void vhost_dev_lock_vqs(struct vhost_dev *d)
  1268. {
  1269. int i = 0;
  1270. for (i = 0; i < d->nvqs; ++i)
  1271. mutex_lock_nested(&d->vqs[i]->mutex, i);
  1272. }
  1273. static void vhost_dev_unlock_vqs(struct vhost_dev *d)
  1274. {
  1275. int i = 0;
  1276. for (i = 0; i < d->nvqs; ++i)
  1277. mutex_unlock(&d->vqs[i]->mutex);
  1278. }
  1279. static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq)
  1280. {
  1281. __virtio16 idx;
  1282. int r;
  1283. r = vhost_get_avail(vq, idx, &vq->avail->idx);
  1284. if (unlikely(r < 0)) {
  1285. vq_err(vq, "Failed to access available index at %p (%d)\n",
  1286. &vq->avail->idx, r);
  1287. return r;
  1288. }
  1289. /* Check it isn't doing very strange thing with available indexes */
  1290. vq->avail_idx = vhost16_to_cpu(vq, idx);
  1291. if (unlikely((u16)(vq->avail_idx - vq->last_avail_idx) > vq->num)) {
  1292. vq_err(vq, "Invalid available index change from %u to %u",
  1293. vq->last_avail_idx, vq->avail_idx);
  1294. return -EINVAL;
  1295. }
  1296. /* We're done if there is nothing new */
  1297. if (vq->avail_idx == vq->last_avail_idx)
  1298. return 0;
  1299. /*
  1300. * We updated vq->avail_idx so we need a memory barrier between
  1301. * the index read above and the caller reading avail ring entries.
  1302. */
  1303. smp_rmb();
  1304. return 1;
  1305. }
  1306. static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
  1307. __virtio16 *head, int idx)
  1308. {
  1309. return vhost_get_avail(vq, *head,
  1310. &vq->avail->ring[idx & (vq->num - 1)]);
  1311. }
  1312. static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
  1313. __virtio16 *flags)
  1314. {
  1315. return vhost_get_avail(vq, *flags, &vq->avail->flags);
  1316. }
  1317. static inline int vhost_get_used_event(struct vhost_virtqueue *vq,
  1318. __virtio16 *event)
  1319. {
  1320. return vhost_get_avail(vq, *event, vhost_used_event(vq));
  1321. }
  1322. static inline int vhost_get_used_idx(struct vhost_virtqueue *vq,
  1323. __virtio16 *idx)
  1324. {
  1325. return vhost_get_used(vq, *idx, &vq->used->idx);
  1326. }
  1327. static inline int vhost_get_desc(struct vhost_virtqueue *vq,
  1328. struct vring_desc *desc, int idx)
  1329. {
  1330. return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc));
  1331. }
  1332. static void vhost_iotlb_notify_vq(struct vhost_dev *d,
  1333. struct vhost_iotlb_msg *msg)
  1334. {
  1335. struct vhost_msg_node *node, *n;
  1336. spin_lock(&d->iotlb_lock);
  1337. list_for_each_entry_safe(node, n, &d->pending_list, node) {
  1338. struct vhost_iotlb_msg *vq_msg = &node->msg.iotlb;
  1339. if (msg->iova <= vq_msg->iova &&
  1340. msg->iova + msg->size - 1 >= vq_msg->iova &&
  1341. vq_msg->type == VHOST_IOTLB_MISS) {
  1342. vhost_poll_queue(&node->vq->poll);
  1343. list_del(&node->node);
  1344. kfree(node);
  1345. }
  1346. }
  1347. spin_unlock(&d->iotlb_lock);
  1348. }
  1349. static bool umem_access_ok(u64 uaddr, u64 size, int access)
  1350. {
  1351. unsigned long a = uaddr;
  1352. /* Make sure 64 bit math will not overflow. */
  1353. if (vhost_overflow(uaddr, size))
  1354. return false;
  1355. if ((access & VHOST_ACCESS_RO) &&
  1356. !access_ok((void __user *)a, size))
  1357. return false;
  1358. if ((access & VHOST_ACCESS_WO) &&
  1359. !access_ok((void __user *)a, size))
  1360. return false;
  1361. return true;
  1362. }
  1363. static int vhost_process_iotlb_msg(struct vhost_dev *dev, u32 asid,
  1364. struct vhost_iotlb_msg *msg)
  1365. {
  1366. int ret = 0;
  1367. if (asid != 0)
  1368. return -EINVAL;
  1369. mutex_lock(&dev->mutex);
  1370. vhost_dev_lock_vqs(dev);
  1371. switch (msg->type) {
  1372. case VHOST_IOTLB_UPDATE:
  1373. if (!dev->iotlb) {
  1374. ret = -EFAULT;
  1375. break;
  1376. }
  1377. if (!umem_access_ok(msg->uaddr, msg->size, msg->perm)) {
  1378. ret = -EFAULT;
  1379. break;
  1380. }
  1381. vhost_vq_meta_reset(dev);
  1382. if (vhost_iotlb_add_range(dev->iotlb, msg->iova,
  1383. msg->iova + msg->size - 1,
  1384. msg->uaddr, msg->perm)) {
  1385. ret = -ENOMEM;
  1386. break;
  1387. }
  1388. vhost_iotlb_notify_vq(dev, msg);
  1389. break;
  1390. case VHOST_IOTLB_INVALIDATE:
  1391. if (!dev->iotlb) {
  1392. ret = -EFAULT;
  1393. break;
  1394. }
  1395. vhost_vq_meta_reset(dev);
  1396. vhost_iotlb_del_range(dev->iotlb, msg->iova,
  1397. msg->iova + msg->size - 1);
  1398. break;
  1399. default:
  1400. ret = -EINVAL;
  1401. break;
  1402. }
  1403. vhost_dev_unlock_vqs(dev);
  1404. mutex_unlock(&dev->mutex);
  1405. return ret;
  1406. }
  1407. ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
  1408. struct iov_iter *from)
  1409. {
  1410. struct vhost_iotlb_msg msg;
  1411. size_t offset;
  1412. int type, ret;
  1413. u32 asid = 0;
  1414. ret = copy_from_iter(&type, sizeof(type), from);
  1415. if (ret != sizeof(type)) {
  1416. ret = -EINVAL;
  1417. goto done;
  1418. }
  1419. switch (type) {
  1420. case VHOST_IOTLB_MSG:
  1421. /* There maybe a hole after type for V1 message type,
  1422. * so skip it here.
  1423. */
  1424. offset = offsetof(struct vhost_msg, iotlb) - sizeof(int);
  1425. break;
  1426. case VHOST_IOTLB_MSG_V2:
  1427. if (vhost_backend_has_feature(dev->vqs[0],
  1428. VHOST_BACKEND_F_IOTLB_ASID)) {
  1429. ret = copy_from_iter(&asid, sizeof(asid), from);
  1430. if (ret != sizeof(asid)) {
  1431. ret = -EINVAL;
  1432. goto done;
  1433. }
  1434. offset = 0;
  1435. } else
  1436. offset = sizeof(__u32);
  1437. break;
  1438. default:
  1439. ret = -EINVAL;
  1440. goto done;
  1441. }
  1442. iov_iter_advance(from, offset);
  1443. ret = copy_from_iter(&msg, sizeof(msg), from);
  1444. if (ret != sizeof(msg)) {
  1445. ret = -EINVAL;
  1446. goto done;
  1447. }
  1448. if (msg.type == VHOST_IOTLB_UPDATE && msg.size == 0) {
  1449. ret = -EINVAL;
  1450. goto done;
  1451. }
  1452. if (dev->msg_handler)
  1453. ret = dev->msg_handler(dev, asid, &msg);
  1454. else
  1455. ret = vhost_process_iotlb_msg(dev, asid, &msg);
  1456. if (ret) {
  1457. ret = -EFAULT;
  1458. goto done;
  1459. }
  1460. ret = (type == VHOST_IOTLB_MSG) ? sizeof(struct vhost_msg) :
  1461. sizeof(struct vhost_msg_v2);
  1462. done:
  1463. return ret;
  1464. }
  1465. EXPORT_SYMBOL(vhost_chr_write_iter);
  1466. __poll_t vhost_chr_poll(struct file *file, struct vhost_dev *dev,
  1467. poll_table *wait)
  1468. {
  1469. __poll_t mask = 0;
  1470. poll_wait(file, &dev->wait, wait);
  1471. if (!list_empty(&dev->read_list))
  1472. mask |= EPOLLIN | EPOLLRDNORM;
  1473. return mask;
  1474. }
  1475. EXPORT_SYMBOL(vhost_chr_poll);
  1476. ssize_t vhost_chr_read_iter(struct vhost_dev *dev, struct iov_iter *to,
  1477. int noblock)
  1478. {
  1479. DEFINE_WAIT(wait);
  1480. struct vhost_msg_node *node;
  1481. ssize_t ret = 0;
  1482. unsigned size = sizeof(struct vhost_msg);
  1483. if (iov_iter_count(to) < size)
  1484. return 0;
  1485. while (1) {
  1486. if (!noblock)
  1487. prepare_to_wait(&dev->wait, &wait,
  1488. TASK_INTERRUPTIBLE);
  1489. node = vhost_dequeue_msg(dev, &dev->read_list);
  1490. if (node)
  1491. break;
  1492. if (noblock) {
  1493. ret = -EAGAIN;
  1494. break;
  1495. }
  1496. if (signal_pending(current)) {
  1497. ret = -ERESTARTSYS;
  1498. break;
  1499. }
  1500. if (!dev->iotlb) {
  1501. ret = -EBADFD;
  1502. break;
  1503. }
  1504. schedule();
  1505. }
  1506. if (!noblock)
  1507. finish_wait(&dev->wait, &wait);
  1508. if (node) {
  1509. struct vhost_iotlb_msg *msg;
  1510. void *start = &node->msg;
  1511. switch (node->msg.type) {
  1512. case VHOST_IOTLB_MSG:
  1513. size = sizeof(node->msg);
  1514. msg = &node->msg.iotlb;
  1515. break;
  1516. case VHOST_IOTLB_MSG_V2:
  1517. size = sizeof(node->msg_v2);
  1518. msg = &node->msg_v2.iotlb;
  1519. break;
  1520. default:
  1521. BUG();
  1522. break;
  1523. }
  1524. ret = copy_to_iter(start, size, to);
  1525. if (ret != size || msg->type != VHOST_IOTLB_MISS) {
  1526. kfree(node);
  1527. return ret;
  1528. }
  1529. vhost_enqueue_msg(dev, &dev->pending_list, node);
  1530. }
  1531. return ret;
  1532. }
  1533. EXPORT_SYMBOL_GPL(vhost_chr_read_iter);
  1534. static int vhost_iotlb_miss(struct vhost_virtqueue *vq, u64 iova, int access)
  1535. {
  1536. struct vhost_dev *dev = vq->dev;
  1537. struct vhost_msg_node *node;
  1538. struct vhost_iotlb_msg *msg;
  1539. bool v2 = vhost_backend_has_feature(vq, VHOST_BACKEND_F_IOTLB_MSG_V2);
  1540. node = vhost_new_msg(vq, v2 ? VHOST_IOTLB_MSG_V2 : VHOST_IOTLB_MSG);
  1541. if (!node)
  1542. return -ENOMEM;
  1543. if (v2) {
  1544. node->msg_v2.type = VHOST_IOTLB_MSG_V2;
  1545. msg = &node->msg_v2.iotlb;
  1546. } else {
  1547. msg = &node->msg.iotlb;
  1548. }
  1549. msg->type = VHOST_IOTLB_MISS;
  1550. msg->iova = iova;
  1551. msg->perm = access;
  1552. vhost_enqueue_msg(dev, &dev->read_list, node);
  1553. return 0;
  1554. }
  1555. static bool vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
  1556. vring_desc_t __user *desc,
  1557. vring_avail_t __user *avail,
  1558. vring_used_t __user *used)
  1559. {
  1560. /* If an IOTLB device is present, the vring addresses are
  1561. * GIOVAs. Access validation occurs at prefetch time. */
  1562. if (vq->iotlb)
  1563. return true;
  1564. return access_ok(desc, vhost_get_desc_size(vq, num)) &&
  1565. access_ok(avail, vhost_get_avail_size(vq, num)) &&
  1566. access_ok(used, vhost_get_used_size(vq, num));
  1567. }
  1568. static void vhost_vq_meta_update(struct vhost_virtqueue *vq,
  1569. const struct vhost_iotlb_map *map,
  1570. int type)
  1571. {
  1572. int access = (type == VHOST_ADDR_USED) ?
  1573. VHOST_ACCESS_WO : VHOST_ACCESS_RO;
  1574. if (likely(map->perm & access))
  1575. vq->meta_iotlb[type] = map;
  1576. }
  1577. static bool iotlb_access_ok(struct vhost_virtqueue *vq,
  1578. int access, u64 addr, u64 len, int type)
  1579. {
  1580. const struct vhost_iotlb_map *map;
  1581. struct vhost_iotlb *umem = vq->iotlb;
  1582. u64 s = 0, size, orig_addr = addr, last = addr + len - 1;
  1583. if (vhost_vq_meta_fetch(vq, addr, len, type))
  1584. return true;
  1585. while (len > s) {
  1586. map = vhost_iotlb_itree_first(umem, addr, last);
  1587. if (map == NULL || map->start > addr) {
  1588. vhost_iotlb_miss(vq, addr, access);
  1589. return false;
  1590. } else if (!(map->perm & access)) {
  1591. /* Report the possible access violation by
  1592. * request another translation from userspace.
  1593. */
  1594. return false;
  1595. }
  1596. size = map->size - addr + map->start;
  1597. if (orig_addr == addr && size >= len)
  1598. vhost_vq_meta_update(vq, map, type);
  1599. s += size;
  1600. addr += size;
  1601. }
  1602. return true;
  1603. }
  1604. int vq_meta_prefetch(struct vhost_virtqueue *vq)
  1605. {
  1606. unsigned int num = vq->num;
  1607. if (!vq->iotlb)
  1608. return 1;
  1609. return iotlb_access_ok(vq, VHOST_MAP_RO, (u64)(uintptr_t)vq->desc,
  1610. vhost_get_desc_size(vq, num), VHOST_ADDR_DESC) &&
  1611. iotlb_access_ok(vq, VHOST_MAP_RO, (u64)(uintptr_t)vq->avail,
  1612. vhost_get_avail_size(vq, num),
  1613. VHOST_ADDR_AVAIL) &&
  1614. iotlb_access_ok(vq, VHOST_MAP_WO, (u64)(uintptr_t)vq->used,
  1615. vhost_get_used_size(vq, num), VHOST_ADDR_USED);
  1616. }
  1617. EXPORT_SYMBOL_GPL(vq_meta_prefetch);
  1618. /* Can we log writes? */
  1619. /* Caller should have device mutex but not vq mutex */
  1620. bool vhost_log_access_ok(struct vhost_dev *dev)
  1621. {
  1622. return memory_access_ok(dev, dev->umem, 1);
  1623. }
  1624. EXPORT_SYMBOL_GPL(vhost_log_access_ok);
  1625. static bool vq_log_used_access_ok(struct vhost_virtqueue *vq,
  1626. void __user *log_base,
  1627. bool log_used,
  1628. u64 log_addr)
  1629. {
  1630. /* If an IOTLB device is present, log_addr is a GIOVA that
  1631. * will never be logged by log_used(). */
  1632. if (vq->iotlb)
  1633. return true;
  1634. return !log_used || log_access_ok(log_base, log_addr,
  1635. vhost_get_used_size(vq, vq->num));
  1636. }
  1637. /* Verify access for write logging. */
  1638. /* Caller should have vq mutex and device mutex */
  1639. static bool vq_log_access_ok(struct vhost_virtqueue *vq,
  1640. void __user *log_base)
  1641. {
  1642. return vq_memory_access_ok(log_base, vq->umem,
  1643. vhost_has_feature(vq, VHOST_F_LOG_ALL)) &&
  1644. vq_log_used_access_ok(vq, log_base, vq->log_used, vq->log_addr);
  1645. }
  1646. /* Can we start vq? */
  1647. /* Caller should have vq mutex and device mutex */
  1648. bool vhost_vq_access_ok(struct vhost_virtqueue *vq)
  1649. {
  1650. if (!vq_log_access_ok(vq, vq->log_base))
  1651. return false;
  1652. return vq_access_ok(vq, vq->num, vq->desc, vq->avail, vq->used);
  1653. }
  1654. EXPORT_SYMBOL_GPL(vhost_vq_access_ok);
  1655. static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
  1656. {
  1657. struct vhost_memory mem, *newmem;
  1658. struct vhost_memory_region *region;
  1659. struct vhost_iotlb *newumem, *oldumem;
  1660. unsigned long size = offsetof(struct vhost_memory, regions);
  1661. int i;
  1662. if (copy_from_user(&mem, m, size))
  1663. return -EFAULT;
  1664. if (mem.padding)
  1665. return -EOPNOTSUPP;
  1666. if (mem.nregions > max_mem_regions)
  1667. return -E2BIG;
  1668. newmem = kvzalloc_flex(*newmem, regions, mem.nregions);
  1669. if (!newmem)
  1670. return -ENOMEM;
  1671. memcpy(newmem, &mem, size);
  1672. if (copy_from_user(newmem->regions, m->regions,
  1673. flex_array_size(newmem, regions, mem.nregions))) {
  1674. kvfree(newmem);
  1675. return -EFAULT;
  1676. }
  1677. newumem = iotlb_alloc();
  1678. if (!newumem) {
  1679. kvfree(newmem);
  1680. return -ENOMEM;
  1681. }
  1682. for (region = newmem->regions;
  1683. region < newmem->regions + mem.nregions;
  1684. region++) {
  1685. if (vhost_iotlb_add_range(newumem,
  1686. region->guest_phys_addr,
  1687. region->guest_phys_addr +
  1688. region->memory_size - 1,
  1689. region->userspace_addr,
  1690. VHOST_MAP_RW))
  1691. goto err;
  1692. }
  1693. if (!memory_access_ok(d, newumem, 0))
  1694. goto err;
  1695. oldumem = d->umem;
  1696. d->umem = newumem;
  1697. /* All memory accesses are done under some VQ mutex. */
  1698. for (i = 0; i < d->nvqs; ++i) {
  1699. mutex_lock(&d->vqs[i]->mutex);
  1700. d->vqs[i]->umem = newumem;
  1701. mutex_unlock(&d->vqs[i]->mutex);
  1702. }
  1703. kvfree(newmem);
  1704. vhost_iotlb_free(oldumem);
  1705. return 0;
  1706. err:
  1707. vhost_iotlb_free(newumem);
  1708. kvfree(newmem);
  1709. return -EFAULT;
  1710. }
  1711. static long vhost_vring_set_num(struct vhost_dev *d,
  1712. struct vhost_virtqueue *vq,
  1713. void __user *argp)
  1714. {
  1715. struct vhost_vring_state s;
  1716. /* Resizing ring with an active backend?
  1717. * You don't want to do that. */
  1718. if (vq->private_data)
  1719. return -EBUSY;
  1720. if (copy_from_user(&s, argp, sizeof s))
  1721. return -EFAULT;
  1722. if (!s.num || s.num > 0xffff || (s.num & (s.num - 1)))
  1723. return -EINVAL;
  1724. vq->num = s.num;
  1725. return 0;
  1726. }
  1727. static long vhost_vring_set_addr(struct vhost_dev *d,
  1728. struct vhost_virtqueue *vq,
  1729. void __user *argp)
  1730. {
  1731. struct vhost_vring_addr a;
  1732. if (copy_from_user(&a, argp, sizeof a))
  1733. return -EFAULT;
  1734. if (a.flags & ~(0x1 << VHOST_VRING_F_LOG))
  1735. return -EOPNOTSUPP;
  1736. /* For 32bit, verify that the top 32bits of the user
  1737. data are set to zero. */
  1738. if ((u64)(unsigned long)a.desc_user_addr != a.desc_user_addr ||
  1739. (u64)(unsigned long)a.used_user_addr != a.used_user_addr ||
  1740. (u64)(unsigned long)a.avail_user_addr != a.avail_user_addr)
  1741. return -EFAULT;
  1742. /* Make sure it's safe to cast pointers to vring types. */
  1743. BUILD_BUG_ON(__alignof__ *vq->avail > VRING_AVAIL_ALIGN_SIZE);
  1744. BUILD_BUG_ON(__alignof__ *vq->used > VRING_USED_ALIGN_SIZE);
  1745. if ((a.avail_user_addr & (VRING_AVAIL_ALIGN_SIZE - 1)) ||
  1746. (a.used_user_addr & (VRING_USED_ALIGN_SIZE - 1)) ||
  1747. (a.log_guest_addr & (VRING_USED_ALIGN_SIZE - 1)))
  1748. return -EINVAL;
  1749. /* We only verify access here if backend is configured.
  1750. * If it is not, we don't as size might not have been setup.
  1751. * We will verify when backend is configured. */
  1752. if (vq->private_data) {
  1753. if (!vq_access_ok(vq, vq->num,
  1754. (void __user *)(unsigned long)a.desc_user_addr,
  1755. (void __user *)(unsigned long)a.avail_user_addr,
  1756. (void __user *)(unsigned long)a.used_user_addr))
  1757. return -EINVAL;
  1758. /* Also validate log access for used ring if enabled. */
  1759. if (!vq_log_used_access_ok(vq, vq->log_base,
  1760. a.flags & (0x1 << VHOST_VRING_F_LOG),
  1761. a.log_guest_addr))
  1762. return -EINVAL;
  1763. }
  1764. vq->log_used = !!(a.flags & (0x1 << VHOST_VRING_F_LOG));
  1765. vq->desc = (void __user *)(unsigned long)a.desc_user_addr;
  1766. vq->avail = (void __user *)(unsigned long)a.avail_user_addr;
  1767. vq->log_addr = a.log_guest_addr;
  1768. vq->used = (void __user *)(unsigned long)a.used_user_addr;
  1769. return 0;
  1770. }
  1771. static long vhost_vring_set_num_addr(struct vhost_dev *d,
  1772. struct vhost_virtqueue *vq,
  1773. unsigned int ioctl,
  1774. void __user *argp)
  1775. {
  1776. long r;
  1777. mutex_lock(&vq->mutex);
  1778. switch (ioctl) {
  1779. case VHOST_SET_VRING_NUM:
  1780. r = vhost_vring_set_num(d, vq, argp);
  1781. break;
  1782. case VHOST_SET_VRING_ADDR:
  1783. r = vhost_vring_set_addr(d, vq, argp);
  1784. break;
  1785. default:
  1786. BUG();
  1787. }
  1788. mutex_unlock(&vq->mutex);
  1789. return r;
  1790. }
  1791. long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
  1792. {
  1793. struct file *eventfp, *filep = NULL;
  1794. bool pollstart = false, pollstop = false;
  1795. struct eventfd_ctx *ctx = NULL;
  1796. struct vhost_virtqueue *vq;
  1797. struct vhost_vring_state s;
  1798. struct vhost_vring_file f;
  1799. u32 idx;
  1800. long r;
  1801. r = vhost_get_vq_from_user(d, argp, &vq, &idx);
  1802. if (r < 0)
  1803. return r;
  1804. if (ioctl == VHOST_SET_VRING_NUM ||
  1805. ioctl == VHOST_SET_VRING_ADDR) {
  1806. return vhost_vring_set_num_addr(d, vq, ioctl, argp);
  1807. }
  1808. mutex_lock(&vq->mutex);
  1809. switch (ioctl) {
  1810. case VHOST_SET_VRING_BASE:
  1811. /* Moving base with an active backend?
  1812. * You don't want to do that. */
  1813. if (vq->private_data) {
  1814. r = -EBUSY;
  1815. break;
  1816. }
  1817. if (copy_from_user(&s, argp, sizeof s)) {
  1818. r = -EFAULT;
  1819. break;
  1820. }
  1821. if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED)) {
  1822. vq->next_avail_head = vq->last_avail_idx =
  1823. s.num & 0xffff;
  1824. vq->last_used_idx = (s.num >> 16) & 0xffff;
  1825. } else {
  1826. if (s.num > 0xffff) {
  1827. r = -EINVAL;
  1828. break;
  1829. }
  1830. vq->next_avail_head = vq->last_avail_idx = s.num;
  1831. }
  1832. /* Forget the cached index value. */
  1833. vq->avail_idx = vq->last_avail_idx;
  1834. break;
  1835. case VHOST_GET_VRING_BASE:
  1836. s.index = idx;
  1837. if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
  1838. s.num = (u32)vq->last_avail_idx | ((u32)vq->last_used_idx << 16);
  1839. else
  1840. s.num = vq->last_avail_idx;
  1841. if (copy_to_user(argp, &s, sizeof s))
  1842. r = -EFAULT;
  1843. break;
  1844. case VHOST_SET_VRING_KICK:
  1845. if (copy_from_user(&f, argp, sizeof f)) {
  1846. r = -EFAULT;
  1847. break;
  1848. }
  1849. eventfp = f.fd == VHOST_FILE_UNBIND ? NULL : eventfd_fget(f.fd);
  1850. if (IS_ERR(eventfp)) {
  1851. r = PTR_ERR(eventfp);
  1852. break;
  1853. }
  1854. if (eventfp != vq->kick) {
  1855. pollstop = (filep = vq->kick) != NULL;
  1856. pollstart = (vq->kick = eventfp) != NULL;
  1857. } else
  1858. filep = eventfp;
  1859. break;
  1860. case VHOST_SET_VRING_CALL:
  1861. if (copy_from_user(&f, argp, sizeof f)) {
  1862. r = -EFAULT;
  1863. break;
  1864. }
  1865. ctx = f.fd == VHOST_FILE_UNBIND ? NULL : eventfd_ctx_fdget(f.fd);
  1866. if (IS_ERR(ctx)) {
  1867. r = PTR_ERR(ctx);
  1868. break;
  1869. }
  1870. swap(ctx, vq->call_ctx.ctx);
  1871. break;
  1872. case VHOST_SET_VRING_ERR:
  1873. if (copy_from_user(&f, argp, sizeof f)) {
  1874. r = -EFAULT;
  1875. break;
  1876. }
  1877. ctx = f.fd == VHOST_FILE_UNBIND ? NULL : eventfd_ctx_fdget(f.fd);
  1878. if (IS_ERR(ctx)) {
  1879. r = PTR_ERR(ctx);
  1880. break;
  1881. }
  1882. swap(ctx, vq->error_ctx);
  1883. break;
  1884. case VHOST_SET_VRING_ENDIAN:
  1885. r = vhost_set_vring_endian(vq, argp);
  1886. break;
  1887. case VHOST_GET_VRING_ENDIAN:
  1888. r = vhost_get_vring_endian(vq, idx, argp);
  1889. break;
  1890. case VHOST_SET_VRING_BUSYLOOP_TIMEOUT:
  1891. if (copy_from_user(&s, argp, sizeof(s))) {
  1892. r = -EFAULT;
  1893. break;
  1894. }
  1895. vq->busyloop_timeout = s.num;
  1896. break;
  1897. case VHOST_GET_VRING_BUSYLOOP_TIMEOUT:
  1898. s.index = idx;
  1899. s.num = vq->busyloop_timeout;
  1900. if (copy_to_user(argp, &s, sizeof(s)))
  1901. r = -EFAULT;
  1902. break;
  1903. default:
  1904. r = -ENOIOCTLCMD;
  1905. }
  1906. if (pollstop && vq->handle_kick)
  1907. vhost_poll_stop(&vq->poll);
  1908. if (!IS_ERR_OR_NULL(ctx))
  1909. eventfd_ctx_put(ctx);
  1910. if (filep)
  1911. fput(filep);
  1912. if (pollstart && vq->handle_kick)
  1913. r = vhost_poll_start(&vq->poll, vq->kick);
  1914. mutex_unlock(&vq->mutex);
  1915. if (pollstop && vq->handle_kick)
  1916. vhost_dev_flush(vq->poll.dev);
  1917. return r;
  1918. }
  1919. EXPORT_SYMBOL_GPL(vhost_vring_ioctl);
  1920. int vhost_init_device_iotlb(struct vhost_dev *d)
  1921. {
  1922. struct vhost_iotlb *niotlb, *oiotlb;
  1923. int i;
  1924. niotlb = iotlb_alloc();
  1925. if (!niotlb)
  1926. return -ENOMEM;
  1927. oiotlb = d->iotlb;
  1928. d->iotlb = niotlb;
  1929. for (i = 0; i < d->nvqs; ++i) {
  1930. struct vhost_virtqueue *vq = d->vqs[i];
  1931. mutex_lock(&vq->mutex);
  1932. vq->iotlb = niotlb;
  1933. __vhost_vq_meta_reset(vq);
  1934. mutex_unlock(&vq->mutex);
  1935. }
  1936. vhost_iotlb_free(oiotlb);
  1937. return 0;
  1938. }
  1939. EXPORT_SYMBOL_GPL(vhost_init_device_iotlb);
  1940. /* Caller must have device mutex */
  1941. long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
  1942. {
  1943. struct eventfd_ctx *ctx;
  1944. u64 p;
  1945. long r;
  1946. int i, fd;
  1947. /* If you are not the owner, you can become one */
  1948. if (ioctl == VHOST_SET_OWNER) {
  1949. r = vhost_dev_set_owner(d);
  1950. goto done;
  1951. }
  1952. #ifdef CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL
  1953. if (ioctl == VHOST_SET_FORK_FROM_OWNER) {
  1954. /* Only allow modification before owner is set */
  1955. if (vhost_dev_has_owner(d)) {
  1956. r = -EBUSY;
  1957. goto done;
  1958. }
  1959. u8 fork_owner_val;
  1960. if (get_user(fork_owner_val, (u8 __user *)argp)) {
  1961. r = -EFAULT;
  1962. goto done;
  1963. }
  1964. if (fork_owner_val != VHOST_FORK_OWNER_TASK &&
  1965. fork_owner_val != VHOST_FORK_OWNER_KTHREAD) {
  1966. r = -EINVAL;
  1967. goto done;
  1968. }
  1969. d->fork_owner = !!fork_owner_val;
  1970. r = 0;
  1971. goto done;
  1972. }
  1973. if (ioctl == VHOST_GET_FORK_FROM_OWNER) {
  1974. u8 fork_owner_val = d->fork_owner;
  1975. if (fork_owner_val != VHOST_FORK_OWNER_TASK &&
  1976. fork_owner_val != VHOST_FORK_OWNER_KTHREAD) {
  1977. r = -EINVAL;
  1978. goto done;
  1979. }
  1980. if (put_user(fork_owner_val, (u8 __user *)argp)) {
  1981. r = -EFAULT;
  1982. goto done;
  1983. }
  1984. r = 0;
  1985. goto done;
  1986. }
  1987. #endif
  1988. /* You must be the owner to do anything else */
  1989. r = vhost_dev_check_owner(d);
  1990. if (r)
  1991. goto done;
  1992. switch (ioctl) {
  1993. case VHOST_SET_MEM_TABLE:
  1994. r = vhost_set_memory(d, argp);
  1995. break;
  1996. case VHOST_SET_LOG_BASE:
  1997. if (copy_from_user(&p, argp, sizeof p)) {
  1998. r = -EFAULT;
  1999. break;
  2000. }
  2001. if ((u64)(unsigned long)p != p) {
  2002. r = -EFAULT;
  2003. break;
  2004. }
  2005. for (i = 0; i < d->nvqs; ++i) {
  2006. struct vhost_virtqueue *vq;
  2007. void __user *base = (void __user *)(unsigned long)p;
  2008. vq = d->vqs[i];
  2009. mutex_lock(&vq->mutex);
  2010. /* If ring is inactive, will check when it's enabled. */
  2011. if (vq->private_data && !vq_log_access_ok(vq, base))
  2012. r = -EFAULT;
  2013. else
  2014. vq->log_base = base;
  2015. mutex_unlock(&vq->mutex);
  2016. }
  2017. break;
  2018. case VHOST_SET_LOG_FD:
  2019. r = get_user(fd, (int __user *)argp);
  2020. if (r < 0)
  2021. break;
  2022. ctx = fd == VHOST_FILE_UNBIND ? NULL : eventfd_ctx_fdget(fd);
  2023. if (IS_ERR(ctx)) {
  2024. r = PTR_ERR(ctx);
  2025. break;
  2026. }
  2027. swap(ctx, d->log_ctx);
  2028. for (i = 0; i < d->nvqs; ++i) {
  2029. mutex_lock(&d->vqs[i]->mutex);
  2030. d->vqs[i]->log_ctx = d->log_ctx;
  2031. mutex_unlock(&d->vqs[i]->mutex);
  2032. }
  2033. if (ctx)
  2034. eventfd_ctx_put(ctx);
  2035. break;
  2036. default:
  2037. r = -ENOIOCTLCMD;
  2038. break;
  2039. }
  2040. done:
  2041. return r;
  2042. }
  2043. EXPORT_SYMBOL_GPL(vhost_dev_ioctl);
  2044. /* TODO: This is really inefficient. We need something like get_user()
  2045. * (instruction directly accesses the data, with an exception table entry
  2046. * returning -EFAULT). See Documentation/arch/x86/exception-tables.rst.
  2047. */
  2048. static int set_bit_to_user(int nr, void __user *addr)
  2049. {
  2050. unsigned long log = (unsigned long)addr;
  2051. struct page *page;
  2052. void *base;
  2053. int bit = nr + (log % PAGE_SIZE) * 8;
  2054. int r;
  2055. r = pin_user_pages_fast(log, 1, FOLL_WRITE, &page);
  2056. if (r < 0)
  2057. return r;
  2058. BUG_ON(r != 1);
  2059. base = kmap_atomic(page);
  2060. set_bit(bit, base);
  2061. kunmap_atomic(base);
  2062. unpin_user_pages_dirty_lock(&page, 1, true);
  2063. return 0;
  2064. }
  2065. static int log_write(void __user *log_base,
  2066. u64 write_address, u64 write_length)
  2067. {
  2068. u64 write_page = write_address / VHOST_PAGE_SIZE;
  2069. int r;
  2070. if (!write_length)
  2071. return 0;
  2072. write_length += write_address % VHOST_PAGE_SIZE;
  2073. for (;;) {
  2074. u64 base = (u64)(unsigned long)log_base;
  2075. u64 log = base + write_page / 8;
  2076. int bit = write_page % 8;
  2077. if ((u64)(unsigned long)log != log)
  2078. return -EFAULT;
  2079. r = set_bit_to_user(bit, (void __user *)(unsigned long)log);
  2080. if (r < 0)
  2081. return r;
  2082. if (write_length <= VHOST_PAGE_SIZE)
  2083. break;
  2084. write_length -= VHOST_PAGE_SIZE;
  2085. write_page += 1;
  2086. }
  2087. return r;
  2088. }
  2089. static int log_write_hva(struct vhost_virtqueue *vq, u64 hva, u64 len)
  2090. {
  2091. struct vhost_iotlb *umem = vq->umem;
  2092. struct vhost_iotlb_map *u;
  2093. u64 start, end, l, min;
  2094. int r;
  2095. bool hit = false;
  2096. while (len) {
  2097. min = len;
  2098. /* More than one GPAs can be mapped into a single HVA. So
  2099. * iterate all possible umems here to be safe.
  2100. */
  2101. list_for_each_entry(u, &umem->list, link) {
  2102. if (u->addr > hva - 1 + len ||
  2103. u->addr - 1 + u->size < hva)
  2104. continue;
  2105. start = max(u->addr, hva);
  2106. end = min(u->addr - 1 + u->size, hva - 1 + len);
  2107. l = end - start + 1;
  2108. r = log_write(vq->log_base,
  2109. u->start + start - u->addr,
  2110. l);
  2111. if (r < 0)
  2112. return r;
  2113. hit = true;
  2114. min = min(l, min);
  2115. }
  2116. if (!hit)
  2117. return -EFAULT;
  2118. len -= min;
  2119. hva += min;
  2120. }
  2121. return 0;
  2122. }
  2123. static int log_used(struct vhost_virtqueue *vq, u64 used_offset, u64 len)
  2124. {
  2125. struct iovec *iov = vq->log_iov;
  2126. int i, ret;
  2127. if (!vq->iotlb)
  2128. return log_write(vq->log_base, vq->log_addr + used_offset, len);
  2129. ret = translate_desc(vq, (uintptr_t)vq->used + used_offset,
  2130. len, iov, 64, VHOST_ACCESS_WO);
  2131. if (ret < 0)
  2132. return ret;
  2133. for (i = 0; i < ret; i++) {
  2134. ret = log_write_hva(vq, (uintptr_t)iov[i].iov_base,
  2135. iov[i].iov_len);
  2136. if (ret)
  2137. return ret;
  2138. }
  2139. return 0;
  2140. }
  2141. /*
  2142. * vhost_log_write() - Log in dirty page bitmap
  2143. * @vq: vhost virtqueue.
  2144. * @log: Array of dirty memory in GPA.
  2145. * @log_num: Size of vhost_log arrary.
  2146. * @len: The total length of memory buffer to log in the dirty bitmap.
  2147. * Some drivers may only partially use pages shared via the last
  2148. * vring descriptor (i.e. vhost-net RX buffer).
  2149. * Use (len == U64_MAX) to indicate the driver would log all
  2150. * pages of vring descriptors.
  2151. * @iov: Array of dirty memory in HVA.
  2152. * @count: Size of iovec array.
  2153. */
  2154. int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
  2155. unsigned int log_num, u64 len, struct iovec *iov, int count)
  2156. {
  2157. int i, r;
  2158. /* Make sure data written is seen before log. */
  2159. smp_wmb();
  2160. if (vq->iotlb) {
  2161. for (i = 0; i < count; i++) {
  2162. r = log_write_hva(vq, (uintptr_t)iov[i].iov_base,
  2163. iov[i].iov_len);
  2164. if (r < 0)
  2165. return r;
  2166. }
  2167. return 0;
  2168. }
  2169. for (i = 0; i < log_num; ++i) {
  2170. u64 l = min(log[i].len, len);
  2171. r = log_write(vq->log_base, log[i].addr, l);
  2172. if (r < 0)
  2173. return r;
  2174. if (len != U64_MAX)
  2175. len -= l;
  2176. }
  2177. if (vq->log_ctx)
  2178. eventfd_signal(vq->log_ctx);
  2179. return 0;
  2180. }
  2181. EXPORT_SYMBOL_GPL(vhost_log_write);
  2182. static int vhost_update_used_flags(struct vhost_virtqueue *vq)
  2183. {
  2184. void __user *used;
  2185. if (vhost_put_used_flags(vq))
  2186. return -EFAULT;
  2187. if (unlikely(vq->log_used)) {
  2188. /* Make sure the flag is seen before log. */
  2189. smp_wmb();
  2190. /* Log used flag write. */
  2191. used = &vq->used->flags;
  2192. log_used(vq, (used - (void __user *)vq->used),
  2193. sizeof vq->used->flags);
  2194. if (vq->log_ctx)
  2195. eventfd_signal(vq->log_ctx);
  2196. }
  2197. return 0;
  2198. }
  2199. static int vhost_update_avail_event(struct vhost_virtqueue *vq)
  2200. {
  2201. if (vhost_put_avail_event(vq))
  2202. return -EFAULT;
  2203. if (unlikely(vq->log_used)) {
  2204. void __user *used;
  2205. /* Make sure the event is seen before log. */
  2206. smp_wmb();
  2207. /* Log avail event write */
  2208. used = vhost_avail_event(vq);
  2209. log_used(vq, (used - (void __user *)vq->used),
  2210. sizeof *vhost_avail_event(vq));
  2211. if (vq->log_ctx)
  2212. eventfd_signal(vq->log_ctx);
  2213. }
  2214. return 0;
  2215. }
  2216. int vhost_vq_init_access(struct vhost_virtqueue *vq)
  2217. {
  2218. __virtio16 last_used_idx;
  2219. int r;
  2220. bool is_le = vq->is_le;
  2221. if (!vq->private_data)
  2222. return 0;
  2223. vhost_init_is_le(vq);
  2224. r = vhost_update_used_flags(vq);
  2225. if (r)
  2226. goto err;
  2227. vq->signalled_used_valid = false;
  2228. if (!vq->iotlb &&
  2229. !access_ok(&vq->used->idx, sizeof vq->used->idx)) {
  2230. r = -EFAULT;
  2231. goto err;
  2232. }
  2233. r = vhost_get_used_idx(vq, &last_used_idx);
  2234. if (r) {
  2235. vq_err(vq, "Can't access used idx at %p\n",
  2236. &vq->used->idx);
  2237. goto err;
  2238. }
  2239. vq->last_used_idx = vhost16_to_cpu(vq, last_used_idx);
  2240. return 0;
  2241. err:
  2242. vq->is_le = is_le;
  2243. return r;
  2244. }
  2245. EXPORT_SYMBOL_GPL(vhost_vq_init_access);
  2246. static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
  2247. struct iovec iov[], int iov_size, int access)
  2248. {
  2249. const struct vhost_iotlb_map *map;
  2250. struct vhost_dev *dev = vq->dev;
  2251. struct vhost_iotlb *umem = dev->iotlb ? dev->iotlb : dev->umem;
  2252. struct iovec *_iov;
  2253. u64 s = 0, last = addr + len - 1;
  2254. int ret = 0;
  2255. while ((u64)len > s) {
  2256. u64 size;
  2257. if (unlikely(ret >= iov_size)) {
  2258. ret = -ENOBUFS;
  2259. break;
  2260. }
  2261. map = vhost_iotlb_itree_first(umem, addr, last);
  2262. if (map == NULL || map->start > addr) {
  2263. if (umem != dev->iotlb) {
  2264. ret = -EFAULT;
  2265. break;
  2266. }
  2267. ret = -EAGAIN;
  2268. break;
  2269. } else if (!(map->perm & access)) {
  2270. ret = -EPERM;
  2271. break;
  2272. }
  2273. _iov = iov + ret;
  2274. size = map->size - addr + map->start;
  2275. _iov->iov_len = min((u64)len - s, size);
  2276. _iov->iov_base = (void __user *)(unsigned long)
  2277. (map->addr + addr - map->start);
  2278. s += size;
  2279. addr += size;
  2280. ++ret;
  2281. }
  2282. if (ret == -EAGAIN)
  2283. vhost_iotlb_miss(vq, addr, access);
  2284. return ret;
  2285. }
  2286. /* Each buffer in the virtqueues is actually a chain of descriptors. This
  2287. * function returns the next descriptor in the chain,
  2288. * or -1U if we're at the end. */
  2289. static unsigned next_desc(struct vhost_virtqueue *vq, struct vring_desc *desc)
  2290. {
  2291. unsigned int next;
  2292. /* If this descriptor says it doesn't chain, we're done. */
  2293. if (!(desc->flags & cpu_to_vhost16(vq, VRING_DESC_F_NEXT)))
  2294. return -1U;
  2295. /* Check they're not leading us off end of descriptors. */
  2296. next = vhost16_to_cpu(vq, READ_ONCE(desc->next));
  2297. return next;
  2298. }
  2299. static int get_indirect(struct vhost_virtqueue *vq,
  2300. struct iovec iov[], unsigned int iov_size,
  2301. unsigned int *out_num, unsigned int *in_num,
  2302. struct vhost_log *log, unsigned int *log_num,
  2303. struct vring_desc *indirect)
  2304. {
  2305. struct vring_desc desc;
  2306. unsigned int i = 0, count, found = 0;
  2307. u32 len = vhost32_to_cpu(vq, indirect->len);
  2308. struct iov_iter from;
  2309. int ret, access;
  2310. /* Sanity check */
  2311. if (unlikely(len % sizeof desc)) {
  2312. vq_err(vq, "Invalid length in indirect descriptor: "
  2313. "len 0x%llx not multiple of 0x%zx\n",
  2314. (unsigned long long)len,
  2315. sizeof desc);
  2316. return -EINVAL;
  2317. }
  2318. ret = translate_desc(vq, vhost64_to_cpu(vq, indirect->addr), len, vq->indirect,
  2319. UIO_MAXIOV, VHOST_ACCESS_RO);
  2320. if (unlikely(ret < 0)) {
  2321. if (ret != -EAGAIN)
  2322. vq_err(vq, "Translation failure %d in indirect.\n", ret);
  2323. return ret;
  2324. }
  2325. iov_iter_init(&from, ITER_SOURCE, vq->indirect, ret, len);
  2326. count = len / sizeof desc;
  2327. /* Buffers are chained via a 16 bit next field, so
  2328. * we can have at most 2^16 of these. */
  2329. if (unlikely(count > USHRT_MAX + 1)) {
  2330. vq_err(vq, "Indirect buffer length too big: %d\n",
  2331. indirect->len);
  2332. return -E2BIG;
  2333. }
  2334. do {
  2335. unsigned iov_count = *in_num + *out_num;
  2336. if (unlikely(++found > count)) {
  2337. vq_err(vq, "Loop detected: last one at %u "
  2338. "indirect size %u\n",
  2339. i, count);
  2340. return -EINVAL;
  2341. }
  2342. if (unlikely(!copy_from_iter_full(&desc, sizeof(desc), &from))) {
  2343. vq_err(vq, "Failed indirect descriptor: idx %d, %zx\n",
  2344. i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc);
  2345. return -EINVAL;
  2346. }
  2347. if (unlikely(desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT))) {
  2348. vq_err(vq, "Nested indirect descriptor: idx %d, %zx\n",
  2349. i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc);
  2350. return -EINVAL;
  2351. }
  2352. if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
  2353. access = VHOST_ACCESS_WO;
  2354. else
  2355. access = VHOST_ACCESS_RO;
  2356. ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
  2357. vhost32_to_cpu(vq, desc.len), iov + iov_count,
  2358. iov_size - iov_count, access);
  2359. if (unlikely(ret < 0)) {
  2360. if (ret != -EAGAIN)
  2361. vq_err(vq, "Translation failure %d indirect idx %d\n",
  2362. ret, i);
  2363. return ret;
  2364. }
  2365. /* If this is an input descriptor, increment that count. */
  2366. if (access == VHOST_ACCESS_WO) {
  2367. *in_num += ret;
  2368. if (unlikely(log && ret)) {
  2369. log[*log_num].addr = vhost64_to_cpu(vq, desc.addr);
  2370. log[*log_num].len = vhost32_to_cpu(vq, desc.len);
  2371. ++*log_num;
  2372. }
  2373. } else {
  2374. /* If it's an output descriptor, they're all supposed
  2375. * to come before any input descriptors. */
  2376. if (unlikely(*in_num)) {
  2377. vq_err(vq, "Indirect descriptor "
  2378. "has out after in: idx %d\n", i);
  2379. return -EINVAL;
  2380. }
  2381. *out_num += ret;
  2382. }
  2383. } while ((i = next_desc(vq, &desc)) != -1);
  2384. return 0;
  2385. }
  2386. /**
  2387. * vhost_get_vq_desc_n - Fetch the next available descriptor chain and build iovecs
  2388. * @vq: target virtqueue
  2389. * @iov: array that receives the scatter/gather segments
  2390. * @iov_size: capacity of @iov in elements
  2391. * @out_num: the number of output segments
  2392. * @in_num: the number of input segments
  2393. * @log: optional array to record addr/len for each writable segment; NULL if unused
  2394. * @log_num: optional output; number of entries written to @log when provided
  2395. * @ndesc: optional output; number of descriptors consumed from the available ring
  2396. * (useful for rollback via vhost_discard_vq_desc)
  2397. *
  2398. * Extracts one available descriptor chain from @vq and translates guest addresses
  2399. * into host iovecs.
  2400. *
  2401. * On success, advances @vq->last_avail_idx by 1 and @vq->next_avail_head by the
  2402. * number of descriptors consumed (also stored via @ndesc when non-NULL).
  2403. *
  2404. * Return:
  2405. * - head index in [0, @vq->num) on success;
  2406. * - @vq->num if no descriptor is currently available;
  2407. * - negative errno on failure
  2408. */
  2409. int vhost_get_vq_desc_n(struct vhost_virtqueue *vq,
  2410. struct iovec iov[], unsigned int iov_size,
  2411. unsigned int *out_num, unsigned int *in_num,
  2412. struct vhost_log *log, unsigned int *log_num,
  2413. unsigned int *ndesc)
  2414. {
  2415. bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
  2416. struct vring_desc desc;
  2417. unsigned int i, head, found = 0;
  2418. u16 last_avail_idx = vq->last_avail_idx;
  2419. __virtio16 ring_head;
  2420. int ret, access, c = 0;
  2421. if (vq->avail_idx == vq->last_avail_idx) {
  2422. ret = vhost_get_avail_idx(vq);
  2423. if (unlikely(ret < 0))
  2424. return ret;
  2425. if (!ret)
  2426. return vq->num;
  2427. }
  2428. if (in_order)
  2429. head = vq->next_avail_head & (vq->num - 1);
  2430. else {
  2431. /* Grab the next descriptor number they're
  2432. * advertising, and increment the index we've seen. */
  2433. if (unlikely(vhost_get_avail_head(vq, &ring_head,
  2434. last_avail_idx))) {
  2435. vq_err(vq, "Failed to read head: idx %d address %p\n",
  2436. last_avail_idx,
  2437. &vq->avail->ring[last_avail_idx % vq->num]);
  2438. return -EFAULT;
  2439. }
  2440. head = vhost16_to_cpu(vq, ring_head);
  2441. }
  2442. /* If their number is silly, that's an error. */
  2443. if (unlikely(head >= vq->num)) {
  2444. vq_err(vq, "Guest says index %u > %u is available",
  2445. head, vq->num);
  2446. return -EINVAL;
  2447. }
  2448. /* When we start there are none of either input nor output. */
  2449. *out_num = *in_num = 0;
  2450. if (unlikely(log))
  2451. *log_num = 0;
  2452. i = head;
  2453. do {
  2454. unsigned iov_count = *in_num + *out_num;
  2455. if (unlikely(i >= vq->num)) {
  2456. vq_err(vq, "Desc index is %u > %u, head = %u",
  2457. i, vq->num, head);
  2458. return -EINVAL;
  2459. }
  2460. if (unlikely(++found > vq->num)) {
  2461. vq_err(vq, "Loop detected: last one at %u "
  2462. "vq size %u head %u\n",
  2463. i, vq->num, head);
  2464. return -EINVAL;
  2465. }
  2466. ret = vhost_get_desc(vq, &desc, i);
  2467. if (unlikely(ret)) {
  2468. vq_err(vq, "Failed to get descriptor: idx %d addr %p\n",
  2469. i, vq->desc + i);
  2470. return -EFAULT;
  2471. }
  2472. if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT)) {
  2473. ret = get_indirect(vq, iov, iov_size,
  2474. out_num, in_num,
  2475. log, log_num, &desc);
  2476. if (unlikely(ret < 0)) {
  2477. if (ret != -EAGAIN)
  2478. vq_err(vq, "Failure detected "
  2479. "in indirect descriptor at idx %d\n", i);
  2480. return ret;
  2481. }
  2482. ++c;
  2483. continue;
  2484. }
  2485. if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
  2486. access = VHOST_ACCESS_WO;
  2487. else
  2488. access = VHOST_ACCESS_RO;
  2489. ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
  2490. vhost32_to_cpu(vq, desc.len), iov + iov_count,
  2491. iov_size - iov_count, access);
  2492. if (unlikely(ret < 0)) {
  2493. if (ret != -EAGAIN)
  2494. vq_err(vq, "Translation failure %d descriptor idx %d\n",
  2495. ret, i);
  2496. return ret;
  2497. }
  2498. if (access == VHOST_ACCESS_WO) {
  2499. /* If this is an input descriptor,
  2500. * increment that count. */
  2501. *in_num += ret;
  2502. if (unlikely(log && ret)) {
  2503. log[*log_num].addr = vhost64_to_cpu(vq, desc.addr);
  2504. log[*log_num].len = vhost32_to_cpu(vq, desc.len);
  2505. ++*log_num;
  2506. }
  2507. } else {
  2508. /* If it's an output descriptor, they're all supposed
  2509. * to come before any input descriptors. */
  2510. if (unlikely(*in_num)) {
  2511. vq_err(vq, "Descriptor has out after in: "
  2512. "idx %d\n", i);
  2513. return -EINVAL;
  2514. }
  2515. *out_num += ret;
  2516. }
  2517. ++c;
  2518. } while ((i = next_desc(vq, &desc)) != -1);
  2519. /* On success, increment avail index. */
  2520. vq->last_avail_idx++;
  2521. vq->next_avail_head += c;
  2522. if (ndesc)
  2523. *ndesc = c;
  2524. /* Assume notifications from guest are disabled at this point,
  2525. * if they aren't we would need to update avail_event index. */
  2526. BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
  2527. return head;
  2528. }
  2529. EXPORT_SYMBOL_GPL(vhost_get_vq_desc_n);
  2530. /* This looks in the virtqueue and for the first available buffer, and converts
  2531. * it to an iovec for convenient access. Since descriptors consist of some
  2532. * number of output then some number of input descriptors, it's actually two
  2533. * iovecs, but we pack them into one and note how many of each there were.
  2534. *
  2535. * This function returns the descriptor number found, or vq->num (which is
  2536. * never a valid descriptor number) if none was found. A negative code is
  2537. * returned on error.
  2538. */
  2539. int vhost_get_vq_desc(struct vhost_virtqueue *vq,
  2540. struct iovec iov[], unsigned int iov_size,
  2541. unsigned int *out_num, unsigned int *in_num,
  2542. struct vhost_log *log, unsigned int *log_num)
  2543. {
  2544. return vhost_get_vq_desc_n(vq, iov, iov_size, out_num, in_num,
  2545. log, log_num, NULL);
  2546. }
  2547. EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
  2548. /**
  2549. * vhost_discard_vq_desc - Reverse the effect of vhost_get_vq_desc_n()
  2550. * @vq: target virtqueue
  2551. * @nbufs: number of buffers to roll back
  2552. * @ndesc: number of descriptors to roll back
  2553. *
  2554. * Rewinds the internal consumer cursors after a failed attempt to use buffers
  2555. * returned by vhost_get_vq_desc_n().
  2556. */
  2557. void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int nbufs,
  2558. unsigned int ndesc)
  2559. {
  2560. vq->next_avail_head -= ndesc;
  2561. vq->last_avail_idx -= nbufs;
  2562. }
  2563. EXPORT_SYMBOL_GPL(vhost_discard_vq_desc);
  2564. /* After we've used one of their buffers, we tell them about it. We'll then
  2565. * want to notify the guest, using eventfd. */
  2566. int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
  2567. {
  2568. struct vring_used_elem heads = {
  2569. cpu_to_vhost32(vq, head),
  2570. cpu_to_vhost32(vq, len)
  2571. };
  2572. u16 nheads = 1;
  2573. return vhost_add_used_n(vq, &heads, &nheads, 1);
  2574. }
  2575. EXPORT_SYMBOL_GPL(vhost_add_used);
  2576. static int __vhost_add_used_n(struct vhost_virtqueue *vq,
  2577. struct vring_used_elem *heads,
  2578. unsigned count)
  2579. {
  2580. vring_used_elem_t __user *used;
  2581. u16 old, new;
  2582. int start;
  2583. start = vq->last_used_idx & (vq->num - 1);
  2584. used = vq->used->ring + start;
  2585. if (vhost_put_used(vq, heads, start, count)) {
  2586. vq_err(vq, "Failed to write used");
  2587. return -EFAULT;
  2588. }
  2589. if (unlikely(vq->log_used)) {
  2590. /* Make sure data is seen before log. */
  2591. smp_wmb();
  2592. /* Log used ring entry write. */
  2593. log_used(vq, ((void __user *)used - (void __user *)vq->used),
  2594. count * sizeof *used);
  2595. }
  2596. old = vq->last_used_idx;
  2597. new = (vq->last_used_idx += count);
  2598. /* If the driver never bothers to signal in a very long while,
  2599. * used index might wrap around. If that happens, invalidate
  2600. * signalled_used index we stored. TODO: make sure driver
  2601. * signals at least once in 2^16 and remove this. */
  2602. if (unlikely((u16)(new - vq->signalled_used) < (u16)(new - old)))
  2603. vq->signalled_used_valid = false;
  2604. return 0;
  2605. }
  2606. static int vhost_add_used_n_ooo(struct vhost_virtqueue *vq,
  2607. struct vring_used_elem *heads,
  2608. unsigned count)
  2609. {
  2610. int start, n, r;
  2611. start = vq->last_used_idx & (vq->num - 1);
  2612. n = vq->num - start;
  2613. if (n < count) {
  2614. r = __vhost_add_used_n(vq, heads, n);
  2615. if (r < 0)
  2616. return r;
  2617. heads += n;
  2618. count -= n;
  2619. }
  2620. return __vhost_add_used_n(vq, heads, count);
  2621. }
  2622. static int vhost_add_used_n_in_order(struct vhost_virtqueue *vq,
  2623. struct vring_used_elem *heads,
  2624. const u16 *nheads,
  2625. unsigned count)
  2626. {
  2627. vring_used_elem_t __user *used;
  2628. u16 old, new = vq->last_used_idx;
  2629. int start, i;
  2630. if (!nheads)
  2631. return -EINVAL;
  2632. start = vq->last_used_idx & (vq->num - 1);
  2633. used = vq->used->ring + start;
  2634. for (i = 0; i < count; i++) {
  2635. if (vhost_put_used(vq, &heads[i], start, 1)) {
  2636. vq_err(vq, "Failed to write used");
  2637. return -EFAULT;
  2638. }
  2639. start += nheads[i];
  2640. new += nheads[i];
  2641. if (start >= vq->num)
  2642. start -= vq->num;
  2643. }
  2644. if (unlikely(vq->log_used)) {
  2645. /* Make sure data is seen before log. */
  2646. smp_wmb();
  2647. /* Log used ring entry write. */
  2648. log_used(vq, ((void __user *)used - (void __user *)vq->used),
  2649. (vq->num - start) * sizeof *used);
  2650. if (start + count > vq->num)
  2651. log_used(vq, 0,
  2652. (start + count - vq->num) * sizeof *used);
  2653. }
  2654. old = vq->last_used_idx;
  2655. vq->last_used_idx = new;
  2656. /* If the driver never bothers to signal in a very long while,
  2657. * used index might wrap around. If that happens, invalidate
  2658. * signalled_used index we stored. TODO: make sure driver
  2659. * signals at least once in 2^16 and remove this. */
  2660. if (unlikely((u16)(new - vq->signalled_used) < (u16)(new - old)))
  2661. vq->signalled_used_valid = false;
  2662. return 0;
  2663. }
  2664. /* After we've used one of their buffers, we tell them about it. We'll then
  2665. * want to notify the guest, using eventfd. */
  2666. int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
  2667. u16 *nheads, unsigned count)
  2668. {
  2669. bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
  2670. int r;
  2671. if (!in_order || !nheads)
  2672. r = vhost_add_used_n_ooo(vq, heads, count);
  2673. else
  2674. r = vhost_add_used_n_in_order(vq, heads, nheads, count);
  2675. if (r < 0)
  2676. return r;
  2677. /* Make sure buffer is written before we update index. */
  2678. smp_wmb();
  2679. if (vhost_put_used_idx(vq)) {
  2680. vq_err(vq, "Failed to increment used idx");
  2681. return -EFAULT;
  2682. }
  2683. if (unlikely(vq->log_used)) {
  2684. /* Make sure used idx is seen before log. */
  2685. smp_wmb();
  2686. /* Log used index update. */
  2687. log_used(vq, offsetof(struct vring_used, idx),
  2688. sizeof vq->used->idx);
  2689. if (vq->log_ctx)
  2690. eventfd_signal(vq->log_ctx);
  2691. }
  2692. return r;
  2693. }
  2694. EXPORT_SYMBOL_GPL(vhost_add_used_n);
  2695. static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
  2696. {
  2697. __u16 old, new;
  2698. __virtio16 event;
  2699. bool v;
  2700. /* Flush out used index updates. This is paired
  2701. * with the barrier that the Guest executes when enabling
  2702. * interrupts. */
  2703. smp_mb();
  2704. if (vhost_has_feature(vq, VIRTIO_F_NOTIFY_ON_EMPTY) &&
  2705. unlikely(vq->avail_idx == vq->last_avail_idx))
  2706. return true;
  2707. if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) {
  2708. __virtio16 flags;
  2709. if (vhost_get_avail_flags(vq, &flags)) {
  2710. vq_err(vq, "Failed to get flags");
  2711. return true;
  2712. }
  2713. return !(flags & cpu_to_vhost16(vq, VRING_AVAIL_F_NO_INTERRUPT));
  2714. }
  2715. old = vq->signalled_used;
  2716. v = vq->signalled_used_valid;
  2717. new = vq->signalled_used = vq->last_used_idx;
  2718. vq->signalled_used_valid = true;
  2719. if (unlikely(!v))
  2720. return true;
  2721. if (vhost_get_used_event(vq, &event)) {
  2722. vq_err(vq, "Failed to get used event idx");
  2723. return true;
  2724. }
  2725. return vring_need_event(vhost16_to_cpu(vq, event), new, old);
  2726. }
  2727. /* This actually signals the guest, using eventfd. */
  2728. void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
  2729. {
  2730. /* Signal the Guest tell them we used something up. */
  2731. if (vq->call_ctx.ctx && vhost_notify(dev, vq))
  2732. eventfd_signal(vq->call_ctx.ctx);
  2733. }
  2734. EXPORT_SYMBOL_GPL(vhost_signal);
  2735. /* And here's the combo meal deal. Supersize me! */
  2736. void vhost_add_used_and_signal(struct vhost_dev *dev,
  2737. struct vhost_virtqueue *vq,
  2738. unsigned int head, int len)
  2739. {
  2740. vhost_add_used(vq, head, len);
  2741. vhost_signal(dev, vq);
  2742. }
  2743. EXPORT_SYMBOL_GPL(vhost_add_used_and_signal);
  2744. /* multi-buffer version of vhost_add_used_and_signal */
  2745. void vhost_add_used_and_signal_n(struct vhost_dev *dev,
  2746. struct vhost_virtqueue *vq,
  2747. struct vring_used_elem *heads,
  2748. u16 *nheads,
  2749. unsigned count)
  2750. {
  2751. vhost_add_used_n(vq, heads, nheads, count);
  2752. vhost_signal(dev, vq);
  2753. }
  2754. EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n);
  2755. /* return true if we're sure that available ring is empty */
  2756. bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
  2757. {
  2758. int r;
  2759. if (vq->avail_idx != vq->last_avail_idx)
  2760. return false;
  2761. r = vhost_get_avail_idx(vq);
  2762. /* Note: we treat error as non-empty here */
  2763. return r == 0;
  2764. }
  2765. EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);
  2766. /* OK, now we need to know about added descriptors. */
  2767. bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
  2768. {
  2769. int r;
  2770. if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY))
  2771. return false;
  2772. vq->used_flags &= ~VRING_USED_F_NO_NOTIFY;
  2773. if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) {
  2774. r = vhost_update_used_flags(vq);
  2775. if (r) {
  2776. vq_err(vq, "Failed to enable notification at %p: %d\n",
  2777. &vq->used->flags, r);
  2778. return false;
  2779. }
  2780. } else {
  2781. r = vhost_update_avail_event(vq);
  2782. if (r) {
  2783. vq_err(vq, "Failed to update avail event index at %p: %d\n",
  2784. vhost_avail_event(vq), r);
  2785. return false;
  2786. }
  2787. }
  2788. /* They could have slipped one in as we were doing that: make
  2789. * sure it's written, then check again. */
  2790. smp_mb();
  2791. r = vhost_get_avail_idx(vq);
  2792. /* Note: we treat error as empty here */
  2793. if (unlikely(r < 0))
  2794. return false;
  2795. return r;
  2796. }
  2797. EXPORT_SYMBOL_GPL(vhost_enable_notify);
  2798. /* We don't need to be notified again. */
  2799. void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
  2800. {
  2801. int r;
  2802. if (vq->used_flags & VRING_USED_F_NO_NOTIFY)
  2803. return;
  2804. vq->used_flags |= VRING_USED_F_NO_NOTIFY;
  2805. if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) {
  2806. r = vhost_update_used_flags(vq);
  2807. if (r)
  2808. vq_err(vq, "Failed to disable notification at %p: %d\n",
  2809. &vq->used->flags, r);
  2810. }
  2811. }
  2812. EXPORT_SYMBOL_GPL(vhost_disable_notify);
  2813. /* Create a new message. */
  2814. struct vhost_msg_node *vhost_new_msg(struct vhost_virtqueue *vq, int type)
  2815. {
  2816. /* Make sure all padding within the structure is initialized. */
  2817. struct vhost_msg_node *node = kzalloc_obj(*node);
  2818. if (!node)
  2819. return NULL;
  2820. node->vq = vq;
  2821. node->msg.type = type;
  2822. return node;
  2823. }
  2824. EXPORT_SYMBOL_GPL(vhost_new_msg);
  2825. void vhost_enqueue_msg(struct vhost_dev *dev, struct list_head *head,
  2826. struct vhost_msg_node *node)
  2827. {
  2828. spin_lock(&dev->iotlb_lock);
  2829. list_add_tail(&node->node, head);
  2830. spin_unlock(&dev->iotlb_lock);
  2831. wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM);
  2832. }
  2833. EXPORT_SYMBOL_GPL(vhost_enqueue_msg);
  2834. struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
  2835. struct list_head *head)
  2836. {
  2837. struct vhost_msg_node *node = NULL;
  2838. spin_lock(&dev->iotlb_lock);
  2839. if (!list_empty(head)) {
  2840. node = list_first_entry(head, struct vhost_msg_node,
  2841. node);
  2842. list_del(&node->node);
  2843. }
  2844. spin_unlock(&dev->iotlb_lock);
  2845. return node;
  2846. }
  2847. EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
  2848. void vhost_set_backend_features(struct vhost_dev *dev, u64 features)
  2849. {
  2850. struct vhost_virtqueue *vq;
  2851. int i;
  2852. mutex_lock(&dev->mutex);
  2853. for (i = 0; i < dev->nvqs; ++i) {
  2854. vq = dev->vqs[i];
  2855. mutex_lock(&vq->mutex);
  2856. vq->acked_backend_features = features;
  2857. mutex_unlock(&vq->mutex);
  2858. }
  2859. mutex_unlock(&dev->mutex);
  2860. }
  2861. EXPORT_SYMBOL_GPL(vhost_set_backend_features);
  2862. static int __init vhost_init(void)
  2863. {
  2864. return 0;
  2865. }
  2866. static void __exit vhost_exit(void)
  2867. {
  2868. }
  2869. module_init(vhost_init);
  2870. module_exit(vhost_exit);
  2871. MODULE_VERSION("0.0.1");
  2872. MODULE_LICENSE("GPL v2");
  2873. MODULE_AUTHOR("Michael S. Tsirkin");
  2874. MODULE_DESCRIPTION("Host kernel accelerator for virtio");