mshv_root_main.c 59 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * Copyright (c) 2024, Microsoft Corporation.
  4. *
  5. * The main part of the mshv_root module, providing APIs to create
  6. * and manage guest partitions.
  7. *
  8. * Authors: Microsoft Linux virtualization team
  9. */
  10. #include <linux/entry-virt.h>
  11. #include <linux/kernel.h>
  12. #include <linux/module.h>
  13. #include <linux/fs.h>
  14. #include <linux/miscdevice.h>
  15. #include <linux/slab.h>
  16. #include <linux/file.h>
  17. #include <linux/anon_inodes.h>
  18. #include <linux/mm.h>
  19. #include <linux/io.h>
  20. #include <linux/cpuhotplug.h>
  21. #include <linux/random.h>
  22. #include <asm/mshyperv.h>
  23. #include <linux/hyperv.h>
  24. #include <linux/notifier.h>
  25. #include <linux/reboot.h>
  26. #include <linux/kexec.h>
  27. #include <linux/page-flags.h>
  28. #include <linux/crash_dump.h>
  29. #include <linux/panic_notifier.h>
  30. #include <linux/vmalloc.h>
  31. #include <linux/rseq.h>
  32. #include "mshv_eventfd.h"
  33. #include "mshv.h"
  34. #include "mshv_root.h"
  35. MODULE_AUTHOR("Microsoft");
  36. MODULE_LICENSE("GPL");
  37. MODULE_DESCRIPTION("Microsoft Hyper-V root partition VMM interface /dev/mshv");
  38. /* HV_THREAD_COUNTER */
  39. #if defined(CONFIG_X86_64)
  40. #define HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED 202
  41. #elif defined(CONFIG_ARM64)
  42. #define HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED 95
  43. #endif
  44. struct mshv_root mshv_root;
  45. enum hv_scheduler_type hv_scheduler_type;
  46. /* Once we implement the fast extended hypercall ABI they can go away. */
  47. static void * __percpu *root_scheduler_input;
  48. static void * __percpu *root_scheduler_output;
  49. static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
  50. static int mshv_dev_open(struct inode *inode, struct file *filp);
  51. static int mshv_dev_release(struct inode *inode, struct file *filp);
  52. static int mshv_vp_release(struct inode *inode, struct file *filp);
  53. static long mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
  54. static int mshv_partition_release(struct inode *inode, struct file *filp);
  55. static long mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
  56. static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma);
  57. static vm_fault_t mshv_vp_fault(struct vm_fault *vmf);
  58. static int mshv_init_async_handler(struct mshv_partition *partition);
  59. static void mshv_async_hvcall_handler(void *data, u64 *status);
  60. static const union hv_input_vtl input_vtl_zero;
  61. static const union hv_input_vtl input_vtl_normal = {
  62. .target_vtl = HV_NORMAL_VTL,
  63. .use_target_vtl = 1,
  64. };
  65. static const struct vm_operations_struct mshv_vp_vm_ops = {
  66. .fault = mshv_vp_fault,
  67. };
  68. static const struct file_operations mshv_vp_fops = {
  69. .owner = THIS_MODULE,
  70. .release = mshv_vp_release,
  71. .unlocked_ioctl = mshv_vp_ioctl,
  72. .llseek = noop_llseek,
  73. .mmap = mshv_vp_mmap,
  74. };
  75. static const struct file_operations mshv_partition_fops = {
  76. .owner = THIS_MODULE,
  77. .release = mshv_partition_release,
  78. .unlocked_ioctl = mshv_partition_ioctl,
  79. .llseek = noop_llseek,
  80. };
  81. static const struct file_operations mshv_dev_fops = {
  82. .owner = THIS_MODULE,
  83. .open = mshv_dev_open,
  84. .release = mshv_dev_release,
  85. .unlocked_ioctl = mshv_dev_ioctl,
  86. .llseek = noop_llseek,
  87. };
  88. static struct miscdevice mshv_dev = {
  89. .minor = MISC_DYNAMIC_MINOR,
  90. .name = "mshv",
  91. .fops = &mshv_dev_fops,
  92. .mode = 0600,
  93. };
  94. /*
  95. * Only allow hypercalls that have a u64 partition id as the first member of
  96. * the input structure.
  97. * These are sorted by value.
  98. */
  99. static u16 mshv_passthru_hvcalls[] = {
  100. HVCALL_GET_PARTITION_PROPERTY,
  101. HVCALL_GET_PARTITION_PROPERTY_EX,
  102. HVCALL_SET_PARTITION_PROPERTY,
  103. HVCALL_INSTALL_INTERCEPT,
  104. HVCALL_GET_VP_REGISTERS,
  105. HVCALL_SET_VP_REGISTERS,
  106. HVCALL_TRANSLATE_VIRTUAL_ADDRESS,
  107. HVCALL_CLEAR_VIRTUAL_INTERRUPT,
  108. HVCALL_REGISTER_INTERCEPT_RESULT,
  109. HVCALL_ASSERT_VIRTUAL_INTERRUPT,
  110. HVCALL_GET_GPA_PAGES_ACCESS_STATES,
  111. HVCALL_SIGNAL_EVENT_DIRECT,
  112. HVCALL_POST_MESSAGE_DIRECT,
  113. HVCALL_GET_VP_CPUID_VALUES,
  114. };
  115. /*
  116. * Only allow hypercalls that are safe to be called by the VMM with the host
  117. * partition as target (i.e. HV_PARTITION_ID_SELF). Carefully audit that a
  118. * hypercall cannot be misused by the VMM before adding it to this list.
  119. */
  120. static u16 mshv_self_passthru_hvcalls[] = {
  121. HVCALL_GET_PARTITION_PROPERTY,
  122. HVCALL_GET_PARTITION_PROPERTY_EX,
  123. };
  124. static bool mshv_hvcall_is_async(u16 code)
  125. {
  126. switch (code) {
  127. case HVCALL_SET_PARTITION_PROPERTY:
  128. return true;
  129. default:
  130. break;
  131. }
  132. return false;
  133. }
  134. static bool mshv_passthru_hvcall_allowed(u16 code, u64 pt_id)
  135. {
  136. int i;
  137. int n = ARRAY_SIZE(mshv_passthru_hvcalls);
  138. u16 *allowed_hvcalls = mshv_passthru_hvcalls;
  139. if (pt_id == HV_PARTITION_ID_SELF) {
  140. n = ARRAY_SIZE(mshv_self_passthru_hvcalls);
  141. allowed_hvcalls = mshv_self_passthru_hvcalls;
  142. }
  143. for (i = 0; i < n; ++i)
  144. if (allowed_hvcalls[i] == code)
  145. return true;
  146. return false;
  147. }
  148. static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition,
  149. bool partition_locked,
  150. void __user *user_args)
  151. {
  152. u64 status;
  153. int ret = 0;
  154. bool is_async;
  155. struct mshv_root_hvcall args;
  156. struct page *page;
  157. unsigned int pages_order;
  158. void *input_pg = NULL;
  159. void *output_pg = NULL;
  160. u16 reps_completed;
  161. u64 pt_id = partition ? partition->pt_id : HV_PARTITION_ID_SELF;
  162. if (copy_from_user(&args, user_args, sizeof(args)))
  163. return -EFAULT;
  164. if (args.status || !args.in_ptr || args.in_sz < sizeof(u64) ||
  165. mshv_field_nonzero(args, rsvd) || args.in_sz > HV_HYP_PAGE_SIZE)
  166. return -EINVAL;
  167. if (args.out_ptr && (!args.out_sz || args.out_sz > HV_HYP_PAGE_SIZE))
  168. return -EINVAL;
  169. if (!mshv_passthru_hvcall_allowed(args.code, pt_id))
  170. return -EINVAL;
  171. is_async = mshv_hvcall_is_async(args.code);
  172. if (is_async) {
  173. /* async hypercalls can only be called from partition fd */
  174. if (!partition || !partition_locked)
  175. return -EINVAL;
  176. ret = mshv_init_async_handler(partition);
  177. if (ret)
  178. return ret;
  179. }
  180. pages_order = args.out_ptr ? 1 : 0;
  181. page = alloc_pages(GFP_KERNEL, pages_order);
  182. if (!page)
  183. return -ENOMEM;
  184. input_pg = page_address(page);
  185. if (args.out_ptr)
  186. output_pg = (char *)input_pg + PAGE_SIZE;
  187. else
  188. output_pg = NULL;
  189. if (copy_from_user(input_pg, (void __user *)args.in_ptr,
  190. args.in_sz)) {
  191. ret = -EFAULT;
  192. goto free_pages_out;
  193. }
  194. /*
  195. * NOTE: This only works because all the allowed hypercalls' input
  196. * structs begin with a u64 partition_id field.
  197. */
  198. *(u64 *)input_pg = pt_id;
  199. reps_completed = 0;
  200. do {
  201. if (args.reps) {
  202. status = hv_do_rep_hypercall_ex(args.code, args.reps,
  203. 0, reps_completed,
  204. input_pg, output_pg);
  205. reps_completed = hv_repcomp(status);
  206. } else {
  207. status = hv_do_hypercall(args.code, input_pg, output_pg);
  208. }
  209. if (hv_result(status) == HV_STATUS_CALL_PENDING) {
  210. if (is_async) {
  211. mshv_async_hvcall_handler(partition, &status);
  212. } else { /* Paranoia check. This shouldn't happen! */
  213. ret = -EBADFD;
  214. goto free_pages_out;
  215. }
  216. }
  217. if (hv_result_success(status))
  218. break;
  219. if (!hv_result_needs_memory(status))
  220. ret = hv_result_to_errno(status);
  221. else
  222. ret = hv_deposit_memory(pt_id, status);
  223. } while (!ret);
  224. args.status = hv_result(status);
  225. args.reps = reps_completed;
  226. if (copy_to_user(user_args, &args, sizeof(args)))
  227. ret = -EFAULT;
  228. if (!ret && output_pg &&
  229. copy_to_user((void __user *)args.out_ptr, output_pg, args.out_sz))
  230. ret = -EFAULT;
  231. free_pages_out:
  232. free_pages((unsigned long)input_pg, pages_order);
  233. return ret;
  234. }
  235. static inline bool is_ghcb_mapping_available(void)
  236. {
  237. #if IS_ENABLED(CONFIG_X86_64)
  238. return ms_hyperv.ext_features & HV_VP_GHCB_ROOT_MAPPING_AVAILABLE;
  239. #else
  240. return 0;
  241. #endif
  242. }
  243. static int mshv_get_vp_registers(u32 vp_index, u64 partition_id, u16 count,
  244. struct hv_register_assoc *registers)
  245. {
  246. return hv_call_get_vp_registers(vp_index, partition_id,
  247. count, input_vtl_zero, registers);
  248. }
  249. static int mshv_set_vp_registers(u32 vp_index, u64 partition_id, u16 count,
  250. struct hv_register_assoc *registers)
  251. {
  252. return hv_call_set_vp_registers(vp_index, partition_id,
  253. count, input_vtl_zero, registers);
  254. }
  255. /*
  256. * Explicit guest vCPU suspend is asynchronous by nature (as it is requested by
  257. * dom0 vCPU for guest vCPU) and thus it can race with "intercept" suspend,
  258. * done by the hypervisor.
  259. * "Intercept" suspend leads to asynchronous message delivery to dom0 which
  260. * should be awaited to keep the VP loop consistent (i.e. no message pending
  261. * upon VP resume).
  262. * VP intercept suspend can't be done when the VP is explicitly suspended
  263. * already, and thus can be only two possible race scenarios:
  264. * 1. implicit suspend bit set -> explicit suspend bit set -> message sent
  265. * 2. implicit suspend bit set -> message sent -> explicit suspend bit set
  266. * Checking for implicit suspend bit set after explicit suspend request has
  267. * succeeded in either case allows us to reliably identify, if there is a
  268. * message to receive and deliver to VMM.
  269. */
  270. static int
  271. mshv_suspend_vp(const struct mshv_vp *vp, bool *message_in_flight)
  272. {
  273. struct hv_register_assoc explicit_suspend = {
  274. .name = HV_REGISTER_EXPLICIT_SUSPEND
  275. };
  276. struct hv_register_assoc intercept_suspend = {
  277. .name = HV_REGISTER_INTERCEPT_SUSPEND
  278. };
  279. union hv_explicit_suspend_register *es =
  280. &explicit_suspend.value.explicit_suspend;
  281. union hv_intercept_suspend_register *is =
  282. &intercept_suspend.value.intercept_suspend;
  283. int ret;
  284. es->suspended = 1;
  285. ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
  286. 1, &explicit_suspend);
  287. if (ret) {
  288. vp_err(vp, "Failed to explicitly suspend vCPU\n");
  289. return ret;
  290. }
  291. ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
  292. 1, &intercept_suspend);
  293. if (ret) {
  294. vp_err(vp, "Failed to get intercept suspend state\n");
  295. return ret;
  296. }
  297. *message_in_flight = is->suspended;
  298. return 0;
  299. }
  300. /*
  301. * This function is used when VPs are scheduled by the hypervisor's
  302. * scheduler.
  303. *
  304. * Caller has to make sure the registers contain cleared
  305. * HV_REGISTER_INTERCEPT_SUSPEND and HV_REGISTER_EXPLICIT_SUSPEND registers
  306. * exactly in this order (the hypervisor clears them sequentially) to avoid
  307. * potential invalid clearing a newly arrived HV_REGISTER_INTERCEPT_SUSPEND
  308. * after VP is released from HV_REGISTER_EXPLICIT_SUSPEND in case of the
  309. * opposite order.
  310. */
  311. static long mshv_run_vp_with_hyp_scheduler(struct mshv_vp *vp)
  312. {
  313. long ret;
  314. struct hv_register_assoc suspend_regs[2] = {
  315. { .name = HV_REGISTER_INTERCEPT_SUSPEND },
  316. { .name = HV_REGISTER_EXPLICIT_SUSPEND }
  317. };
  318. size_t count = ARRAY_SIZE(suspend_regs);
  319. /* Resume VP execution */
  320. ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
  321. count, suspend_regs);
  322. if (ret) {
  323. vp_err(vp, "Failed to resume vp execution. %lx\n", ret);
  324. return ret;
  325. }
  326. ret = wait_event_interruptible(vp->run.vp_suspend_queue,
  327. vp->run.kicked_by_hv == 1);
  328. if (ret) {
  329. bool message_in_flight;
  330. /*
  331. * Otherwise the waiting was interrupted by a signal: suspend
  332. * the vCPU explicitly and copy message in flight (if any).
  333. */
  334. ret = mshv_suspend_vp(vp, &message_in_flight);
  335. if (ret)
  336. return ret;
  337. /* Return if no message in flight */
  338. if (!message_in_flight)
  339. return -EINTR;
  340. /* Wait for the message in flight. */
  341. wait_event(vp->run.vp_suspend_queue, vp->run.kicked_by_hv == 1);
  342. }
  343. /*
  344. * Reset the flag to make the wait_event call above work
  345. * next time.
  346. */
  347. vp->run.kicked_by_hv = 0;
  348. return 0;
  349. }
  350. static int
  351. mshv_vp_dispatch(struct mshv_vp *vp, u32 flags,
  352. struct hv_output_dispatch_vp *res)
  353. {
  354. struct hv_input_dispatch_vp *input;
  355. struct hv_output_dispatch_vp *output;
  356. u64 status;
  357. preempt_disable();
  358. input = *this_cpu_ptr(root_scheduler_input);
  359. output = *this_cpu_ptr(root_scheduler_output);
  360. memset(input, 0, sizeof(*input));
  361. memset(output, 0, sizeof(*output));
  362. input->partition_id = vp->vp_partition->pt_id;
  363. input->vp_index = vp->vp_index;
  364. input->time_slice = 0; /* Run forever until something happens */
  365. input->spec_ctrl = 0; /* TODO: set sensible flags */
  366. input->flags = flags;
  367. vp->run.flags.root_sched_dispatched = 1;
  368. status = hv_do_hypercall(HVCALL_DISPATCH_VP, input, output);
  369. vp->run.flags.root_sched_dispatched = 0;
  370. *res = *output;
  371. preempt_enable();
  372. if (!hv_result_success(status))
  373. vp_err(vp, "%s: status %s\n", __func__,
  374. hv_result_to_string(status));
  375. return hv_result_to_errno(status);
  376. }
  377. static int
  378. mshv_vp_clear_explicit_suspend(struct mshv_vp *vp)
  379. {
  380. struct hv_register_assoc explicit_suspend = {
  381. .name = HV_REGISTER_EXPLICIT_SUSPEND,
  382. .value.explicit_suspend.suspended = 0,
  383. };
  384. int ret;
  385. ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
  386. 1, &explicit_suspend);
  387. if (ret)
  388. vp_err(vp, "Failed to unsuspend\n");
  389. return ret;
  390. }
  391. #if IS_ENABLED(CONFIG_X86_64)
  392. static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp)
  393. {
  394. if (!vp->vp_register_page)
  395. return 0;
  396. return vp->vp_register_page->interrupt_vectors.as_uint64;
  397. }
  398. #else
  399. static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp)
  400. {
  401. return 0;
  402. }
  403. #endif
  404. static bool mshv_vp_dispatch_thread_blocked(struct mshv_vp *vp)
  405. {
  406. struct hv_stats_page **stats = vp->vp_stats_pages;
  407. u64 *self_vp_cntrs = stats[HV_STATS_AREA_SELF]->data;
  408. u64 *parent_vp_cntrs = stats[HV_STATS_AREA_PARENT]->data;
  409. return parent_vp_cntrs[HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED] ||
  410. self_vp_cntrs[HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED];
  411. }
  412. static int
  413. mshv_vp_wait_for_hv_kick(struct mshv_vp *vp)
  414. {
  415. int ret;
  416. ret = wait_event_interruptible(vp->run.vp_suspend_queue,
  417. (vp->run.kicked_by_hv == 1 &&
  418. !mshv_vp_dispatch_thread_blocked(vp)) ||
  419. mshv_vp_interrupt_pending(vp));
  420. if (ret)
  421. return -EINTR;
  422. vp->run.flags.root_sched_blocked = 0;
  423. vp->run.kicked_by_hv = 0;
  424. return 0;
  425. }
  426. /* Must be called with interrupts enabled */
  427. static long mshv_run_vp_with_root_scheduler(struct mshv_vp *vp)
  428. {
  429. long ret;
  430. if (vp->run.flags.root_sched_blocked) {
  431. /*
  432. * Dispatch state of this VP is blocked. Need to wait
  433. * for the hypervisor to clear the blocked state before
  434. * dispatching it.
  435. */
  436. ret = mshv_vp_wait_for_hv_kick(vp);
  437. if (ret)
  438. return ret;
  439. }
  440. do {
  441. u32 flags = 0;
  442. struct hv_output_dispatch_vp output;
  443. if (__xfer_to_guest_mode_work_pending()) {
  444. ret = xfer_to_guest_mode_handle_work();
  445. if (ret)
  446. break;
  447. }
  448. if (vp->run.flags.intercept_suspend)
  449. flags |= HV_DISPATCH_VP_FLAG_CLEAR_INTERCEPT_SUSPEND;
  450. if (mshv_vp_interrupt_pending(vp))
  451. flags |= HV_DISPATCH_VP_FLAG_SCAN_INTERRUPT_INJECTION;
  452. ret = mshv_vp_dispatch(vp, flags, &output);
  453. if (ret)
  454. break;
  455. vp->run.flags.intercept_suspend = 0;
  456. if (output.dispatch_state == HV_VP_DISPATCH_STATE_BLOCKED) {
  457. if (output.dispatch_event ==
  458. HV_VP_DISPATCH_EVENT_SUSPEND) {
  459. /*
  460. * TODO: remove the warning once VP canceling
  461. * is supported
  462. */
  463. WARN_ONCE(atomic64_read(&vp->run.vp_signaled_count),
  464. "%s: vp#%d: unexpected explicit suspend\n",
  465. __func__, vp->vp_index);
  466. /*
  467. * Need to clear explicit suspend before
  468. * dispatching.
  469. * Explicit suspend is either:
  470. * - set right after the first VP dispatch or
  471. * - set explicitly via hypercall
  472. * Since the latter case is not yet supported,
  473. * simply clear it here.
  474. */
  475. ret = mshv_vp_clear_explicit_suspend(vp);
  476. if (ret)
  477. break;
  478. ret = mshv_vp_wait_for_hv_kick(vp);
  479. if (ret)
  480. break;
  481. } else {
  482. vp->run.flags.root_sched_blocked = 1;
  483. ret = mshv_vp_wait_for_hv_kick(vp);
  484. if (ret)
  485. break;
  486. }
  487. } else {
  488. /* HV_VP_DISPATCH_STATE_READY */
  489. if (output.dispatch_event ==
  490. HV_VP_DISPATCH_EVENT_INTERCEPT)
  491. vp->run.flags.intercept_suspend = 1;
  492. }
  493. } while (!vp->run.flags.intercept_suspend);
  494. rseq_virt_userspace_exit();
  495. return ret;
  496. }
  497. static_assert(sizeof(struct hv_message) <= MSHV_RUN_VP_BUF_SZ,
  498. "sizeof(struct hv_message) must not exceed MSHV_RUN_VP_BUF_SZ");
  499. static struct mshv_mem_region *
  500. mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
  501. {
  502. struct mshv_mem_region *region;
  503. hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) {
  504. if (gfn >= region->start_gfn &&
  505. gfn < region->start_gfn + region->nr_pages)
  506. return region;
  507. }
  508. return NULL;
  509. }
  510. static struct mshv_mem_region *
  511. mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn)
  512. {
  513. struct mshv_mem_region *region;
  514. spin_lock(&p->pt_mem_regions_lock);
  515. region = mshv_partition_region_by_gfn(p, gfn);
  516. if (!region || !mshv_region_get(region)) {
  517. spin_unlock(&p->pt_mem_regions_lock);
  518. return NULL;
  519. }
  520. spin_unlock(&p->pt_mem_regions_lock);
  521. return region;
  522. }
  523. /**
  524. * mshv_handle_gpa_intercept - Handle GPA (Guest Physical Address) intercepts.
  525. * @vp: Pointer to the virtual processor structure.
  526. *
  527. * This function processes GPA intercepts by identifying the memory region
  528. * corresponding to the intercepted GPA, aligning the page offset, and
  529. * mapping the required pages. It ensures that the region is valid and
  530. * handles faults efficiently by mapping multiple pages at once.
  531. *
  532. * Return: true if the intercept was handled successfully, false otherwise.
  533. */
  534. static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
  535. {
  536. struct mshv_partition *p = vp->vp_partition;
  537. struct mshv_mem_region *region;
  538. bool ret = false;
  539. u64 gfn;
  540. #if defined(CONFIG_X86_64)
  541. struct hv_x64_memory_intercept_message *msg =
  542. (struct hv_x64_memory_intercept_message *)
  543. vp->vp_intercept_msg_page->u.payload;
  544. #elif defined(CONFIG_ARM64)
  545. struct hv_arm64_memory_intercept_message *msg =
  546. (struct hv_arm64_memory_intercept_message *)
  547. vp->vp_intercept_msg_page->u.payload;
  548. #endif
  549. enum hv_intercept_access_type access_type =
  550. msg->header.intercept_access_type;
  551. gfn = HVPFN_DOWN(msg->guest_physical_address);
  552. region = mshv_partition_region_by_gfn_get(p, gfn);
  553. if (!region)
  554. return false;
  555. if (access_type == HV_INTERCEPT_ACCESS_WRITE &&
  556. !(region->hv_map_flags & HV_MAP_GPA_WRITABLE))
  557. goto put_region;
  558. if (access_type == HV_INTERCEPT_ACCESS_EXECUTE &&
  559. !(region->hv_map_flags & HV_MAP_GPA_EXECUTABLE))
  560. goto put_region;
  561. /* Only movable memory ranges are supported for GPA intercepts */
  562. if (region->mreg_type == MSHV_REGION_TYPE_MEM_MOVABLE)
  563. ret = mshv_region_handle_gfn_fault(region, gfn);
  564. put_region:
  565. mshv_region_put(region);
  566. return ret;
  567. }
  568. static bool mshv_vp_handle_intercept(struct mshv_vp *vp)
  569. {
  570. switch (vp->vp_intercept_msg_page->header.message_type) {
  571. case HVMSG_GPA_INTERCEPT:
  572. return mshv_handle_gpa_intercept(vp);
  573. }
  574. return false;
  575. }
  576. static long mshv_vp_ioctl_run_vp(struct mshv_vp *vp, void __user *ret_msg)
  577. {
  578. long rc;
  579. do {
  580. if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
  581. rc = mshv_run_vp_with_root_scheduler(vp);
  582. else
  583. rc = mshv_run_vp_with_hyp_scheduler(vp);
  584. } while (rc == 0 && mshv_vp_handle_intercept(vp));
  585. if (rc)
  586. return rc;
  587. if (copy_to_user(ret_msg, vp->vp_intercept_msg_page,
  588. sizeof(struct hv_message)))
  589. rc = -EFAULT;
  590. return rc;
  591. }
  592. static int
  593. mshv_vp_ioctl_get_set_state_pfn(struct mshv_vp *vp,
  594. struct hv_vp_state_data state_data,
  595. unsigned long user_pfn, size_t page_count,
  596. bool is_set)
  597. {
  598. int completed, ret = 0;
  599. unsigned long check;
  600. struct page **pages;
  601. if (page_count > INT_MAX)
  602. return -EINVAL;
  603. /*
  604. * Check the arithmetic for wraparound/overflow.
  605. * The last page address in the buffer is:
  606. * (user_pfn + (page_count - 1)) * PAGE_SIZE
  607. */
  608. if (check_add_overflow(user_pfn, (page_count - 1), &check))
  609. return -EOVERFLOW;
  610. if (check_mul_overflow(check, PAGE_SIZE, &check))
  611. return -EOVERFLOW;
  612. /* Pin user pages so hypervisor can copy directly to them */
  613. pages = kzalloc_objs(struct page *, page_count);
  614. if (!pages)
  615. return -ENOMEM;
  616. for (completed = 0; completed < page_count; completed += ret) {
  617. unsigned long user_addr = (user_pfn + completed) * PAGE_SIZE;
  618. int remaining = page_count - completed;
  619. ret = pin_user_pages_fast(user_addr, remaining, FOLL_WRITE,
  620. &pages[completed]);
  621. if (ret < 0) {
  622. vp_err(vp, "%s: Failed to pin user pages error %i\n",
  623. __func__, ret);
  624. goto unpin_pages;
  625. }
  626. }
  627. if (is_set)
  628. ret = hv_call_set_vp_state(vp->vp_index,
  629. vp->vp_partition->pt_id,
  630. state_data, page_count, pages,
  631. 0, NULL);
  632. else
  633. ret = hv_call_get_vp_state(vp->vp_index,
  634. vp->vp_partition->pt_id,
  635. state_data, page_count, pages,
  636. NULL);
  637. unpin_pages:
  638. unpin_user_pages(pages, completed);
  639. kfree(pages);
  640. return ret;
  641. }
  642. static long
  643. mshv_vp_ioctl_get_set_state(struct mshv_vp *vp,
  644. struct mshv_get_set_vp_state __user *user_args,
  645. bool is_set)
  646. {
  647. struct mshv_get_set_vp_state args;
  648. long ret = 0;
  649. union hv_output_get_vp_state vp_state;
  650. u32 data_sz;
  651. struct hv_vp_state_data state_data = {};
  652. if (copy_from_user(&args, user_args, sizeof(args)))
  653. return -EFAULT;
  654. if (args.type >= MSHV_VP_STATE_COUNT || mshv_field_nonzero(args, rsvd) ||
  655. !args.buf_sz || !PAGE_ALIGNED(args.buf_sz) ||
  656. !PAGE_ALIGNED(args.buf_ptr))
  657. return -EINVAL;
  658. if (!access_ok((void __user *)args.buf_ptr, args.buf_sz))
  659. return -EFAULT;
  660. switch (args.type) {
  661. case MSHV_VP_STATE_LAPIC:
  662. state_data.type = HV_GET_SET_VP_STATE_LAPIC_STATE;
  663. data_sz = HV_HYP_PAGE_SIZE;
  664. break;
  665. case MSHV_VP_STATE_XSAVE:
  666. {
  667. u64 data_sz_64;
  668. ret = hv_call_get_partition_property(vp->vp_partition->pt_id,
  669. HV_PARTITION_PROPERTY_XSAVE_STATES,
  670. &state_data.xsave.states.as_uint64);
  671. if (ret)
  672. return ret;
  673. ret = hv_call_get_partition_property(vp->vp_partition->pt_id,
  674. HV_PARTITION_PROPERTY_MAX_XSAVE_DATA_SIZE,
  675. &data_sz_64);
  676. if (ret)
  677. return ret;
  678. data_sz = (u32)data_sz_64;
  679. state_data.xsave.flags = 0;
  680. /* Always request legacy states */
  681. state_data.xsave.states.legacy_x87 = 1;
  682. state_data.xsave.states.legacy_sse = 1;
  683. state_data.type = HV_GET_SET_VP_STATE_XSAVE;
  684. break;
  685. }
  686. case MSHV_VP_STATE_SIMP:
  687. state_data.type = HV_GET_SET_VP_STATE_SIM_PAGE;
  688. data_sz = HV_HYP_PAGE_SIZE;
  689. break;
  690. case MSHV_VP_STATE_SIEFP:
  691. state_data.type = HV_GET_SET_VP_STATE_SIEF_PAGE;
  692. data_sz = HV_HYP_PAGE_SIZE;
  693. break;
  694. case MSHV_VP_STATE_SYNTHETIC_TIMERS:
  695. state_data.type = HV_GET_SET_VP_STATE_SYNTHETIC_TIMERS;
  696. data_sz = sizeof(vp_state.synthetic_timers_state);
  697. break;
  698. default:
  699. return -EINVAL;
  700. }
  701. if (copy_to_user(&user_args->buf_sz, &data_sz, sizeof(user_args->buf_sz)))
  702. return -EFAULT;
  703. if (data_sz > args.buf_sz)
  704. return -EINVAL;
  705. /* If the data is transmitted via pfns, delegate to helper */
  706. if (state_data.type & HV_GET_SET_VP_STATE_TYPE_PFN) {
  707. unsigned long user_pfn = PFN_DOWN(args.buf_ptr);
  708. size_t page_count = PFN_DOWN(args.buf_sz);
  709. return mshv_vp_ioctl_get_set_state_pfn(vp, state_data, user_pfn,
  710. page_count, is_set);
  711. }
  712. /* Paranoia check - this shouldn't happen! */
  713. if (data_sz > sizeof(vp_state)) {
  714. vp_err(vp, "Invalid vp state data size!\n");
  715. return -EINVAL;
  716. }
  717. if (is_set) {
  718. if (copy_from_user(&vp_state, (__user void *)args.buf_ptr, data_sz))
  719. return -EFAULT;
  720. return hv_call_set_vp_state(vp->vp_index,
  721. vp->vp_partition->pt_id,
  722. state_data, 0, NULL,
  723. sizeof(vp_state), (u8 *)&vp_state);
  724. }
  725. ret = hv_call_get_vp_state(vp->vp_index, vp->vp_partition->pt_id,
  726. state_data, 0, NULL, &vp_state);
  727. if (ret)
  728. return ret;
  729. if (copy_to_user((void __user *)args.buf_ptr, &vp_state, data_sz))
  730. return -EFAULT;
  731. return 0;
  732. }
  733. static long
  734. mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
  735. {
  736. struct mshv_vp *vp = filp->private_data;
  737. long r = -ENOTTY;
  738. if (mutex_lock_killable(&vp->vp_mutex))
  739. return -EINTR;
  740. switch (ioctl) {
  741. case MSHV_RUN_VP:
  742. r = mshv_vp_ioctl_run_vp(vp, (void __user *)arg);
  743. break;
  744. case MSHV_GET_VP_STATE:
  745. r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, false);
  746. break;
  747. case MSHV_SET_VP_STATE:
  748. r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, true);
  749. break;
  750. case MSHV_ROOT_HVCALL:
  751. r = mshv_ioctl_passthru_hvcall(vp->vp_partition, false,
  752. (void __user *)arg);
  753. break;
  754. default:
  755. vp_warn(vp, "Invalid ioctl: %#x\n", ioctl);
  756. break;
  757. }
  758. mutex_unlock(&vp->vp_mutex);
  759. return r;
  760. }
  761. static vm_fault_t mshv_vp_fault(struct vm_fault *vmf)
  762. {
  763. struct mshv_vp *vp = vmf->vma->vm_file->private_data;
  764. switch (vmf->vma->vm_pgoff) {
  765. case MSHV_VP_MMAP_OFFSET_REGISTERS:
  766. vmf->page = virt_to_page(vp->vp_register_page);
  767. break;
  768. case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE:
  769. vmf->page = virt_to_page(vp->vp_intercept_msg_page);
  770. break;
  771. case MSHV_VP_MMAP_OFFSET_GHCB:
  772. vmf->page = virt_to_page(vp->vp_ghcb_page);
  773. break;
  774. default:
  775. return VM_FAULT_SIGBUS;
  776. }
  777. get_page(vmf->page);
  778. return 0;
  779. }
  780. static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma)
  781. {
  782. struct mshv_vp *vp = file->private_data;
  783. switch (vma->vm_pgoff) {
  784. case MSHV_VP_MMAP_OFFSET_REGISTERS:
  785. if (!vp->vp_register_page)
  786. return -ENODEV;
  787. break;
  788. case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE:
  789. if (!vp->vp_intercept_msg_page)
  790. return -ENODEV;
  791. break;
  792. case MSHV_VP_MMAP_OFFSET_GHCB:
  793. if (!vp->vp_ghcb_page)
  794. return -ENODEV;
  795. break;
  796. default:
  797. return -EINVAL;
  798. }
  799. vma->vm_ops = &mshv_vp_vm_ops;
  800. return 0;
  801. }
  802. static int
  803. mshv_vp_release(struct inode *inode, struct file *filp)
  804. {
  805. struct mshv_vp *vp = filp->private_data;
  806. /* Rest of VP cleanup happens in destroy_partition() */
  807. mshv_partition_put(vp->vp_partition);
  808. return 0;
  809. }
  810. void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index,
  811. struct hv_stats_page *stats_pages[])
  812. {
  813. union hv_stats_object_identity identity = {
  814. .vp.partition_id = partition_id,
  815. .vp.vp_index = vp_index,
  816. };
  817. int err;
  818. identity.vp.stats_area_type = HV_STATS_AREA_SELF;
  819. err = hv_unmap_stats_page(HV_STATS_OBJECT_VP,
  820. stats_pages[HV_STATS_AREA_SELF],
  821. &identity);
  822. if (err)
  823. pr_err("%s: failed to unmap partition %llu vp %u self stats, err: %d\n",
  824. __func__, partition_id, vp_index, err);
  825. if (stats_pages[HV_STATS_AREA_PARENT] != stats_pages[HV_STATS_AREA_SELF]) {
  826. identity.vp.stats_area_type = HV_STATS_AREA_PARENT;
  827. err = hv_unmap_stats_page(HV_STATS_OBJECT_VP,
  828. stats_pages[HV_STATS_AREA_PARENT],
  829. &identity);
  830. if (err)
  831. pr_err("%s: failed to unmap partition %llu vp %u parent stats, err: %d\n",
  832. __func__, partition_id, vp_index, err);
  833. }
  834. }
  835. int mshv_vp_stats_map(u64 partition_id, u32 vp_index,
  836. struct hv_stats_page *stats_pages[])
  837. {
  838. union hv_stats_object_identity identity = {
  839. .vp.partition_id = partition_id,
  840. .vp.vp_index = vp_index,
  841. };
  842. int err;
  843. identity.vp.stats_area_type = HV_STATS_AREA_SELF;
  844. err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity,
  845. &stats_pages[HV_STATS_AREA_SELF]);
  846. if (err) {
  847. pr_err("%s: failed to map partition %llu vp %u self stats, err: %d\n",
  848. __func__, partition_id, vp_index, err);
  849. return err;
  850. }
  851. /*
  852. * L1VH partition cannot access its vp stats in parent area.
  853. */
  854. if (is_l1vh_parent(partition_id)) {
  855. stats_pages[HV_STATS_AREA_PARENT] = stats_pages[HV_STATS_AREA_SELF];
  856. } else {
  857. identity.vp.stats_area_type = HV_STATS_AREA_PARENT;
  858. err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity,
  859. &stats_pages[HV_STATS_AREA_PARENT]);
  860. if (err) {
  861. pr_err("%s: failed to map partition %llu vp %u parent stats, err: %d\n",
  862. __func__, partition_id, vp_index, err);
  863. goto unmap_self;
  864. }
  865. if (!stats_pages[HV_STATS_AREA_PARENT])
  866. stats_pages[HV_STATS_AREA_PARENT] = stats_pages[HV_STATS_AREA_SELF];
  867. }
  868. return 0;
  869. unmap_self:
  870. identity.vp.stats_area_type = HV_STATS_AREA_SELF;
  871. hv_unmap_stats_page(HV_STATS_OBJECT_VP,
  872. stats_pages[HV_STATS_AREA_SELF],
  873. &identity);
  874. return err;
  875. }
  876. static long
  877. mshv_partition_ioctl_create_vp(struct mshv_partition *partition,
  878. void __user *arg)
  879. {
  880. struct mshv_create_vp args;
  881. struct mshv_vp *vp;
  882. struct page *intercept_msg_page, *register_page, *ghcb_page;
  883. struct hv_stats_page *stats_pages[2];
  884. long ret;
  885. if (copy_from_user(&args, arg, sizeof(args)))
  886. return -EFAULT;
  887. if (args.vp_index >= MSHV_MAX_VPS)
  888. return -EINVAL;
  889. if (partition->pt_vp_array[args.vp_index])
  890. return -EEXIST;
  891. ret = hv_call_create_vp(NUMA_NO_NODE, partition->pt_id, args.vp_index,
  892. 0 /* Only valid for root partition VPs */);
  893. if (ret)
  894. return ret;
  895. ret = hv_map_vp_state_page(partition->pt_id, args.vp_index,
  896. HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
  897. input_vtl_zero, &intercept_msg_page);
  898. if (ret)
  899. goto destroy_vp;
  900. if (!mshv_partition_encrypted(partition)) {
  901. ret = hv_map_vp_state_page(partition->pt_id, args.vp_index,
  902. HV_VP_STATE_PAGE_REGISTERS,
  903. input_vtl_zero, &register_page);
  904. if (ret)
  905. goto unmap_intercept_message_page;
  906. }
  907. if (mshv_partition_encrypted(partition) &&
  908. is_ghcb_mapping_available()) {
  909. ret = hv_map_vp_state_page(partition->pt_id, args.vp_index,
  910. HV_VP_STATE_PAGE_GHCB,
  911. input_vtl_normal, &ghcb_page);
  912. if (ret)
  913. goto unmap_register_page;
  914. }
  915. ret = mshv_vp_stats_map(partition->pt_id, args.vp_index,
  916. stats_pages);
  917. if (ret)
  918. goto unmap_ghcb_page;
  919. vp = kzalloc_obj(*vp);
  920. if (!vp)
  921. goto unmap_stats_pages;
  922. vp->vp_partition = mshv_partition_get(partition);
  923. if (!vp->vp_partition) {
  924. ret = -EBADF;
  925. goto free_vp;
  926. }
  927. mutex_init(&vp->vp_mutex);
  928. init_waitqueue_head(&vp->run.vp_suspend_queue);
  929. atomic64_set(&vp->run.vp_signaled_count, 0);
  930. vp->vp_index = args.vp_index;
  931. vp->vp_intercept_msg_page = page_to_virt(intercept_msg_page);
  932. if (!mshv_partition_encrypted(partition))
  933. vp->vp_register_page = page_to_virt(register_page);
  934. if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available())
  935. vp->vp_ghcb_page = page_to_virt(ghcb_page);
  936. memcpy(vp->vp_stats_pages, stats_pages, sizeof(stats_pages));
  937. ret = mshv_debugfs_vp_create(vp);
  938. if (ret)
  939. goto put_partition;
  940. /*
  941. * Keep anon_inode_getfd last: it installs fd in the file struct and
  942. * thus makes the state accessible in user space.
  943. */
  944. ret = anon_inode_getfd("mshv_vp", &mshv_vp_fops, vp,
  945. O_RDWR | O_CLOEXEC);
  946. if (ret < 0)
  947. goto remove_debugfs_vp;
  948. /* already exclusive with the partition mutex for all ioctls */
  949. partition->pt_vp_count++;
  950. partition->pt_vp_array[args.vp_index] = vp;
  951. return ret;
  952. remove_debugfs_vp:
  953. mshv_debugfs_vp_remove(vp);
  954. put_partition:
  955. mshv_partition_put(partition);
  956. free_vp:
  957. kfree(vp);
  958. unmap_stats_pages:
  959. mshv_vp_stats_unmap(partition->pt_id, args.vp_index, stats_pages);
  960. unmap_ghcb_page:
  961. if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available())
  962. hv_unmap_vp_state_page(partition->pt_id, args.vp_index,
  963. HV_VP_STATE_PAGE_GHCB, ghcb_page,
  964. input_vtl_normal);
  965. unmap_register_page:
  966. if (!mshv_partition_encrypted(partition))
  967. hv_unmap_vp_state_page(partition->pt_id, args.vp_index,
  968. HV_VP_STATE_PAGE_REGISTERS,
  969. register_page, input_vtl_zero);
  970. unmap_intercept_message_page:
  971. hv_unmap_vp_state_page(partition->pt_id, args.vp_index,
  972. HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
  973. intercept_msg_page, input_vtl_zero);
  974. destroy_vp:
  975. hv_call_delete_vp(partition->pt_id, args.vp_index);
  976. return ret;
  977. }
  978. static int mshv_init_async_handler(struct mshv_partition *partition)
  979. {
  980. if (completion_done(&partition->async_hypercall)) {
  981. pt_err(partition,
  982. "Cannot issue async hypercall while another one in progress!\n");
  983. return -EPERM;
  984. }
  985. reinit_completion(&partition->async_hypercall);
  986. return 0;
  987. }
  988. static void mshv_async_hvcall_handler(void *data, u64 *status)
  989. {
  990. struct mshv_partition *partition = data;
  991. wait_for_completion(&partition->async_hypercall);
  992. pt_dbg(partition, "Async hypercall completed!\n");
  993. *status = partition->async_hypercall_status;
  994. }
  995. /*
  996. * NB: caller checks and makes sure mem->size is page aligned
  997. * Returns: 0 with regionpp updated on success, or -errno
  998. */
  999. static int mshv_partition_create_region(struct mshv_partition *partition,
  1000. struct mshv_user_mem_region *mem,
  1001. struct mshv_mem_region **regionpp,
  1002. bool is_mmio)
  1003. {
  1004. struct mshv_mem_region *rg;
  1005. u64 nr_pages = HVPFN_DOWN(mem->size);
  1006. /* Reject overlapping regions */
  1007. spin_lock(&partition->pt_mem_regions_lock);
  1008. hlist_for_each_entry(rg, &partition->pt_mem_regions, hnode) {
  1009. if (mem->guest_pfn + nr_pages <= rg->start_gfn ||
  1010. rg->start_gfn + rg->nr_pages <= mem->guest_pfn)
  1011. continue;
  1012. spin_unlock(&partition->pt_mem_regions_lock);
  1013. return -EEXIST;
  1014. }
  1015. spin_unlock(&partition->pt_mem_regions_lock);
  1016. rg = mshv_region_create(mem->guest_pfn, nr_pages,
  1017. mem->userspace_addr, mem->flags);
  1018. if (IS_ERR(rg))
  1019. return PTR_ERR(rg);
  1020. if (is_mmio)
  1021. rg->mreg_type = MSHV_REGION_TYPE_MMIO;
  1022. else if (mshv_partition_encrypted(partition) ||
  1023. !mshv_region_movable_init(rg))
  1024. rg->mreg_type = MSHV_REGION_TYPE_MEM_PINNED;
  1025. else
  1026. rg->mreg_type = MSHV_REGION_TYPE_MEM_MOVABLE;
  1027. rg->partition = partition;
  1028. *regionpp = rg;
  1029. return 0;
  1030. }
  1031. /**
  1032. * mshv_prepare_pinned_region - Pin and map memory regions
  1033. * @region: Pointer to the memory region structure
  1034. *
  1035. * This function processes memory regions that are explicitly marked as pinned.
  1036. * Pinned regions are preallocated, mapped upfront, and do not rely on fault-based
  1037. * population. The function ensures the region is properly populated, handles
  1038. * encryption requirements for SNP partitions if applicable, maps the region,
  1039. * and performs necessary sharing or eviction operations based on the mapping
  1040. * result.
  1041. *
  1042. * Return: 0 on success, negative error code on failure.
  1043. */
  1044. static int mshv_prepare_pinned_region(struct mshv_mem_region *region)
  1045. {
  1046. struct mshv_partition *partition = region->partition;
  1047. int ret;
  1048. ret = mshv_region_pin(region);
  1049. if (ret) {
  1050. pt_err(partition, "Failed to pin memory region: %d\n",
  1051. ret);
  1052. goto err_out;
  1053. }
  1054. /*
  1055. * For an SNP partition it is a requirement that for every memory region
  1056. * that we are going to map for this partition we should make sure that
  1057. * host access to that region is released. This is ensured by doing an
  1058. * additional hypercall which will update the SLAT to release host
  1059. * access to guest memory regions.
  1060. */
  1061. if (mshv_partition_encrypted(partition)) {
  1062. ret = mshv_region_unshare(region);
  1063. if (ret) {
  1064. pt_err(partition,
  1065. "Failed to unshare memory region (guest_pfn: %llu): %d\n",
  1066. region->start_gfn, ret);
  1067. goto invalidate_region;
  1068. }
  1069. }
  1070. ret = mshv_region_map(region);
  1071. if (ret && mshv_partition_encrypted(partition)) {
  1072. int shrc;
  1073. shrc = mshv_region_share(region);
  1074. if (!shrc)
  1075. goto invalidate_region;
  1076. pt_err(partition,
  1077. "Failed to share memory region (guest_pfn: %llu): %d\n",
  1078. region->start_gfn, shrc);
  1079. /*
  1080. * Don't unpin if marking shared failed because pages are no
  1081. * longer mapped in the host, ie root, anymore.
  1082. */
  1083. goto err_out;
  1084. }
  1085. return 0;
  1086. invalidate_region:
  1087. mshv_region_invalidate(region);
  1088. err_out:
  1089. return ret;
  1090. }
  1091. /*
  1092. * This maps two things: guest RAM and for pci passthru mmio space.
  1093. *
  1094. * mmio:
  1095. * - vfio overloads vm_pgoff to store the mmio start pfn/spa.
  1096. * - Two things need to happen for mapping mmio range:
  1097. * 1. mapped in the uaddr so VMM can access it.
  1098. * 2. mapped in the hwpt (gfn <-> mmio phys addr) so guest can access it.
  1099. *
  1100. * This function takes care of the second. The first one is managed by vfio,
  1101. * and hence is taken care of via vfio_pci_mmap_fault().
  1102. */
  1103. static long
  1104. mshv_map_user_memory(struct mshv_partition *partition,
  1105. struct mshv_user_mem_region *mem)
  1106. {
  1107. struct mshv_mem_region *region;
  1108. struct vm_area_struct *vma;
  1109. bool is_mmio;
  1110. ulong mmio_pfn;
  1111. long ret;
  1112. if (mem->flags & BIT(MSHV_SET_MEM_BIT_UNMAP) ||
  1113. !access_ok((const void __user *)mem->userspace_addr, mem->size))
  1114. return -EINVAL;
  1115. mmap_read_lock(current->mm);
  1116. vma = vma_lookup(current->mm, mem->userspace_addr);
  1117. is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0;
  1118. mmio_pfn = is_mmio ? vma->vm_pgoff : 0;
  1119. mmap_read_unlock(current->mm);
  1120. if (!vma)
  1121. return -EINVAL;
  1122. ret = mshv_partition_create_region(partition, mem, &region,
  1123. is_mmio);
  1124. if (ret)
  1125. return ret;
  1126. switch (region->mreg_type) {
  1127. case MSHV_REGION_TYPE_MEM_PINNED:
  1128. ret = mshv_prepare_pinned_region(region);
  1129. break;
  1130. case MSHV_REGION_TYPE_MEM_MOVABLE:
  1131. /*
  1132. * For movable memory regions, remap with no access to let
  1133. * the hypervisor track dirty pages, enabling pre-copy live
  1134. * migration.
  1135. */
  1136. ret = hv_call_map_gpa_pages(partition->pt_id,
  1137. region->start_gfn,
  1138. region->nr_pages,
  1139. HV_MAP_GPA_NO_ACCESS, NULL);
  1140. break;
  1141. case MSHV_REGION_TYPE_MMIO:
  1142. ret = hv_call_map_mmio_pages(partition->pt_id,
  1143. region->start_gfn,
  1144. mmio_pfn,
  1145. region->nr_pages);
  1146. break;
  1147. }
  1148. if (ret)
  1149. goto errout;
  1150. spin_lock(&partition->pt_mem_regions_lock);
  1151. hlist_add_head(&region->hnode, &partition->pt_mem_regions);
  1152. spin_unlock(&partition->pt_mem_regions_lock);
  1153. return 0;
  1154. errout:
  1155. mshv_region_put(region);
  1156. return ret;
  1157. }
  1158. /* Called for unmapping both the guest ram and the mmio space */
  1159. static long
  1160. mshv_unmap_user_memory(struct mshv_partition *partition,
  1161. struct mshv_user_mem_region *mem)
  1162. {
  1163. struct mshv_mem_region *region;
  1164. if (!(mem->flags & BIT(MSHV_SET_MEM_BIT_UNMAP)))
  1165. return -EINVAL;
  1166. spin_lock(&partition->pt_mem_regions_lock);
  1167. region = mshv_partition_region_by_gfn(partition, mem->guest_pfn);
  1168. if (!region) {
  1169. spin_unlock(&partition->pt_mem_regions_lock);
  1170. return -ENOENT;
  1171. }
  1172. /* Paranoia check */
  1173. if (region->start_uaddr != mem->userspace_addr ||
  1174. region->start_gfn != mem->guest_pfn ||
  1175. region->nr_pages != HVPFN_DOWN(mem->size)) {
  1176. spin_unlock(&partition->pt_mem_regions_lock);
  1177. return -EINVAL;
  1178. }
  1179. hlist_del(&region->hnode);
  1180. spin_unlock(&partition->pt_mem_regions_lock);
  1181. mshv_region_put(region);
  1182. return 0;
  1183. }
  1184. static long
  1185. mshv_partition_ioctl_set_memory(struct mshv_partition *partition,
  1186. struct mshv_user_mem_region __user *user_mem)
  1187. {
  1188. struct mshv_user_mem_region mem;
  1189. if (copy_from_user(&mem, user_mem, sizeof(mem)))
  1190. return -EFAULT;
  1191. if (!mem.size ||
  1192. !PAGE_ALIGNED(mem.size) ||
  1193. !PAGE_ALIGNED(mem.userspace_addr) ||
  1194. (mem.flags & ~MSHV_SET_MEM_FLAGS_MASK) ||
  1195. mshv_field_nonzero(mem, rsvd))
  1196. return -EINVAL;
  1197. if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP))
  1198. return mshv_unmap_user_memory(partition, &mem);
  1199. return mshv_map_user_memory(partition, &mem);
  1200. }
  1201. static long
  1202. mshv_partition_ioctl_ioeventfd(struct mshv_partition *partition,
  1203. void __user *user_args)
  1204. {
  1205. struct mshv_user_ioeventfd args;
  1206. if (copy_from_user(&args, user_args, sizeof(args)))
  1207. return -EFAULT;
  1208. return mshv_set_unset_ioeventfd(partition, &args);
  1209. }
  1210. static long
  1211. mshv_partition_ioctl_irqfd(struct mshv_partition *partition,
  1212. void __user *user_args)
  1213. {
  1214. struct mshv_user_irqfd args;
  1215. if (copy_from_user(&args, user_args, sizeof(args)))
  1216. return -EFAULT;
  1217. return mshv_set_unset_irqfd(partition, &args);
  1218. }
  1219. static long
  1220. mshv_partition_ioctl_get_gpap_access_bitmap(struct mshv_partition *partition,
  1221. void __user *user_args)
  1222. {
  1223. struct mshv_gpap_access_bitmap args;
  1224. union hv_gpa_page_access_state *states;
  1225. long ret, i;
  1226. union hv_gpa_page_access_state_flags hv_flags = {};
  1227. u8 hv_type_mask;
  1228. ulong bitmap_buf_sz, states_buf_sz;
  1229. int written = 0;
  1230. if (copy_from_user(&args, user_args, sizeof(args)))
  1231. return -EFAULT;
  1232. if (args.access_type >= MSHV_GPAP_ACCESS_TYPE_COUNT ||
  1233. args.access_op >= MSHV_GPAP_ACCESS_OP_COUNT ||
  1234. mshv_field_nonzero(args, rsvd) || !args.page_count ||
  1235. !args.bitmap_ptr)
  1236. return -EINVAL;
  1237. if (check_mul_overflow(args.page_count, sizeof(*states), &states_buf_sz))
  1238. return -E2BIG;
  1239. /* Num bytes needed to store bitmap; one bit per page rounded up */
  1240. bitmap_buf_sz = DIV_ROUND_UP(args.page_count, 8);
  1241. /* Sanity check */
  1242. if (bitmap_buf_sz > states_buf_sz)
  1243. return -EBADFD;
  1244. switch (args.access_type) {
  1245. case MSHV_GPAP_ACCESS_TYPE_ACCESSED:
  1246. hv_type_mask = 1;
  1247. if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) {
  1248. hv_flags.clear_accessed = 1;
  1249. /* not accessed implies not dirty */
  1250. hv_flags.clear_dirty = 1;
  1251. } else { /* MSHV_GPAP_ACCESS_OP_SET */
  1252. hv_flags.set_accessed = 1;
  1253. }
  1254. break;
  1255. case MSHV_GPAP_ACCESS_TYPE_DIRTY:
  1256. hv_type_mask = 2;
  1257. if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) {
  1258. hv_flags.clear_dirty = 1;
  1259. } else { /* MSHV_GPAP_ACCESS_OP_SET */
  1260. hv_flags.set_dirty = 1;
  1261. /* dirty implies accessed */
  1262. hv_flags.set_accessed = 1;
  1263. }
  1264. break;
  1265. }
  1266. states = vzalloc(states_buf_sz);
  1267. if (!states)
  1268. return -ENOMEM;
  1269. ret = hv_call_get_gpa_access_states(partition->pt_id, args.page_count,
  1270. args.gpap_base, hv_flags, &written,
  1271. states);
  1272. if (ret)
  1273. goto free_return;
  1274. /*
  1275. * Overwrite states buffer with bitmap - the bits in hv_type_mask
  1276. * correspond to bitfields in hv_gpa_page_access_state
  1277. */
  1278. for (i = 0; i < written; ++i)
  1279. __assign_bit(i, (ulong *)states,
  1280. states[i].as_uint8 & hv_type_mask);
  1281. /* zero the unused bits in the last byte(s) of the returned bitmap */
  1282. for (i = written; i < bitmap_buf_sz * 8; ++i)
  1283. __clear_bit(i, (ulong *)states);
  1284. if (copy_to_user((void __user *)args.bitmap_ptr, states, bitmap_buf_sz))
  1285. ret = -EFAULT;
  1286. free_return:
  1287. vfree(states);
  1288. return ret;
  1289. }
  1290. static long
  1291. mshv_partition_ioctl_set_msi_routing(struct mshv_partition *partition,
  1292. void __user *user_args)
  1293. {
  1294. struct mshv_user_irq_entry *entries = NULL;
  1295. struct mshv_user_irq_table args;
  1296. long ret;
  1297. if (copy_from_user(&args, user_args, sizeof(args)))
  1298. return -EFAULT;
  1299. if (args.nr > MSHV_MAX_GUEST_IRQS ||
  1300. mshv_field_nonzero(args, rsvd))
  1301. return -EINVAL;
  1302. if (args.nr) {
  1303. struct mshv_user_irq_table __user *urouting = user_args;
  1304. entries = vmemdup_user(urouting->entries,
  1305. array_size(sizeof(*entries),
  1306. args.nr));
  1307. if (IS_ERR(entries))
  1308. return PTR_ERR(entries);
  1309. }
  1310. ret = mshv_update_routing_table(partition, entries, args.nr);
  1311. kvfree(entries);
  1312. return ret;
  1313. }
  1314. static long
  1315. mshv_partition_ioctl_initialize(struct mshv_partition *partition)
  1316. {
  1317. long ret;
  1318. if (partition->pt_initialized)
  1319. return 0;
  1320. ret = hv_call_initialize_partition(partition->pt_id);
  1321. if (ret)
  1322. goto withdraw_mem;
  1323. ret = mshv_debugfs_partition_create(partition);
  1324. if (ret)
  1325. goto finalize_partition;
  1326. partition->pt_initialized = true;
  1327. return 0;
  1328. finalize_partition:
  1329. hv_call_finalize_partition(partition->pt_id);
  1330. withdraw_mem:
  1331. hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id);
  1332. return ret;
  1333. }
  1334. static long
  1335. mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
  1336. {
  1337. struct mshv_partition *partition = filp->private_data;
  1338. long ret;
  1339. void __user *uarg = (void __user *)arg;
  1340. if (mutex_lock_killable(&partition->pt_mutex))
  1341. return -EINTR;
  1342. switch (ioctl) {
  1343. case MSHV_INITIALIZE_PARTITION:
  1344. ret = mshv_partition_ioctl_initialize(partition);
  1345. break;
  1346. case MSHV_SET_GUEST_MEMORY:
  1347. ret = mshv_partition_ioctl_set_memory(partition, uarg);
  1348. break;
  1349. case MSHV_CREATE_VP:
  1350. ret = mshv_partition_ioctl_create_vp(partition, uarg);
  1351. break;
  1352. case MSHV_IRQFD:
  1353. ret = mshv_partition_ioctl_irqfd(partition, uarg);
  1354. break;
  1355. case MSHV_IOEVENTFD:
  1356. ret = mshv_partition_ioctl_ioeventfd(partition, uarg);
  1357. break;
  1358. case MSHV_SET_MSI_ROUTING:
  1359. ret = mshv_partition_ioctl_set_msi_routing(partition, uarg);
  1360. break;
  1361. case MSHV_GET_GPAP_ACCESS_BITMAP:
  1362. ret = mshv_partition_ioctl_get_gpap_access_bitmap(partition,
  1363. uarg);
  1364. break;
  1365. case MSHV_ROOT_HVCALL:
  1366. ret = mshv_ioctl_passthru_hvcall(partition, true, uarg);
  1367. break;
  1368. default:
  1369. ret = -ENOTTY;
  1370. }
  1371. mutex_unlock(&partition->pt_mutex);
  1372. return ret;
  1373. }
  1374. static int
  1375. disable_vp_dispatch(struct mshv_vp *vp)
  1376. {
  1377. int ret;
  1378. struct hv_register_assoc dispatch_suspend = {
  1379. .name = HV_REGISTER_DISPATCH_SUSPEND,
  1380. .value.dispatch_suspend.suspended = 1,
  1381. };
  1382. ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
  1383. 1, &dispatch_suspend);
  1384. if (ret)
  1385. vp_err(vp, "failed to suspend\n");
  1386. return ret;
  1387. }
  1388. static int
  1389. get_vp_signaled_count(struct mshv_vp *vp, u64 *count)
  1390. {
  1391. int ret;
  1392. struct hv_register_assoc root_signal_count = {
  1393. .name = HV_REGISTER_VP_ROOT_SIGNAL_COUNT,
  1394. };
  1395. ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
  1396. 1, &root_signal_count);
  1397. if (ret) {
  1398. vp_err(vp, "Failed to get root signal count");
  1399. *count = 0;
  1400. return ret;
  1401. }
  1402. *count = root_signal_count.value.reg64;
  1403. return ret;
  1404. }
  1405. static void
  1406. drain_vp_signals(struct mshv_vp *vp)
  1407. {
  1408. u64 hv_signal_count;
  1409. u64 vp_signal_count;
  1410. get_vp_signaled_count(vp, &hv_signal_count);
  1411. vp_signal_count = atomic64_read(&vp->run.vp_signaled_count);
  1412. /*
  1413. * There should be at most 1 outstanding notification, but be extra
  1414. * careful anyway.
  1415. */
  1416. while (hv_signal_count != vp_signal_count) {
  1417. WARN_ON(hv_signal_count - vp_signal_count != 1);
  1418. if (wait_event_interruptible(vp->run.vp_suspend_queue,
  1419. vp->run.kicked_by_hv == 1))
  1420. break;
  1421. vp->run.kicked_by_hv = 0;
  1422. vp_signal_count = atomic64_read(&vp->run.vp_signaled_count);
  1423. }
  1424. }
  1425. static void drain_all_vps(const struct mshv_partition *partition)
  1426. {
  1427. int i;
  1428. struct mshv_vp *vp;
  1429. /*
  1430. * VPs are reachable from ISR. It is safe to not take the partition
  1431. * lock because nobody else can enter this function and drop the
  1432. * partition from the list.
  1433. */
  1434. for (i = 0; i < MSHV_MAX_VPS; i++) {
  1435. vp = partition->pt_vp_array[i];
  1436. if (!vp)
  1437. continue;
  1438. /*
  1439. * Disable dispatching of the VP in the hypervisor. After this
  1440. * the hypervisor guarantees it won't generate any signals for
  1441. * the VP and the hypervisor's VP signal count won't change.
  1442. */
  1443. disable_vp_dispatch(vp);
  1444. drain_vp_signals(vp);
  1445. }
  1446. }
  1447. static void
  1448. remove_partition(struct mshv_partition *partition)
  1449. {
  1450. spin_lock(&mshv_root.pt_ht_lock);
  1451. hlist_del_rcu(&partition->pt_hnode);
  1452. spin_unlock(&mshv_root.pt_ht_lock);
  1453. synchronize_rcu();
  1454. }
  1455. /*
  1456. * Tear down a partition and remove it from the list.
  1457. * Partition's refcount must be 0
  1458. */
  1459. static void destroy_partition(struct mshv_partition *partition)
  1460. {
  1461. struct mshv_vp *vp;
  1462. struct mshv_mem_region *region;
  1463. struct hlist_node *n;
  1464. int i;
  1465. if (refcount_read(&partition->pt_ref_count)) {
  1466. pt_err(partition,
  1467. "Attempt to destroy partition but refcount > 0\n");
  1468. return;
  1469. }
  1470. if (partition->pt_initialized) {
  1471. /*
  1472. * We only need to drain signals for root scheduler. This should be
  1473. * done before removing the partition from the partition list.
  1474. */
  1475. if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
  1476. drain_all_vps(partition);
  1477. /* Remove vps */
  1478. for (i = 0; i < MSHV_MAX_VPS; ++i) {
  1479. vp = partition->pt_vp_array[i];
  1480. if (!vp)
  1481. continue;
  1482. mshv_debugfs_vp_remove(vp);
  1483. mshv_vp_stats_unmap(partition->pt_id, vp->vp_index,
  1484. vp->vp_stats_pages);
  1485. if (vp->vp_register_page) {
  1486. (void)hv_unmap_vp_state_page(partition->pt_id,
  1487. vp->vp_index,
  1488. HV_VP_STATE_PAGE_REGISTERS,
  1489. virt_to_page(vp->vp_register_page),
  1490. input_vtl_zero);
  1491. vp->vp_register_page = NULL;
  1492. }
  1493. (void)hv_unmap_vp_state_page(partition->pt_id,
  1494. vp->vp_index,
  1495. HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
  1496. virt_to_page(vp->vp_intercept_msg_page),
  1497. input_vtl_zero);
  1498. vp->vp_intercept_msg_page = NULL;
  1499. if (vp->vp_ghcb_page) {
  1500. (void)hv_unmap_vp_state_page(partition->pt_id,
  1501. vp->vp_index,
  1502. HV_VP_STATE_PAGE_GHCB,
  1503. virt_to_page(vp->vp_ghcb_page),
  1504. input_vtl_normal);
  1505. vp->vp_ghcb_page = NULL;
  1506. }
  1507. kfree(vp);
  1508. partition->pt_vp_array[i] = NULL;
  1509. }
  1510. mshv_debugfs_partition_remove(partition);
  1511. /* Deallocates and unmaps everything including vcpus, GPA mappings etc */
  1512. hv_call_finalize_partition(partition->pt_id);
  1513. partition->pt_initialized = false;
  1514. }
  1515. remove_partition(partition);
  1516. hlist_for_each_entry_safe(region, n, &partition->pt_mem_regions,
  1517. hnode) {
  1518. hlist_del(&region->hnode);
  1519. mshv_region_put(region);
  1520. }
  1521. /* Withdraw and free all pages we deposited */
  1522. hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id);
  1523. hv_call_delete_partition(partition->pt_id);
  1524. mshv_free_routing_table(partition);
  1525. kfree(partition);
  1526. }
  1527. struct
  1528. mshv_partition *mshv_partition_get(struct mshv_partition *partition)
  1529. {
  1530. if (refcount_inc_not_zero(&partition->pt_ref_count))
  1531. return partition;
  1532. return NULL;
  1533. }
  1534. struct
  1535. mshv_partition *mshv_partition_find(u64 partition_id)
  1536. __must_hold(RCU)
  1537. {
  1538. struct mshv_partition *p;
  1539. hash_for_each_possible_rcu(mshv_root.pt_htable, p, pt_hnode,
  1540. partition_id)
  1541. if (p->pt_id == partition_id)
  1542. return p;
  1543. return NULL;
  1544. }
  1545. void
  1546. mshv_partition_put(struct mshv_partition *partition)
  1547. {
  1548. if (refcount_dec_and_test(&partition->pt_ref_count))
  1549. destroy_partition(partition);
  1550. }
  1551. static int
  1552. mshv_partition_release(struct inode *inode, struct file *filp)
  1553. {
  1554. struct mshv_partition *partition = filp->private_data;
  1555. mshv_eventfd_release(partition);
  1556. cleanup_srcu_struct(&partition->pt_irq_srcu);
  1557. mshv_partition_put(partition);
  1558. return 0;
  1559. }
  1560. static int
  1561. add_partition(struct mshv_partition *partition)
  1562. {
  1563. spin_lock(&mshv_root.pt_ht_lock);
  1564. hash_add_rcu(mshv_root.pt_htable, &partition->pt_hnode,
  1565. partition->pt_id);
  1566. spin_unlock(&mshv_root.pt_ht_lock);
  1567. return 0;
  1568. }
  1569. static_assert(MSHV_NUM_CPU_FEATURES_BANKS ==
  1570. HV_PARTITION_PROCESSOR_FEATURES_BANKS);
  1571. static long mshv_ioctl_process_pt_flags(void __user *user_arg, u64 *pt_flags,
  1572. struct hv_partition_creation_properties *cr_props,
  1573. union hv_partition_isolation_properties *isol_props)
  1574. {
  1575. int i;
  1576. struct mshv_create_partition_v2 args;
  1577. union hv_partition_processor_features *disabled_procs;
  1578. union hv_partition_processor_xsave_features *disabled_xsave;
  1579. /* First, copy v1 struct in case user is on previous versions */
  1580. if (copy_from_user(&args, user_arg,
  1581. sizeof(struct mshv_create_partition)))
  1582. return -EFAULT;
  1583. if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) ||
  1584. args.pt_isolation >= MSHV_PT_ISOLATION_COUNT)
  1585. return -EINVAL;
  1586. disabled_procs = &cr_props->disabled_processor_features;
  1587. disabled_xsave = &cr_props->disabled_processor_xsave_features;
  1588. /* Check if user provided newer struct with feature fields */
  1589. if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_CPU_AND_XSAVE_FEATURES)) {
  1590. if (copy_from_user(&args, user_arg, sizeof(args)))
  1591. return -EFAULT;
  1592. /* Re-validate v1 fields after second copy_from_user() */
  1593. if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) ||
  1594. args.pt_isolation >= MSHV_PT_ISOLATION_COUNT)
  1595. return -EINVAL;
  1596. if (args.pt_num_cpu_fbanks != MSHV_NUM_CPU_FEATURES_BANKS ||
  1597. mshv_field_nonzero(args, pt_rsvd) ||
  1598. mshv_field_nonzero(args, pt_rsvd1))
  1599. return -EINVAL;
  1600. /*
  1601. * Note this assumes MSHV_NUM_CPU_FEATURES_BANKS will never
  1602. * change and equals HV_PARTITION_PROCESSOR_FEATURES_BANKS
  1603. * (i.e. 2).
  1604. *
  1605. * Further banks (index >= 2) will be modifiable as 'early'
  1606. * properties via the set partition property hypercall.
  1607. */
  1608. for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++)
  1609. disabled_procs->as_uint64[i] = args.pt_cpu_fbanks[i];
  1610. #if IS_ENABLED(CONFIG_X86_64)
  1611. disabled_xsave->as_uint64 = args.pt_disabled_xsave;
  1612. #else
  1613. /*
  1614. * In practice this field is ignored on arm64, but safer to
  1615. * zero it in case it is ever used.
  1616. */
  1617. disabled_xsave->as_uint64 = 0;
  1618. if (mshv_field_nonzero(args, pt_rsvd2))
  1619. return -EINVAL;
  1620. #endif
  1621. } else {
  1622. /*
  1623. * v1 behavior: try to enable everything. The hypervisor will
  1624. * disable features that are not supported. The banks can be
  1625. * queried via the get partition property hypercall.
  1626. */
  1627. for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++)
  1628. disabled_procs->as_uint64[i] = 0;
  1629. disabled_xsave->as_uint64 = 0;
  1630. }
  1631. /* Only support EXO partitions */
  1632. *pt_flags = HV_PARTITION_CREATION_FLAG_EXO_PARTITION |
  1633. HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED;
  1634. if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_LAPIC))
  1635. *pt_flags |= HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED;
  1636. if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_X2APIC))
  1637. *pt_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE;
  1638. if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_GPA_SUPER_PAGES))
  1639. *pt_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED;
  1640. if (args.pt_flags & BIT(MSHV_PT_BIT_NESTED_VIRTUALIZATION))
  1641. *pt_flags |= HV_PARTITION_CREATION_FLAG_NESTED_VIRTUALIZATION_CAPABLE;
  1642. if (args.pt_flags & BIT(MSHV_PT_BIT_SMT_ENABLED_GUEST))
  1643. *pt_flags |= HV_PARTITION_CREATION_FLAG_SMT_ENABLED_GUEST;
  1644. isol_props->as_uint64 = 0;
  1645. switch (args.pt_isolation) {
  1646. case MSHV_PT_ISOLATION_NONE:
  1647. isol_props->isolation_type = HV_PARTITION_ISOLATION_TYPE_NONE;
  1648. break;
  1649. }
  1650. return 0;
  1651. }
  1652. static long
  1653. mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev)
  1654. {
  1655. u64 creation_flags;
  1656. struct hv_partition_creation_properties creation_properties;
  1657. union hv_partition_isolation_properties isolation_properties;
  1658. struct mshv_partition *partition;
  1659. long ret;
  1660. ret = mshv_ioctl_process_pt_flags(user_arg, &creation_flags,
  1661. &creation_properties,
  1662. &isolation_properties);
  1663. if (ret)
  1664. return ret;
  1665. partition = kzalloc_obj(*partition);
  1666. if (!partition)
  1667. return -ENOMEM;
  1668. partition->pt_module_dev = module_dev;
  1669. partition->isolation_type = isolation_properties.isolation_type;
  1670. refcount_set(&partition->pt_ref_count, 1);
  1671. mutex_init(&partition->pt_mutex);
  1672. mutex_init(&partition->pt_irq_lock);
  1673. init_completion(&partition->async_hypercall);
  1674. INIT_HLIST_HEAD(&partition->irq_ack_notifier_list);
  1675. INIT_HLIST_HEAD(&partition->pt_devices);
  1676. spin_lock_init(&partition->pt_mem_regions_lock);
  1677. INIT_HLIST_HEAD(&partition->pt_mem_regions);
  1678. mshv_eventfd_init(partition);
  1679. ret = init_srcu_struct(&partition->pt_irq_srcu);
  1680. if (ret)
  1681. goto free_partition;
  1682. ret = hv_call_create_partition(creation_flags,
  1683. creation_properties,
  1684. isolation_properties,
  1685. &partition->pt_id);
  1686. if (ret)
  1687. goto cleanup_irq_srcu;
  1688. ret = add_partition(partition);
  1689. if (ret)
  1690. goto delete_partition;
  1691. ret = mshv_init_async_handler(partition);
  1692. if (!ret) {
  1693. ret = FD_ADD(O_CLOEXEC, anon_inode_getfile("mshv_partition",
  1694. &mshv_partition_fops,
  1695. partition, O_RDWR));
  1696. if (ret >= 0)
  1697. return ret;
  1698. }
  1699. remove_partition(partition);
  1700. delete_partition:
  1701. hv_call_delete_partition(partition->pt_id);
  1702. cleanup_irq_srcu:
  1703. cleanup_srcu_struct(&partition->pt_irq_srcu);
  1704. free_partition:
  1705. kfree(partition);
  1706. return ret;
  1707. }
  1708. static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl,
  1709. unsigned long arg)
  1710. {
  1711. struct miscdevice *misc = filp->private_data;
  1712. switch (ioctl) {
  1713. case MSHV_CREATE_PARTITION:
  1714. return mshv_ioctl_create_partition((void __user *)arg,
  1715. misc->this_device);
  1716. case MSHV_ROOT_HVCALL:
  1717. return mshv_ioctl_passthru_hvcall(NULL, false,
  1718. (void __user *)arg);
  1719. }
  1720. return -ENOTTY;
  1721. }
  1722. static int
  1723. mshv_dev_open(struct inode *inode, struct file *filp)
  1724. {
  1725. return 0;
  1726. }
  1727. static int
  1728. mshv_dev_release(struct inode *inode, struct file *filp)
  1729. {
  1730. return 0;
  1731. }
  1732. static int mshv_root_sched_online;
  1733. static const char *scheduler_type_to_string(enum hv_scheduler_type type)
  1734. {
  1735. switch (type) {
  1736. case HV_SCHEDULER_TYPE_LP:
  1737. return "classic scheduler without SMT";
  1738. case HV_SCHEDULER_TYPE_LP_SMT:
  1739. return "classic scheduler with SMT";
  1740. case HV_SCHEDULER_TYPE_CORE_SMT:
  1741. return "core scheduler";
  1742. case HV_SCHEDULER_TYPE_ROOT:
  1743. return "root scheduler";
  1744. default:
  1745. return "unknown scheduler";
  1746. };
  1747. }
  1748. static int __init l1vh_retrieve_scheduler_type(enum hv_scheduler_type *out)
  1749. {
  1750. u64 integrated_sched_enabled;
  1751. int ret;
  1752. *out = HV_SCHEDULER_TYPE_CORE_SMT;
  1753. if (!mshv_root.vmm_caps.vmm_enable_integrated_scheduler)
  1754. return 0;
  1755. ret = hv_call_get_partition_property_ex(HV_PARTITION_ID_SELF,
  1756. HV_PARTITION_PROPERTY_INTEGRATED_SCHEDULER_ENABLED,
  1757. 0, &integrated_sched_enabled,
  1758. sizeof(integrated_sched_enabled));
  1759. if (ret)
  1760. return ret;
  1761. if (integrated_sched_enabled)
  1762. *out = HV_SCHEDULER_TYPE_ROOT;
  1763. return 0;
  1764. }
  1765. /* TODO move this to hv_common.c when needed outside */
  1766. static int __init hv_retrieve_scheduler_type(enum hv_scheduler_type *out)
  1767. {
  1768. struct hv_input_get_system_property *input;
  1769. struct hv_output_get_system_property *output;
  1770. unsigned long flags;
  1771. u64 status;
  1772. local_irq_save(flags);
  1773. input = *this_cpu_ptr(hyperv_pcpu_input_arg);
  1774. output = *this_cpu_ptr(hyperv_pcpu_output_arg);
  1775. memset(input, 0, sizeof(*input));
  1776. memset(output, 0, sizeof(*output));
  1777. input->property_id = HV_SYSTEM_PROPERTY_SCHEDULER_TYPE;
  1778. status = hv_do_hypercall(HVCALL_GET_SYSTEM_PROPERTY, input, output);
  1779. if (!hv_result_success(status)) {
  1780. local_irq_restore(flags);
  1781. pr_err("%s: %s\n", __func__, hv_result_to_string(status));
  1782. return hv_result_to_errno(status);
  1783. }
  1784. *out = output->scheduler_type;
  1785. local_irq_restore(flags);
  1786. return 0;
  1787. }
  1788. /* Retrieve and stash the supported scheduler type */
  1789. static int __init mshv_retrieve_scheduler_type(struct device *dev)
  1790. {
  1791. int ret;
  1792. if (hv_l1vh_partition())
  1793. ret = l1vh_retrieve_scheduler_type(&hv_scheduler_type);
  1794. else
  1795. ret = hv_retrieve_scheduler_type(&hv_scheduler_type);
  1796. if (ret)
  1797. return ret;
  1798. dev_info(dev, "Hypervisor using %s\n",
  1799. scheduler_type_to_string(hv_scheduler_type));
  1800. switch (hv_scheduler_type) {
  1801. case HV_SCHEDULER_TYPE_CORE_SMT:
  1802. case HV_SCHEDULER_TYPE_LP_SMT:
  1803. case HV_SCHEDULER_TYPE_ROOT:
  1804. case HV_SCHEDULER_TYPE_LP:
  1805. /* Supported scheduler, nothing to do */
  1806. break;
  1807. default:
  1808. dev_err(dev, "unsupported scheduler 0x%x, bailing.\n",
  1809. hv_scheduler_type);
  1810. return -EOPNOTSUPP;
  1811. }
  1812. return 0;
  1813. }
  1814. static int mshv_root_scheduler_init(unsigned int cpu)
  1815. {
  1816. void **inputarg, **outputarg, *p;
  1817. inputarg = (void **)this_cpu_ptr(root_scheduler_input);
  1818. outputarg = (void **)this_cpu_ptr(root_scheduler_output);
  1819. /* Allocate two consecutive pages. One for input, one for output. */
  1820. p = kmalloc(2 * HV_HYP_PAGE_SIZE, GFP_KERNEL);
  1821. if (!p)
  1822. return -ENOMEM;
  1823. *inputarg = p;
  1824. *outputarg = (char *)p + HV_HYP_PAGE_SIZE;
  1825. return 0;
  1826. }
  1827. static int mshv_root_scheduler_cleanup(unsigned int cpu)
  1828. {
  1829. void *p, **inputarg, **outputarg;
  1830. inputarg = (void **)this_cpu_ptr(root_scheduler_input);
  1831. outputarg = (void **)this_cpu_ptr(root_scheduler_output);
  1832. p = *inputarg;
  1833. *inputarg = NULL;
  1834. *outputarg = NULL;
  1835. kfree(p);
  1836. return 0;
  1837. }
  1838. /* Must be called after retrieving the scheduler type */
  1839. static int
  1840. root_scheduler_init(struct device *dev)
  1841. {
  1842. int ret;
  1843. if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT)
  1844. return 0;
  1845. root_scheduler_input = alloc_percpu(void *);
  1846. root_scheduler_output = alloc_percpu(void *);
  1847. if (!root_scheduler_input || !root_scheduler_output) {
  1848. dev_err(dev, "Failed to allocate root scheduler buffers\n");
  1849. ret = -ENOMEM;
  1850. goto out;
  1851. }
  1852. ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_root_sched",
  1853. mshv_root_scheduler_init,
  1854. mshv_root_scheduler_cleanup);
  1855. if (ret < 0) {
  1856. dev_err(dev, "Failed to setup root scheduler state: %i\n", ret);
  1857. goto out;
  1858. }
  1859. mshv_root_sched_online = ret;
  1860. return 0;
  1861. out:
  1862. free_percpu(root_scheduler_input);
  1863. free_percpu(root_scheduler_output);
  1864. return ret;
  1865. }
  1866. static void
  1867. root_scheduler_deinit(void)
  1868. {
  1869. if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT)
  1870. return;
  1871. cpuhp_remove_state(mshv_root_sched_online);
  1872. free_percpu(root_scheduler_input);
  1873. free_percpu(root_scheduler_output);
  1874. }
  1875. static int __init mshv_init_vmm_caps(struct device *dev)
  1876. {
  1877. int ret;
  1878. ret = hv_call_get_partition_property_ex(HV_PARTITION_ID_SELF,
  1879. HV_PARTITION_PROPERTY_VMM_CAPABILITIES,
  1880. 0, &mshv_root.vmm_caps,
  1881. sizeof(mshv_root.vmm_caps));
  1882. if (ret && hv_l1vh_partition()) {
  1883. dev_err(dev, "Failed to get VMM capabilities: %d\n", ret);
  1884. return ret;
  1885. }
  1886. dev_dbg(dev, "vmm_caps = %#llx\n", mshv_root.vmm_caps.as_uint64[0]);
  1887. return 0;
  1888. }
  1889. static int __init mshv_parent_partition_init(void)
  1890. {
  1891. int ret;
  1892. struct device *dev;
  1893. union hv_hypervisor_version_info version_info;
  1894. if (!hv_parent_partition() || is_kdump_kernel())
  1895. return -ENODEV;
  1896. if (hv_get_hypervisor_version(&version_info))
  1897. return -ENODEV;
  1898. ret = misc_register(&mshv_dev);
  1899. if (ret)
  1900. return ret;
  1901. dev = mshv_dev.this_device;
  1902. if (version_info.build_number < MSHV_HV_MIN_VERSION ||
  1903. version_info.build_number > MSHV_HV_MAX_VERSION) {
  1904. dev_err(dev, "Running on unvalidated Hyper-V version\n");
  1905. dev_err(dev, "Versions: current: %u min: %u max: %u\n",
  1906. version_info.build_number, MSHV_HV_MIN_VERSION,
  1907. MSHV_HV_MAX_VERSION);
  1908. }
  1909. ret = mshv_synic_init(dev);
  1910. if (ret)
  1911. goto device_deregister;
  1912. ret = mshv_init_vmm_caps(dev);
  1913. if (ret)
  1914. goto synic_cleanup;
  1915. ret = mshv_retrieve_scheduler_type(dev);
  1916. if (ret)
  1917. goto synic_cleanup;
  1918. ret = root_scheduler_init(dev);
  1919. if (ret)
  1920. goto synic_cleanup;
  1921. ret = mshv_debugfs_init();
  1922. if (ret)
  1923. goto deinit_root_scheduler;
  1924. ret = mshv_irqfd_wq_init();
  1925. if (ret)
  1926. goto exit_debugfs;
  1927. spin_lock_init(&mshv_root.pt_ht_lock);
  1928. hash_init(mshv_root.pt_htable);
  1929. hv_setup_mshv_handler(mshv_isr);
  1930. return 0;
  1931. exit_debugfs:
  1932. mshv_debugfs_exit();
  1933. deinit_root_scheduler:
  1934. root_scheduler_deinit();
  1935. synic_cleanup:
  1936. mshv_synic_exit();
  1937. device_deregister:
  1938. misc_deregister(&mshv_dev);
  1939. return ret;
  1940. }
  1941. static void __exit mshv_parent_partition_exit(void)
  1942. {
  1943. hv_setup_mshv_handler(NULL);
  1944. mshv_port_table_fini();
  1945. mshv_debugfs_exit();
  1946. misc_deregister(&mshv_dev);
  1947. mshv_irqfd_wq_cleanup();
  1948. root_scheduler_deinit();
  1949. mshv_synic_exit();
  1950. }
  1951. module_init(mshv_parent_partition_init);
  1952. module_exit(mshv_parent_partition_exit);