| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357 |
- // SPDX-License-Identifier: GPL-2.0-only
- /*
- * Copyright (c) 2024, Microsoft Corporation.
- *
- * The main part of the mshv_root module, providing APIs to create
- * and manage guest partitions.
- *
- * Authors: Microsoft Linux virtualization team
- */
- #include <linux/entry-virt.h>
- #include <linux/kernel.h>
- #include <linux/module.h>
- #include <linux/fs.h>
- #include <linux/miscdevice.h>
- #include <linux/slab.h>
- #include <linux/file.h>
- #include <linux/anon_inodes.h>
- #include <linux/mm.h>
- #include <linux/io.h>
- #include <linux/cpuhotplug.h>
- #include <linux/random.h>
- #include <asm/mshyperv.h>
- #include <linux/hyperv.h>
- #include <linux/notifier.h>
- #include <linux/reboot.h>
- #include <linux/kexec.h>
- #include <linux/page-flags.h>
- #include <linux/crash_dump.h>
- #include <linux/panic_notifier.h>
- #include <linux/vmalloc.h>
- #include <linux/rseq.h>
- #include "mshv_eventfd.h"
- #include "mshv.h"
- #include "mshv_root.h"
- MODULE_AUTHOR("Microsoft");
- MODULE_LICENSE("GPL");
- MODULE_DESCRIPTION("Microsoft Hyper-V root partition VMM interface /dev/mshv");
- /* HV_THREAD_COUNTER */
- #if defined(CONFIG_X86_64)
- #define HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED 202
- #elif defined(CONFIG_ARM64)
- #define HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED 95
- #endif
- struct mshv_root mshv_root;
- enum hv_scheduler_type hv_scheduler_type;
- /* Once we implement the fast extended hypercall ABI they can go away. */
- static void * __percpu *root_scheduler_input;
- static void * __percpu *root_scheduler_output;
- static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
- static int mshv_dev_open(struct inode *inode, struct file *filp);
- static int mshv_dev_release(struct inode *inode, struct file *filp);
- static int mshv_vp_release(struct inode *inode, struct file *filp);
- static long mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
- static int mshv_partition_release(struct inode *inode, struct file *filp);
- static long mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
- static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma);
- static vm_fault_t mshv_vp_fault(struct vm_fault *vmf);
- static int mshv_init_async_handler(struct mshv_partition *partition);
- static void mshv_async_hvcall_handler(void *data, u64 *status);
- static const union hv_input_vtl input_vtl_zero;
- static const union hv_input_vtl input_vtl_normal = {
- .target_vtl = HV_NORMAL_VTL,
- .use_target_vtl = 1,
- };
- static const struct vm_operations_struct mshv_vp_vm_ops = {
- .fault = mshv_vp_fault,
- };
- static const struct file_operations mshv_vp_fops = {
- .owner = THIS_MODULE,
- .release = mshv_vp_release,
- .unlocked_ioctl = mshv_vp_ioctl,
- .llseek = noop_llseek,
- .mmap = mshv_vp_mmap,
- };
- static const struct file_operations mshv_partition_fops = {
- .owner = THIS_MODULE,
- .release = mshv_partition_release,
- .unlocked_ioctl = mshv_partition_ioctl,
- .llseek = noop_llseek,
- };
- static const struct file_operations mshv_dev_fops = {
- .owner = THIS_MODULE,
- .open = mshv_dev_open,
- .release = mshv_dev_release,
- .unlocked_ioctl = mshv_dev_ioctl,
- .llseek = noop_llseek,
- };
- static struct miscdevice mshv_dev = {
- .minor = MISC_DYNAMIC_MINOR,
- .name = "mshv",
- .fops = &mshv_dev_fops,
- .mode = 0600,
- };
- /*
- * Only allow hypercalls that have a u64 partition id as the first member of
- * the input structure.
- * These are sorted by value.
- */
- static u16 mshv_passthru_hvcalls[] = {
- HVCALL_GET_PARTITION_PROPERTY,
- HVCALL_GET_PARTITION_PROPERTY_EX,
- HVCALL_SET_PARTITION_PROPERTY,
- HVCALL_INSTALL_INTERCEPT,
- HVCALL_GET_VP_REGISTERS,
- HVCALL_SET_VP_REGISTERS,
- HVCALL_TRANSLATE_VIRTUAL_ADDRESS,
- HVCALL_CLEAR_VIRTUAL_INTERRUPT,
- HVCALL_REGISTER_INTERCEPT_RESULT,
- HVCALL_ASSERT_VIRTUAL_INTERRUPT,
- HVCALL_GET_GPA_PAGES_ACCESS_STATES,
- HVCALL_SIGNAL_EVENT_DIRECT,
- HVCALL_POST_MESSAGE_DIRECT,
- HVCALL_GET_VP_CPUID_VALUES,
- };
- /*
- * Only allow hypercalls that are safe to be called by the VMM with the host
- * partition as target (i.e. HV_PARTITION_ID_SELF). Carefully audit that a
- * hypercall cannot be misused by the VMM before adding it to this list.
- */
- static u16 mshv_self_passthru_hvcalls[] = {
- HVCALL_GET_PARTITION_PROPERTY,
- HVCALL_GET_PARTITION_PROPERTY_EX,
- };
- static bool mshv_hvcall_is_async(u16 code)
- {
- switch (code) {
- case HVCALL_SET_PARTITION_PROPERTY:
- return true;
- default:
- break;
- }
- return false;
- }
- static bool mshv_passthru_hvcall_allowed(u16 code, u64 pt_id)
- {
- int i;
- int n = ARRAY_SIZE(mshv_passthru_hvcalls);
- u16 *allowed_hvcalls = mshv_passthru_hvcalls;
- if (pt_id == HV_PARTITION_ID_SELF) {
- n = ARRAY_SIZE(mshv_self_passthru_hvcalls);
- allowed_hvcalls = mshv_self_passthru_hvcalls;
- }
- for (i = 0; i < n; ++i)
- if (allowed_hvcalls[i] == code)
- return true;
- return false;
- }
- static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition,
- bool partition_locked,
- void __user *user_args)
- {
- u64 status;
- int ret = 0;
- bool is_async;
- struct mshv_root_hvcall args;
- struct page *page;
- unsigned int pages_order;
- void *input_pg = NULL;
- void *output_pg = NULL;
- u16 reps_completed;
- u64 pt_id = partition ? partition->pt_id : HV_PARTITION_ID_SELF;
- if (copy_from_user(&args, user_args, sizeof(args)))
- return -EFAULT;
- if (args.status || !args.in_ptr || args.in_sz < sizeof(u64) ||
- mshv_field_nonzero(args, rsvd) || args.in_sz > HV_HYP_PAGE_SIZE)
- return -EINVAL;
- if (args.out_ptr && (!args.out_sz || args.out_sz > HV_HYP_PAGE_SIZE))
- return -EINVAL;
- if (!mshv_passthru_hvcall_allowed(args.code, pt_id))
- return -EINVAL;
- is_async = mshv_hvcall_is_async(args.code);
- if (is_async) {
- /* async hypercalls can only be called from partition fd */
- if (!partition || !partition_locked)
- return -EINVAL;
- ret = mshv_init_async_handler(partition);
- if (ret)
- return ret;
- }
- pages_order = args.out_ptr ? 1 : 0;
- page = alloc_pages(GFP_KERNEL, pages_order);
- if (!page)
- return -ENOMEM;
- input_pg = page_address(page);
- if (args.out_ptr)
- output_pg = (char *)input_pg + PAGE_SIZE;
- else
- output_pg = NULL;
- if (copy_from_user(input_pg, (void __user *)args.in_ptr,
- args.in_sz)) {
- ret = -EFAULT;
- goto free_pages_out;
- }
- /*
- * NOTE: This only works because all the allowed hypercalls' input
- * structs begin with a u64 partition_id field.
- */
- *(u64 *)input_pg = pt_id;
- reps_completed = 0;
- do {
- if (args.reps) {
- status = hv_do_rep_hypercall_ex(args.code, args.reps,
- 0, reps_completed,
- input_pg, output_pg);
- reps_completed = hv_repcomp(status);
- } else {
- status = hv_do_hypercall(args.code, input_pg, output_pg);
- }
- if (hv_result(status) == HV_STATUS_CALL_PENDING) {
- if (is_async) {
- mshv_async_hvcall_handler(partition, &status);
- } else { /* Paranoia check. This shouldn't happen! */
- ret = -EBADFD;
- goto free_pages_out;
- }
- }
- if (hv_result_success(status))
- break;
- if (!hv_result_needs_memory(status))
- ret = hv_result_to_errno(status);
- else
- ret = hv_deposit_memory(pt_id, status);
- } while (!ret);
- args.status = hv_result(status);
- args.reps = reps_completed;
- if (copy_to_user(user_args, &args, sizeof(args)))
- ret = -EFAULT;
- if (!ret && output_pg &&
- copy_to_user((void __user *)args.out_ptr, output_pg, args.out_sz))
- ret = -EFAULT;
- free_pages_out:
- free_pages((unsigned long)input_pg, pages_order);
- return ret;
- }
- static inline bool is_ghcb_mapping_available(void)
- {
- #if IS_ENABLED(CONFIG_X86_64)
- return ms_hyperv.ext_features & HV_VP_GHCB_ROOT_MAPPING_AVAILABLE;
- #else
- return 0;
- #endif
- }
- static int mshv_get_vp_registers(u32 vp_index, u64 partition_id, u16 count,
- struct hv_register_assoc *registers)
- {
- return hv_call_get_vp_registers(vp_index, partition_id,
- count, input_vtl_zero, registers);
- }
- static int mshv_set_vp_registers(u32 vp_index, u64 partition_id, u16 count,
- struct hv_register_assoc *registers)
- {
- return hv_call_set_vp_registers(vp_index, partition_id,
- count, input_vtl_zero, registers);
- }
- /*
- * Explicit guest vCPU suspend is asynchronous by nature (as it is requested by
- * dom0 vCPU for guest vCPU) and thus it can race with "intercept" suspend,
- * done by the hypervisor.
- * "Intercept" suspend leads to asynchronous message delivery to dom0 which
- * should be awaited to keep the VP loop consistent (i.e. no message pending
- * upon VP resume).
- * VP intercept suspend can't be done when the VP is explicitly suspended
- * already, and thus can be only two possible race scenarios:
- * 1. implicit suspend bit set -> explicit suspend bit set -> message sent
- * 2. implicit suspend bit set -> message sent -> explicit suspend bit set
- * Checking for implicit suspend bit set after explicit suspend request has
- * succeeded in either case allows us to reliably identify, if there is a
- * message to receive and deliver to VMM.
- */
- static int
- mshv_suspend_vp(const struct mshv_vp *vp, bool *message_in_flight)
- {
- struct hv_register_assoc explicit_suspend = {
- .name = HV_REGISTER_EXPLICIT_SUSPEND
- };
- struct hv_register_assoc intercept_suspend = {
- .name = HV_REGISTER_INTERCEPT_SUSPEND
- };
- union hv_explicit_suspend_register *es =
- &explicit_suspend.value.explicit_suspend;
- union hv_intercept_suspend_register *is =
- &intercept_suspend.value.intercept_suspend;
- int ret;
- es->suspended = 1;
- ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
- 1, &explicit_suspend);
- if (ret) {
- vp_err(vp, "Failed to explicitly suspend vCPU\n");
- return ret;
- }
- ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
- 1, &intercept_suspend);
- if (ret) {
- vp_err(vp, "Failed to get intercept suspend state\n");
- return ret;
- }
- *message_in_flight = is->suspended;
- return 0;
- }
- /*
- * This function is used when VPs are scheduled by the hypervisor's
- * scheduler.
- *
- * Caller has to make sure the registers contain cleared
- * HV_REGISTER_INTERCEPT_SUSPEND and HV_REGISTER_EXPLICIT_SUSPEND registers
- * exactly in this order (the hypervisor clears them sequentially) to avoid
- * potential invalid clearing a newly arrived HV_REGISTER_INTERCEPT_SUSPEND
- * after VP is released from HV_REGISTER_EXPLICIT_SUSPEND in case of the
- * opposite order.
- */
- static long mshv_run_vp_with_hyp_scheduler(struct mshv_vp *vp)
- {
- long ret;
- struct hv_register_assoc suspend_regs[2] = {
- { .name = HV_REGISTER_INTERCEPT_SUSPEND },
- { .name = HV_REGISTER_EXPLICIT_SUSPEND }
- };
- size_t count = ARRAY_SIZE(suspend_regs);
- /* Resume VP execution */
- ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
- count, suspend_regs);
- if (ret) {
- vp_err(vp, "Failed to resume vp execution. %lx\n", ret);
- return ret;
- }
- ret = wait_event_interruptible(vp->run.vp_suspend_queue,
- vp->run.kicked_by_hv == 1);
- if (ret) {
- bool message_in_flight;
- /*
- * Otherwise the waiting was interrupted by a signal: suspend
- * the vCPU explicitly and copy message in flight (if any).
- */
- ret = mshv_suspend_vp(vp, &message_in_flight);
- if (ret)
- return ret;
- /* Return if no message in flight */
- if (!message_in_flight)
- return -EINTR;
- /* Wait for the message in flight. */
- wait_event(vp->run.vp_suspend_queue, vp->run.kicked_by_hv == 1);
- }
- /*
- * Reset the flag to make the wait_event call above work
- * next time.
- */
- vp->run.kicked_by_hv = 0;
- return 0;
- }
- static int
- mshv_vp_dispatch(struct mshv_vp *vp, u32 flags,
- struct hv_output_dispatch_vp *res)
- {
- struct hv_input_dispatch_vp *input;
- struct hv_output_dispatch_vp *output;
- u64 status;
- preempt_disable();
- input = *this_cpu_ptr(root_scheduler_input);
- output = *this_cpu_ptr(root_scheduler_output);
- memset(input, 0, sizeof(*input));
- memset(output, 0, sizeof(*output));
- input->partition_id = vp->vp_partition->pt_id;
- input->vp_index = vp->vp_index;
- input->time_slice = 0; /* Run forever until something happens */
- input->spec_ctrl = 0; /* TODO: set sensible flags */
- input->flags = flags;
- vp->run.flags.root_sched_dispatched = 1;
- status = hv_do_hypercall(HVCALL_DISPATCH_VP, input, output);
- vp->run.flags.root_sched_dispatched = 0;
- *res = *output;
- preempt_enable();
- if (!hv_result_success(status))
- vp_err(vp, "%s: status %s\n", __func__,
- hv_result_to_string(status));
- return hv_result_to_errno(status);
- }
- static int
- mshv_vp_clear_explicit_suspend(struct mshv_vp *vp)
- {
- struct hv_register_assoc explicit_suspend = {
- .name = HV_REGISTER_EXPLICIT_SUSPEND,
- .value.explicit_suspend.suspended = 0,
- };
- int ret;
- ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
- 1, &explicit_suspend);
- if (ret)
- vp_err(vp, "Failed to unsuspend\n");
- return ret;
- }
- #if IS_ENABLED(CONFIG_X86_64)
- static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp)
- {
- if (!vp->vp_register_page)
- return 0;
- return vp->vp_register_page->interrupt_vectors.as_uint64;
- }
- #else
- static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp)
- {
- return 0;
- }
- #endif
- static bool mshv_vp_dispatch_thread_blocked(struct mshv_vp *vp)
- {
- struct hv_stats_page **stats = vp->vp_stats_pages;
- u64 *self_vp_cntrs = stats[HV_STATS_AREA_SELF]->data;
- u64 *parent_vp_cntrs = stats[HV_STATS_AREA_PARENT]->data;
- return parent_vp_cntrs[HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED] ||
- self_vp_cntrs[HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED];
- }
- static int
- mshv_vp_wait_for_hv_kick(struct mshv_vp *vp)
- {
- int ret;
- ret = wait_event_interruptible(vp->run.vp_suspend_queue,
- (vp->run.kicked_by_hv == 1 &&
- !mshv_vp_dispatch_thread_blocked(vp)) ||
- mshv_vp_interrupt_pending(vp));
- if (ret)
- return -EINTR;
- vp->run.flags.root_sched_blocked = 0;
- vp->run.kicked_by_hv = 0;
- return 0;
- }
- /* Must be called with interrupts enabled */
- static long mshv_run_vp_with_root_scheduler(struct mshv_vp *vp)
- {
- long ret;
- if (vp->run.flags.root_sched_blocked) {
- /*
- * Dispatch state of this VP is blocked. Need to wait
- * for the hypervisor to clear the blocked state before
- * dispatching it.
- */
- ret = mshv_vp_wait_for_hv_kick(vp);
- if (ret)
- return ret;
- }
- do {
- u32 flags = 0;
- struct hv_output_dispatch_vp output;
- if (__xfer_to_guest_mode_work_pending()) {
- ret = xfer_to_guest_mode_handle_work();
- if (ret)
- break;
- }
- if (vp->run.flags.intercept_suspend)
- flags |= HV_DISPATCH_VP_FLAG_CLEAR_INTERCEPT_SUSPEND;
- if (mshv_vp_interrupt_pending(vp))
- flags |= HV_DISPATCH_VP_FLAG_SCAN_INTERRUPT_INJECTION;
- ret = mshv_vp_dispatch(vp, flags, &output);
- if (ret)
- break;
- vp->run.flags.intercept_suspend = 0;
- if (output.dispatch_state == HV_VP_DISPATCH_STATE_BLOCKED) {
- if (output.dispatch_event ==
- HV_VP_DISPATCH_EVENT_SUSPEND) {
- /*
- * TODO: remove the warning once VP canceling
- * is supported
- */
- WARN_ONCE(atomic64_read(&vp->run.vp_signaled_count),
- "%s: vp#%d: unexpected explicit suspend\n",
- __func__, vp->vp_index);
- /*
- * Need to clear explicit suspend before
- * dispatching.
- * Explicit suspend is either:
- * - set right after the first VP dispatch or
- * - set explicitly via hypercall
- * Since the latter case is not yet supported,
- * simply clear it here.
- */
- ret = mshv_vp_clear_explicit_suspend(vp);
- if (ret)
- break;
- ret = mshv_vp_wait_for_hv_kick(vp);
- if (ret)
- break;
- } else {
- vp->run.flags.root_sched_blocked = 1;
- ret = mshv_vp_wait_for_hv_kick(vp);
- if (ret)
- break;
- }
- } else {
- /* HV_VP_DISPATCH_STATE_READY */
- if (output.dispatch_event ==
- HV_VP_DISPATCH_EVENT_INTERCEPT)
- vp->run.flags.intercept_suspend = 1;
- }
- } while (!vp->run.flags.intercept_suspend);
- rseq_virt_userspace_exit();
- return ret;
- }
- static_assert(sizeof(struct hv_message) <= MSHV_RUN_VP_BUF_SZ,
- "sizeof(struct hv_message) must not exceed MSHV_RUN_VP_BUF_SZ");
- static struct mshv_mem_region *
- mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
- {
- struct mshv_mem_region *region;
- hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) {
- if (gfn >= region->start_gfn &&
- gfn < region->start_gfn + region->nr_pages)
- return region;
- }
- return NULL;
- }
- static struct mshv_mem_region *
- mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn)
- {
- struct mshv_mem_region *region;
- spin_lock(&p->pt_mem_regions_lock);
- region = mshv_partition_region_by_gfn(p, gfn);
- if (!region || !mshv_region_get(region)) {
- spin_unlock(&p->pt_mem_regions_lock);
- return NULL;
- }
- spin_unlock(&p->pt_mem_regions_lock);
- return region;
- }
- /**
- * mshv_handle_gpa_intercept - Handle GPA (Guest Physical Address) intercepts.
- * @vp: Pointer to the virtual processor structure.
- *
- * This function processes GPA intercepts by identifying the memory region
- * corresponding to the intercepted GPA, aligning the page offset, and
- * mapping the required pages. It ensures that the region is valid and
- * handles faults efficiently by mapping multiple pages at once.
- *
- * Return: true if the intercept was handled successfully, false otherwise.
- */
- static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
- {
- struct mshv_partition *p = vp->vp_partition;
- struct mshv_mem_region *region;
- bool ret = false;
- u64 gfn;
- #if defined(CONFIG_X86_64)
- struct hv_x64_memory_intercept_message *msg =
- (struct hv_x64_memory_intercept_message *)
- vp->vp_intercept_msg_page->u.payload;
- #elif defined(CONFIG_ARM64)
- struct hv_arm64_memory_intercept_message *msg =
- (struct hv_arm64_memory_intercept_message *)
- vp->vp_intercept_msg_page->u.payload;
- #endif
- enum hv_intercept_access_type access_type =
- msg->header.intercept_access_type;
- gfn = HVPFN_DOWN(msg->guest_physical_address);
- region = mshv_partition_region_by_gfn_get(p, gfn);
- if (!region)
- return false;
- if (access_type == HV_INTERCEPT_ACCESS_WRITE &&
- !(region->hv_map_flags & HV_MAP_GPA_WRITABLE))
- goto put_region;
- if (access_type == HV_INTERCEPT_ACCESS_EXECUTE &&
- !(region->hv_map_flags & HV_MAP_GPA_EXECUTABLE))
- goto put_region;
- /* Only movable memory ranges are supported for GPA intercepts */
- if (region->mreg_type == MSHV_REGION_TYPE_MEM_MOVABLE)
- ret = mshv_region_handle_gfn_fault(region, gfn);
- put_region:
- mshv_region_put(region);
- return ret;
- }
- static bool mshv_vp_handle_intercept(struct mshv_vp *vp)
- {
- switch (vp->vp_intercept_msg_page->header.message_type) {
- case HVMSG_GPA_INTERCEPT:
- return mshv_handle_gpa_intercept(vp);
- }
- return false;
- }
- static long mshv_vp_ioctl_run_vp(struct mshv_vp *vp, void __user *ret_msg)
- {
- long rc;
- do {
- if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
- rc = mshv_run_vp_with_root_scheduler(vp);
- else
- rc = mshv_run_vp_with_hyp_scheduler(vp);
- } while (rc == 0 && mshv_vp_handle_intercept(vp));
- if (rc)
- return rc;
- if (copy_to_user(ret_msg, vp->vp_intercept_msg_page,
- sizeof(struct hv_message)))
- rc = -EFAULT;
- return rc;
- }
- static int
- mshv_vp_ioctl_get_set_state_pfn(struct mshv_vp *vp,
- struct hv_vp_state_data state_data,
- unsigned long user_pfn, size_t page_count,
- bool is_set)
- {
- int completed, ret = 0;
- unsigned long check;
- struct page **pages;
- if (page_count > INT_MAX)
- return -EINVAL;
- /*
- * Check the arithmetic for wraparound/overflow.
- * The last page address in the buffer is:
- * (user_pfn + (page_count - 1)) * PAGE_SIZE
- */
- if (check_add_overflow(user_pfn, (page_count - 1), &check))
- return -EOVERFLOW;
- if (check_mul_overflow(check, PAGE_SIZE, &check))
- return -EOVERFLOW;
- /* Pin user pages so hypervisor can copy directly to them */
- pages = kzalloc_objs(struct page *, page_count);
- if (!pages)
- return -ENOMEM;
- for (completed = 0; completed < page_count; completed += ret) {
- unsigned long user_addr = (user_pfn + completed) * PAGE_SIZE;
- int remaining = page_count - completed;
- ret = pin_user_pages_fast(user_addr, remaining, FOLL_WRITE,
- &pages[completed]);
- if (ret < 0) {
- vp_err(vp, "%s: Failed to pin user pages error %i\n",
- __func__, ret);
- goto unpin_pages;
- }
- }
- if (is_set)
- ret = hv_call_set_vp_state(vp->vp_index,
- vp->vp_partition->pt_id,
- state_data, page_count, pages,
- 0, NULL);
- else
- ret = hv_call_get_vp_state(vp->vp_index,
- vp->vp_partition->pt_id,
- state_data, page_count, pages,
- NULL);
- unpin_pages:
- unpin_user_pages(pages, completed);
- kfree(pages);
- return ret;
- }
- static long
- mshv_vp_ioctl_get_set_state(struct mshv_vp *vp,
- struct mshv_get_set_vp_state __user *user_args,
- bool is_set)
- {
- struct mshv_get_set_vp_state args;
- long ret = 0;
- union hv_output_get_vp_state vp_state;
- u32 data_sz;
- struct hv_vp_state_data state_data = {};
- if (copy_from_user(&args, user_args, sizeof(args)))
- return -EFAULT;
- if (args.type >= MSHV_VP_STATE_COUNT || mshv_field_nonzero(args, rsvd) ||
- !args.buf_sz || !PAGE_ALIGNED(args.buf_sz) ||
- !PAGE_ALIGNED(args.buf_ptr))
- return -EINVAL;
- if (!access_ok((void __user *)args.buf_ptr, args.buf_sz))
- return -EFAULT;
- switch (args.type) {
- case MSHV_VP_STATE_LAPIC:
- state_data.type = HV_GET_SET_VP_STATE_LAPIC_STATE;
- data_sz = HV_HYP_PAGE_SIZE;
- break;
- case MSHV_VP_STATE_XSAVE:
- {
- u64 data_sz_64;
- ret = hv_call_get_partition_property(vp->vp_partition->pt_id,
- HV_PARTITION_PROPERTY_XSAVE_STATES,
- &state_data.xsave.states.as_uint64);
- if (ret)
- return ret;
- ret = hv_call_get_partition_property(vp->vp_partition->pt_id,
- HV_PARTITION_PROPERTY_MAX_XSAVE_DATA_SIZE,
- &data_sz_64);
- if (ret)
- return ret;
- data_sz = (u32)data_sz_64;
- state_data.xsave.flags = 0;
- /* Always request legacy states */
- state_data.xsave.states.legacy_x87 = 1;
- state_data.xsave.states.legacy_sse = 1;
- state_data.type = HV_GET_SET_VP_STATE_XSAVE;
- break;
- }
- case MSHV_VP_STATE_SIMP:
- state_data.type = HV_GET_SET_VP_STATE_SIM_PAGE;
- data_sz = HV_HYP_PAGE_SIZE;
- break;
- case MSHV_VP_STATE_SIEFP:
- state_data.type = HV_GET_SET_VP_STATE_SIEF_PAGE;
- data_sz = HV_HYP_PAGE_SIZE;
- break;
- case MSHV_VP_STATE_SYNTHETIC_TIMERS:
- state_data.type = HV_GET_SET_VP_STATE_SYNTHETIC_TIMERS;
- data_sz = sizeof(vp_state.synthetic_timers_state);
- break;
- default:
- return -EINVAL;
- }
- if (copy_to_user(&user_args->buf_sz, &data_sz, sizeof(user_args->buf_sz)))
- return -EFAULT;
- if (data_sz > args.buf_sz)
- return -EINVAL;
- /* If the data is transmitted via pfns, delegate to helper */
- if (state_data.type & HV_GET_SET_VP_STATE_TYPE_PFN) {
- unsigned long user_pfn = PFN_DOWN(args.buf_ptr);
- size_t page_count = PFN_DOWN(args.buf_sz);
- return mshv_vp_ioctl_get_set_state_pfn(vp, state_data, user_pfn,
- page_count, is_set);
- }
- /* Paranoia check - this shouldn't happen! */
- if (data_sz > sizeof(vp_state)) {
- vp_err(vp, "Invalid vp state data size!\n");
- return -EINVAL;
- }
- if (is_set) {
- if (copy_from_user(&vp_state, (__user void *)args.buf_ptr, data_sz))
- return -EFAULT;
- return hv_call_set_vp_state(vp->vp_index,
- vp->vp_partition->pt_id,
- state_data, 0, NULL,
- sizeof(vp_state), (u8 *)&vp_state);
- }
- ret = hv_call_get_vp_state(vp->vp_index, vp->vp_partition->pt_id,
- state_data, 0, NULL, &vp_state);
- if (ret)
- return ret;
- if (copy_to_user((void __user *)args.buf_ptr, &vp_state, data_sz))
- return -EFAULT;
- return 0;
- }
- static long
- mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
- {
- struct mshv_vp *vp = filp->private_data;
- long r = -ENOTTY;
- if (mutex_lock_killable(&vp->vp_mutex))
- return -EINTR;
- switch (ioctl) {
- case MSHV_RUN_VP:
- r = mshv_vp_ioctl_run_vp(vp, (void __user *)arg);
- break;
- case MSHV_GET_VP_STATE:
- r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, false);
- break;
- case MSHV_SET_VP_STATE:
- r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, true);
- break;
- case MSHV_ROOT_HVCALL:
- r = mshv_ioctl_passthru_hvcall(vp->vp_partition, false,
- (void __user *)arg);
- break;
- default:
- vp_warn(vp, "Invalid ioctl: %#x\n", ioctl);
- break;
- }
- mutex_unlock(&vp->vp_mutex);
- return r;
- }
- static vm_fault_t mshv_vp_fault(struct vm_fault *vmf)
- {
- struct mshv_vp *vp = vmf->vma->vm_file->private_data;
- switch (vmf->vma->vm_pgoff) {
- case MSHV_VP_MMAP_OFFSET_REGISTERS:
- vmf->page = virt_to_page(vp->vp_register_page);
- break;
- case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE:
- vmf->page = virt_to_page(vp->vp_intercept_msg_page);
- break;
- case MSHV_VP_MMAP_OFFSET_GHCB:
- vmf->page = virt_to_page(vp->vp_ghcb_page);
- break;
- default:
- return VM_FAULT_SIGBUS;
- }
- get_page(vmf->page);
- return 0;
- }
- static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma)
- {
- struct mshv_vp *vp = file->private_data;
- switch (vma->vm_pgoff) {
- case MSHV_VP_MMAP_OFFSET_REGISTERS:
- if (!vp->vp_register_page)
- return -ENODEV;
- break;
- case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE:
- if (!vp->vp_intercept_msg_page)
- return -ENODEV;
- break;
- case MSHV_VP_MMAP_OFFSET_GHCB:
- if (!vp->vp_ghcb_page)
- return -ENODEV;
- break;
- default:
- return -EINVAL;
- }
- vma->vm_ops = &mshv_vp_vm_ops;
- return 0;
- }
- static int
- mshv_vp_release(struct inode *inode, struct file *filp)
- {
- struct mshv_vp *vp = filp->private_data;
- /* Rest of VP cleanup happens in destroy_partition() */
- mshv_partition_put(vp->vp_partition);
- return 0;
- }
- void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index,
- struct hv_stats_page *stats_pages[])
- {
- union hv_stats_object_identity identity = {
- .vp.partition_id = partition_id,
- .vp.vp_index = vp_index,
- };
- int err;
- identity.vp.stats_area_type = HV_STATS_AREA_SELF;
- err = hv_unmap_stats_page(HV_STATS_OBJECT_VP,
- stats_pages[HV_STATS_AREA_SELF],
- &identity);
- if (err)
- pr_err("%s: failed to unmap partition %llu vp %u self stats, err: %d\n",
- __func__, partition_id, vp_index, err);
- if (stats_pages[HV_STATS_AREA_PARENT] != stats_pages[HV_STATS_AREA_SELF]) {
- identity.vp.stats_area_type = HV_STATS_AREA_PARENT;
- err = hv_unmap_stats_page(HV_STATS_OBJECT_VP,
- stats_pages[HV_STATS_AREA_PARENT],
- &identity);
- if (err)
- pr_err("%s: failed to unmap partition %llu vp %u parent stats, err: %d\n",
- __func__, partition_id, vp_index, err);
- }
- }
- int mshv_vp_stats_map(u64 partition_id, u32 vp_index,
- struct hv_stats_page *stats_pages[])
- {
- union hv_stats_object_identity identity = {
- .vp.partition_id = partition_id,
- .vp.vp_index = vp_index,
- };
- int err;
- identity.vp.stats_area_type = HV_STATS_AREA_SELF;
- err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity,
- &stats_pages[HV_STATS_AREA_SELF]);
- if (err) {
- pr_err("%s: failed to map partition %llu vp %u self stats, err: %d\n",
- __func__, partition_id, vp_index, err);
- return err;
- }
- /*
- * L1VH partition cannot access its vp stats in parent area.
- */
- if (is_l1vh_parent(partition_id)) {
- stats_pages[HV_STATS_AREA_PARENT] = stats_pages[HV_STATS_AREA_SELF];
- } else {
- identity.vp.stats_area_type = HV_STATS_AREA_PARENT;
- err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity,
- &stats_pages[HV_STATS_AREA_PARENT]);
- if (err) {
- pr_err("%s: failed to map partition %llu vp %u parent stats, err: %d\n",
- __func__, partition_id, vp_index, err);
- goto unmap_self;
- }
- if (!stats_pages[HV_STATS_AREA_PARENT])
- stats_pages[HV_STATS_AREA_PARENT] = stats_pages[HV_STATS_AREA_SELF];
- }
- return 0;
- unmap_self:
- identity.vp.stats_area_type = HV_STATS_AREA_SELF;
- hv_unmap_stats_page(HV_STATS_OBJECT_VP,
- stats_pages[HV_STATS_AREA_SELF],
- &identity);
- return err;
- }
- static long
- mshv_partition_ioctl_create_vp(struct mshv_partition *partition,
- void __user *arg)
- {
- struct mshv_create_vp args;
- struct mshv_vp *vp;
- struct page *intercept_msg_page, *register_page, *ghcb_page;
- struct hv_stats_page *stats_pages[2];
- long ret;
- if (copy_from_user(&args, arg, sizeof(args)))
- return -EFAULT;
- if (args.vp_index >= MSHV_MAX_VPS)
- return -EINVAL;
- if (partition->pt_vp_array[args.vp_index])
- return -EEXIST;
- ret = hv_call_create_vp(NUMA_NO_NODE, partition->pt_id, args.vp_index,
- 0 /* Only valid for root partition VPs */);
- if (ret)
- return ret;
- ret = hv_map_vp_state_page(partition->pt_id, args.vp_index,
- HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
- input_vtl_zero, &intercept_msg_page);
- if (ret)
- goto destroy_vp;
- if (!mshv_partition_encrypted(partition)) {
- ret = hv_map_vp_state_page(partition->pt_id, args.vp_index,
- HV_VP_STATE_PAGE_REGISTERS,
- input_vtl_zero, ®ister_page);
- if (ret)
- goto unmap_intercept_message_page;
- }
- if (mshv_partition_encrypted(partition) &&
- is_ghcb_mapping_available()) {
- ret = hv_map_vp_state_page(partition->pt_id, args.vp_index,
- HV_VP_STATE_PAGE_GHCB,
- input_vtl_normal, &ghcb_page);
- if (ret)
- goto unmap_register_page;
- }
- ret = mshv_vp_stats_map(partition->pt_id, args.vp_index,
- stats_pages);
- if (ret)
- goto unmap_ghcb_page;
- vp = kzalloc_obj(*vp);
- if (!vp)
- goto unmap_stats_pages;
- vp->vp_partition = mshv_partition_get(partition);
- if (!vp->vp_partition) {
- ret = -EBADF;
- goto free_vp;
- }
- mutex_init(&vp->vp_mutex);
- init_waitqueue_head(&vp->run.vp_suspend_queue);
- atomic64_set(&vp->run.vp_signaled_count, 0);
- vp->vp_index = args.vp_index;
- vp->vp_intercept_msg_page = page_to_virt(intercept_msg_page);
- if (!mshv_partition_encrypted(partition))
- vp->vp_register_page = page_to_virt(register_page);
- if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available())
- vp->vp_ghcb_page = page_to_virt(ghcb_page);
- memcpy(vp->vp_stats_pages, stats_pages, sizeof(stats_pages));
- ret = mshv_debugfs_vp_create(vp);
- if (ret)
- goto put_partition;
- /*
- * Keep anon_inode_getfd last: it installs fd in the file struct and
- * thus makes the state accessible in user space.
- */
- ret = anon_inode_getfd("mshv_vp", &mshv_vp_fops, vp,
- O_RDWR | O_CLOEXEC);
- if (ret < 0)
- goto remove_debugfs_vp;
- /* already exclusive with the partition mutex for all ioctls */
- partition->pt_vp_count++;
- partition->pt_vp_array[args.vp_index] = vp;
- return ret;
- remove_debugfs_vp:
- mshv_debugfs_vp_remove(vp);
- put_partition:
- mshv_partition_put(partition);
- free_vp:
- kfree(vp);
- unmap_stats_pages:
- mshv_vp_stats_unmap(partition->pt_id, args.vp_index, stats_pages);
- unmap_ghcb_page:
- if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available())
- hv_unmap_vp_state_page(partition->pt_id, args.vp_index,
- HV_VP_STATE_PAGE_GHCB, ghcb_page,
- input_vtl_normal);
- unmap_register_page:
- if (!mshv_partition_encrypted(partition))
- hv_unmap_vp_state_page(partition->pt_id, args.vp_index,
- HV_VP_STATE_PAGE_REGISTERS,
- register_page, input_vtl_zero);
- unmap_intercept_message_page:
- hv_unmap_vp_state_page(partition->pt_id, args.vp_index,
- HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
- intercept_msg_page, input_vtl_zero);
- destroy_vp:
- hv_call_delete_vp(partition->pt_id, args.vp_index);
- return ret;
- }
- static int mshv_init_async_handler(struct mshv_partition *partition)
- {
- if (completion_done(&partition->async_hypercall)) {
- pt_err(partition,
- "Cannot issue async hypercall while another one in progress!\n");
- return -EPERM;
- }
- reinit_completion(&partition->async_hypercall);
- return 0;
- }
- static void mshv_async_hvcall_handler(void *data, u64 *status)
- {
- struct mshv_partition *partition = data;
- wait_for_completion(&partition->async_hypercall);
- pt_dbg(partition, "Async hypercall completed!\n");
- *status = partition->async_hypercall_status;
- }
- /*
- * NB: caller checks and makes sure mem->size is page aligned
- * Returns: 0 with regionpp updated on success, or -errno
- */
- static int mshv_partition_create_region(struct mshv_partition *partition,
- struct mshv_user_mem_region *mem,
- struct mshv_mem_region **regionpp,
- bool is_mmio)
- {
- struct mshv_mem_region *rg;
- u64 nr_pages = HVPFN_DOWN(mem->size);
- /* Reject overlapping regions */
- spin_lock(&partition->pt_mem_regions_lock);
- hlist_for_each_entry(rg, &partition->pt_mem_regions, hnode) {
- if (mem->guest_pfn + nr_pages <= rg->start_gfn ||
- rg->start_gfn + rg->nr_pages <= mem->guest_pfn)
- continue;
- spin_unlock(&partition->pt_mem_regions_lock);
- return -EEXIST;
- }
- spin_unlock(&partition->pt_mem_regions_lock);
- rg = mshv_region_create(mem->guest_pfn, nr_pages,
- mem->userspace_addr, mem->flags);
- if (IS_ERR(rg))
- return PTR_ERR(rg);
- if (is_mmio)
- rg->mreg_type = MSHV_REGION_TYPE_MMIO;
- else if (mshv_partition_encrypted(partition) ||
- !mshv_region_movable_init(rg))
- rg->mreg_type = MSHV_REGION_TYPE_MEM_PINNED;
- else
- rg->mreg_type = MSHV_REGION_TYPE_MEM_MOVABLE;
- rg->partition = partition;
- *regionpp = rg;
- return 0;
- }
- /**
- * mshv_prepare_pinned_region - Pin and map memory regions
- * @region: Pointer to the memory region structure
- *
- * This function processes memory regions that are explicitly marked as pinned.
- * Pinned regions are preallocated, mapped upfront, and do not rely on fault-based
- * population. The function ensures the region is properly populated, handles
- * encryption requirements for SNP partitions if applicable, maps the region,
- * and performs necessary sharing or eviction operations based on the mapping
- * result.
- *
- * Return: 0 on success, negative error code on failure.
- */
- static int mshv_prepare_pinned_region(struct mshv_mem_region *region)
- {
- struct mshv_partition *partition = region->partition;
- int ret;
- ret = mshv_region_pin(region);
- if (ret) {
- pt_err(partition, "Failed to pin memory region: %d\n",
- ret);
- goto err_out;
- }
- /*
- * For an SNP partition it is a requirement that for every memory region
- * that we are going to map for this partition we should make sure that
- * host access to that region is released. This is ensured by doing an
- * additional hypercall which will update the SLAT to release host
- * access to guest memory regions.
- */
- if (mshv_partition_encrypted(partition)) {
- ret = mshv_region_unshare(region);
- if (ret) {
- pt_err(partition,
- "Failed to unshare memory region (guest_pfn: %llu): %d\n",
- region->start_gfn, ret);
- goto invalidate_region;
- }
- }
- ret = mshv_region_map(region);
- if (ret && mshv_partition_encrypted(partition)) {
- int shrc;
- shrc = mshv_region_share(region);
- if (!shrc)
- goto invalidate_region;
- pt_err(partition,
- "Failed to share memory region (guest_pfn: %llu): %d\n",
- region->start_gfn, shrc);
- /*
- * Don't unpin if marking shared failed because pages are no
- * longer mapped in the host, ie root, anymore.
- */
- goto err_out;
- }
- return 0;
- invalidate_region:
- mshv_region_invalidate(region);
- err_out:
- return ret;
- }
- /*
- * This maps two things: guest RAM and for pci passthru mmio space.
- *
- * mmio:
- * - vfio overloads vm_pgoff to store the mmio start pfn/spa.
- * - Two things need to happen for mapping mmio range:
- * 1. mapped in the uaddr so VMM can access it.
- * 2. mapped in the hwpt (gfn <-> mmio phys addr) so guest can access it.
- *
- * This function takes care of the second. The first one is managed by vfio,
- * and hence is taken care of via vfio_pci_mmap_fault().
- */
- static long
- mshv_map_user_memory(struct mshv_partition *partition,
- struct mshv_user_mem_region *mem)
- {
- struct mshv_mem_region *region;
- struct vm_area_struct *vma;
- bool is_mmio;
- ulong mmio_pfn;
- long ret;
- if (mem->flags & BIT(MSHV_SET_MEM_BIT_UNMAP) ||
- !access_ok((const void __user *)mem->userspace_addr, mem->size))
- return -EINVAL;
- mmap_read_lock(current->mm);
- vma = vma_lookup(current->mm, mem->userspace_addr);
- is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0;
- mmio_pfn = is_mmio ? vma->vm_pgoff : 0;
- mmap_read_unlock(current->mm);
- if (!vma)
- return -EINVAL;
- ret = mshv_partition_create_region(partition, mem, ®ion,
- is_mmio);
- if (ret)
- return ret;
- switch (region->mreg_type) {
- case MSHV_REGION_TYPE_MEM_PINNED:
- ret = mshv_prepare_pinned_region(region);
- break;
- case MSHV_REGION_TYPE_MEM_MOVABLE:
- /*
- * For movable memory regions, remap with no access to let
- * the hypervisor track dirty pages, enabling pre-copy live
- * migration.
- */
- ret = hv_call_map_gpa_pages(partition->pt_id,
- region->start_gfn,
- region->nr_pages,
- HV_MAP_GPA_NO_ACCESS, NULL);
- break;
- case MSHV_REGION_TYPE_MMIO:
- ret = hv_call_map_mmio_pages(partition->pt_id,
- region->start_gfn,
- mmio_pfn,
- region->nr_pages);
- break;
- }
- if (ret)
- goto errout;
- spin_lock(&partition->pt_mem_regions_lock);
- hlist_add_head(®ion->hnode, &partition->pt_mem_regions);
- spin_unlock(&partition->pt_mem_regions_lock);
- return 0;
- errout:
- mshv_region_put(region);
- return ret;
- }
- /* Called for unmapping both the guest ram and the mmio space */
- static long
- mshv_unmap_user_memory(struct mshv_partition *partition,
- struct mshv_user_mem_region *mem)
- {
- struct mshv_mem_region *region;
- if (!(mem->flags & BIT(MSHV_SET_MEM_BIT_UNMAP)))
- return -EINVAL;
- spin_lock(&partition->pt_mem_regions_lock);
- region = mshv_partition_region_by_gfn(partition, mem->guest_pfn);
- if (!region) {
- spin_unlock(&partition->pt_mem_regions_lock);
- return -ENOENT;
- }
- /* Paranoia check */
- if (region->start_uaddr != mem->userspace_addr ||
- region->start_gfn != mem->guest_pfn ||
- region->nr_pages != HVPFN_DOWN(mem->size)) {
- spin_unlock(&partition->pt_mem_regions_lock);
- return -EINVAL;
- }
- hlist_del(®ion->hnode);
- spin_unlock(&partition->pt_mem_regions_lock);
- mshv_region_put(region);
- return 0;
- }
- static long
- mshv_partition_ioctl_set_memory(struct mshv_partition *partition,
- struct mshv_user_mem_region __user *user_mem)
- {
- struct mshv_user_mem_region mem;
- if (copy_from_user(&mem, user_mem, sizeof(mem)))
- return -EFAULT;
- if (!mem.size ||
- !PAGE_ALIGNED(mem.size) ||
- !PAGE_ALIGNED(mem.userspace_addr) ||
- (mem.flags & ~MSHV_SET_MEM_FLAGS_MASK) ||
- mshv_field_nonzero(mem, rsvd))
- return -EINVAL;
- if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP))
- return mshv_unmap_user_memory(partition, &mem);
- return mshv_map_user_memory(partition, &mem);
- }
- static long
- mshv_partition_ioctl_ioeventfd(struct mshv_partition *partition,
- void __user *user_args)
- {
- struct mshv_user_ioeventfd args;
- if (copy_from_user(&args, user_args, sizeof(args)))
- return -EFAULT;
- return mshv_set_unset_ioeventfd(partition, &args);
- }
- static long
- mshv_partition_ioctl_irqfd(struct mshv_partition *partition,
- void __user *user_args)
- {
- struct mshv_user_irqfd args;
- if (copy_from_user(&args, user_args, sizeof(args)))
- return -EFAULT;
- return mshv_set_unset_irqfd(partition, &args);
- }
- static long
- mshv_partition_ioctl_get_gpap_access_bitmap(struct mshv_partition *partition,
- void __user *user_args)
- {
- struct mshv_gpap_access_bitmap args;
- union hv_gpa_page_access_state *states;
- long ret, i;
- union hv_gpa_page_access_state_flags hv_flags = {};
- u8 hv_type_mask;
- ulong bitmap_buf_sz, states_buf_sz;
- int written = 0;
- if (copy_from_user(&args, user_args, sizeof(args)))
- return -EFAULT;
- if (args.access_type >= MSHV_GPAP_ACCESS_TYPE_COUNT ||
- args.access_op >= MSHV_GPAP_ACCESS_OP_COUNT ||
- mshv_field_nonzero(args, rsvd) || !args.page_count ||
- !args.bitmap_ptr)
- return -EINVAL;
- if (check_mul_overflow(args.page_count, sizeof(*states), &states_buf_sz))
- return -E2BIG;
- /* Num bytes needed to store bitmap; one bit per page rounded up */
- bitmap_buf_sz = DIV_ROUND_UP(args.page_count, 8);
- /* Sanity check */
- if (bitmap_buf_sz > states_buf_sz)
- return -EBADFD;
- switch (args.access_type) {
- case MSHV_GPAP_ACCESS_TYPE_ACCESSED:
- hv_type_mask = 1;
- if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) {
- hv_flags.clear_accessed = 1;
- /* not accessed implies not dirty */
- hv_flags.clear_dirty = 1;
- } else { /* MSHV_GPAP_ACCESS_OP_SET */
- hv_flags.set_accessed = 1;
- }
- break;
- case MSHV_GPAP_ACCESS_TYPE_DIRTY:
- hv_type_mask = 2;
- if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) {
- hv_flags.clear_dirty = 1;
- } else { /* MSHV_GPAP_ACCESS_OP_SET */
- hv_flags.set_dirty = 1;
- /* dirty implies accessed */
- hv_flags.set_accessed = 1;
- }
- break;
- }
- states = vzalloc(states_buf_sz);
- if (!states)
- return -ENOMEM;
- ret = hv_call_get_gpa_access_states(partition->pt_id, args.page_count,
- args.gpap_base, hv_flags, &written,
- states);
- if (ret)
- goto free_return;
- /*
- * Overwrite states buffer with bitmap - the bits in hv_type_mask
- * correspond to bitfields in hv_gpa_page_access_state
- */
- for (i = 0; i < written; ++i)
- __assign_bit(i, (ulong *)states,
- states[i].as_uint8 & hv_type_mask);
- /* zero the unused bits in the last byte(s) of the returned bitmap */
- for (i = written; i < bitmap_buf_sz * 8; ++i)
- __clear_bit(i, (ulong *)states);
- if (copy_to_user((void __user *)args.bitmap_ptr, states, bitmap_buf_sz))
- ret = -EFAULT;
- free_return:
- vfree(states);
- return ret;
- }
- static long
- mshv_partition_ioctl_set_msi_routing(struct mshv_partition *partition,
- void __user *user_args)
- {
- struct mshv_user_irq_entry *entries = NULL;
- struct mshv_user_irq_table args;
- long ret;
- if (copy_from_user(&args, user_args, sizeof(args)))
- return -EFAULT;
- if (args.nr > MSHV_MAX_GUEST_IRQS ||
- mshv_field_nonzero(args, rsvd))
- return -EINVAL;
- if (args.nr) {
- struct mshv_user_irq_table __user *urouting = user_args;
- entries = vmemdup_user(urouting->entries,
- array_size(sizeof(*entries),
- args.nr));
- if (IS_ERR(entries))
- return PTR_ERR(entries);
- }
- ret = mshv_update_routing_table(partition, entries, args.nr);
- kvfree(entries);
- return ret;
- }
- static long
- mshv_partition_ioctl_initialize(struct mshv_partition *partition)
- {
- long ret;
- if (partition->pt_initialized)
- return 0;
- ret = hv_call_initialize_partition(partition->pt_id);
- if (ret)
- goto withdraw_mem;
- ret = mshv_debugfs_partition_create(partition);
- if (ret)
- goto finalize_partition;
- partition->pt_initialized = true;
- return 0;
- finalize_partition:
- hv_call_finalize_partition(partition->pt_id);
- withdraw_mem:
- hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id);
- return ret;
- }
- static long
- mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
- {
- struct mshv_partition *partition = filp->private_data;
- long ret;
- void __user *uarg = (void __user *)arg;
- if (mutex_lock_killable(&partition->pt_mutex))
- return -EINTR;
- switch (ioctl) {
- case MSHV_INITIALIZE_PARTITION:
- ret = mshv_partition_ioctl_initialize(partition);
- break;
- case MSHV_SET_GUEST_MEMORY:
- ret = mshv_partition_ioctl_set_memory(partition, uarg);
- break;
- case MSHV_CREATE_VP:
- ret = mshv_partition_ioctl_create_vp(partition, uarg);
- break;
- case MSHV_IRQFD:
- ret = mshv_partition_ioctl_irqfd(partition, uarg);
- break;
- case MSHV_IOEVENTFD:
- ret = mshv_partition_ioctl_ioeventfd(partition, uarg);
- break;
- case MSHV_SET_MSI_ROUTING:
- ret = mshv_partition_ioctl_set_msi_routing(partition, uarg);
- break;
- case MSHV_GET_GPAP_ACCESS_BITMAP:
- ret = mshv_partition_ioctl_get_gpap_access_bitmap(partition,
- uarg);
- break;
- case MSHV_ROOT_HVCALL:
- ret = mshv_ioctl_passthru_hvcall(partition, true, uarg);
- break;
- default:
- ret = -ENOTTY;
- }
- mutex_unlock(&partition->pt_mutex);
- return ret;
- }
- static int
- disable_vp_dispatch(struct mshv_vp *vp)
- {
- int ret;
- struct hv_register_assoc dispatch_suspend = {
- .name = HV_REGISTER_DISPATCH_SUSPEND,
- .value.dispatch_suspend.suspended = 1,
- };
- ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
- 1, &dispatch_suspend);
- if (ret)
- vp_err(vp, "failed to suspend\n");
- return ret;
- }
- static int
- get_vp_signaled_count(struct mshv_vp *vp, u64 *count)
- {
- int ret;
- struct hv_register_assoc root_signal_count = {
- .name = HV_REGISTER_VP_ROOT_SIGNAL_COUNT,
- };
- ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
- 1, &root_signal_count);
- if (ret) {
- vp_err(vp, "Failed to get root signal count");
- *count = 0;
- return ret;
- }
- *count = root_signal_count.value.reg64;
- return ret;
- }
- static void
- drain_vp_signals(struct mshv_vp *vp)
- {
- u64 hv_signal_count;
- u64 vp_signal_count;
- get_vp_signaled_count(vp, &hv_signal_count);
- vp_signal_count = atomic64_read(&vp->run.vp_signaled_count);
- /*
- * There should be at most 1 outstanding notification, but be extra
- * careful anyway.
- */
- while (hv_signal_count != vp_signal_count) {
- WARN_ON(hv_signal_count - vp_signal_count != 1);
- if (wait_event_interruptible(vp->run.vp_suspend_queue,
- vp->run.kicked_by_hv == 1))
- break;
- vp->run.kicked_by_hv = 0;
- vp_signal_count = atomic64_read(&vp->run.vp_signaled_count);
- }
- }
- static void drain_all_vps(const struct mshv_partition *partition)
- {
- int i;
- struct mshv_vp *vp;
- /*
- * VPs are reachable from ISR. It is safe to not take the partition
- * lock because nobody else can enter this function and drop the
- * partition from the list.
- */
- for (i = 0; i < MSHV_MAX_VPS; i++) {
- vp = partition->pt_vp_array[i];
- if (!vp)
- continue;
- /*
- * Disable dispatching of the VP in the hypervisor. After this
- * the hypervisor guarantees it won't generate any signals for
- * the VP and the hypervisor's VP signal count won't change.
- */
- disable_vp_dispatch(vp);
- drain_vp_signals(vp);
- }
- }
- static void
- remove_partition(struct mshv_partition *partition)
- {
- spin_lock(&mshv_root.pt_ht_lock);
- hlist_del_rcu(&partition->pt_hnode);
- spin_unlock(&mshv_root.pt_ht_lock);
- synchronize_rcu();
- }
- /*
- * Tear down a partition and remove it from the list.
- * Partition's refcount must be 0
- */
- static void destroy_partition(struct mshv_partition *partition)
- {
- struct mshv_vp *vp;
- struct mshv_mem_region *region;
- struct hlist_node *n;
- int i;
- if (refcount_read(&partition->pt_ref_count)) {
- pt_err(partition,
- "Attempt to destroy partition but refcount > 0\n");
- return;
- }
- if (partition->pt_initialized) {
- /*
- * We only need to drain signals for root scheduler. This should be
- * done before removing the partition from the partition list.
- */
- if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
- drain_all_vps(partition);
- /* Remove vps */
- for (i = 0; i < MSHV_MAX_VPS; ++i) {
- vp = partition->pt_vp_array[i];
- if (!vp)
- continue;
- mshv_debugfs_vp_remove(vp);
- mshv_vp_stats_unmap(partition->pt_id, vp->vp_index,
- vp->vp_stats_pages);
- if (vp->vp_register_page) {
- (void)hv_unmap_vp_state_page(partition->pt_id,
- vp->vp_index,
- HV_VP_STATE_PAGE_REGISTERS,
- virt_to_page(vp->vp_register_page),
- input_vtl_zero);
- vp->vp_register_page = NULL;
- }
- (void)hv_unmap_vp_state_page(partition->pt_id,
- vp->vp_index,
- HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
- virt_to_page(vp->vp_intercept_msg_page),
- input_vtl_zero);
- vp->vp_intercept_msg_page = NULL;
- if (vp->vp_ghcb_page) {
- (void)hv_unmap_vp_state_page(partition->pt_id,
- vp->vp_index,
- HV_VP_STATE_PAGE_GHCB,
- virt_to_page(vp->vp_ghcb_page),
- input_vtl_normal);
- vp->vp_ghcb_page = NULL;
- }
- kfree(vp);
- partition->pt_vp_array[i] = NULL;
- }
- mshv_debugfs_partition_remove(partition);
- /* Deallocates and unmaps everything including vcpus, GPA mappings etc */
- hv_call_finalize_partition(partition->pt_id);
- partition->pt_initialized = false;
- }
- remove_partition(partition);
- hlist_for_each_entry_safe(region, n, &partition->pt_mem_regions,
- hnode) {
- hlist_del(®ion->hnode);
- mshv_region_put(region);
- }
- /* Withdraw and free all pages we deposited */
- hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id);
- hv_call_delete_partition(partition->pt_id);
- mshv_free_routing_table(partition);
- kfree(partition);
- }
- struct
- mshv_partition *mshv_partition_get(struct mshv_partition *partition)
- {
- if (refcount_inc_not_zero(&partition->pt_ref_count))
- return partition;
- return NULL;
- }
- struct
- mshv_partition *mshv_partition_find(u64 partition_id)
- __must_hold(RCU)
- {
- struct mshv_partition *p;
- hash_for_each_possible_rcu(mshv_root.pt_htable, p, pt_hnode,
- partition_id)
- if (p->pt_id == partition_id)
- return p;
- return NULL;
- }
- void
- mshv_partition_put(struct mshv_partition *partition)
- {
- if (refcount_dec_and_test(&partition->pt_ref_count))
- destroy_partition(partition);
- }
- static int
- mshv_partition_release(struct inode *inode, struct file *filp)
- {
- struct mshv_partition *partition = filp->private_data;
- mshv_eventfd_release(partition);
- cleanup_srcu_struct(&partition->pt_irq_srcu);
- mshv_partition_put(partition);
- return 0;
- }
- static int
- add_partition(struct mshv_partition *partition)
- {
- spin_lock(&mshv_root.pt_ht_lock);
- hash_add_rcu(mshv_root.pt_htable, &partition->pt_hnode,
- partition->pt_id);
- spin_unlock(&mshv_root.pt_ht_lock);
- return 0;
- }
- static_assert(MSHV_NUM_CPU_FEATURES_BANKS ==
- HV_PARTITION_PROCESSOR_FEATURES_BANKS);
- static long mshv_ioctl_process_pt_flags(void __user *user_arg, u64 *pt_flags,
- struct hv_partition_creation_properties *cr_props,
- union hv_partition_isolation_properties *isol_props)
- {
- int i;
- struct mshv_create_partition_v2 args;
- union hv_partition_processor_features *disabled_procs;
- union hv_partition_processor_xsave_features *disabled_xsave;
- /* First, copy v1 struct in case user is on previous versions */
- if (copy_from_user(&args, user_arg,
- sizeof(struct mshv_create_partition)))
- return -EFAULT;
- if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) ||
- args.pt_isolation >= MSHV_PT_ISOLATION_COUNT)
- return -EINVAL;
- disabled_procs = &cr_props->disabled_processor_features;
- disabled_xsave = &cr_props->disabled_processor_xsave_features;
- /* Check if user provided newer struct with feature fields */
- if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_CPU_AND_XSAVE_FEATURES)) {
- if (copy_from_user(&args, user_arg, sizeof(args)))
- return -EFAULT;
- /* Re-validate v1 fields after second copy_from_user() */
- if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) ||
- args.pt_isolation >= MSHV_PT_ISOLATION_COUNT)
- return -EINVAL;
- if (args.pt_num_cpu_fbanks != MSHV_NUM_CPU_FEATURES_BANKS ||
- mshv_field_nonzero(args, pt_rsvd) ||
- mshv_field_nonzero(args, pt_rsvd1))
- return -EINVAL;
- /*
- * Note this assumes MSHV_NUM_CPU_FEATURES_BANKS will never
- * change and equals HV_PARTITION_PROCESSOR_FEATURES_BANKS
- * (i.e. 2).
- *
- * Further banks (index >= 2) will be modifiable as 'early'
- * properties via the set partition property hypercall.
- */
- for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++)
- disabled_procs->as_uint64[i] = args.pt_cpu_fbanks[i];
- #if IS_ENABLED(CONFIG_X86_64)
- disabled_xsave->as_uint64 = args.pt_disabled_xsave;
- #else
- /*
- * In practice this field is ignored on arm64, but safer to
- * zero it in case it is ever used.
- */
- disabled_xsave->as_uint64 = 0;
- if (mshv_field_nonzero(args, pt_rsvd2))
- return -EINVAL;
- #endif
- } else {
- /*
- * v1 behavior: try to enable everything. The hypervisor will
- * disable features that are not supported. The banks can be
- * queried via the get partition property hypercall.
- */
- for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++)
- disabled_procs->as_uint64[i] = 0;
- disabled_xsave->as_uint64 = 0;
- }
- /* Only support EXO partitions */
- *pt_flags = HV_PARTITION_CREATION_FLAG_EXO_PARTITION |
- HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED;
- if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_LAPIC))
- *pt_flags |= HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED;
- if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_X2APIC))
- *pt_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE;
- if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_GPA_SUPER_PAGES))
- *pt_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED;
- if (args.pt_flags & BIT(MSHV_PT_BIT_NESTED_VIRTUALIZATION))
- *pt_flags |= HV_PARTITION_CREATION_FLAG_NESTED_VIRTUALIZATION_CAPABLE;
- if (args.pt_flags & BIT(MSHV_PT_BIT_SMT_ENABLED_GUEST))
- *pt_flags |= HV_PARTITION_CREATION_FLAG_SMT_ENABLED_GUEST;
- isol_props->as_uint64 = 0;
- switch (args.pt_isolation) {
- case MSHV_PT_ISOLATION_NONE:
- isol_props->isolation_type = HV_PARTITION_ISOLATION_TYPE_NONE;
- break;
- }
- return 0;
- }
- static long
- mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev)
- {
- u64 creation_flags;
- struct hv_partition_creation_properties creation_properties;
- union hv_partition_isolation_properties isolation_properties;
- struct mshv_partition *partition;
- long ret;
- ret = mshv_ioctl_process_pt_flags(user_arg, &creation_flags,
- &creation_properties,
- &isolation_properties);
- if (ret)
- return ret;
- partition = kzalloc_obj(*partition);
- if (!partition)
- return -ENOMEM;
- partition->pt_module_dev = module_dev;
- partition->isolation_type = isolation_properties.isolation_type;
- refcount_set(&partition->pt_ref_count, 1);
- mutex_init(&partition->pt_mutex);
- mutex_init(&partition->pt_irq_lock);
- init_completion(&partition->async_hypercall);
- INIT_HLIST_HEAD(&partition->irq_ack_notifier_list);
- INIT_HLIST_HEAD(&partition->pt_devices);
- spin_lock_init(&partition->pt_mem_regions_lock);
- INIT_HLIST_HEAD(&partition->pt_mem_regions);
- mshv_eventfd_init(partition);
- ret = init_srcu_struct(&partition->pt_irq_srcu);
- if (ret)
- goto free_partition;
- ret = hv_call_create_partition(creation_flags,
- creation_properties,
- isolation_properties,
- &partition->pt_id);
- if (ret)
- goto cleanup_irq_srcu;
- ret = add_partition(partition);
- if (ret)
- goto delete_partition;
- ret = mshv_init_async_handler(partition);
- if (!ret) {
- ret = FD_ADD(O_CLOEXEC, anon_inode_getfile("mshv_partition",
- &mshv_partition_fops,
- partition, O_RDWR));
- if (ret >= 0)
- return ret;
- }
- remove_partition(partition);
- delete_partition:
- hv_call_delete_partition(partition->pt_id);
- cleanup_irq_srcu:
- cleanup_srcu_struct(&partition->pt_irq_srcu);
- free_partition:
- kfree(partition);
- return ret;
- }
- static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl,
- unsigned long arg)
- {
- struct miscdevice *misc = filp->private_data;
- switch (ioctl) {
- case MSHV_CREATE_PARTITION:
- return mshv_ioctl_create_partition((void __user *)arg,
- misc->this_device);
- case MSHV_ROOT_HVCALL:
- return mshv_ioctl_passthru_hvcall(NULL, false,
- (void __user *)arg);
- }
- return -ENOTTY;
- }
- static int
- mshv_dev_open(struct inode *inode, struct file *filp)
- {
- return 0;
- }
- static int
- mshv_dev_release(struct inode *inode, struct file *filp)
- {
- return 0;
- }
- static int mshv_root_sched_online;
- static const char *scheduler_type_to_string(enum hv_scheduler_type type)
- {
- switch (type) {
- case HV_SCHEDULER_TYPE_LP:
- return "classic scheduler without SMT";
- case HV_SCHEDULER_TYPE_LP_SMT:
- return "classic scheduler with SMT";
- case HV_SCHEDULER_TYPE_CORE_SMT:
- return "core scheduler";
- case HV_SCHEDULER_TYPE_ROOT:
- return "root scheduler";
- default:
- return "unknown scheduler";
- };
- }
- static int __init l1vh_retrieve_scheduler_type(enum hv_scheduler_type *out)
- {
- u64 integrated_sched_enabled;
- int ret;
- *out = HV_SCHEDULER_TYPE_CORE_SMT;
- if (!mshv_root.vmm_caps.vmm_enable_integrated_scheduler)
- return 0;
- ret = hv_call_get_partition_property_ex(HV_PARTITION_ID_SELF,
- HV_PARTITION_PROPERTY_INTEGRATED_SCHEDULER_ENABLED,
- 0, &integrated_sched_enabled,
- sizeof(integrated_sched_enabled));
- if (ret)
- return ret;
- if (integrated_sched_enabled)
- *out = HV_SCHEDULER_TYPE_ROOT;
- return 0;
- }
- /* TODO move this to hv_common.c when needed outside */
- static int __init hv_retrieve_scheduler_type(enum hv_scheduler_type *out)
- {
- struct hv_input_get_system_property *input;
- struct hv_output_get_system_property *output;
- unsigned long flags;
- u64 status;
- local_irq_save(flags);
- input = *this_cpu_ptr(hyperv_pcpu_input_arg);
- output = *this_cpu_ptr(hyperv_pcpu_output_arg);
- memset(input, 0, sizeof(*input));
- memset(output, 0, sizeof(*output));
- input->property_id = HV_SYSTEM_PROPERTY_SCHEDULER_TYPE;
- status = hv_do_hypercall(HVCALL_GET_SYSTEM_PROPERTY, input, output);
- if (!hv_result_success(status)) {
- local_irq_restore(flags);
- pr_err("%s: %s\n", __func__, hv_result_to_string(status));
- return hv_result_to_errno(status);
- }
- *out = output->scheduler_type;
- local_irq_restore(flags);
- return 0;
- }
- /* Retrieve and stash the supported scheduler type */
- static int __init mshv_retrieve_scheduler_type(struct device *dev)
- {
- int ret;
- if (hv_l1vh_partition())
- ret = l1vh_retrieve_scheduler_type(&hv_scheduler_type);
- else
- ret = hv_retrieve_scheduler_type(&hv_scheduler_type);
- if (ret)
- return ret;
- dev_info(dev, "Hypervisor using %s\n",
- scheduler_type_to_string(hv_scheduler_type));
- switch (hv_scheduler_type) {
- case HV_SCHEDULER_TYPE_CORE_SMT:
- case HV_SCHEDULER_TYPE_LP_SMT:
- case HV_SCHEDULER_TYPE_ROOT:
- case HV_SCHEDULER_TYPE_LP:
- /* Supported scheduler, nothing to do */
- break;
- default:
- dev_err(dev, "unsupported scheduler 0x%x, bailing.\n",
- hv_scheduler_type);
- return -EOPNOTSUPP;
- }
- return 0;
- }
- static int mshv_root_scheduler_init(unsigned int cpu)
- {
- void **inputarg, **outputarg, *p;
- inputarg = (void **)this_cpu_ptr(root_scheduler_input);
- outputarg = (void **)this_cpu_ptr(root_scheduler_output);
- /* Allocate two consecutive pages. One for input, one for output. */
- p = kmalloc(2 * HV_HYP_PAGE_SIZE, GFP_KERNEL);
- if (!p)
- return -ENOMEM;
- *inputarg = p;
- *outputarg = (char *)p + HV_HYP_PAGE_SIZE;
- return 0;
- }
- static int mshv_root_scheduler_cleanup(unsigned int cpu)
- {
- void *p, **inputarg, **outputarg;
- inputarg = (void **)this_cpu_ptr(root_scheduler_input);
- outputarg = (void **)this_cpu_ptr(root_scheduler_output);
- p = *inputarg;
- *inputarg = NULL;
- *outputarg = NULL;
- kfree(p);
- return 0;
- }
- /* Must be called after retrieving the scheduler type */
- static int
- root_scheduler_init(struct device *dev)
- {
- int ret;
- if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT)
- return 0;
- root_scheduler_input = alloc_percpu(void *);
- root_scheduler_output = alloc_percpu(void *);
- if (!root_scheduler_input || !root_scheduler_output) {
- dev_err(dev, "Failed to allocate root scheduler buffers\n");
- ret = -ENOMEM;
- goto out;
- }
- ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_root_sched",
- mshv_root_scheduler_init,
- mshv_root_scheduler_cleanup);
- if (ret < 0) {
- dev_err(dev, "Failed to setup root scheduler state: %i\n", ret);
- goto out;
- }
- mshv_root_sched_online = ret;
- return 0;
- out:
- free_percpu(root_scheduler_input);
- free_percpu(root_scheduler_output);
- return ret;
- }
- static void
- root_scheduler_deinit(void)
- {
- if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT)
- return;
- cpuhp_remove_state(mshv_root_sched_online);
- free_percpu(root_scheduler_input);
- free_percpu(root_scheduler_output);
- }
- static int __init mshv_init_vmm_caps(struct device *dev)
- {
- int ret;
- ret = hv_call_get_partition_property_ex(HV_PARTITION_ID_SELF,
- HV_PARTITION_PROPERTY_VMM_CAPABILITIES,
- 0, &mshv_root.vmm_caps,
- sizeof(mshv_root.vmm_caps));
- if (ret && hv_l1vh_partition()) {
- dev_err(dev, "Failed to get VMM capabilities: %d\n", ret);
- return ret;
- }
- dev_dbg(dev, "vmm_caps = %#llx\n", mshv_root.vmm_caps.as_uint64[0]);
- return 0;
- }
- static int __init mshv_parent_partition_init(void)
- {
- int ret;
- struct device *dev;
- union hv_hypervisor_version_info version_info;
- if (!hv_parent_partition() || is_kdump_kernel())
- return -ENODEV;
- if (hv_get_hypervisor_version(&version_info))
- return -ENODEV;
- ret = misc_register(&mshv_dev);
- if (ret)
- return ret;
- dev = mshv_dev.this_device;
- if (version_info.build_number < MSHV_HV_MIN_VERSION ||
- version_info.build_number > MSHV_HV_MAX_VERSION) {
- dev_err(dev, "Running on unvalidated Hyper-V version\n");
- dev_err(dev, "Versions: current: %u min: %u max: %u\n",
- version_info.build_number, MSHV_HV_MIN_VERSION,
- MSHV_HV_MAX_VERSION);
- }
- ret = mshv_synic_init(dev);
- if (ret)
- goto device_deregister;
- ret = mshv_init_vmm_caps(dev);
- if (ret)
- goto synic_cleanup;
- ret = mshv_retrieve_scheduler_type(dev);
- if (ret)
- goto synic_cleanup;
- ret = root_scheduler_init(dev);
- if (ret)
- goto synic_cleanup;
- ret = mshv_debugfs_init();
- if (ret)
- goto deinit_root_scheduler;
- ret = mshv_irqfd_wq_init();
- if (ret)
- goto exit_debugfs;
- spin_lock_init(&mshv_root.pt_ht_lock);
- hash_init(mshv_root.pt_htable);
- hv_setup_mshv_handler(mshv_isr);
- return 0;
- exit_debugfs:
- mshv_debugfs_exit();
- deinit_root_scheduler:
- root_scheduler_deinit();
- synic_cleanup:
- mshv_synic_exit();
- device_deregister:
- misc_deregister(&mshv_dev);
- return ret;
- }
- static void __exit mshv_parent_partition_exit(void)
- {
- hv_setup_mshv_handler(NULL);
- mshv_port_table_fini();
- mshv_debugfs_exit();
- misc_deregister(&mshv_dev);
- mshv_irqfd_wq_cleanup();
- root_scheduler_deinit();
- mshv_synic_exit();
- }
- module_init(mshv_parent_partition_init);
- module_exit(mshv_parent_partition_exit);
|