kvm_main.c 167 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * Kernel-based Virtual Machine (KVM) Hypervisor
  4. *
  5. * Copyright (C) 2006 Qumranet, Inc.
  6. * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  7. *
  8. * Authors:
  9. * Avi Kivity <avi@qumranet.com>
  10. * Yaniv Kamay <yaniv@qumranet.com>
  11. */
  12. #include <kvm/iodev.h>
  13. #include <linux/kvm_host.h>
  14. #include <linux/kvm.h>
  15. #include <linux/module.h>
  16. #include <linux/errno.h>
  17. #include <linux/percpu.h>
  18. #include <linux/mm.h>
  19. #include <linux/miscdevice.h>
  20. #include <linux/vmalloc.h>
  21. #include <linux/reboot.h>
  22. #include <linux/debugfs.h>
  23. #include <linux/highmem.h>
  24. #include <linux/file.h>
  25. #include <linux/syscore_ops.h>
  26. #include <linux/cpu.h>
  27. #include <linux/sched/signal.h>
  28. #include <linux/sched/mm.h>
  29. #include <linux/sched/stat.h>
  30. #include <linux/cpumask.h>
  31. #include <linux/smp.h>
  32. #include <linux/anon_inodes.h>
  33. #include <linux/profile.h>
  34. #include <linux/kvm_para.h>
  35. #include <linux/pagemap.h>
  36. #include <linux/mman.h>
  37. #include <linux/swap.h>
  38. #include <linux/bitops.h>
  39. #include <linux/spinlock.h>
  40. #include <linux/compat.h>
  41. #include <linux/srcu.h>
  42. #include <linux/hugetlb.h>
  43. #include <linux/slab.h>
  44. #include <linux/sort.h>
  45. #include <linux/bsearch.h>
  46. #include <linux/io.h>
  47. #include <linux/lockdep.h>
  48. #include <linux/kthread.h>
  49. #include <linux/suspend.h>
  50. #include <linux/rseq.h>
  51. #include <asm/processor.h>
  52. #include <asm/ioctl.h>
  53. #include <linux/uaccess.h>
  54. #include "coalesced_mmio.h"
  55. #include "async_pf.h"
  56. #include "kvm_mm.h"
  57. #include "vfio.h"
  58. #include <trace/events/ipi.h>
  59. #define CREATE_TRACE_POINTS
  60. #include <trace/events/kvm.h>
  61. #include <linux/kvm_dirty_ring.h>
  62. /* Worst case buffer size needed for holding an integer. */
  63. #define ITOA_MAX_LEN 12
  64. MODULE_AUTHOR("Qumranet");
  65. MODULE_DESCRIPTION("Kernel-based Virtual Machine (KVM) Hypervisor");
  66. MODULE_LICENSE("GPL");
  67. /* Architectures should define their poll value according to the halt latency */
  68. unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
  69. module_param(halt_poll_ns, uint, 0644);
  70. EXPORT_SYMBOL_FOR_KVM_INTERNAL(halt_poll_ns);
  71. /* Default doubles per-vcpu halt_poll_ns. */
  72. unsigned int halt_poll_ns_grow = 2;
  73. module_param(halt_poll_ns_grow, uint, 0644);
  74. EXPORT_SYMBOL_FOR_KVM_INTERNAL(halt_poll_ns_grow);
  75. /* The start value to grow halt_poll_ns from */
  76. unsigned int halt_poll_ns_grow_start = 10000; /* 10us */
  77. module_param(halt_poll_ns_grow_start, uint, 0644);
  78. EXPORT_SYMBOL_FOR_KVM_INTERNAL(halt_poll_ns_grow_start);
  79. /* Default halves per-vcpu halt_poll_ns. */
  80. unsigned int halt_poll_ns_shrink = 2;
  81. module_param(halt_poll_ns_shrink, uint, 0644);
  82. EXPORT_SYMBOL_FOR_KVM_INTERNAL(halt_poll_ns_shrink);
  83. /*
  84. * Allow direct access (from KVM or the CPU) without MMU notifier protection
  85. * to unpinned pages.
  86. */
  87. static bool allow_unsafe_mappings;
  88. module_param(allow_unsafe_mappings, bool, 0444);
  89. /*
  90. * Ordering of locks:
  91. *
  92. * kvm->lock --> kvm->slots_lock --> kvm->irq_lock
  93. */
  94. DEFINE_MUTEX(kvm_lock);
  95. LIST_HEAD(vm_list);
  96. static struct kmem_cache *kvm_vcpu_cache;
  97. static __read_mostly struct preempt_ops kvm_preempt_ops;
  98. static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);
  99. static struct dentry *kvm_debugfs_dir;
  100. static const struct file_operations stat_fops_per_vm;
  101. static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
  102. unsigned long arg);
  103. #ifdef CONFIG_KVM_COMPAT
  104. static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
  105. unsigned long arg);
  106. #define KVM_COMPAT(c) .compat_ioctl = (c)
  107. #else
  108. /*
  109. * For architectures that don't implement a compat infrastructure,
  110. * adopt a double line of defense:
  111. * - Prevent a compat task from opening /dev/kvm
  112. * - If the open has been done by a 64bit task, and the KVM fd
  113. * passed to a compat task, let the ioctls fail.
  114. */
  115. static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
  116. unsigned long arg) { return -EINVAL; }
  117. static int kvm_no_compat_open(struct inode *inode, struct file *file)
  118. {
  119. return is_compat_task() ? -ENODEV : 0;
  120. }
  121. #define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \
  122. .open = kvm_no_compat_open
  123. #endif
  124. static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
  125. #define KVM_EVENT_CREATE_VM 0
  126. #define KVM_EVENT_DESTROY_VM 1
  127. static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
  128. static unsigned long long kvm_createvm_count;
  129. static unsigned long long kvm_active_vms;
  130. static DEFINE_PER_CPU(cpumask_var_t, cpu_kick_mask);
  131. __weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
  132. {
  133. }
  134. /*
  135. * Switches to specified vcpu, until a matching vcpu_put()
  136. */
  137. void vcpu_load(struct kvm_vcpu *vcpu)
  138. {
  139. int cpu = get_cpu();
  140. __this_cpu_write(kvm_running_vcpu, vcpu);
  141. preempt_notifier_register(&vcpu->preempt_notifier);
  142. kvm_arch_vcpu_load(vcpu, cpu);
  143. put_cpu();
  144. }
  145. EXPORT_SYMBOL_FOR_KVM_INTERNAL(vcpu_load);
  146. void vcpu_put(struct kvm_vcpu *vcpu)
  147. {
  148. preempt_disable();
  149. kvm_arch_vcpu_put(vcpu);
  150. preempt_notifier_unregister(&vcpu->preempt_notifier);
  151. __this_cpu_write(kvm_running_vcpu, NULL);
  152. preempt_enable();
  153. }
  154. EXPORT_SYMBOL_FOR_KVM_INTERNAL(vcpu_put);
  155. /* TODO: merge with kvm_arch_vcpu_should_kick */
  156. static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
  157. {
  158. int mode = kvm_vcpu_exiting_guest_mode(vcpu);
  159. /*
  160. * We need to wait for the VCPU to reenable interrupts and get out of
  161. * READING_SHADOW_PAGE_TABLES mode.
  162. */
  163. if (req & KVM_REQUEST_WAIT)
  164. return mode != OUTSIDE_GUEST_MODE;
  165. /*
  166. * Need to kick a running VCPU, but otherwise there is nothing to do.
  167. */
  168. return mode == IN_GUEST_MODE;
  169. }
  170. static void ack_kick(void *_completed)
  171. {
  172. }
  173. static inline bool kvm_kick_many_cpus(struct cpumask *cpus, bool wait)
  174. {
  175. if (cpumask_empty(cpus))
  176. return false;
  177. smp_call_function_many(cpus, ack_kick, NULL, wait);
  178. return true;
  179. }
  180. static void kvm_make_vcpu_request(struct kvm_vcpu *vcpu, unsigned int req,
  181. struct cpumask *tmp, int current_cpu)
  182. {
  183. int cpu;
  184. if (likely(!(req & KVM_REQUEST_NO_ACTION)))
  185. __kvm_make_request(req, vcpu);
  186. if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
  187. return;
  188. /*
  189. * Note, the vCPU could get migrated to a different pCPU at any point
  190. * after kvm_request_needs_ipi(), which could result in sending an IPI
  191. * to the previous pCPU. But, that's OK because the purpose of the IPI
  192. * is to ensure the vCPU returns to OUTSIDE_GUEST_MODE, which is
  193. * satisfied if the vCPU migrates. Entering READING_SHADOW_PAGE_TABLES
  194. * after this point is also OK, as the requirement is only that KVM wait
  195. * for vCPUs that were reading SPTEs _before_ any changes were
  196. * finalized. See kvm_vcpu_kick() for more details on handling requests.
  197. */
  198. if (kvm_request_needs_ipi(vcpu, req)) {
  199. cpu = READ_ONCE(vcpu->cpu);
  200. if (cpu != -1 && cpu != current_cpu)
  201. __cpumask_set_cpu(cpu, tmp);
  202. }
  203. }
  204. bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
  205. unsigned long *vcpu_bitmap)
  206. {
  207. struct kvm_vcpu *vcpu;
  208. struct cpumask *cpus;
  209. int i, me;
  210. bool called;
  211. me = get_cpu();
  212. cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
  213. cpumask_clear(cpus);
  214. for_each_set_bit(i, vcpu_bitmap, KVM_MAX_VCPUS) {
  215. vcpu = kvm_get_vcpu(kvm, i);
  216. if (!vcpu)
  217. continue;
  218. kvm_make_vcpu_request(vcpu, req, cpus, me);
  219. }
  220. called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
  221. put_cpu();
  222. return called;
  223. }
  224. bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
  225. {
  226. struct kvm_vcpu *vcpu;
  227. struct cpumask *cpus;
  228. unsigned long i;
  229. bool called;
  230. int me;
  231. me = get_cpu();
  232. cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
  233. cpumask_clear(cpus);
  234. kvm_for_each_vcpu(i, vcpu, kvm)
  235. kvm_make_vcpu_request(vcpu, req, cpus, me);
  236. called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
  237. put_cpu();
  238. return called;
  239. }
  240. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_make_all_cpus_request);
  241. void kvm_flush_remote_tlbs(struct kvm *kvm)
  242. {
  243. ++kvm->stat.generic.remote_tlb_flush_requests;
  244. /*
  245. * We want to publish modifications to the page tables before reading
  246. * mode. Pairs with a memory barrier in arch-specific code.
  247. * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest
  248. * and smp_mb in walk_shadow_page_lockless_begin/end.
  249. * - powerpc: smp_mb in kvmppc_prepare_to_enter.
  250. *
  251. * There is already an smp_mb__after_atomic() before
  252. * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that
  253. * barrier here.
  254. */
  255. if (!kvm_arch_flush_remote_tlbs(kvm)
  256. || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
  257. ++kvm->stat.generic.remote_tlb_flush;
  258. }
  259. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_flush_remote_tlbs);
  260. void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages)
  261. {
  262. if (!kvm_arch_flush_remote_tlbs_range(kvm, gfn, nr_pages))
  263. return;
  264. /*
  265. * Fall back to a flushing entire TLBs if the architecture range-based
  266. * TLB invalidation is unsupported or can't be performed for whatever
  267. * reason.
  268. */
  269. kvm_flush_remote_tlbs(kvm);
  270. }
  271. void kvm_flush_remote_tlbs_memslot(struct kvm *kvm,
  272. const struct kvm_memory_slot *memslot)
  273. {
  274. /*
  275. * All current use cases for flushing the TLBs for a specific memslot
  276. * are related to dirty logging, and many do the TLB flush out of
  277. * mmu_lock. The interaction between the various operations on memslot
  278. * must be serialized by slots_lock to ensure the TLB flush from one
  279. * operation is observed by any other operation on the same memslot.
  280. */
  281. lockdep_assert_held(&kvm->slots_lock);
  282. kvm_flush_remote_tlbs_range(kvm, memslot->base_gfn, memslot->npages);
  283. }
  284. static void kvm_flush_shadow_all(struct kvm *kvm)
  285. {
  286. kvm_arch_flush_shadow_all(kvm);
  287. kvm_arch_guest_memory_reclaimed(kvm);
  288. }
  289. #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
  290. static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
  291. gfp_t gfp_flags)
  292. {
  293. void *page;
  294. gfp_flags |= mc->gfp_zero;
  295. if (mc->kmem_cache)
  296. return kmem_cache_alloc(mc->kmem_cache, gfp_flags);
  297. page = (void *)__get_free_page(gfp_flags);
  298. if (page && mc->init_value)
  299. memset64(page, mc->init_value, PAGE_SIZE / sizeof(u64));
  300. return page;
  301. }
  302. int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min)
  303. {
  304. gfp_t gfp = mc->gfp_custom ? mc->gfp_custom : GFP_KERNEL_ACCOUNT;
  305. void *obj;
  306. if (mc->nobjs >= min)
  307. return 0;
  308. if (unlikely(!mc->objects)) {
  309. if (WARN_ON_ONCE(!capacity))
  310. return -EIO;
  311. /*
  312. * Custom init values can be used only for page allocations,
  313. * and obviously conflict with __GFP_ZERO.
  314. */
  315. if (WARN_ON_ONCE(mc->init_value && (mc->kmem_cache || mc->gfp_zero)))
  316. return -EIO;
  317. mc->objects = kvmalloc_array(capacity, sizeof(void *), gfp);
  318. if (!mc->objects)
  319. return -ENOMEM;
  320. mc->capacity = capacity;
  321. }
  322. /* It is illegal to request a different capacity across topups. */
  323. if (WARN_ON_ONCE(mc->capacity != capacity))
  324. return -EIO;
  325. while (mc->nobjs < mc->capacity) {
  326. obj = mmu_memory_cache_alloc_obj(mc, gfp);
  327. if (!obj)
  328. return mc->nobjs >= min ? 0 : -ENOMEM;
  329. mc->objects[mc->nobjs++] = obj;
  330. }
  331. return 0;
  332. }
  333. int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
  334. {
  335. return __kvm_mmu_topup_memory_cache(mc, KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE, min);
  336. }
  337. int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
  338. {
  339. return mc->nobjs;
  340. }
  341. void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
  342. {
  343. while (mc->nobjs) {
  344. if (mc->kmem_cache)
  345. kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
  346. else
  347. free_page((unsigned long)mc->objects[--mc->nobjs]);
  348. }
  349. kvfree(mc->objects);
  350. mc->objects = NULL;
  351. mc->capacity = 0;
  352. }
  353. void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
  354. {
  355. void *p;
  356. if (WARN_ON(!mc->nobjs))
  357. p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT);
  358. else
  359. p = mc->objects[--mc->nobjs];
  360. BUG_ON(!p);
  361. return p;
  362. }
  363. #endif
  364. static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
  365. {
  366. mutex_init(&vcpu->mutex);
  367. vcpu->cpu = -1;
  368. vcpu->kvm = kvm;
  369. vcpu->vcpu_id = id;
  370. vcpu->pid = NULL;
  371. rwlock_init(&vcpu->pid_lock);
  372. #ifndef __KVM_HAVE_ARCH_WQP
  373. rcuwait_init(&vcpu->wait);
  374. #endif
  375. kvm_async_pf_vcpu_init(vcpu);
  376. kvm_vcpu_set_in_spin_loop(vcpu, false);
  377. kvm_vcpu_set_dy_eligible(vcpu, false);
  378. vcpu->preempted = false;
  379. vcpu->ready = false;
  380. preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
  381. vcpu->last_used_slot = NULL;
  382. /* Fill the stats id string for the vcpu */
  383. snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d",
  384. task_pid_nr(current), id);
  385. }
  386. static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
  387. {
  388. kvm_arch_vcpu_destroy(vcpu);
  389. kvm_dirty_ring_free(&vcpu->dirty_ring);
  390. /*
  391. * No need for rcu_read_lock as VCPU_RUN is the only place that changes
  392. * the vcpu->pid pointer, and at destruction time all file descriptors
  393. * are already gone.
  394. */
  395. put_pid(vcpu->pid);
  396. free_page((unsigned long)vcpu->run);
  397. kmem_cache_free(kvm_vcpu_cache, vcpu);
  398. }
  399. void kvm_destroy_vcpus(struct kvm *kvm)
  400. {
  401. unsigned long i;
  402. struct kvm_vcpu *vcpu;
  403. kvm_for_each_vcpu(i, vcpu, kvm) {
  404. kvm_vcpu_destroy(vcpu);
  405. xa_erase(&kvm->vcpu_array, i);
  406. /*
  407. * Assert that the vCPU isn't visible in any way, to ensure KVM
  408. * doesn't trigger a use-after-free if destroying vCPUs results
  409. * in VM-wide request, e.g. to flush remote TLBs when tearing
  410. * down MMUs, or to mark the VM dead if a KVM_BUG_ON() fires.
  411. */
  412. WARN_ON_ONCE(xa_load(&kvm->vcpu_array, i) || kvm_get_vcpu(kvm, i));
  413. }
  414. atomic_set(&kvm->online_vcpus, 0);
  415. }
  416. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_destroy_vcpus);
  417. static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
  418. {
  419. return container_of(mn, struct kvm, mmu_notifier);
  420. }
  421. typedef bool (*gfn_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
  422. typedef void (*on_lock_fn_t)(struct kvm *kvm);
  423. struct kvm_mmu_notifier_range {
  424. /*
  425. * 64-bit addresses, as KVM notifiers can operate on host virtual
  426. * addresses (unsigned long) and guest physical addresses (64-bit).
  427. */
  428. u64 start;
  429. u64 end;
  430. union kvm_mmu_notifier_arg arg;
  431. gfn_handler_t handler;
  432. on_lock_fn_t on_lock;
  433. bool flush_on_ret;
  434. bool may_block;
  435. bool lockless;
  436. };
  437. /*
  438. * The inner-most helper returns a tuple containing the return value from the
  439. * arch- and action-specific handler, plus a flag indicating whether or not at
  440. * least one memslot was found, i.e. if the handler found guest memory.
  441. *
  442. * Note, most notifiers are averse to booleans, so even though KVM tracks the
  443. * return from arch code as a bool, outer helpers will cast it to an int. :-(
  444. */
  445. typedef struct kvm_mmu_notifier_return {
  446. bool ret;
  447. bool found_memslot;
  448. } kvm_mn_ret_t;
  449. /*
  450. * Use a dedicated stub instead of NULL to indicate that there is no callback
  451. * function/handler. The compiler technically can't guarantee that a real
  452. * function will have a non-zero address, and so it will generate code to
  453. * check for !NULL, whereas comparing against a stub will be elided at compile
  454. * time (unless the compiler is getting long in the tooth, e.g. gcc 4.9).
  455. */
  456. static void kvm_null_fn(void)
  457. {
  458. }
  459. #define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)
  460. /* Iterate over each memslot intersecting [start, last] (inclusive) range */
  461. #define kvm_for_each_memslot_in_hva_range(node, slots, start, last) \
  462. for (node = interval_tree_iter_first(&slots->hva_tree, start, last); \
  463. node; \
  464. node = interval_tree_iter_next(node, start, last)) \
  465. static __always_inline kvm_mn_ret_t kvm_handle_hva_range(struct kvm *kvm,
  466. const struct kvm_mmu_notifier_range *range)
  467. {
  468. struct kvm_mmu_notifier_return r = {
  469. .ret = false,
  470. .found_memslot = false,
  471. };
  472. struct kvm_gfn_range gfn_range;
  473. struct kvm_memory_slot *slot;
  474. struct kvm_memslots *slots;
  475. int i, idx;
  476. if (WARN_ON_ONCE(range->end <= range->start))
  477. return r;
  478. /* A null handler is allowed if and only if on_lock() is provided. */
  479. if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) &&
  480. IS_KVM_NULL_FN(range->handler)))
  481. return r;
  482. /* on_lock will never be called for lockless walks */
  483. if (WARN_ON_ONCE(range->lockless && !IS_KVM_NULL_FN(range->on_lock)))
  484. return r;
  485. idx = srcu_read_lock(&kvm->srcu);
  486. for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
  487. struct interval_tree_node *node;
  488. slots = __kvm_memslots(kvm, i);
  489. kvm_for_each_memslot_in_hva_range(node, slots,
  490. range->start, range->end - 1) {
  491. unsigned long hva_start, hva_end;
  492. slot = container_of(node, struct kvm_memory_slot, hva_node[slots->node_idx]);
  493. hva_start = max_t(unsigned long, range->start, slot->userspace_addr);
  494. hva_end = min_t(unsigned long, range->end,
  495. slot->userspace_addr + (slot->npages << PAGE_SHIFT));
  496. /*
  497. * To optimize for the likely case where the address
  498. * range is covered by zero or one memslots, don't
  499. * bother making these conditional (to avoid writes on
  500. * the second or later invocation of the handler).
  501. */
  502. gfn_range.arg = range->arg;
  503. gfn_range.may_block = range->may_block;
  504. /*
  505. * HVA-based notifications aren't relevant to private
  506. * mappings as they don't have a userspace mapping.
  507. */
  508. gfn_range.attr_filter = KVM_FILTER_SHARED;
  509. /*
  510. * {gfn(page) | page intersects with [hva_start, hva_end)} =
  511. * {gfn_start, gfn_start+1, ..., gfn_end-1}.
  512. */
  513. gfn_range.start = hva_to_gfn_memslot(hva_start, slot);
  514. gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
  515. gfn_range.slot = slot;
  516. gfn_range.lockless = range->lockless;
  517. if (!r.found_memslot) {
  518. r.found_memslot = true;
  519. if (!range->lockless) {
  520. KVM_MMU_LOCK(kvm);
  521. if (!IS_KVM_NULL_FN(range->on_lock))
  522. range->on_lock(kvm);
  523. if (IS_KVM_NULL_FN(range->handler))
  524. goto mmu_unlock;
  525. }
  526. }
  527. r.ret |= range->handler(kvm, &gfn_range);
  528. }
  529. }
  530. if (range->flush_on_ret && r.ret)
  531. kvm_flush_remote_tlbs(kvm);
  532. mmu_unlock:
  533. if (r.found_memslot && !range->lockless)
  534. KVM_MMU_UNLOCK(kvm);
  535. srcu_read_unlock(&kvm->srcu, idx);
  536. return r;
  537. }
  538. static __always_inline int kvm_age_hva_range(struct mmu_notifier *mn,
  539. unsigned long start,
  540. unsigned long end,
  541. gfn_handler_t handler,
  542. bool flush_on_ret)
  543. {
  544. struct kvm *kvm = mmu_notifier_to_kvm(mn);
  545. const struct kvm_mmu_notifier_range range = {
  546. .start = start,
  547. .end = end,
  548. .handler = handler,
  549. .on_lock = (void *)kvm_null_fn,
  550. .flush_on_ret = flush_on_ret,
  551. .may_block = false,
  552. .lockless = IS_ENABLED(CONFIG_KVM_MMU_LOCKLESS_AGING),
  553. };
  554. return kvm_handle_hva_range(kvm, &range).ret;
  555. }
  556. static __always_inline int kvm_age_hva_range_no_flush(struct mmu_notifier *mn,
  557. unsigned long start,
  558. unsigned long end,
  559. gfn_handler_t handler)
  560. {
  561. return kvm_age_hva_range(mn, start, end, handler, false);
  562. }
  563. void kvm_mmu_invalidate_begin(struct kvm *kvm)
  564. {
  565. lockdep_assert_held_write(&kvm->mmu_lock);
  566. /*
  567. * The count increase must become visible at unlock time as no
  568. * spte can be established without taking the mmu_lock and
  569. * count is also read inside the mmu_lock critical section.
  570. */
  571. kvm->mmu_invalidate_in_progress++;
  572. if (likely(kvm->mmu_invalidate_in_progress == 1)) {
  573. kvm->mmu_invalidate_range_start = INVALID_GPA;
  574. kvm->mmu_invalidate_range_end = INVALID_GPA;
  575. }
  576. }
  577. void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end)
  578. {
  579. lockdep_assert_held_write(&kvm->mmu_lock);
  580. WARN_ON_ONCE(!kvm->mmu_invalidate_in_progress);
  581. if (likely(kvm->mmu_invalidate_range_start == INVALID_GPA)) {
  582. kvm->mmu_invalidate_range_start = start;
  583. kvm->mmu_invalidate_range_end = end;
  584. } else {
  585. /*
  586. * Fully tracking multiple concurrent ranges has diminishing
  587. * returns. Keep things simple and just find the minimal range
  588. * which includes the current and new ranges. As there won't be
  589. * enough information to subtract a range after its invalidate
  590. * completes, any ranges invalidated concurrently will
  591. * accumulate and persist until all outstanding invalidates
  592. * complete.
  593. */
  594. kvm->mmu_invalidate_range_start =
  595. min(kvm->mmu_invalidate_range_start, start);
  596. kvm->mmu_invalidate_range_end =
  597. max(kvm->mmu_invalidate_range_end, end);
  598. }
  599. }
  600. bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
  601. {
  602. kvm_mmu_invalidate_range_add(kvm, range->start, range->end);
  603. return kvm_unmap_gfn_range(kvm, range);
  604. }
  605. static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
  606. const struct mmu_notifier_range *range)
  607. {
  608. struct kvm *kvm = mmu_notifier_to_kvm(mn);
  609. const struct kvm_mmu_notifier_range hva_range = {
  610. .start = range->start,
  611. .end = range->end,
  612. .handler = kvm_mmu_unmap_gfn_range,
  613. .on_lock = kvm_mmu_invalidate_begin,
  614. .flush_on_ret = true,
  615. .may_block = mmu_notifier_range_blockable(range),
  616. };
  617. trace_kvm_unmap_hva_range(range->start, range->end);
  618. /*
  619. * Prevent memslot modification between range_start() and range_end()
  620. * so that conditionally locking provides the same result in both
  621. * functions. Without that guarantee, the mmu_invalidate_in_progress
  622. * adjustments will be imbalanced.
  623. *
  624. * Pairs with the decrement in range_end().
  625. */
  626. spin_lock(&kvm->mn_invalidate_lock);
  627. kvm->mn_active_invalidate_count++;
  628. spin_unlock(&kvm->mn_invalidate_lock);
  629. /*
  630. * Invalidate pfn caches _before_ invalidating the secondary MMUs, i.e.
  631. * before acquiring mmu_lock, to avoid holding mmu_lock while acquiring
  632. * each cache's lock. There are relatively few caches in existence at
  633. * any given time, and the caches themselves can check for hva overlap,
  634. * i.e. don't need to rely on memslot overlap checks for performance.
  635. * Because this runs without holding mmu_lock, the pfn caches must use
  636. * mn_active_invalidate_count (see above) instead of
  637. * mmu_invalidate_in_progress.
  638. */
  639. gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end);
  640. /*
  641. * If one or more memslots were found and thus zapped, notify arch code
  642. * that guest memory has been reclaimed. This needs to be done *after*
  643. * dropping mmu_lock, as x86's reclaim path is slooooow.
  644. */
  645. if (kvm_handle_hva_range(kvm, &hva_range).found_memslot)
  646. kvm_arch_guest_memory_reclaimed(kvm);
  647. return 0;
  648. }
  649. void kvm_mmu_invalidate_end(struct kvm *kvm)
  650. {
  651. lockdep_assert_held_write(&kvm->mmu_lock);
  652. /*
  653. * This sequence increase will notify the kvm page fault that
  654. * the page that is going to be mapped in the spte could have
  655. * been freed.
  656. */
  657. kvm->mmu_invalidate_seq++;
  658. smp_wmb();
  659. /*
  660. * The above sequence increase must be visible before the
  661. * below count decrease, which is ensured by the smp_wmb above
  662. * in conjunction with the smp_rmb in mmu_invalidate_retry().
  663. */
  664. kvm->mmu_invalidate_in_progress--;
  665. KVM_BUG_ON(kvm->mmu_invalidate_in_progress < 0, kvm);
  666. /*
  667. * Assert that at least one range was added between start() and end().
  668. * Not adding a range isn't fatal, but it is a KVM bug.
  669. */
  670. WARN_ON_ONCE(kvm->mmu_invalidate_range_start == INVALID_GPA);
  671. }
  672. static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
  673. const struct mmu_notifier_range *range)
  674. {
  675. struct kvm *kvm = mmu_notifier_to_kvm(mn);
  676. const struct kvm_mmu_notifier_range hva_range = {
  677. .start = range->start,
  678. .end = range->end,
  679. .handler = (void *)kvm_null_fn,
  680. .on_lock = kvm_mmu_invalidate_end,
  681. .flush_on_ret = false,
  682. .may_block = mmu_notifier_range_blockable(range),
  683. };
  684. bool wake;
  685. kvm_handle_hva_range(kvm, &hva_range);
  686. /* Pairs with the increment in range_start(). */
  687. spin_lock(&kvm->mn_invalidate_lock);
  688. if (!WARN_ON_ONCE(!kvm->mn_active_invalidate_count))
  689. --kvm->mn_active_invalidate_count;
  690. wake = !kvm->mn_active_invalidate_count;
  691. spin_unlock(&kvm->mn_invalidate_lock);
  692. /*
  693. * There can only be one waiter, since the wait happens under
  694. * slots_lock.
  695. */
  696. if (wake)
  697. rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
  698. }
  699. static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
  700. struct mm_struct *mm,
  701. unsigned long start,
  702. unsigned long end)
  703. {
  704. trace_kvm_age_hva(start, end);
  705. return kvm_age_hva_range(mn, start, end, kvm_age_gfn,
  706. !IS_ENABLED(CONFIG_KVM_ELIDE_TLB_FLUSH_IF_YOUNG));
  707. }
  708. static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
  709. struct mm_struct *mm,
  710. unsigned long start,
  711. unsigned long end)
  712. {
  713. trace_kvm_age_hva(start, end);
  714. /*
  715. * Even though we do not flush TLB, this will still adversely
  716. * affect performance on pre-Haswell Intel EPT, where there is
  717. * no EPT Access Bit to clear so that we have to tear down EPT
  718. * tables instead. If we find this unacceptable, we can always
  719. * add a parameter to kvm_age_hva so that it effectively doesn't
  720. * do anything on clear_young.
  721. *
  722. * Also note that currently we never issue secondary TLB flushes
  723. * from clear_young, leaving this job up to the regular system
  724. * cadence. If we find this inaccurate, we might come up with a
  725. * more sophisticated heuristic later.
  726. */
  727. return kvm_age_hva_range_no_flush(mn, start, end, kvm_age_gfn);
  728. }
  729. static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
  730. struct mm_struct *mm,
  731. unsigned long address)
  732. {
  733. trace_kvm_test_age_hva(address);
  734. return kvm_age_hva_range_no_flush(mn, address, address + 1,
  735. kvm_test_age_gfn);
  736. }
  737. static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
  738. struct mm_struct *mm)
  739. {
  740. struct kvm *kvm = mmu_notifier_to_kvm(mn);
  741. int idx;
  742. idx = srcu_read_lock(&kvm->srcu);
  743. kvm_flush_shadow_all(kvm);
  744. srcu_read_unlock(&kvm->srcu, idx);
  745. }
  746. static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
  747. .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
  748. .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
  749. .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
  750. .clear_young = kvm_mmu_notifier_clear_young,
  751. .test_young = kvm_mmu_notifier_test_young,
  752. .release = kvm_mmu_notifier_release,
  753. };
  754. static int kvm_init_mmu_notifier(struct kvm *kvm)
  755. {
  756. kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
  757. return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
  758. }
  759. #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
  760. static int kvm_pm_notifier_call(struct notifier_block *bl,
  761. unsigned long state,
  762. void *unused)
  763. {
  764. struct kvm *kvm = container_of(bl, struct kvm, pm_notifier);
  765. return kvm_arch_pm_notifier(kvm, state);
  766. }
  767. static void kvm_init_pm_notifier(struct kvm *kvm)
  768. {
  769. kvm->pm_notifier.notifier_call = kvm_pm_notifier_call;
  770. /* Suspend KVM before we suspend ftrace, RCU, etc. */
  771. kvm->pm_notifier.priority = INT_MAX;
  772. register_pm_notifier(&kvm->pm_notifier);
  773. }
  774. static void kvm_destroy_pm_notifier(struct kvm *kvm)
  775. {
  776. unregister_pm_notifier(&kvm->pm_notifier);
  777. }
  778. #else /* !CONFIG_HAVE_KVM_PM_NOTIFIER */
  779. static void kvm_init_pm_notifier(struct kvm *kvm)
  780. {
  781. }
  782. static void kvm_destroy_pm_notifier(struct kvm *kvm)
  783. {
  784. }
  785. #endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
  786. static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
  787. {
  788. if (!memslot->dirty_bitmap)
  789. return;
  790. vfree(memslot->dirty_bitmap);
  791. memslot->dirty_bitmap = NULL;
  792. }
  793. /* This does not remove the slot from struct kvm_memslots data structures */
  794. static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
  795. {
  796. if (slot->flags & KVM_MEM_GUEST_MEMFD)
  797. kvm_gmem_unbind(slot);
  798. kvm_destroy_dirty_bitmap(slot);
  799. kvm_arch_free_memslot(kvm, slot);
  800. kfree(slot);
  801. }
  802. static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
  803. {
  804. struct hlist_node *idnode;
  805. struct kvm_memory_slot *memslot;
  806. int bkt;
  807. /*
  808. * The same memslot objects live in both active and inactive sets,
  809. * arbitrarily free using index '1' so the second invocation of this
  810. * function isn't operating over a structure with dangling pointers
  811. * (even though this function isn't actually touching them).
  812. */
  813. if (!slots->node_idx)
  814. return;
  815. hash_for_each_safe(slots->id_hash, bkt, idnode, memslot, id_node[1])
  816. kvm_free_memslot(kvm, memslot);
  817. }
  818. static umode_t kvm_stats_debugfs_mode(const struct kvm_stats_desc *desc)
  819. {
  820. switch (desc->flags & KVM_STATS_TYPE_MASK) {
  821. case KVM_STATS_TYPE_INSTANT:
  822. return 0444;
  823. case KVM_STATS_TYPE_CUMULATIVE:
  824. case KVM_STATS_TYPE_PEAK:
  825. default:
  826. return 0644;
  827. }
  828. }
  829. static void kvm_destroy_vm_debugfs(struct kvm *kvm)
  830. {
  831. int i;
  832. int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
  833. kvm_vcpu_stats_header.num_desc;
  834. if (IS_ERR(kvm->debugfs_dentry))
  835. return;
  836. debugfs_remove_recursive(kvm->debugfs_dentry);
  837. if (kvm->debugfs_stat_data) {
  838. for (i = 0; i < kvm_debugfs_num_entries; i++)
  839. kfree(kvm->debugfs_stat_data[i]);
  840. kfree(kvm->debugfs_stat_data);
  841. }
  842. }
  843. static int kvm_create_vm_debugfs(struct kvm *kvm, const char *fdname)
  844. {
  845. static DEFINE_MUTEX(kvm_debugfs_lock);
  846. struct dentry *dent;
  847. char dir_name[ITOA_MAX_LEN * 2];
  848. struct kvm_stat_data *stat_data;
  849. const struct kvm_stats_desc *pdesc;
  850. int i, ret = -ENOMEM;
  851. int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
  852. kvm_vcpu_stats_header.num_desc;
  853. if (!debugfs_initialized())
  854. return 0;
  855. snprintf(dir_name, sizeof(dir_name), "%d-%s", task_pid_nr(current), fdname);
  856. mutex_lock(&kvm_debugfs_lock);
  857. dent = debugfs_lookup(dir_name, kvm_debugfs_dir);
  858. if (dent) {
  859. pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name);
  860. dput(dent);
  861. mutex_unlock(&kvm_debugfs_lock);
  862. return 0;
  863. }
  864. dent = debugfs_create_dir(dir_name, kvm_debugfs_dir);
  865. mutex_unlock(&kvm_debugfs_lock);
  866. if (IS_ERR(dent))
  867. return 0;
  868. kvm->debugfs_dentry = dent;
  869. kvm->debugfs_stat_data = kzalloc_objs(*kvm->debugfs_stat_data,
  870. kvm_debugfs_num_entries,
  871. GFP_KERNEL_ACCOUNT);
  872. if (!kvm->debugfs_stat_data)
  873. goto out_err;
  874. for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
  875. pdesc = &kvm_vm_stats_desc[i];
  876. stat_data = kzalloc_obj(*stat_data, GFP_KERNEL_ACCOUNT);
  877. if (!stat_data)
  878. goto out_err;
  879. stat_data->kvm = kvm;
  880. stat_data->desc = pdesc;
  881. stat_data->kind = KVM_STAT_VM;
  882. kvm->debugfs_stat_data[i] = stat_data;
  883. debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
  884. kvm->debugfs_dentry, stat_data,
  885. &stat_fops_per_vm);
  886. }
  887. for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
  888. pdesc = &kvm_vcpu_stats_desc[i];
  889. stat_data = kzalloc_obj(*stat_data, GFP_KERNEL_ACCOUNT);
  890. if (!stat_data)
  891. goto out_err;
  892. stat_data->kvm = kvm;
  893. stat_data->desc = pdesc;
  894. stat_data->kind = KVM_STAT_VCPU;
  895. kvm->debugfs_stat_data[i + kvm_vm_stats_header.num_desc] = stat_data;
  896. debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
  897. kvm->debugfs_dentry, stat_data,
  898. &stat_fops_per_vm);
  899. }
  900. kvm_arch_create_vm_debugfs(kvm);
  901. return 0;
  902. out_err:
  903. kvm_destroy_vm_debugfs(kvm);
  904. return ret;
  905. }
  906. /*
  907. * Called just after removing the VM from the vm_list, but before doing any
  908. * other destruction.
  909. */
  910. void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
  911. {
  912. }
  913. /*
  914. * Called after per-vm debugfs created. When called kvm->debugfs_dentry should
  915. * be setup already, so we can create arch-specific debugfs entries under it.
  916. * Cleanup should be automatic done in kvm_destroy_vm_debugfs() recursively, so
  917. * a per-arch destroy interface is not needed.
  918. */
  919. void __weak kvm_arch_create_vm_debugfs(struct kvm *kvm)
  920. {
  921. }
  922. /* Called only on cleanup and destruction paths when there are no users. */
  923. static inline struct kvm_io_bus *kvm_get_bus_for_destruction(struct kvm *kvm,
  924. enum kvm_bus idx)
  925. {
  926. return rcu_dereference_protected(kvm->buses[idx],
  927. !refcount_read(&kvm->users_count));
  928. }
  929. static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
  930. {
  931. struct kvm *kvm = kvm_arch_alloc_vm();
  932. struct kvm_memslots *slots;
  933. int r, i, j;
  934. if (!kvm)
  935. return ERR_PTR(-ENOMEM);
  936. KVM_MMU_LOCK_INIT(kvm);
  937. mmgrab(current->mm);
  938. kvm->mm = current->mm;
  939. kvm_eventfd_init(kvm);
  940. mutex_init(&kvm->lock);
  941. mutex_init(&kvm->irq_lock);
  942. mutex_init(&kvm->slots_lock);
  943. mutex_init(&kvm->slots_arch_lock);
  944. spin_lock_init(&kvm->mn_invalidate_lock);
  945. rcuwait_init(&kvm->mn_memslots_update_rcuwait);
  946. xa_init(&kvm->vcpu_array);
  947. #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
  948. xa_init(&kvm->mem_attr_array);
  949. #endif
  950. INIT_LIST_HEAD(&kvm->gpc_list);
  951. spin_lock_init(&kvm->gpc_lock);
  952. INIT_LIST_HEAD(&kvm->devices);
  953. kvm->max_vcpus = KVM_MAX_VCPUS;
  954. BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
  955. /*
  956. * Force subsequent debugfs file creations to fail if the VM directory
  957. * is not created (by kvm_create_vm_debugfs()).
  958. */
  959. kvm->debugfs_dentry = ERR_PTR(-ENOENT);
  960. snprintf(kvm->stats_id, sizeof(kvm->stats_id), "kvm-%d",
  961. task_pid_nr(current));
  962. r = -ENOMEM;
  963. if (init_srcu_struct(&kvm->srcu))
  964. goto out_err_no_srcu;
  965. if (init_srcu_struct(&kvm->irq_srcu))
  966. goto out_err_no_irq_srcu;
  967. r = kvm_init_irq_routing(kvm);
  968. if (r)
  969. goto out_err_no_irq_routing;
  970. refcount_set(&kvm->users_count, 1);
  971. for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
  972. for (j = 0; j < 2; j++) {
  973. slots = &kvm->__memslots[i][j];
  974. atomic_long_set(&slots->last_used_slot, (unsigned long)NULL);
  975. slots->hva_tree = RB_ROOT_CACHED;
  976. slots->gfn_tree = RB_ROOT;
  977. hash_init(slots->id_hash);
  978. slots->node_idx = j;
  979. /* Generations must be different for each address space. */
  980. slots->generation = i;
  981. }
  982. rcu_assign_pointer(kvm->memslots[i], &kvm->__memslots[i][0]);
  983. }
  984. r = -ENOMEM;
  985. for (i = 0; i < KVM_NR_BUSES; i++) {
  986. rcu_assign_pointer(kvm->buses[i],
  987. kzalloc_obj(struct kvm_io_bus, GFP_KERNEL_ACCOUNT));
  988. if (!kvm->buses[i])
  989. goto out_err_no_arch_destroy_vm;
  990. }
  991. r = kvm_arch_init_vm(kvm, type);
  992. if (r)
  993. goto out_err_no_arch_destroy_vm;
  994. r = kvm_enable_virtualization();
  995. if (r)
  996. goto out_err_no_disable;
  997. #ifdef CONFIG_HAVE_KVM_IRQCHIP
  998. INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
  999. #endif
  1000. r = kvm_init_mmu_notifier(kvm);
  1001. if (r)
  1002. goto out_err_no_mmu_notifier;
  1003. r = kvm_coalesced_mmio_init(kvm);
  1004. if (r < 0)
  1005. goto out_no_coalesced_mmio;
  1006. r = kvm_create_vm_debugfs(kvm, fdname);
  1007. if (r)
  1008. goto out_err_no_debugfs;
  1009. mutex_lock(&kvm_lock);
  1010. list_add(&kvm->vm_list, &vm_list);
  1011. mutex_unlock(&kvm_lock);
  1012. preempt_notifier_inc();
  1013. kvm_init_pm_notifier(kvm);
  1014. return kvm;
  1015. out_err_no_debugfs:
  1016. kvm_coalesced_mmio_free(kvm);
  1017. out_no_coalesced_mmio:
  1018. if (kvm->mmu_notifier.ops)
  1019. mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
  1020. out_err_no_mmu_notifier:
  1021. kvm_disable_virtualization();
  1022. out_err_no_disable:
  1023. kvm_arch_destroy_vm(kvm);
  1024. out_err_no_arch_destroy_vm:
  1025. WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
  1026. for (i = 0; i < KVM_NR_BUSES; i++)
  1027. kfree(kvm_get_bus_for_destruction(kvm, i));
  1028. kvm_free_irq_routing(kvm);
  1029. out_err_no_irq_routing:
  1030. cleanup_srcu_struct(&kvm->irq_srcu);
  1031. out_err_no_irq_srcu:
  1032. cleanup_srcu_struct(&kvm->srcu);
  1033. out_err_no_srcu:
  1034. kvm_arch_free_vm(kvm);
  1035. mmdrop(current->mm);
  1036. return ERR_PTR(r);
  1037. }
  1038. static void kvm_destroy_devices(struct kvm *kvm)
  1039. {
  1040. struct kvm_device *dev, *tmp;
  1041. /*
  1042. * We do not need to take the kvm->lock here, because nobody else
  1043. * has a reference to the struct kvm at this point and therefore
  1044. * cannot access the devices list anyhow.
  1045. *
  1046. * The device list is generally managed as an rculist, but list_del()
  1047. * is used intentionally here. If a bug in KVM introduced a reader that
  1048. * was not backed by a reference on the kvm struct, the hope is that
  1049. * it'd consume the poisoned forward pointer instead of suffering a
  1050. * use-after-free, even though this cannot be guaranteed.
  1051. */
  1052. list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
  1053. list_del(&dev->vm_node);
  1054. dev->ops->destroy(dev);
  1055. }
  1056. }
  1057. static void kvm_destroy_vm(struct kvm *kvm)
  1058. {
  1059. int i;
  1060. struct mm_struct *mm = kvm->mm;
  1061. kvm_destroy_pm_notifier(kvm);
  1062. kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
  1063. kvm_destroy_vm_debugfs(kvm);
  1064. mutex_lock(&kvm_lock);
  1065. list_del(&kvm->vm_list);
  1066. mutex_unlock(&kvm_lock);
  1067. kvm_arch_pre_destroy_vm(kvm);
  1068. kvm_free_irq_routing(kvm);
  1069. for (i = 0; i < KVM_NR_BUSES; i++) {
  1070. struct kvm_io_bus *bus = kvm_get_bus_for_destruction(kvm, i);
  1071. if (bus)
  1072. kvm_io_bus_destroy(bus);
  1073. kvm->buses[i] = NULL;
  1074. }
  1075. kvm_coalesced_mmio_free(kvm);
  1076. mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
  1077. /*
  1078. * At this point, pending calls to invalidate_range_start()
  1079. * have completed but no more MMU notifiers will run, so
  1080. * mn_active_invalidate_count may remain unbalanced.
  1081. * No threads can be waiting in kvm_swap_active_memslots() as the
  1082. * last reference on KVM has been dropped, but freeing
  1083. * memslots would deadlock without this manual intervention.
  1084. *
  1085. * If the count isn't unbalanced, i.e. KVM did NOT unregister its MMU
  1086. * notifier between a start() and end(), then there shouldn't be any
  1087. * in-progress invalidations.
  1088. */
  1089. WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
  1090. if (kvm->mn_active_invalidate_count)
  1091. kvm->mn_active_invalidate_count = 0;
  1092. else
  1093. WARN_ON(kvm->mmu_invalidate_in_progress);
  1094. kvm_arch_destroy_vm(kvm);
  1095. kvm_destroy_devices(kvm);
  1096. for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
  1097. kvm_free_memslots(kvm, &kvm->__memslots[i][0]);
  1098. kvm_free_memslots(kvm, &kvm->__memslots[i][1]);
  1099. }
  1100. cleanup_srcu_struct(&kvm->irq_srcu);
  1101. srcu_barrier(&kvm->srcu);
  1102. cleanup_srcu_struct(&kvm->srcu);
  1103. #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
  1104. xa_destroy(&kvm->mem_attr_array);
  1105. #endif
  1106. kvm_arch_free_vm(kvm);
  1107. preempt_notifier_dec();
  1108. kvm_disable_virtualization();
  1109. mmdrop(mm);
  1110. }
  1111. void kvm_get_kvm(struct kvm *kvm)
  1112. {
  1113. refcount_inc(&kvm->users_count);
  1114. }
  1115. EXPORT_SYMBOL_GPL(kvm_get_kvm);
  1116. /*
  1117. * Make sure the vm is not during destruction, which is a safe version of
  1118. * kvm_get_kvm(). Return true if kvm referenced successfully, false otherwise.
  1119. */
  1120. bool kvm_get_kvm_safe(struct kvm *kvm)
  1121. {
  1122. return refcount_inc_not_zero(&kvm->users_count);
  1123. }
  1124. EXPORT_SYMBOL_GPL(kvm_get_kvm_safe);
  1125. void kvm_put_kvm(struct kvm *kvm)
  1126. {
  1127. if (refcount_dec_and_test(&kvm->users_count))
  1128. kvm_destroy_vm(kvm);
  1129. }
  1130. EXPORT_SYMBOL_GPL(kvm_put_kvm);
  1131. /*
  1132. * Used to put a reference that was taken on behalf of an object associated
  1133. * with a user-visible file descriptor, e.g. a vcpu or device, if installation
  1134. * of the new file descriptor fails and the reference cannot be transferred to
  1135. * its final owner. In such cases, the caller is still actively using @kvm and
  1136. * will fail miserably if the refcount unexpectedly hits zero.
  1137. */
  1138. void kvm_put_kvm_no_destroy(struct kvm *kvm)
  1139. {
  1140. WARN_ON(refcount_dec_and_test(&kvm->users_count));
  1141. }
  1142. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_put_kvm_no_destroy);
  1143. static int kvm_vm_release(struct inode *inode, struct file *filp)
  1144. {
  1145. struct kvm *kvm = filp->private_data;
  1146. kvm_irqfd_release(kvm);
  1147. kvm_put_kvm(kvm);
  1148. return 0;
  1149. }
  1150. int kvm_trylock_all_vcpus(struct kvm *kvm)
  1151. {
  1152. struct kvm_vcpu *vcpu;
  1153. unsigned long i, j;
  1154. lockdep_assert_held(&kvm->lock);
  1155. kvm_for_each_vcpu(i, vcpu, kvm)
  1156. if (!mutex_trylock_nest_lock(&vcpu->mutex, &kvm->lock))
  1157. goto out_unlock;
  1158. return 0;
  1159. out_unlock:
  1160. kvm_for_each_vcpu(j, vcpu, kvm) {
  1161. if (i == j)
  1162. break;
  1163. mutex_unlock(&vcpu->mutex);
  1164. }
  1165. return -EINTR;
  1166. }
  1167. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_trylock_all_vcpus);
  1168. int kvm_lock_all_vcpus(struct kvm *kvm)
  1169. {
  1170. struct kvm_vcpu *vcpu;
  1171. unsigned long i, j;
  1172. int r;
  1173. lockdep_assert_held(&kvm->lock);
  1174. kvm_for_each_vcpu(i, vcpu, kvm) {
  1175. r = mutex_lock_killable_nest_lock(&vcpu->mutex, &kvm->lock);
  1176. if (r)
  1177. goto out_unlock;
  1178. }
  1179. return 0;
  1180. out_unlock:
  1181. kvm_for_each_vcpu(j, vcpu, kvm) {
  1182. if (i == j)
  1183. break;
  1184. mutex_unlock(&vcpu->mutex);
  1185. }
  1186. return r;
  1187. }
  1188. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lock_all_vcpus);
  1189. void kvm_unlock_all_vcpus(struct kvm *kvm)
  1190. {
  1191. struct kvm_vcpu *vcpu;
  1192. unsigned long i;
  1193. lockdep_assert_held(&kvm->lock);
  1194. kvm_for_each_vcpu(i, vcpu, kvm)
  1195. mutex_unlock(&vcpu->mutex);
  1196. }
  1197. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_unlock_all_vcpus);
  1198. /*
  1199. * Allocation size is twice as large as the actual dirty bitmap size.
  1200. * See kvm_vm_ioctl_get_dirty_log() why this is needed.
  1201. */
  1202. static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot)
  1203. {
  1204. unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(memslot);
  1205. memslot->dirty_bitmap = __vcalloc(2, dirty_bytes, GFP_KERNEL_ACCOUNT);
  1206. if (!memslot->dirty_bitmap)
  1207. return -ENOMEM;
  1208. return 0;
  1209. }
  1210. static struct kvm_memslots *kvm_get_inactive_memslots(struct kvm *kvm, int as_id)
  1211. {
  1212. struct kvm_memslots *active = __kvm_memslots(kvm, as_id);
  1213. int node_idx_inactive = active->node_idx ^ 1;
  1214. return &kvm->__memslots[as_id][node_idx_inactive];
  1215. }
  1216. /*
  1217. * Helper to get the address space ID when one of memslot pointers may be NULL.
  1218. * This also serves as a sanity that at least one of the pointers is non-NULL,
  1219. * and that their address space IDs don't diverge.
  1220. */
  1221. static int kvm_memslots_get_as_id(struct kvm_memory_slot *a,
  1222. struct kvm_memory_slot *b)
  1223. {
  1224. if (WARN_ON_ONCE(!a && !b))
  1225. return 0;
  1226. if (!a)
  1227. return b->as_id;
  1228. if (!b)
  1229. return a->as_id;
  1230. WARN_ON_ONCE(a->as_id != b->as_id);
  1231. return a->as_id;
  1232. }
  1233. static void kvm_insert_gfn_node(struct kvm_memslots *slots,
  1234. struct kvm_memory_slot *slot)
  1235. {
  1236. struct rb_root *gfn_tree = &slots->gfn_tree;
  1237. struct rb_node **node, *parent;
  1238. int idx = slots->node_idx;
  1239. parent = NULL;
  1240. for (node = &gfn_tree->rb_node; *node; ) {
  1241. struct kvm_memory_slot *tmp;
  1242. tmp = container_of(*node, struct kvm_memory_slot, gfn_node[idx]);
  1243. parent = *node;
  1244. if (slot->base_gfn < tmp->base_gfn)
  1245. node = &(*node)->rb_left;
  1246. else if (slot->base_gfn > tmp->base_gfn)
  1247. node = &(*node)->rb_right;
  1248. else
  1249. BUG();
  1250. }
  1251. rb_link_node(&slot->gfn_node[idx], parent, node);
  1252. rb_insert_color(&slot->gfn_node[idx], gfn_tree);
  1253. }
  1254. static void kvm_erase_gfn_node(struct kvm_memslots *slots,
  1255. struct kvm_memory_slot *slot)
  1256. {
  1257. rb_erase(&slot->gfn_node[slots->node_idx], &slots->gfn_tree);
  1258. }
  1259. static void kvm_replace_gfn_node(struct kvm_memslots *slots,
  1260. struct kvm_memory_slot *old,
  1261. struct kvm_memory_slot *new)
  1262. {
  1263. int idx = slots->node_idx;
  1264. WARN_ON_ONCE(old->base_gfn != new->base_gfn);
  1265. rb_replace_node(&old->gfn_node[idx], &new->gfn_node[idx],
  1266. &slots->gfn_tree);
  1267. }
  1268. /*
  1269. * Replace @old with @new in the inactive memslots.
  1270. *
  1271. * With NULL @old this simply adds @new.
  1272. * With NULL @new this simply removes @old.
  1273. *
  1274. * If @new is non-NULL its hva_node[slots_idx] range has to be set
  1275. * appropriately.
  1276. */
  1277. static void kvm_replace_memslot(struct kvm *kvm,
  1278. struct kvm_memory_slot *old,
  1279. struct kvm_memory_slot *new)
  1280. {
  1281. int as_id = kvm_memslots_get_as_id(old, new);
  1282. struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
  1283. int idx = slots->node_idx;
  1284. if (old) {
  1285. hash_del(&old->id_node[idx]);
  1286. interval_tree_remove(&old->hva_node[idx], &slots->hva_tree);
  1287. if ((long)old == atomic_long_read(&slots->last_used_slot))
  1288. atomic_long_set(&slots->last_used_slot, (long)new);
  1289. if (!new) {
  1290. kvm_erase_gfn_node(slots, old);
  1291. return;
  1292. }
  1293. }
  1294. /*
  1295. * Initialize @new's hva range. Do this even when replacing an @old
  1296. * slot, kvm_copy_memslot() deliberately does not touch node data.
  1297. */
  1298. new->hva_node[idx].start = new->userspace_addr;
  1299. new->hva_node[idx].last = new->userspace_addr +
  1300. (new->npages << PAGE_SHIFT) - 1;
  1301. /*
  1302. * (Re)Add the new memslot. There is no O(1) interval_tree_replace(),
  1303. * hva_node needs to be swapped with remove+insert even though hva can't
  1304. * change when replacing an existing slot.
  1305. */
  1306. hash_add(slots->id_hash, &new->id_node[idx], new->id);
  1307. interval_tree_insert(&new->hva_node[idx], &slots->hva_tree);
  1308. /*
  1309. * If the memslot gfn is unchanged, rb_replace_node() can be used to
  1310. * switch the node in the gfn tree instead of removing the old and
  1311. * inserting the new as two separate operations. Replacement is a
  1312. * single O(1) operation versus two O(log(n)) operations for
  1313. * remove+insert.
  1314. */
  1315. if (old && old->base_gfn == new->base_gfn) {
  1316. kvm_replace_gfn_node(slots, old, new);
  1317. } else {
  1318. if (old)
  1319. kvm_erase_gfn_node(slots, old);
  1320. kvm_insert_gfn_node(slots, new);
  1321. }
  1322. }
  1323. /*
  1324. * Flags that do not access any of the extra space of struct
  1325. * kvm_userspace_memory_region2. KVM_SET_USER_MEMORY_REGION_V1_FLAGS
  1326. * only allows these.
  1327. */
  1328. #define KVM_SET_USER_MEMORY_REGION_V1_FLAGS \
  1329. (KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY)
  1330. static int check_memory_region_flags(struct kvm *kvm,
  1331. const struct kvm_userspace_memory_region2 *mem)
  1332. {
  1333. u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
  1334. if (IS_ENABLED(CONFIG_KVM_GUEST_MEMFD))
  1335. valid_flags |= KVM_MEM_GUEST_MEMFD;
  1336. /* Dirty logging private memory is not currently supported. */
  1337. if (mem->flags & KVM_MEM_GUEST_MEMFD)
  1338. valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
  1339. /*
  1340. * GUEST_MEMFD is incompatible with read-only memslots, as writes to
  1341. * read-only memslots have emulated MMIO, not page fault, semantics,
  1342. * and KVM doesn't allow emulated MMIO for private memory.
  1343. */
  1344. if (kvm_arch_has_readonly_mem(kvm) &&
  1345. !(mem->flags & KVM_MEM_GUEST_MEMFD))
  1346. valid_flags |= KVM_MEM_READONLY;
  1347. if (mem->flags & ~valid_flags)
  1348. return -EINVAL;
  1349. return 0;
  1350. }
  1351. static void kvm_swap_active_memslots(struct kvm *kvm, int as_id)
  1352. {
  1353. struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
  1354. /* Grab the generation from the activate memslots. */
  1355. u64 gen = __kvm_memslots(kvm, as_id)->generation;
  1356. WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
  1357. slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
  1358. /*
  1359. * Do not store the new memslots while there are invalidations in
  1360. * progress, otherwise the locking in invalidate_range_start and
  1361. * invalidate_range_end will be unbalanced.
  1362. */
  1363. spin_lock(&kvm->mn_invalidate_lock);
  1364. prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait);
  1365. while (kvm->mn_active_invalidate_count) {
  1366. set_current_state(TASK_UNINTERRUPTIBLE);
  1367. spin_unlock(&kvm->mn_invalidate_lock);
  1368. schedule();
  1369. spin_lock(&kvm->mn_invalidate_lock);
  1370. }
  1371. finish_rcuwait(&kvm->mn_memslots_update_rcuwait);
  1372. rcu_assign_pointer(kvm->memslots[as_id], slots);
  1373. spin_unlock(&kvm->mn_invalidate_lock);
  1374. /*
  1375. * Acquired in kvm_set_memslot. Must be released before synchronize
  1376. * SRCU below in order to avoid deadlock with another thread
  1377. * acquiring the slots_arch_lock in an srcu critical section.
  1378. */
  1379. mutex_unlock(&kvm->slots_arch_lock);
  1380. synchronize_srcu_expedited(&kvm->srcu);
  1381. /*
  1382. * Increment the new memslot generation a second time, dropping the
  1383. * update in-progress flag and incrementing the generation based on
  1384. * the number of address spaces. This provides a unique and easily
  1385. * identifiable generation number while the memslots are in flux.
  1386. */
  1387. gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
  1388. /*
  1389. * Generations must be unique even across address spaces. We do not need
  1390. * a global counter for that, instead the generation space is evenly split
  1391. * across address spaces. For example, with two address spaces, address
  1392. * space 0 will use generations 0, 2, 4, ... while address space 1 will
  1393. * use generations 1, 3, 5, ...
  1394. */
  1395. gen += kvm_arch_nr_memslot_as_ids(kvm);
  1396. kvm_arch_memslots_updated(kvm, gen);
  1397. slots->generation = gen;
  1398. }
  1399. static int kvm_prepare_memory_region(struct kvm *kvm,
  1400. const struct kvm_memory_slot *old,
  1401. struct kvm_memory_slot *new,
  1402. enum kvm_mr_change change)
  1403. {
  1404. int r;
  1405. /*
  1406. * If dirty logging is disabled, nullify the bitmap; the old bitmap
  1407. * will be freed on "commit". If logging is enabled in both old and
  1408. * new, reuse the existing bitmap. If logging is enabled only in the
  1409. * new and KVM isn't using a ring buffer, allocate and initialize a
  1410. * new bitmap.
  1411. */
  1412. if (change != KVM_MR_DELETE) {
  1413. if (!(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
  1414. new->dirty_bitmap = NULL;
  1415. else if (old && old->dirty_bitmap)
  1416. new->dirty_bitmap = old->dirty_bitmap;
  1417. else if (kvm_use_dirty_bitmap(kvm)) {
  1418. r = kvm_alloc_dirty_bitmap(new);
  1419. if (r)
  1420. return r;
  1421. if (kvm_dirty_log_manual_protect_and_init_set(kvm))
  1422. bitmap_set(new->dirty_bitmap, 0, new->npages);
  1423. }
  1424. }
  1425. r = kvm_arch_prepare_memory_region(kvm, old, new, change);
  1426. /* Free the bitmap on failure if it was allocated above. */
  1427. if (r && new && new->dirty_bitmap && (!old || !old->dirty_bitmap))
  1428. kvm_destroy_dirty_bitmap(new);
  1429. return r;
  1430. }
  1431. static void kvm_commit_memory_region(struct kvm *kvm,
  1432. struct kvm_memory_slot *old,
  1433. const struct kvm_memory_slot *new,
  1434. enum kvm_mr_change change)
  1435. {
  1436. int old_flags = old ? old->flags : 0;
  1437. int new_flags = new ? new->flags : 0;
  1438. /*
  1439. * Update the total number of memslot pages before calling the arch
  1440. * hook so that architectures can consume the result directly.
  1441. */
  1442. if (change == KVM_MR_DELETE)
  1443. kvm->nr_memslot_pages -= old->npages;
  1444. else if (change == KVM_MR_CREATE)
  1445. kvm->nr_memslot_pages += new->npages;
  1446. if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES) {
  1447. int change = (new_flags & KVM_MEM_LOG_DIRTY_PAGES) ? 1 : -1;
  1448. atomic_set(&kvm->nr_memslots_dirty_logging,
  1449. atomic_read(&kvm->nr_memslots_dirty_logging) + change);
  1450. }
  1451. kvm_arch_commit_memory_region(kvm, old, new, change);
  1452. switch (change) {
  1453. case KVM_MR_CREATE:
  1454. /* Nothing more to do. */
  1455. break;
  1456. case KVM_MR_DELETE:
  1457. /* Free the old memslot and all its metadata. */
  1458. kvm_free_memslot(kvm, old);
  1459. break;
  1460. case KVM_MR_MOVE:
  1461. /*
  1462. * Moving a guest_memfd memslot isn't supported, and will never
  1463. * be supported.
  1464. */
  1465. WARN_ON_ONCE(old->flags & KVM_MEM_GUEST_MEMFD);
  1466. fallthrough;
  1467. case KVM_MR_FLAGS_ONLY:
  1468. /*
  1469. * Free the dirty bitmap as needed; the below check encompasses
  1470. * both the flags and whether a ring buffer is being used)
  1471. */
  1472. if (old->dirty_bitmap && !new->dirty_bitmap)
  1473. kvm_destroy_dirty_bitmap(old);
  1474. /*
  1475. * Unbind the guest_memfd instance as needed; the @new slot has
  1476. * already created its own binding. TODO: Drop the WARN when
  1477. * dirty logging guest_memfd memslots is supported. Until then,
  1478. * flags-only changes on guest_memfd slots should be impossible.
  1479. */
  1480. if (WARN_ON_ONCE(old->flags & KVM_MEM_GUEST_MEMFD))
  1481. kvm_gmem_unbind(old);
  1482. /*
  1483. * The final quirk. Free the detached, old slot, but only its
  1484. * memory, not any metadata. Metadata, including arch specific
  1485. * data, may be reused by @new.
  1486. */
  1487. kfree(old);
  1488. break;
  1489. default:
  1490. BUG();
  1491. }
  1492. }
  1493. /*
  1494. * Activate @new, which must be installed in the inactive slots by the caller,
  1495. * by swapping the active slots and then propagating @new to @old once @old is
  1496. * unreachable and can be safely modified.
  1497. *
  1498. * With NULL @old this simply adds @new to @active (while swapping the sets).
  1499. * With NULL @new this simply removes @old from @active and frees it
  1500. * (while also swapping the sets).
  1501. */
  1502. static void kvm_activate_memslot(struct kvm *kvm,
  1503. struct kvm_memory_slot *old,
  1504. struct kvm_memory_slot *new)
  1505. {
  1506. int as_id = kvm_memslots_get_as_id(old, new);
  1507. kvm_swap_active_memslots(kvm, as_id);
  1508. /* Propagate the new memslot to the now inactive memslots. */
  1509. kvm_replace_memslot(kvm, old, new);
  1510. }
  1511. static void kvm_copy_memslot(struct kvm_memory_slot *dest,
  1512. const struct kvm_memory_slot *src)
  1513. {
  1514. dest->base_gfn = src->base_gfn;
  1515. dest->npages = src->npages;
  1516. dest->dirty_bitmap = src->dirty_bitmap;
  1517. dest->arch = src->arch;
  1518. dest->userspace_addr = src->userspace_addr;
  1519. dest->flags = src->flags;
  1520. dest->id = src->id;
  1521. dest->as_id = src->as_id;
  1522. }
  1523. static void kvm_invalidate_memslot(struct kvm *kvm,
  1524. struct kvm_memory_slot *old,
  1525. struct kvm_memory_slot *invalid_slot)
  1526. {
  1527. /*
  1528. * Mark the current slot INVALID. As with all memslot modifications,
  1529. * this must be done on an unreachable slot to avoid modifying the
  1530. * current slot in the active tree.
  1531. */
  1532. kvm_copy_memslot(invalid_slot, old);
  1533. invalid_slot->flags |= KVM_MEMSLOT_INVALID;
  1534. kvm_replace_memslot(kvm, old, invalid_slot);
  1535. /*
  1536. * Activate the slot that is now marked INVALID, but don't propagate
  1537. * the slot to the now inactive slots. The slot is either going to be
  1538. * deleted or recreated as a new slot.
  1539. */
  1540. kvm_swap_active_memslots(kvm, old->as_id);
  1541. /*
  1542. * From this point no new shadow pages pointing to a deleted, or moved,
  1543. * memslot will be created. Validation of sp->gfn happens in:
  1544. * - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
  1545. * - kvm_is_visible_gfn (mmu_check_root)
  1546. */
  1547. kvm_arch_flush_shadow_memslot(kvm, old);
  1548. kvm_arch_guest_memory_reclaimed(kvm);
  1549. /* Was released by kvm_swap_active_memslots(), reacquire. */
  1550. mutex_lock(&kvm->slots_arch_lock);
  1551. /*
  1552. * Copy the arch-specific field of the newly-installed slot back to the
  1553. * old slot as the arch data could have changed between releasing
  1554. * slots_arch_lock in kvm_swap_active_memslots() and re-acquiring the lock
  1555. * above. Writers are required to retrieve memslots *after* acquiring
  1556. * slots_arch_lock, thus the active slot's data is guaranteed to be fresh.
  1557. */
  1558. old->arch = invalid_slot->arch;
  1559. }
  1560. static void kvm_create_memslot(struct kvm *kvm,
  1561. struct kvm_memory_slot *new)
  1562. {
  1563. /* Add the new memslot to the inactive set and activate. */
  1564. kvm_replace_memslot(kvm, NULL, new);
  1565. kvm_activate_memslot(kvm, NULL, new);
  1566. }
  1567. static void kvm_delete_memslot(struct kvm *kvm,
  1568. struct kvm_memory_slot *old,
  1569. struct kvm_memory_slot *invalid_slot)
  1570. {
  1571. /*
  1572. * Remove the old memslot (in the inactive memslots) by passing NULL as
  1573. * the "new" slot, and for the invalid version in the active slots.
  1574. */
  1575. kvm_replace_memslot(kvm, old, NULL);
  1576. kvm_activate_memslot(kvm, invalid_slot, NULL);
  1577. }
  1578. static void kvm_move_memslot(struct kvm *kvm,
  1579. struct kvm_memory_slot *old,
  1580. struct kvm_memory_slot *new,
  1581. struct kvm_memory_slot *invalid_slot)
  1582. {
  1583. /*
  1584. * Replace the old memslot in the inactive slots, and then swap slots
  1585. * and replace the current INVALID with the new as well.
  1586. */
  1587. kvm_replace_memslot(kvm, old, new);
  1588. kvm_activate_memslot(kvm, invalid_slot, new);
  1589. }
  1590. static void kvm_update_flags_memslot(struct kvm *kvm,
  1591. struct kvm_memory_slot *old,
  1592. struct kvm_memory_slot *new)
  1593. {
  1594. /*
  1595. * Similar to the MOVE case, but the slot doesn't need to be zapped as
  1596. * an intermediate step. Instead, the old memslot is simply replaced
  1597. * with a new, updated copy in both memslot sets.
  1598. */
  1599. kvm_replace_memslot(kvm, old, new);
  1600. kvm_activate_memslot(kvm, old, new);
  1601. }
  1602. static int kvm_set_memslot(struct kvm *kvm,
  1603. struct kvm_memory_slot *old,
  1604. struct kvm_memory_slot *new,
  1605. enum kvm_mr_change change)
  1606. {
  1607. struct kvm_memory_slot *invalid_slot;
  1608. int r;
  1609. /*
  1610. * Released in kvm_swap_active_memslots().
  1611. *
  1612. * Must be held from before the current memslots are copied until after
  1613. * the new memslots are installed with rcu_assign_pointer, then
  1614. * released before the synchronize srcu in kvm_swap_active_memslots().
  1615. *
  1616. * When modifying memslots outside of the slots_lock, must be held
  1617. * before reading the pointer to the current memslots until after all
  1618. * changes to those memslots are complete.
  1619. *
  1620. * These rules ensure that installing new memslots does not lose
  1621. * changes made to the previous memslots.
  1622. */
  1623. mutex_lock(&kvm->slots_arch_lock);
  1624. /*
  1625. * Invalidate the old slot if it's being deleted or moved. This is
  1626. * done prior to actually deleting/moving the memslot to allow vCPUs to
  1627. * continue running by ensuring there are no mappings or shadow pages
  1628. * for the memslot when it is deleted/moved. Without pre-invalidation
  1629. * (and without a lock), a window would exist between effecting the
  1630. * delete/move and committing the changes in arch code where KVM or a
  1631. * guest could access a non-existent memslot.
  1632. *
  1633. * Modifications are done on a temporary, unreachable slot. The old
  1634. * slot needs to be preserved in case a later step fails and the
  1635. * invalidation needs to be reverted.
  1636. */
  1637. if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
  1638. invalid_slot = kzalloc_obj(*invalid_slot, GFP_KERNEL_ACCOUNT);
  1639. if (!invalid_slot) {
  1640. mutex_unlock(&kvm->slots_arch_lock);
  1641. return -ENOMEM;
  1642. }
  1643. kvm_invalidate_memslot(kvm, old, invalid_slot);
  1644. }
  1645. r = kvm_prepare_memory_region(kvm, old, new, change);
  1646. if (r) {
  1647. /*
  1648. * For DELETE/MOVE, revert the above INVALID change. No
  1649. * modifications required since the original slot was preserved
  1650. * in the inactive slots. Changing the active memslots also
  1651. * release slots_arch_lock.
  1652. */
  1653. if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
  1654. kvm_activate_memslot(kvm, invalid_slot, old);
  1655. kfree(invalid_slot);
  1656. } else {
  1657. mutex_unlock(&kvm->slots_arch_lock);
  1658. }
  1659. return r;
  1660. }
  1661. /*
  1662. * For DELETE and MOVE, the working slot is now active as the INVALID
  1663. * version of the old slot. MOVE is particularly special as it reuses
  1664. * the old slot and returns a copy of the old slot (in working_slot).
  1665. * For CREATE, there is no old slot. For DELETE and FLAGS_ONLY, the
  1666. * old slot is detached but otherwise preserved.
  1667. */
  1668. if (change == KVM_MR_CREATE)
  1669. kvm_create_memslot(kvm, new);
  1670. else if (change == KVM_MR_DELETE)
  1671. kvm_delete_memslot(kvm, old, invalid_slot);
  1672. else if (change == KVM_MR_MOVE)
  1673. kvm_move_memslot(kvm, old, new, invalid_slot);
  1674. else if (change == KVM_MR_FLAGS_ONLY)
  1675. kvm_update_flags_memslot(kvm, old, new);
  1676. else
  1677. BUG();
  1678. /* Free the temporary INVALID slot used for DELETE and MOVE. */
  1679. if (change == KVM_MR_DELETE || change == KVM_MR_MOVE)
  1680. kfree(invalid_slot);
  1681. /*
  1682. * No need to refresh new->arch, changes after dropping slots_arch_lock
  1683. * will directly hit the final, active memslot. Architectures are
  1684. * responsible for knowing that new->arch may be stale.
  1685. */
  1686. kvm_commit_memory_region(kvm, old, new, change);
  1687. return 0;
  1688. }
  1689. static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id,
  1690. gfn_t start, gfn_t end)
  1691. {
  1692. struct kvm_memslot_iter iter;
  1693. kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
  1694. if (iter.slot->id != id)
  1695. return true;
  1696. }
  1697. return false;
  1698. }
  1699. static int kvm_set_memory_region(struct kvm *kvm,
  1700. const struct kvm_userspace_memory_region2 *mem)
  1701. {
  1702. struct kvm_memory_slot *old, *new;
  1703. struct kvm_memslots *slots;
  1704. enum kvm_mr_change change;
  1705. unsigned long npages;
  1706. gfn_t base_gfn;
  1707. int as_id, id;
  1708. int r;
  1709. lockdep_assert_held(&kvm->slots_lock);
  1710. r = check_memory_region_flags(kvm, mem);
  1711. if (r)
  1712. return r;
  1713. as_id = mem->slot >> 16;
  1714. id = (u16)mem->slot;
  1715. /* General sanity checks */
  1716. if ((mem->memory_size & (PAGE_SIZE - 1)) ||
  1717. (mem->memory_size != (unsigned long)mem->memory_size))
  1718. return -EINVAL;
  1719. if (mem->guest_phys_addr & (PAGE_SIZE - 1))
  1720. return -EINVAL;
  1721. /* We can read the guest memory with __xxx_user() later on. */
  1722. if ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
  1723. (mem->userspace_addr != untagged_addr(mem->userspace_addr)) ||
  1724. !access_ok((void __user *)(unsigned long)mem->userspace_addr,
  1725. mem->memory_size))
  1726. return -EINVAL;
  1727. if (mem->flags & KVM_MEM_GUEST_MEMFD &&
  1728. (mem->guest_memfd_offset & (PAGE_SIZE - 1) ||
  1729. mem->guest_memfd_offset + mem->memory_size < mem->guest_memfd_offset))
  1730. return -EINVAL;
  1731. if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_MEM_SLOTS_NUM)
  1732. return -EINVAL;
  1733. if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
  1734. return -EINVAL;
  1735. /*
  1736. * The size of userspace-defined memory regions is restricted in order
  1737. * to play nice with dirty bitmap operations, which are indexed with an
  1738. * "unsigned int". KVM's internal memory regions don't support dirty
  1739. * logging, and so are exempt.
  1740. */
  1741. if (id < KVM_USER_MEM_SLOTS &&
  1742. (mem->memory_size >> PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES)
  1743. return -EINVAL;
  1744. slots = __kvm_memslots(kvm, as_id);
  1745. /*
  1746. * Note, the old memslot (and the pointer itself!) may be invalidated
  1747. * and/or destroyed by kvm_set_memslot().
  1748. */
  1749. old = id_to_memslot(slots, id);
  1750. if (!mem->memory_size) {
  1751. if (!old || !old->npages)
  1752. return -EINVAL;
  1753. if (WARN_ON_ONCE(kvm->nr_memslot_pages < old->npages))
  1754. return -EIO;
  1755. return kvm_set_memslot(kvm, old, NULL, KVM_MR_DELETE);
  1756. }
  1757. base_gfn = (mem->guest_phys_addr >> PAGE_SHIFT);
  1758. npages = (mem->memory_size >> PAGE_SHIFT);
  1759. if (!old || !old->npages) {
  1760. change = KVM_MR_CREATE;
  1761. /*
  1762. * To simplify KVM internals, the total number of pages across
  1763. * all memslots must fit in an unsigned long.
  1764. */
  1765. if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages)
  1766. return -EINVAL;
  1767. } else { /* Modify an existing slot. */
  1768. /* Private memslots are immutable, they can only be deleted. */
  1769. if (mem->flags & KVM_MEM_GUEST_MEMFD)
  1770. return -EINVAL;
  1771. if ((mem->userspace_addr != old->userspace_addr) ||
  1772. (npages != old->npages) ||
  1773. ((mem->flags ^ old->flags) & (KVM_MEM_READONLY | KVM_MEM_GUEST_MEMFD)))
  1774. return -EINVAL;
  1775. if (base_gfn != old->base_gfn)
  1776. change = KVM_MR_MOVE;
  1777. else if (mem->flags != old->flags)
  1778. change = KVM_MR_FLAGS_ONLY;
  1779. else /* Nothing to change. */
  1780. return 0;
  1781. }
  1782. if ((change == KVM_MR_CREATE || change == KVM_MR_MOVE) &&
  1783. kvm_check_memslot_overlap(slots, id, base_gfn, base_gfn + npages))
  1784. return -EEXIST;
  1785. /* Allocate a slot that will persist in the memslot. */
  1786. new = kzalloc_obj(*new, GFP_KERNEL_ACCOUNT);
  1787. if (!new)
  1788. return -ENOMEM;
  1789. new->as_id = as_id;
  1790. new->id = id;
  1791. new->base_gfn = base_gfn;
  1792. new->npages = npages;
  1793. new->flags = mem->flags;
  1794. new->userspace_addr = mem->userspace_addr;
  1795. if (mem->flags & KVM_MEM_GUEST_MEMFD) {
  1796. r = kvm_gmem_bind(kvm, new, mem->guest_memfd, mem->guest_memfd_offset);
  1797. if (r)
  1798. goto out;
  1799. }
  1800. r = kvm_set_memslot(kvm, old, new, change);
  1801. if (r)
  1802. goto out_unbind;
  1803. return 0;
  1804. out_unbind:
  1805. if (mem->flags & KVM_MEM_GUEST_MEMFD)
  1806. kvm_gmem_unbind(new);
  1807. out:
  1808. kfree(new);
  1809. return r;
  1810. }
  1811. int kvm_set_internal_memslot(struct kvm *kvm,
  1812. const struct kvm_userspace_memory_region2 *mem)
  1813. {
  1814. if (WARN_ON_ONCE(mem->slot < KVM_USER_MEM_SLOTS))
  1815. return -EINVAL;
  1816. if (WARN_ON_ONCE(mem->flags))
  1817. return -EINVAL;
  1818. return kvm_set_memory_region(kvm, mem);
  1819. }
  1820. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_internal_memslot);
  1821. static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
  1822. struct kvm_userspace_memory_region2 *mem)
  1823. {
  1824. if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
  1825. return -EINVAL;
  1826. guard(mutex)(&kvm->slots_lock);
  1827. return kvm_set_memory_region(kvm, mem);
  1828. }
  1829. #ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
  1830. /**
  1831. * kvm_get_dirty_log - get a snapshot of dirty pages
  1832. * @kvm: pointer to kvm instance
  1833. * @log: slot id and address to which we copy the log
  1834. * @is_dirty: set to '1' if any dirty pages were found
  1835. * @memslot: set to the associated memslot, always valid on success
  1836. */
  1837. int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
  1838. int *is_dirty, struct kvm_memory_slot **memslot)
  1839. {
  1840. struct kvm_memslots *slots;
  1841. int i, as_id, id;
  1842. unsigned long n;
  1843. unsigned long any = 0;
  1844. /* Dirty ring tracking may be exclusive to dirty log tracking */
  1845. if (!kvm_use_dirty_bitmap(kvm))
  1846. return -ENXIO;
  1847. *memslot = NULL;
  1848. *is_dirty = 0;
  1849. as_id = log->slot >> 16;
  1850. id = (u16)log->slot;
  1851. if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
  1852. return -EINVAL;
  1853. slots = __kvm_memslots(kvm, as_id);
  1854. *memslot = id_to_memslot(slots, id);
  1855. if (!(*memslot) || !(*memslot)->dirty_bitmap)
  1856. return -ENOENT;
  1857. kvm_arch_sync_dirty_log(kvm, *memslot);
  1858. n = kvm_dirty_bitmap_bytes(*memslot);
  1859. for (i = 0; !any && i < n/sizeof(long); ++i)
  1860. any = (*memslot)->dirty_bitmap[i];
  1861. if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n))
  1862. return -EFAULT;
  1863. if (any)
  1864. *is_dirty = 1;
  1865. return 0;
  1866. }
  1867. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_dirty_log);
  1868. #else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
  1869. /**
  1870. * kvm_get_dirty_log_protect - get a snapshot of dirty pages
  1871. * and reenable dirty page tracking for the corresponding pages.
  1872. * @kvm: pointer to kvm instance
  1873. * @log: slot id and address to which we copy the log
  1874. *
  1875. * We need to keep it in mind that VCPU threads can write to the bitmap
  1876. * concurrently. So, to avoid losing track of dirty pages we keep the
  1877. * following order:
  1878. *
  1879. * 1. Take a snapshot of the bit and clear it if needed.
  1880. * 2. Write protect the corresponding page.
  1881. * 3. Copy the snapshot to the userspace.
  1882. * 4. Upon return caller flushes TLB's if needed.
  1883. *
  1884. * Between 2 and 4, the guest may write to the page using the remaining TLB
  1885. * entry. This is not a problem because the page is reported dirty using
  1886. * the snapshot taken before and step 4 ensures that writes done after
  1887. * exiting to userspace will be logged for the next call.
  1888. *
  1889. */
  1890. static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
  1891. {
  1892. struct kvm_memslots *slots;
  1893. struct kvm_memory_slot *memslot;
  1894. int i, as_id, id;
  1895. unsigned long n;
  1896. unsigned long *dirty_bitmap;
  1897. unsigned long *dirty_bitmap_buffer;
  1898. bool flush;
  1899. /* Dirty ring tracking may be exclusive to dirty log tracking */
  1900. if (!kvm_use_dirty_bitmap(kvm))
  1901. return -ENXIO;
  1902. as_id = log->slot >> 16;
  1903. id = (u16)log->slot;
  1904. if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
  1905. return -EINVAL;
  1906. slots = __kvm_memslots(kvm, as_id);
  1907. memslot = id_to_memslot(slots, id);
  1908. if (!memslot || !memslot->dirty_bitmap)
  1909. return -ENOENT;
  1910. dirty_bitmap = memslot->dirty_bitmap;
  1911. kvm_arch_sync_dirty_log(kvm, memslot);
  1912. n = kvm_dirty_bitmap_bytes(memslot);
  1913. flush = false;
  1914. if (kvm->manual_dirty_log_protect) {
  1915. /*
  1916. * Unlike kvm_get_dirty_log, we always return false in *flush,
  1917. * because no flush is needed until KVM_CLEAR_DIRTY_LOG. There
  1918. * is some code duplication between this function and
  1919. * kvm_get_dirty_log, but hopefully all architecture
  1920. * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log
  1921. * can be eliminated.
  1922. */
  1923. dirty_bitmap_buffer = dirty_bitmap;
  1924. } else {
  1925. dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
  1926. memset(dirty_bitmap_buffer, 0, n);
  1927. KVM_MMU_LOCK(kvm);
  1928. for (i = 0; i < n / sizeof(long); i++) {
  1929. unsigned long mask;
  1930. gfn_t offset;
  1931. if (!dirty_bitmap[i])
  1932. continue;
  1933. flush = true;
  1934. mask = xchg(&dirty_bitmap[i], 0);
  1935. dirty_bitmap_buffer[i] = mask;
  1936. offset = i * BITS_PER_LONG;
  1937. kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
  1938. offset, mask);
  1939. }
  1940. KVM_MMU_UNLOCK(kvm);
  1941. }
  1942. if (flush)
  1943. kvm_flush_remote_tlbs_memslot(kvm, memslot);
  1944. if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
  1945. return -EFAULT;
  1946. return 0;
  1947. }
  1948. /**
  1949. * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
  1950. * @kvm: kvm instance
  1951. * @log: slot id and address to which we copy the log
  1952. *
  1953. * Steps 1-4 below provide general overview of dirty page logging. See
  1954. * kvm_get_dirty_log_protect() function description for additional details.
  1955. *
  1956. * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
  1957. * always flush the TLB (step 4) even if previous step failed and the dirty
  1958. * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
  1959. * does not preclude user space subsequent dirty log read. Flushing TLB ensures
  1960. * writes will be marked dirty for next log read.
  1961. *
  1962. * 1. Take a snapshot of the bit and clear it if needed.
  1963. * 2. Write protect the corresponding page.
  1964. * 3. Copy the snapshot to the userspace.
  1965. * 4. Flush TLB's if needed.
  1966. */
  1967. static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
  1968. struct kvm_dirty_log *log)
  1969. {
  1970. int r;
  1971. mutex_lock(&kvm->slots_lock);
  1972. r = kvm_get_dirty_log_protect(kvm, log);
  1973. mutex_unlock(&kvm->slots_lock);
  1974. return r;
  1975. }
  1976. /**
  1977. * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap
  1978. * and reenable dirty page tracking for the corresponding pages.
  1979. * @kvm: pointer to kvm instance
  1980. * @log: slot id and address from which to fetch the bitmap of dirty pages
  1981. */
  1982. static int kvm_clear_dirty_log_protect(struct kvm *kvm,
  1983. struct kvm_clear_dirty_log *log)
  1984. {
  1985. struct kvm_memslots *slots;
  1986. struct kvm_memory_slot *memslot;
  1987. int as_id, id;
  1988. gfn_t offset;
  1989. unsigned long i, n;
  1990. unsigned long *dirty_bitmap;
  1991. unsigned long *dirty_bitmap_buffer;
  1992. bool flush;
  1993. /* Dirty ring tracking may be exclusive to dirty log tracking */
  1994. if (!kvm_use_dirty_bitmap(kvm))
  1995. return -ENXIO;
  1996. as_id = log->slot >> 16;
  1997. id = (u16)log->slot;
  1998. if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
  1999. return -EINVAL;
  2000. if (log->first_page & 63)
  2001. return -EINVAL;
  2002. slots = __kvm_memslots(kvm, as_id);
  2003. memslot = id_to_memslot(slots, id);
  2004. if (!memslot || !memslot->dirty_bitmap)
  2005. return -ENOENT;
  2006. dirty_bitmap = memslot->dirty_bitmap;
  2007. n = ALIGN(log->num_pages, BITS_PER_LONG) / 8;
  2008. if (log->first_page > memslot->npages ||
  2009. log->num_pages > memslot->npages - log->first_page ||
  2010. (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63)))
  2011. return -EINVAL;
  2012. kvm_arch_sync_dirty_log(kvm, memslot);
  2013. flush = false;
  2014. dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
  2015. if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
  2016. return -EFAULT;
  2017. KVM_MMU_LOCK(kvm);
  2018. for (offset = log->first_page, i = offset / BITS_PER_LONG,
  2019. n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--;
  2020. i++, offset += BITS_PER_LONG) {
  2021. unsigned long mask = *dirty_bitmap_buffer++;
  2022. atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
  2023. if (!mask)
  2024. continue;
  2025. mask &= atomic_long_fetch_andnot(mask, p);
  2026. /*
  2027. * mask contains the bits that really have been cleared. This
  2028. * never includes any bits beyond the length of the memslot (if
  2029. * the length is not aligned to 64 pages), therefore it is not
  2030. * a problem if userspace sets them in log->dirty_bitmap.
  2031. */
  2032. if (mask) {
  2033. flush = true;
  2034. kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
  2035. offset, mask);
  2036. }
  2037. }
  2038. KVM_MMU_UNLOCK(kvm);
  2039. if (flush)
  2040. kvm_flush_remote_tlbs_memslot(kvm, memslot);
  2041. return 0;
  2042. }
  2043. static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
  2044. struct kvm_clear_dirty_log *log)
  2045. {
  2046. int r;
  2047. mutex_lock(&kvm->slots_lock);
  2048. r = kvm_clear_dirty_log_protect(kvm, log);
  2049. mutex_unlock(&kvm->slots_lock);
  2050. return r;
  2051. }
  2052. #endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
  2053. #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
  2054. static u64 kvm_supported_mem_attributes(struct kvm *kvm)
  2055. {
  2056. if (!kvm || kvm_arch_has_private_mem(kvm))
  2057. return KVM_MEMORY_ATTRIBUTE_PRIVATE;
  2058. return 0;
  2059. }
  2060. /*
  2061. * Returns true if _all_ gfns in the range [@start, @end) have attributes
  2062. * such that the bits in @mask match @attrs.
  2063. */
  2064. bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
  2065. unsigned long mask, unsigned long attrs)
  2066. {
  2067. XA_STATE(xas, &kvm->mem_attr_array, start);
  2068. unsigned long index;
  2069. void *entry;
  2070. mask &= kvm_supported_mem_attributes(kvm);
  2071. if (attrs & ~mask)
  2072. return false;
  2073. if (end == start + 1)
  2074. return (kvm_get_memory_attributes(kvm, start) & mask) == attrs;
  2075. guard(rcu)();
  2076. if (!attrs)
  2077. return !xas_find(&xas, end - 1);
  2078. for (index = start; index < end; index++) {
  2079. do {
  2080. entry = xas_next(&xas);
  2081. } while (xas_retry(&xas, entry));
  2082. if (xas.xa_index != index ||
  2083. (xa_to_value(entry) & mask) != attrs)
  2084. return false;
  2085. }
  2086. return true;
  2087. }
  2088. static __always_inline void kvm_handle_gfn_range(struct kvm *kvm,
  2089. struct kvm_mmu_notifier_range *range)
  2090. {
  2091. struct kvm_gfn_range gfn_range;
  2092. struct kvm_memory_slot *slot;
  2093. struct kvm_memslots *slots;
  2094. struct kvm_memslot_iter iter;
  2095. bool found_memslot = false;
  2096. bool ret = false;
  2097. int i;
  2098. gfn_range.arg = range->arg;
  2099. gfn_range.may_block = range->may_block;
  2100. /*
  2101. * If/when KVM supports more attributes beyond private .vs shared, this
  2102. * _could_ set KVM_FILTER_{SHARED,PRIVATE} appropriately if the entire target
  2103. * range already has the desired private vs. shared state (it's unclear
  2104. * if that is a net win). For now, KVM reaches this point if and only
  2105. * if the private flag is being toggled, i.e. all mappings are in play.
  2106. */
  2107. for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
  2108. slots = __kvm_memslots(kvm, i);
  2109. kvm_for_each_memslot_in_gfn_range(&iter, slots, range->start, range->end) {
  2110. slot = iter.slot;
  2111. gfn_range.slot = slot;
  2112. gfn_range.start = max(range->start, slot->base_gfn);
  2113. gfn_range.end = min(range->end, slot->base_gfn + slot->npages);
  2114. if (gfn_range.start >= gfn_range.end)
  2115. continue;
  2116. if (!found_memslot) {
  2117. found_memslot = true;
  2118. KVM_MMU_LOCK(kvm);
  2119. if (!IS_KVM_NULL_FN(range->on_lock))
  2120. range->on_lock(kvm);
  2121. }
  2122. ret |= range->handler(kvm, &gfn_range);
  2123. }
  2124. }
  2125. if (range->flush_on_ret && ret)
  2126. kvm_flush_remote_tlbs(kvm);
  2127. if (found_memslot)
  2128. KVM_MMU_UNLOCK(kvm);
  2129. }
  2130. static bool kvm_pre_set_memory_attributes(struct kvm *kvm,
  2131. struct kvm_gfn_range *range)
  2132. {
  2133. /*
  2134. * Unconditionally add the range to the invalidation set, regardless of
  2135. * whether or not the arch callback actually needs to zap SPTEs. E.g.
  2136. * if KVM supports RWX attributes in the future and the attributes are
  2137. * going from R=>RW, zapping isn't strictly necessary. Unconditionally
  2138. * adding the range allows KVM to require that MMU invalidations add at
  2139. * least one range between begin() and end(), e.g. allows KVM to detect
  2140. * bugs where the add() is missed. Relaxing the rule *might* be safe,
  2141. * but it's not obvious that allowing new mappings while the attributes
  2142. * are in flux is desirable or worth the complexity.
  2143. */
  2144. kvm_mmu_invalidate_range_add(kvm, range->start, range->end);
  2145. return kvm_arch_pre_set_memory_attributes(kvm, range);
  2146. }
  2147. /* Set @attributes for the gfn range [@start, @end). */
  2148. static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
  2149. unsigned long attributes)
  2150. {
  2151. struct kvm_mmu_notifier_range pre_set_range = {
  2152. .start = start,
  2153. .end = end,
  2154. .arg.attributes = attributes,
  2155. .handler = kvm_pre_set_memory_attributes,
  2156. .on_lock = kvm_mmu_invalidate_begin,
  2157. .flush_on_ret = true,
  2158. .may_block = true,
  2159. };
  2160. struct kvm_mmu_notifier_range post_set_range = {
  2161. .start = start,
  2162. .end = end,
  2163. .arg.attributes = attributes,
  2164. .handler = kvm_arch_post_set_memory_attributes,
  2165. .on_lock = kvm_mmu_invalidate_end,
  2166. .may_block = true,
  2167. };
  2168. unsigned long i;
  2169. void *entry;
  2170. int r = 0;
  2171. entry = attributes ? xa_mk_value(attributes) : NULL;
  2172. trace_kvm_vm_set_mem_attributes(start, end, attributes);
  2173. mutex_lock(&kvm->slots_lock);
  2174. /* Nothing to do if the entire range has the desired attributes. */
  2175. if (kvm_range_has_memory_attributes(kvm, start, end, ~0, attributes))
  2176. goto out_unlock;
  2177. /*
  2178. * Reserve memory ahead of time to avoid having to deal with failures
  2179. * partway through setting the new attributes.
  2180. */
  2181. for (i = start; i < end; i++) {
  2182. r = xa_reserve(&kvm->mem_attr_array, i, GFP_KERNEL_ACCOUNT);
  2183. if (r)
  2184. goto out_unlock;
  2185. cond_resched();
  2186. }
  2187. kvm_handle_gfn_range(kvm, &pre_set_range);
  2188. for (i = start; i < end; i++) {
  2189. r = xa_err(xa_store(&kvm->mem_attr_array, i, entry,
  2190. GFP_KERNEL_ACCOUNT));
  2191. KVM_BUG_ON(r, kvm);
  2192. cond_resched();
  2193. }
  2194. kvm_handle_gfn_range(kvm, &post_set_range);
  2195. out_unlock:
  2196. mutex_unlock(&kvm->slots_lock);
  2197. return r;
  2198. }
  2199. static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
  2200. struct kvm_memory_attributes *attrs)
  2201. {
  2202. gfn_t start, end;
  2203. /* flags is currently not used. */
  2204. if (attrs->flags)
  2205. return -EINVAL;
  2206. if (attrs->attributes & ~kvm_supported_mem_attributes(kvm))
  2207. return -EINVAL;
  2208. if (attrs->size == 0 || attrs->address + attrs->size < attrs->address)
  2209. return -EINVAL;
  2210. if (!PAGE_ALIGNED(attrs->address) || !PAGE_ALIGNED(attrs->size))
  2211. return -EINVAL;
  2212. start = attrs->address >> PAGE_SHIFT;
  2213. end = (attrs->address + attrs->size) >> PAGE_SHIFT;
  2214. /*
  2215. * xarray tracks data using "unsigned long", and as a result so does
  2216. * KVM. For simplicity, supports generic attributes only on 64-bit
  2217. * architectures.
  2218. */
  2219. BUILD_BUG_ON(sizeof(attrs->attributes) != sizeof(unsigned long));
  2220. return kvm_vm_set_mem_attributes(kvm, start, end, attrs->attributes);
  2221. }
  2222. #endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
  2223. struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
  2224. {
  2225. return __gfn_to_memslot(kvm_memslots(kvm), gfn);
  2226. }
  2227. EXPORT_SYMBOL_FOR_KVM_INTERNAL(gfn_to_memslot);
  2228. struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn)
  2229. {
  2230. struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
  2231. u64 gen = slots->generation;
  2232. struct kvm_memory_slot *slot;
  2233. /*
  2234. * This also protects against using a memslot from a different address space,
  2235. * since different address spaces have different generation numbers.
  2236. */
  2237. if (unlikely(gen != vcpu->last_used_slot_gen)) {
  2238. vcpu->last_used_slot = NULL;
  2239. vcpu->last_used_slot_gen = gen;
  2240. }
  2241. slot = try_get_memslot(vcpu->last_used_slot, gfn);
  2242. if (slot)
  2243. return slot;
  2244. /*
  2245. * Fall back to searching all memslots. We purposely use
  2246. * search_memslots() instead of __gfn_to_memslot() to avoid
  2247. * thrashing the VM-wide last_used_slot in kvm_memslots.
  2248. */
  2249. slot = search_memslots(slots, gfn, false);
  2250. if (slot) {
  2251. vcpu->last_used_slot = slot;
  2252. return slot;
  2253. }
  2254. return NULL;
  2255. }
  2256. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_gfn_to_memslot);
  2257. bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
  2258. {
  2259. struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
  2260. return kvm_is_visible_memslot(memslot);
  2261. }
  2262. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_is_visible_gfn);
  2263. bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
  2264. {
  2265. struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
  2266. return kvm_is_visible_memslot(memslot);
  2267. }
  2268. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_is_visible_gfn);
  2269. unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
  2270. {
  2271. struct vm_area_struct *vma;
  2272. unsigned long addr, size;
  2273. size = PAGE_SIZE;
  2274. addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
  2275. if (kvm_is_error_hva(addr))
  2276. return PAGE_SIZE;
  2277. mmap_read_lock(current->mm);
  2278. vma = find_vma(current->mm, addr);
  2279. if (!vma)
  2280. goto out;
  2281. size = vma_kernel_pagesize(vma);
  2282. out:
  2283. mmap_read_unlock(current->mm);
  2284. return size;
  2285. }
  2286. static bool memslot_is_readonly(const struct kvm_memory_slot *slot)
  2287. {
  2288. return slot->flags & KVM_MEM_READONLY;
  2289. }
  2290. static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn,
  2291. gfn_t *nr_pages, bool write)
  2292. {
  2293. if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
  2294. return KVM_HVA_ERR_BAD;
  2295. if (memslot_is_readonly(slot) && write)
  2296. return KVM_HVA_ERR_RO_BAD;
  2297. if (nr_pages)
  2298. *nr_pages = slot->npages - (gfn - slot->base_gfn);
  2299. return __gfn_to_hva_memslot(slot, gfn);
  2300. }
  2301. static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
  2302. gfn_t *nr_pages)
  2303. {
  2304. return __gfn_to_hva_many(slot, gfn, nr_pages, true);
  2305. }
  2306. unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
  2307. gfn_t gfn)
  2308. {
  2309. return gfn_to_hva_many(slot, gfn, NULL);
  2310. }
  2311. EXPORT_SYMBOL_FOR_KVM_INTERNAL(gfn_to_hva_memslot);
  2312. unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
  2313. {
  2314. return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
  2315. }
  2316. EXPORT_SYMBOL_FOR_KVM_INTERNAL(gfn_to_hva);
  2317. unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
  2318. {
  2319. return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL);
  2320. }
  2321. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_gfn_to_hva);
  2322. /*
  2323. * Return the hva of a @gfn and the R/W attribute if possible.
  2324. *
  2325. * @slot: the kvm_memory_slot which contains @gfn
  2326. * @gfn: the gfn to be translated
  2327. * @writable: used to return the read/write attribute of the @slot if the hva
  2328. * is valid and @writable is not NULL
  2329. */
  2330. unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
  2331. gfn_t gfn, bool *writable)
  2332. {
  2333. unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
  2334. if (!kvm_is_error_hva(hva) && writable)
  2335. *writable = !memslot_is_readonly(slot);
  2336. return hva;
  2337. }
  2338. unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
  2339. {
  2340. struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
  2341. return gfn_to_hva_memslot_prot(slot, gfn, writable);
  2342. }
  2343. unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable)
  2344. {
  2345. struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
  2346. return gfn_to_hva_memslot_prot(slot, gfn, writable);
  2347. }
  2348. static bool kvm_is_ad_tracked_page(struct page *page)
  2349. {
  2350. /*
  2351. * Per page-flags.h, pages tagged PG_reserved "should in general not be
  2352. * touched (e.g. set dirty) except by its owner".
  2353. */
  2354. return !PageReserved(page);
  2355. }
  2356. static void kvm_set_page_dirty(struct page *page)
  2357. {
  2358. if (kvm_is_ad_tracked_page(page))
  2359. SetPageDirty(page);
  2360. }
  2361. static void kvm_set_page_accessed(struct page *page)
  2362. {
  2363. if (kvm_is_ad_tracked_page(page))
  2364. mark_page_accessed(page);
  2365. }
  2366. void kvm_release_page_clean(struct page *page)
  2367. {
  2368. if (!page)
  2369. return;
  2370. kvm_set_page_accessed(page);
  2371. put_page(page);
  2372. }
  2373. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_release_page_clean);
  2374. void kvm_release_page_dirty(struct page *page)
  2375. {
  2376. if (!page)
  2377. return;
  2378. kvm_set_page_dirty(page);
  2379. kvm_release_page_clean(page);
  2380. }
  2381. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_release_page_dirty);
  2382. static kvm_pfn_t kvm_resolve_pfn(struct kvm_follow_pfn *kfp, struct page *page,
  2383. struct follow_pfnmap_args *map, bool writable)
  2384. {
  2385. kvm_pfn_t pfn;
  2386. WARN_ON_ONCE(!!page == !!map);
  2387. if (kfp->map_writable)
  2388. *kfp->map_writable = writable;
  2389. if (map)
  2390. pfn = map->pfn;
  2391. else
  2392. pfn = page_to_pfn(page);
  2393. *kfp->refcounted_page = page;
  2394. return pfn;
  2395. }
  2396. /*
  2397. * The fast path to get the writable pfn which will be stored in @pfn,
  2398. * true indicates success, otherwise false is returned.
  2399. */
  2400. static bool hva_to_pfn_fast(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn)
  2401. {
  2402. struct page *page;
  2403. bool r;
  2404. /*
  2405. * Try the fast-only path when the caller wants to pin/get the page for
  2406. * writing. If the caller only wants to read the page, KVM must go
  2407. * down the full, slow path in order to avoid racing an operation that
  2408. * breaks Copy-on-Write (CoW), e.g. so that KVM doesn't end up pointing
  2409. * at the old, read-only page while mm/ points at a new, writable page.
  2410. */
  2411. if (!((kfp->flags & FOLL_WRITE) || kfp->map_writable))
  2412. return false;
  2413. if (kfp->pin)
  2414. r = pin_user_pages_fast(kfp->hva, 1, FOLL_WRITE, &page) == 1;
  2415. else
  2416. r = get_user_page_fast_only(kfp->hva, FOLL_WRITE, &page);
  2417. if (r) {
  2418. *pfn = kvm_resolve_pfn(kfp, page, NULL, true);
  2419. return true;
  2420. }
  2421. return false;
  2422. }
  2423. /*
  2424. * The slow path to get the pfn of the specified host virtual address,
  2425. * 1 indicates success, -errno is returned if error is detected.
  2426. */
  2427. static int hva_to_pfn_slow(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn)
  2428. {
  2429. /*
  2430. * When a VCPU accesses a page that is not mapped into the secondary
  2431. * MMU, we lookup the page using GUP to map it, so the guest VCPU can
  2432. * make progress. We always want to honor NUMA hinting faults in that
  2433. * case, because GUP usage corresponds to memory accesses from the VCPU.
  2434. * Otherwise, we'd not trigger NUMA hinting faults once a page is
  2435. * mapped into the secondary MMU and gets accessed by a VCPU.
  2436. *
  2437. * Note that get_user_page_fast_only() and FOLL_WRITE for now
  2438. * implicitly honor NUMA hinting faults and don't need this flag.
  2439. */
  2440. unsigned int flags = FOLL_HWPOISON | FOLL_HONOR_NUMA_FAULT | kfp->flags;
  2441. struct page *page, *wpage;
  2442. int npages;
  2443. if (kfp->pin)
  2444. npages = pin_user_pages_unlocked(kfp->hva, 1, &page, flags);
  2445. else
  2446. npages = get_user_pages_unlocked(kfp->hva, 1, &page, flags);
  2447. if (npages != 1)
  2448. return npages;
  2449. /*
  2450. * Pinning is mutually exclusive with opportunistically mapping a read
  2451. * fault as writable, as KVM should never pin pages when mapping memory
  2452. * into the guest (pinning is only for direct accesses from KVM).
  2453. */
  2454. if (WARN_ON_ONCE(kfp->map_writable && kfp->pin))
  2455. goto out;
  2456. /* map read fault as writable if possible */
  2457. if (!(flags & FOLL_WRITE) && kfp->map_writable &&
  2458. get_user_page_fast_only(kfp->hva, FOLL_WRITE, &wpage)) {
  2459. put_page(page);
  2460. page = wpage;
  2461. flags |= FOLL_WRITE;
  2462. }
  2463. out:
  2464. *pfn = kvm_resolve_pfn(kfp, page, NULL, flags & FOLL_WRITE);
  2465. return npages;
  2466. }
  2467. static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
  2468. {
  2469. if (unlikely(!(vma->vm_flags & VM_READ)))
  2470. return false;
  2471. if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
  2472. return false;
  2473. return true;
  2474. }
  2475. static int hva_to_pfn_remapped(struct vm_area_struct *vma,
  2476. struct kvm_follow_pfn *kfp, kvm_pfn_t *p_pfn)
  2477. {
  2478. struct follow_pfnmap_args args = { .vma = vma, .address = kfp->hva };
  2479. bool write_fault = kfp->flags & FOLL_WRITE;
  2480. int r;
  2481. /*
  2482. * Remapped memory cannot be pinned in any meaningful sense. Bail if
  2483. * the caller wants to pin the page, i.e. access the page outside of
  2484. * MMU notifier protection, and unsafe umappings are disallowed.
  2485. */
  2486. if (kfp->pin && !allow_unsafe_mappings)
  2487. return -EINVAL;
  2488. r = follow_pfnmap_start(&args);
  2489. if (r) {
  2490. /*
  2491. * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
  2492. * not call the fault handler, so do it here.
  2493. */
  2494. bool unlocked = false;
  2495. r = fixup_user_fault(current->mm, kfp->hva,
  2496. (write_fault ? FAULT_FLAG_WRITE : 0),
  2497. &unlocked);
  2498. if (unlocked)
  2499. return -EAGAIN;
  2500. if (r)
  2501. return r;
  2502. r = follow_pfnmap_start(&args);
  2503. if (r)
  2504. return r;
  2505. }
  2506. if (write_fault && !args.writable) {
  2507. *p_pfn = KVM_PFN_ERR_RO_FAULT;
  2508. goto out;
  2509. }
  2510. *p_pfn = kvm_resolve_pfn(kfp, NULL, &args, args.writable);
  2511. out:
  2512. follow_pfnmap_end(&args);
  2513. return r;
  2514. }
  2515. kvm_pfn_t hva_to_pfn(struct kvm_follow_pfn *kfp)
  2516. {
  2517. struct vm_area_struct *vma;
  2518. kvm_pfn_t pfn;
  2519. int npages, r;
  2520. might_sleep();
  2521. if (WARN_ON_ONCE(!kfp->refcounted_page))
  2522. return KVM_PFN_ERR_FAULT;
  2523. if (hva_to_pfn_fast(kfp, &pfn))
  2524. return pfn;
  2525. npages = hva_to_pfn_slow(kfp, &pfn);
  2526. if (npages == 1)
  2527. return pfn;
  2528. if (npages == -EINTR || npages == -EAGAIN)
  2529. return KVM_PFN_ERR_SIGPENDING;
  2530. if (npages == -EHWPOISON)
  2531. return KVM_PFN_ERR_HWPOISON;
  2532. mmap_read_lock(current->mm);
  2533. retry:
  2534. vma = vma_lookup(current->mm, kfp->hva);
  2535. if (vma == NULL)
  2536. pfn = KVM_PFN_ERR_FAULT;
  2537. else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
  2538. r = hva_to_pfn_remapped(vma, kfp, &pfn);
  2539. if (r == -EAGAIN)
  2540. goto retry;
  2541. if (r < 0)
  2542. pfn = KVM_PFN_ERR_FAULT;
  2543. } else {
  2544. if ((kfp->flags & FOLL_NOWAIT) &&
  2545. vma_is_valid(vma, kfp->flags & FOLL_WRITE))
  2546. pfn = KVM_PFN_ERR_NEEDS_IO;
  2547. else
  2548. pfn = KVM_PFN_ERR_FAULT;
  2549. }
  2550. mmap_read_unlock(current->mm);
  2551. return pfn;
  2552. }
  2553. static kvm_pfn_t kvm_follow_pfn(struct kvm_follow_pfn *kfp)
  2554. {
  2555. kfp->hva = __gfn_to_hva_many(kfp->slot, kfp->gfn, NULL,
  2556. kfp->flags & FOLL_WRITE);
  2557. if (kfp->hva == KVM_HVA_ERR_RO_BAD)
  2558. return KVM_PFN_ERR_RO_FAULT;
  2559. if (kvm_is_error_hva(kfp->hva))
  2560. return KVM_PFN_NOSLOT;
  2561. if (memslot_is_readonly(kfp->slot) && kfp->map_writable) {
  2562. *kfp->map_writable = false;
  2563. kfp->map_writable = NULL;
  2564. }
  2565. return hva_to_pfn(kfp);
  2566. }
  2567. kvm_pfn_t __kvm_faultin_pfn(const struct kvm_memory_slot *slot, gfn_t gfn,
  2568. unsigned int foll, bool *writable,
  2569. struct page **refcounted_page)
  2570. {
  2571. struct kvm_follow_pfn kfp = {
  2572. .slot = slot,
  2573. .gfn = gfn,
  2574. .flags = foll,
  2575. .map_writable = writable,
  2576. .refcounted_page = refcounted_page,
  2577. };
  2578. if (WARN_ON_ONCE(!writable || !refcounted_page))
  2579. return KVM_PFN_ERR_FAULT;
  2580. *writable = false;
  2581. *refcounted_page = NULL;
  2582. return kvm_follow_pfn(&kfp);
  2583. }
  2584. EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_faultin_pfn);
  2585. int kvm_prefetch_pages(struct kvm_memory_slot *slot, gfn_t gfn,
  2586. struct page **pages, int nr_pages)
  2587. {
  2588. unsigned long addr;
  2589. gfn_t entry = 0;
  2590. addr = gfn_to_hva_many(slot, gfn, &entry);
  2591. if (kvm_is_error_hva(addr))
  2592. return -1;
  2593. if (entry < nr_pages)
  2594. return 0;
  2595. return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages);
  2596. }
  2597. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_prefetch_pages);
  2598. /*
  2599. * Don't use this API unless you are absolutely, positively certain that KVM
  2600. * needs to get a struct page, e.g. to pin the page for firmware DMA.
  2601. *
  2602. * FIXME: Users of this API likely need to FOLL_PIN the page, not just elevate
  2603. * its refcount.
  2604. */
  2605. struct page *__gfn_to_page(struct kvm *kvm, gfn_t gfn, bool write)
  2606. {
  2607. struct page *refcounted_page = NULL;
  2608. struct kvm_follow_pfn kfp = {
  2609. .slot = gfn_to_memslot(kvm, gfn),
  2610. .gfn = gfn,
  2611. .flags = write ? FOLL_WRITE : 0,
  2612. .refcounted_page = &refcounted_page,
  2613. };
  2614. (void)kvm_follow_pfn(&kfp);
  2615. return refcounted_page;
  2616. }
  2617. EXPORT_SYMBOL_FOR_KVM_INTERNAL(__gfn_to_page);
  2618. int __kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map,
  2619. bool writable)
  2620. {
  2621. struct kvm_follow_pfn kfp = {
  2622. .slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn),
  2623. .gfn = gfn,
  2624. .flags = writable ? FOLL_WRITE : 0,
  2625. .refcounted_page = &map->pinned_page,
  2626. .pin = true,
  2627. };
  2628. map->pinned_page = NULL;
  2629. map->page = NULL;
  2630. map->hva = NULL;
  2631. map->gfn = gfn;
  2632. map->writable = writable;
  2633. map->pfn = kvm_follow_pfn(&kfp);
  2634. if (is_error_noslot_pfn(map->pfn))
  2635. return -EINVAL;
  2636. if (pfn_valid(map->pfn)) {
  2637. map->page = pfn_to_page(map->pfn);
  2638. map->hva = kmap(map->page);
  2639. #ifdef CONFIG_HAS_IOMEM
  2640. } else {
  2641. map->hva = memremap(pfn_to_hpa(map->pfn), PAGE_SIZE, MEMREMAP_WB);
  2642. #endif
  2643. }
  2644. return map->hva ? 0 : -EFAULT;
  2645. }
  2646. EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_vcpu_map);
  2647. void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map)
  2648. {
  2649. if (!map->hva)
  2650. return;
  2651. if (map->page)
  2652. kunmap(map->page);
  2653. #ifdef CONFIG_HAS_IOMEM
  2654. else
  2655. memunmap(map->hva);
  2656. #endif
  2657. if (map->writable)
  2658. kvm_vcpu_mark_page_dirty(vcpu, map->gfn);
  2659. if (map->pinned_page) {
  2660. if (map->writable)
  2661. kvm_set_page_dirty(map->pinned_page);
  2662. kvm_set_page_accessed(map->pinned_page);
  2663. unpin_user_page(map->pinned_page);
  2664. }
  2665. map->hva = NULL;
  2666. map->page = NULL;
  2667. map->pinned_page = NULL;
  2668. }
  2669. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_unmap);
  2670. static int next_segment(unsigned long len, int offset)
  2671. {
  2672. if (len > PAGE_SIZE - offset)
  2673. return PAGE_SIZE - offset;
  2674. else
  2675. return len;
  2676. }
  2677. /* Copy @len bytes from guest memory at '(@gfn * PAGE_SIZE) + @offset' to @data */
  2678. static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
  2679. void *data, int offset, int len)
  2680. {
  2681. int r;
  2682. unsigned long addr;
  2683. if (WARN_ON_ONCE(offset + len > PAGE_SIZE))
  2684. return -EFAULT;
  2685. addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
  2686. if (kvm_is_error_hva(addr))
  2687. return -EFAULT;
  2688. r = __copy_from_user(data, (void __user *)addr + offset, len);
  2689. if (r)
  2690. return -EFAULT;
  2691. return 0;
  2692. }
  2693. int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
  2694. int len)
  2695. {
  2696. struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
  2697. return __kvm_read_guest_page(slot, gfn, data, offset, len);
  2698. }
  2699. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_read_guest_page);
  2700. int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data,
  2701. int offset, int len)
  2702. {
  2703. struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
  2704. return __kvm_read_guest_page(slot, gfn, data, offset, len);
  2705. }
  2706. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_read_guest_page);
  2707. int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
  2708. {
  2709. gfn_t gfn = gpa >> PAGE_SHIFT;
  2710. int seg;
  2711. int offset = offset_in_page(gpa);
  2712. int ret;
  2713. while ((seg = next_segment(len, offset)) != 0) {
  2714. ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
  2715. if (ret < 0)
  2716. return ret;
  2717. offset = 0;
  2718. len -= seg;
  2719. data += seg;
  2720. ++gfn;
  2721. }
  2722. return 0;
  2723. }
  2724. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_read_guest);
  2725. int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len)
  2726. {
  2727. gfn_t gfn = gpa >> PAGE_SHIFT;
  2728. int seg;
  2729. int offset = offset_in_page(gpa);
  2730. int ret;
  2731. while ((seg = next_segment(len, offset)) != 0) {
  2732. ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg);
  2733. if (ret < 0)
  2734. return ret;
  2735. offset = 0;
  2736. len -= seg;
  2737. data += seg;
  2738. ++gfn;
  2739. }
  2740. return 0;
  2741. }
  2742. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_read_guest);
  2743. static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
  2744. void *data, int offset, unsigned long len)
  2745. {
  2746. int r;
  2747. unsigned long addr;
  2748. if (WARN_ON_ONCE(offset + len > PAGE_SIZE))
  2749. return -EFAULT;
  2750. addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
  2751. if (kvm_is_error_hva(addr))
  2752. return -EFAULT;
  2753. pagefault_disable();
  2754. r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
  2755. pagefault_enable();
  2756. if (r)
  2757. return -EFAULT;
  2758. return 0;
  2759. }
  2760. int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
  2761. void *data, unsigned long len)
  2762. {
  2763. gfn_t gfn = gpa >> PAGE_SHIFT;
  2764. struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
  2765. int offset = offset_in_page(gpa);
  2766. return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
  2767. }
  2768. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_read_guest_atomic);
  2769. /* Copy @len bytes from @data into guest memory at '(@gfn * PAGE_SIZE) + @offset' */
  2770. static int __kvm_write_guest_page(struct kvm *kvm,
  2771. struct kvm_memory_slot *memslot, gfn_t gfn,
  2772. const void *data, int offset, int len)
  2773. {
  2774. int r;
  2775. unsigned long addr;
  2776. if (WARN_ON_ONCE(offset + len > PAGE_SIZE))
  2777. return -EFAULT;
  2778. addr = gfn_to_hva_memslot(memslot, gfn);
  2779. if (kvm_is_error_hva(addr))
  2780. return -EFAULT;
  2781. r = __copy_to_user((void __user *)addr + offset, data, len);
  2782. if (r)
  2783. return -EFAULT;
  2784. mark_page_dirty_in_slot(kvm, memslot, gfn);
  2785. return 0;
  2786. }
  2787. int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
  2788. const void *data, int offset, int len)
  2789. {
  2790. struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
  2791. return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len);
  2792. }
  2793. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_write_guest_page);
  2794. int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
  2795. const void *data, int offset, int len)
  2796. {
  2797. struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
  2798. return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len);
  2799. }
  2800. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_write_guest_page);
  2801. int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
  2802. unsigned long len)
  2803. {
  2804. gfn_t gfn = gpa >> PAGE_SHIFT;
  2805. int seg;
  2806. int offset = offset_in_page(gpa);
  2807. int ret;
  2808. while ((seg = next_segment(len, offset)) != 0) {
  2809. ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
  2810. if (ret < 0)
  2811. return ret;
  2812. offset = 0;
  2813. len -= seg;
  2814. data += seg;
  2815. ++gfn;
  2816. }
  2817. return 0;
  2818. }
  2819. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_write_guest);
  2820. int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
  2821. unsigned long len)
  2822. {
  2823. gfn_t gfn = gpa >> PAGE_SHIFT;
  2824. int seg;
  2825. int offset = offset_in_page(gpa);
  2826. int ret;
  2827. while ((seg = next_segment(len, offset)) != 0) {
  2828. ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg);
  2829. if (ret < 0)
  2830. return ret;
  2831. offset = 0;
  2832. len -= seg;
  2833. data += seg;
  2834. ++gfn;
  2835. }
  2836. return 0;
  2837. }
  2838. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_write_guest);
  2839. static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
  2840. struct gfn_to_hva_cache *ghc,
  2841. gpa_t gpa, unsigned long len)
  2842. {
  2843. int offset = offset_in_page(gpa);
  2844. gfn_t start_gfn = gpa >> PAGE_SHIFT;
  2845. gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
  2846. gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
  2847. gfn_t nr_pages_avail;
  2848. /* Update ghc->generation before performing any error checks. */
  2849. ghc->generation = slots->generation;
  2850. if (start_gfn > end_gfn) {
  2851. ghc->hva = KVM_HVA_ERR_BAD;
  2852. return -EINVAL;
  2853. }
  2854. /*
  2855. * If the requested region crosses two memslots, we still
  2856. * verify that the entire region is valid here.
  2857. */
  2858. for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) {
  2859. ghc->memslot = __gfn_to_memslot(slots, start_gfn);
  2860. ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
  2861. &nr_pages_avail);
  2862. if (kvm_is_error_hva(ghc->hva))
  2863. return -EFAULT;
  2864. }
  2865. /* Use the slow path for cross page reads and writes. */
  2866. if (nr_pages_needed == 1)
  2867. ghc->hva += offset;
  2868. else
  2869. ghc->memslot = NULL;
  2870. ghc->gpa = gpa;
  2871. ghc->len = len;
  2872. return 0;
  2873. }
  2874. int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
  2875. gpa_t gpa, unsigned long len)
  2876. {
  2877. struct kvm_memslots *slots = kvm_memslots(kvm);
  2878. return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
  2879. }
  2880. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gfn_to_hva_cache_init);
  2881. int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
  2882. void *data, unsigned int offset,
  2883. unsigned long len)
  2884. {
  2885. struct kvm_memslots *slots = kvm_memslots(kvm);
  2886. int r;
  2887. gpa_t gpa = ghc->gpa + offset;
  2888. if (WARN_ON_ONCE(len + offset > ghc->len))
  2889. return -EINVAL;
  2890. if (slots->generation != ghc->generation) {
  2891. if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
  2892. return -EFAULT;
  2893. }
  2894. if (kvm_is_error_hva(ghc->hva))
  2895. return -EFAULT;
  2896. if (unlikely(!ghc->memslot))
  2897. return kvm_write_guest(kvm, gpa, data, len);
  2898. r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
  2899. if (r)
  2900. return -EFAULT;
  2901. mark_page_dirty_in_slot(kvm, ghc->memslot, gpa >> PAGE_SHIFT);
  2902. return 0;
  2903. }
  2904. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_write_guest_offset_cached);
  2905. int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
  2906. void *data, unsigned long len)
  2907. {
  2908. return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
  2909. }
  2910. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_write_guest_cached);
  2911. int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
  2912. void *data, unsigned int offset,
  2913. unsigned long len)
  2914. {
  2915. struct kvm_memslots *slots = kvm_memslots(kvm);
  2916. int r;
  2917. gpa_t gpa = ghc->gpa + offset;
  2918. if (WARN_ON_ONCE(len + offset > ghc->len))
  2919. return -EINVAL;
  2920. if (slots->generation != ghc->generation) {
  2921. if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
  2922. return -EFAULT;
  2923. }
  2924. if (kvm_is_error_hva(ghc->hva))
  2925. return -EFAULT;
  2926. if (unlikely(!ghc->memslot))
  2927. return kvm_read_guest(kvm, gpa, data, len);
  2928. r = __copy_from_user(data, (void __user *)ghc->hva + offset, len);
  2929. if (r)
  2930. return -EFAULT;
  2931. return 0;
  2932. }
  2933. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_read_guest_offset_cached);
  2934. int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
  2935. void *data, unsigned long len)
  2936. {
  2937. return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len);
  2938. }
  2939. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_read_guest_cached);
  2940. int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
  2941. {
  2942. const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
  2943. gfn_t gfn = gpa >> PAGE_SHIFT;
  2944. int seg;
  2945. int offset = offset_in_page(gpa);
  2946. int ret;
  2947. while ((seg = next_segment(len, offset)) != 0) {
  2948. ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, seg);
  2949. if (ret < 0)
  2950. return ret;
  2951. offset = 0;
  2952. len -= seg;
  2953. ++gfn;
  2954. }
  2955. return 0;
  2956. }
  2957. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_clear_guest);
  2958. void mark_page_dirty_in_slot(struct kvm *kvm,
  2959. const struct kvm_memory_slot *memslot,
  2960. gfn_t gfn)
  2961. {
  2962. struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
  2963. #ifdef CONFIG_HAVE_KVM_DIRTY_RING
  2964. if (WARN_ON_ONCE(vcpu && vcpu->kvm != kvm))
  2965. return;
  2966. WARN_ON_ONCE(!vcpu && !kvm_arch_allow_write_without_running_vcpu(kvm));
  2967. #endif
  2968. if (memslot && kvm_slot_dirty_track_enabled(memslot)) {
  2969. unsigned long rel_gfn = gfn - memslot->base_gfn;
  2970. u32 slot = (memslot->as_id << 16) | memslot->id;
  2971. if (kvm->dirty_ring_size && vcpu)
  2972. kvm_dirty_ring_push(vcpu, slot, rel_gfn);
  2973. else if (memslot->dirty_bitmap)
  2974. set_bit_le(rel_gfn, memslot->dirty_bitmap);
  2975. }
  2976. }
  2977. EXPORT_SYMBOL_FOR_KVM_INTERNAL(mark_page_dirty_in_slot);
  2978. void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
  2979. {
  2980. struct kvm_memory_slot *memslot;
  2981. memslot = gfn_to_memslot(kvm, gfn);
  2982. mark_page_dirty_in_slot(kvm, memslot, gfn);
  2983. }
  2984. EXPORT_SYMBOL_FOR_KVM_INTERNAL(mark_page_dirty);
  2985. void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
  2986. {
  2987. struct kvm_memory_slot *memslot;
  2988. memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
  2989. mark_page_dirty_in_slot(vcpu->kvm, memslot, gfn);
  2990. }
  2991. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_mark_page_dirty);
  2992. void kvm_sigset_activate(struct kvm_vcpu *vcpu)
  2993. {
  2994. if (!vcpu->sigset_active)
  2995. return;
  2996. /*
  2997. * This does a lockless modification of ->real_blocked, which is fine
  2998. * because, only current can change ->real_blocked and all readers of
  2999. * ->real_blocked don't care as long ->real_blocked is always a subset
  3000. * of ->blocked.
  3001. */
  3002. sigprocmask(SIG_SETMASK, &vcpu->sigset, &current->real_blocked);
  3003. }
  3004. void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
  3005. {
  3006. if (!vcpu->sigset_active)
  3007. return;
  3008. sigprocmask(SIG_SETMASK, &current->real_blocked, NULL);
  3009. sigemptyset(&current->real_blocked);
  3010. }
  3011. static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
  3012. {
  3013. unsigned int old, val, grow, grow_start;
  3014. old = val = vcpu->halt_poll_ns;
  3015. grow_start = READ_ONCE(halt_poll_ns_grow_start);
  3016. grow = READ_ONCE(halt_poll_ns_grow);
  3017. if (!grow)
  3018. goto out;
  3019. val *= grow;
  3020. if (val < grow_start)
  3021. val = grow_start;
  3022. vcpu->halt_poll_ns = val;
  3023. out:
  3024. trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
  3025. }
  3026. static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
  3027. {
  3028. unsigned int old, val, shrink, grow_start;
  3029. old = val = vcpu->halt_poll_ns;
  3030. shrink = READ_ONCE(halt_poll_ns_shrink);
  3031. grow_start = READ_ONCE(halt_poll_ns_grow_start);
  3032. if (shrink == 0)
  3033. val = 0;
  3034. else
  3035. val /= shrink;
  3036. if (val < grow_start)
  3037. val = 0;
  3038. vcpu->halt_poll_ns = val;
  3039. trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
  3040. }
  3041. static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
  3042. {
  3043. int ret = -EINTR;
  3044. int idx = srcu_read_lock(&vcpu->kvm->srcu);
  3045. if (kvm_arch_vcpu_runnable(vcpu))
  3046. goto out;
  3047. if (kvm_cpu_has_pending_timer(vcpu))
  3048. goto out;
  3049. if (signal_pending(current))
  3050. goto out;
  3051. if (kvm_check_request(KVM_REQ_UNBLOCK, vcpu))
  3052. goto out;
  3053. ret = 0;
  3054. out:
  3055. srcu_read_unlock(&vcpu->kvm->srcu, idx);
  3056. return ret;
  3057. }
  3058. /*
  3059. * Block the vCPU until the vCPU is runnable, an event arrives, or a signal is
  3060. * pending. This is mostly used when halting a vCPU, but may also be used
  3061. * directly for other vCPU non-runnable states, e.g. x86's Wait-For-SIPI.
  3062. */
  3063. bool kvm_vcpu_block(struct kvm_vcpu *vcpu)
  3064. {
  3065. struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
  3066. bool waited = false;
  3067. vcpu->stat.generic.blocking = 1;
  3068. preempt_disable();
  3069. kvm_arch_vcpu_blocking(vcpu);
  3070. prepare_to_rcuwait(wait);
  3071. preempt_enable();
  3072. for (;;) {
  3073. set_current_state(TASK_INTERRUPTIBLE);
  3074. if (kvm_vcpu_check_block(vcpu) < 0)
  3075. break;
  3076. waited = true;
  3077. schedule();
  3078. }
  3079. preempt_disable();
  3080. finish_rcuwait(wait);
  3081. kvm_arch_vcpu_unblocking(vcpu);
  3082. preempt_enable();
  3083. vcpu->stat.generic.blocking = 0;
  3084. return waited;
  3085. }
  3086. static inline void update_halt_poll_stats(struct kvm_vcpu *vcpu, ktime_t start,
  3087. ktime_t end, bool success)
  3088. {
  3089. struct kvm_vcpu_stat_generic *stats = &vcpu->stat.generic;
  3090. u64 poll_ns = ktime_to_ns(ktime_sub(end, start));
  3091. ++vcpu->stat.generic.halt_attempted_poll;
  3092. if (success) {
  3093. ++vcpu->stat.generic.halt_successful_poll;
  3094. if (!vcpu_valid_wakeup(vcpu))
  3095. ++vcpu->stat.generic.halt_poll_invalid;
  3096. stats->halt_poll_success_ns += poll_ns;
  3097. KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_success_hist, poll_ns);
  3098. } else {
  3099. stats->halt_poll_fail_ns += poll_ns;
  3100. KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_fail_hist, poll_ns);
  3101. }
  3102. }
  3103. static unsigned int kvm_vcpu_max_halt_poll_ns(struct kvm_vcpu *vcpu)
  3104. {
  3105. struct kvm *kvm = vcpu->kvm;
  3106. if (kvm->override_halt_poll_ns) {
  3107. /*
  3108. * Ensure kvm->max_halt_poll_ns is not read before
  3109. * kvm->override_halt_poll_ns.
  3110. *
  3111. * Pairs with the smp_wmb() when enabling KVM_CAP_HALT_POLL.
  3112. */
  3113. smp_rmb();
  3114. return READ_ONCE(kvm->max_halt_poll_ns);
  3115. }
  3116. return READ_ONCE(halt_poll_ns);
  3117. }
  3118. /*
  3119. * Emulate a vCPU halt condition, e.g. HLT on x86, WFI on arm, etc... If halt
  3120. * polling is enabled, busy wait for a short time before blocking to avoid the
  3121. * expensive block+unblock sequence if a wake event arrives soon after the vCPU
  3122. * is halted.
  3123. */
  3124. void kvm_vcpu_halt(struct kvm_vcpu *vcpu)
  3125. {
  3126. unsigned int max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu);
  3127. bool halt_poll_allowed = !kvm_arch_no_poll(vcpu);
  3128. ktime_t start, cur, poll_end;
  3129. bool waited = false;
  3130. bool do_halt_poll;
  3131. u64 halt_ns;
  3132. if (vcpu->halt_poll_ns > max_halt_poll_ns)
  3133. vcpu->halt_poll_ns = max_halt_poll_ns;
  3134. do_halt_poll = halt_poll_allowed && vcpu->halt_poll_ns;
  3135. start = cur = poll_end = ktime_get();
  3136. if (do_halt_poll) {
  3137. ktime_t stop = ktime_add_ns(start, vcpu->halt_poll_ns);
  3138. do {
  3139. if (kvm_vcpu_check_block(vcpu) < 0)
  3140. goto out;
  3141. cpu_relax();
  3142. poll_end = cur = ktime_get();
  3143. } while (kvm_vcpu_can_poll(cur, stop));
  3144. }
  3145. waited = kvm_vcpu_block(vcpu);
  3146. cur = ktime_get();
  3147. if (waited) {
  3148. vcpu->stat.generic.halt_wait_ns +=
  3149. ktime_to_ns(cur) - ktime_to_ns(poll_end);
  3150. KVM_STATS_LOG_HIST_UPDATE(vcpu->stat.generic.halt_wait_hist,
  3151. ktime_to_ns(cur) - ktime_to_ns(poll_end));
  3152. }
  3153. out:
  3154. /* The total time the vCPU was "halted", including polling time. */
  3155. halt_ns = ktime_to_ns(cur) - ktime_to_ns(start);
  3156. /*
  3157. * Note, halt-polling is considered successful so long as the vCPU was
  3158. * never actually scheduled out, i.e. even if the wake event arrived
  3159. * after of the halt-polling loop itself, but before the full wait.
  3160. */
  3161. if (do_halt_poll)
  3162. update_halt_poll_stats(vcpu, start, poll_end, !waited);
  3163. if (halt_poll_allowed) {
  3164. /* Recompute the max halt poll time in case it changed. */
  3165. max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu);
  3166. if (!vcpu_valid_wakeup(vcpu)) {
  3167. shrink_halt_poll_ns(vcpu);
  3168. } else if (max_halt_poll_ns) {
  3169. if (halt_ns <= vcpu->halt_poll_ns)
  3170. ;
  3171. /* we had a long block, shrink polling */
  3172. else if (vcpu->halt_poll_ns &&
  3173. halt_ns > max_halt_poll_ns)
  3174. shrink_halt_poll_ns(vcpu);
  3175. /* we had a short halt and our poll time is too small */
  3176. else if (vcpu->halt_poll_ns < max_halt_poll_ns &&
  3177. halt_ns < max_halt_poll_ns)
  3178. grow_halt_poll_ns(vcpu);
  3179. } else {
  3180. vcpu->halt_poll_ns = 0;
  3181. }
  3182. }
  3183. trace_kvm_vcpu_wakeup(halt_ns, waited, vcpu_valid_wakeup(vcpu));
  3184. }
  3185. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_halt);
  3186. bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
  3187. {
  3188. if (__kvm_vcpu_wake_up(vcpu)) {
  3189. WRITE_ONCE(vcpu->ready, true);
  3190. ++vcpu->stat.generic.halt_wakeup;
  3191. return true;
  3192. }
  3193. return false;
  3194. }
  3195. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_wake_up);
  3196. #ifndef CONFIG_S390
  3197. /*
  3198. * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
  3199. */
  3200. void __kvm_vcpu_kick(struct kvm_vcpu *vcpu, bool wait)
  3201. {
  3202. int me, cpu;
  3203. if (kvm_vcpu_wake_up(vcpu))
  3204. return;
  3205. me = get_cpu();
  3206. /*
  3207. * The only state change done outside the vcpu mutex is IN_GUEST_MODE
  3208. * to EXITING_GUEST_MODE. Therefore the moderately expensive "should
  3209. * kick" check does not need atomic operations if kvm_vcpu_kick is used
  3210. * within the vCPU thread itself.
  3211. */
  3212. if (vcpu == __this_cpu_read(kvm_running_vcpu)) {
  3213. if (vcpu->mode == IN_GUEST_MODE)
  3214. WRITE_ONCE(vcpu->mode, EXITING_GUEST_MODE);
  3215. goto out;
  3216. }
  3217. /*
  3218. * Note, the vCPU could get migrated to a different pCPU at any point
  3219. * after kvm_arch_vcpu_should_kick(), which could result in sending an
  3220. * IPI to the previous pCPU. But, that's ok because the purpose of the
  3221. * IPI is to force the vCPU to leave IN_GUEST_MODE, and migrating the
  3222. * vCPU also requires it to leave IN_GUEST_MODE.
  3223. */
  3224. if (kvm_arch_vcpu_should_kick(vcpu)) {
  3225. cpu = READ_ONCE(vcpu->cpu);
  3226. if (cpu != me && (unsigned int)cpu < nr_cpu_ids && cpu_online(cpu)) {
  3227. /*
  3228. * Use a reschedule IPI to kick the vCPU if the caller
  3229. * doesn't need to wait for a response, as KVM allows
  3230. * kicking vCPUs while IRQs are disabled, but using the
  3231. * SMP function call framework with IRQs disabled can
  3232. * deadlock due to taking cross-CPU locks.
  3233. */
  3234. if (wait)
  3235. smp_call_function_single(cpu, ack_kick, NULL, wait);
  3236. else
  3237. smp_send_reschedule(cpu);
  3238. }
  3239. }
  3240. out:
  3241. put_cpu();
  3242. }
  3243. EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_vcpu_kick);
  3244. #endif /* !CONFIG_S390 */
  3245. int kvm_vcpu_yield_to(struct kvm_vcpu *target)
  3246. {
  3247. struct task_struct *task = NULL;
  3248. int ret;
  3249. if (!read_trylock(&target->pid_lock))
  3250. return 0;
  3251. if (target->pid)
  3252. task = get_pid_task(target->pid, PIDTYPE_PID);
  3253. read_unlock(&target->pid_lock);
  3254. if (!task)
  3255. return 0;
  3256. ret = yield_to(task, 1);
  3257. put_task_struct(task);
  3258. return ret;
  3259. }
  3260. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_yield_to);
  3261. /*
  3262. * Helper that checks whether a VCPU is eligible for directed yield.
  3263. * Most eligible candidate to yield is decided by following heuristics:
  3264. *
  3265. * (a) VCPU which has not done pl-exit or cpu relax intercepted recently
  3266. * (preempted lock holder), indicated by @in_spin_loop.
  3267. * Set at the beginning and cleared at the end of interception/PLE handler.
  3268. *
  3269. * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
  3270. * chance last time (mostly it has become eligible now since we have probably
  3271. * yielded to lockholder in last iteration. This is done by toggling
  3272. * @dy_eligible each time a VCPU checked for eligibility.)
  3273. *
  3274. * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
  3275. * to preempted lock-holder could result in wrong VCPU selection and CPU
  3276. * burning. Giving priority for a potential lock-holder increases lock
  3277. * progress.
  3278. *
  3279. * Since algorithm is based on heuristics, accessing another VCPU data without
  3280. * locking does not harm. It may result in trying to yield to same VCPU, fail
  3281. * and continue with next VCPU and so on.
  3282. */
  3283. static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
  3284. {
  3285. #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
  3286. bool eligible;
  3287. eligible = !vcpu->spin_loop.in_spin_loop ||
  3288. vcpu->spin_loop.dy_eligible;
  3289. if (vcpu->spin_loop.in_spin_loop)
  3290. kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
  3291. return eligible;
  3292. #else
  3293. return true;
  3294. #endif
  3295. }
  3296. /*
  3297. * Unlike kvm_arch_vcpu_runnable, this function is called outside
  3298. * a vcpu_load/vcpu_put pair. However, for most architectures
  3299. * kvm_arch_vcpu_runnable does not require vcpu_load.
  3300. */
  3301. bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
  3302. {
  3303. return kvm_arch_vcpu_runnable(vcpu);
  3304. }
  3305. static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
  3306. {
  3307. if (kvm_arch_dy_runnable(vcpu))
  3308. return true;
  3309. #ifdef CONFIG_KVM_ASYNC_PF
  3310. if (!list_empty_careful(&vcpu->async_pf.done))
  3311. return true;
  3312. #endif
  3313. return false;
  3314. }
  3315. /*
  3316. * By default, simply query the target vCPU's current mode when checking if a
  3317. * vCPU was preempted in kernel mode. All architectures except x86 (or more
  3318. * specifical, except VMX) allow querying whether or not a vCPU is in kernel
  3319. * mode even if the vCPU is NOT loaded, i.e. using kvm_arch_vcpu_in_kernel()
  3320. * directly for cross-vCPU checks is functionally correct and accurate.
  3321. */
  3322. bool __weak kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu)
  3323. {
  3324. return kvm_arch_vcpu_in_kernel(vcpu);
  3325. }
  3326. bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
  3327. {
  3328. return false;
  3329. }
  3330. void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
  3331. {
  3332. int nr_vcpus, start, i, idx, yielded;
  3333. struct kvm *kvm = me->kvm;
  3334. struct kvm_vcpu *vcpu;
  3335. int try = 3;
  3336. nr_vcpus = atomic_read(&kvm->online_vcpus);
  3337. if (nr_vcpus < 2)
  3338. return;
  3339. /* Pairs with the smp_wmb() in kvm_vm_ioctl_create_vcpu(). */
  3340. smp_rmb();
  3341. kvm_vcpu_set_in_spin_loop(me, true);
  3342. /*
  3343. * The current vCPU ("me") is spinning in kernel mode, i.e. is likely
  3344. * waiting for a resource to become available. Attempt to yield to a
  3345. * vCPU that is runnable, but not currently running, e.g. because the
  3346. * vCPU was preempted by a higher priority task. With luck, the vCPU
  3347. * that was preempted is holding a lock or some other resource that the
  3348. * current vCPU is waiting to acquire, and yielding to the other vCPU
  3349. * will allow it to make forward progress and release the lock (or kick
  3350. * the spinning vCPU, etc).
  3351. *
  3352. * Since KVM has no insight into what exactly the guest is doing,
  3353. * approximate a round-robin selection by iterating over all vCPUs,
  3354. * starting at the last boosted vCPU. I.e. if N=kvm->last_boosted_vcpu,
  3355. * iterate over vCPU[N+1]..vCPU[N-1], wrapping as needed.
  3356. *
  3357. * Note, this is inherently racy, e.g. if multiple vCPUs are spinning,
  3358. * they may all try to yield to the same vCPU(s). But as above, this
  3359. * is all best effort due to KVM's lack of visibility into the guest.
  3360. */
  3361. start = READ_ONCE(kvm->last_boosted_vcpu) + 1;
  3362. for (i = 0; i < nr_vcpus; i++) {
  3363. idx = (start + i) % nr_vcpus;
  3364. if (idx == me->vcpu_idx)
  3365. continue;
  3366. vcpu = xa_load(&kvm->vcpu_array, idx);
  3367. if (!READ_ONCE(vcpu->ready))
  3368. continue;
  3369. if (kvm_vcpu_is_blocking(vcpu) && !vcpu_dy_runnable(vcpu))
  3370. continue;
  3371. /*
  3372. * Treat the target vCPU as being in-kernel if it has a pending
  3373. * interrupt, as the vCPU trying to yield may be spinning
  3374. * waiting on IPI delivery, i.e. the target vCPU is in-kernel
  3375. * for the purposes of directed yield.
  3376. */
  3377. if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
  3378. !kvm_arch_dy_has_pending_interrupt(vcpu) &&
  3379. !kvm_arch_vcpu_preempted_in_kernel(vcpu))
  3380. continue;
  3381. if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
  3382. continue;
  3383. yielded = kvm_vcpu_yield_to(vcpu);
  3384. if (yielded > 0) {
  3385. WRITE_ONCE(kvm->last_boosted_vcpu, idx);
  3386. break;
  3387. } else if (yielded < 0 && !--try) {
  3388. break;
  3389. }
  3390. }
  3391. kvm_vcpu_set_in_spin_loop(me, false);
  3392. /* Ensure vcpu is not eligible during next spinloop */
  3393. kvm_vcpu_set_dy_eligible(me, false);
  3394. }
  3395. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_on_spin);
  3396. static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff)
  3397. {
  3398. #ifdef CONFIG_HAVE_KVM_DIRTY_RING
  3399. return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) &&
  3400. (pgoff < KVM_DIRTY_LOG_PAGE_OFFSET +
  3401. kvm->dirty_ring_size / PAGE_SIZE);
  3402. #else
  3403. return false;
  3404. #endif
  3405. }
  3406. static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
  3407. {
  3408. struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
  3409. struct page *page;
  3410. if (vmf->pgoff == 0)
  3411. page = virt_to_page(vcpu->run);
  3412. #ifdef CONFIG_X86
  3413. else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
  3414. page = virt_to_page(vcpu->arch.pio_data);
  3415. #endif
  3416. #ifdef CONFIG_KVM_MMIO
  3417. else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
  3418. page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
  3419. #endif
  3420. else if (kvm_page_in_dirty_ring(vcpu->kvm, vmf->pgoff))
  3421. page = kvm_dirty_ring_get_page(
  3422. &vcpu->dirty_ring,
  3423. vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET);
  3424. else
  3425. return kvm_arch_vcpu_fault(vcpu, vmf);
  3426. get_page(page);
  3427. vmf->page = page;
  3428. return 0;
  3429. }
  3430. static const struct vm_operations_struct kvm_vcpu_vm_ops = {
  3431. .fault = kvm_vcpu_fault,
  3432. };
  3433. static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
  3434. {
  3435. struct kvm_vcpu *vcpu = file->private_data;
  3436. unsigned long pages = vma_pages(vma);
  3437. if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) ||
  3438. kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) &&
  3439. ((vma->vm_flags & VM_EXEC) || !(vma->vm_flags & VM_SHARED)))
  3440. return -EINVAL;
  3441. vma->vm_ops = &kvm_vcpu_vm_ops;
  3442. return 0;
  3443. }
  3444. static int kvm_vcpu_release(struct inode *inode, struct file *filp)
  3445. {
  3446. struct kvm_vcpu *vcpu = filp->private_data;
  3447. kvm_put_kvm(vcpu->kvm);
  3448. return 0;
  3449. }
  3450. static struct file_operations kvm_vcpu_fops = {
  3451. .release = kvm_vcpu_release,
  3452. .unlocked_ioctl = kvm_vcpu_ioctl,
  3453. .mmap = kvm_vcpu_mmap,
  3454. .llseek = noop_llseek,
  3455. KVM_COMPAT(kvm_vcpu_compat_ioctl),
  3456. };
  3457. /*
  3458. * Allocates an inode for the vcpu.
  3459. */
  3460. static int create_vcpu_fd(struct kvm_vcpu *vcpu)
  3461. {
  3462. char name[8 + 1 + ITOA_MAX_LEN + 1];
  3463. snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id);
  3464. return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
  3465. }
  3466. #ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
  3467. static int vcpu_get_pid(void *data, u64 *val)
  3468. {
  3469. struct kvm_vcpu *vcpu = data;
  3470. read_lock(&vcpu->pid_lock);
  3471. *val = pid_nr(vcpu->pid);
  3472. read_unlock(&vcpu->pid_lock);
  3473. return 0;
  3474. }
  3475. DEFINE_SIMPLE_ATTRIBUTE(vcpu_get_pid_fops, vcpu_get_pid, NULL, "%llu\n");
  3476. static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
  3477. {
  3478. struct dentry *debugfs_dentry;
  3479. char dir_name[ITOA_MAX_LEN * 2];
  3480. if (!debugfs_initialized())
  3481. return;
  3482. snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
  3483. debugfs_dentry = debugfs_create_dir(dir_name,
  3484. vcpu->kvm->debugfs_dentry);
  3485. debugfs_create_file("pid", 0444, debugfs_dentry, vcpu,
  3486. &vcpu_get_pid_fops);
  3487. kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry);
  3488. }
  3489. #endif
  3490. /*
  3491. * Creates some virtual cpus. Good luck creating more than one.
  3492. */
  3493. static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
  3494. {
  3495. int r;
  3496. struct kvm_vcpu *vcpu;
  3497. struct page *page;
  3498. /*
  3499. * KVM tracks vCPU IDs as 'int', be kind to userspace and reject
  3500. * too-large values instead of silently truncating.
  3501. *
  3502. * Ensure KVM_MAX_VCPU_IDS isn't pushed above INT_MAX without first
  3503. * changing the storage type (at the very least, IDs should be tracked
  3504. * as unsigned ints).
  3505. */
  3506. BUILD_BUG_ON(KVM_MAX_VCPU_IDS > INT_MAX);
  3507. if (id >= KVM_MAX_VCPU_IDS)
  3508. return -EINVAL;
  3509. mutex_lock(&kvm->lock);
  3510. if (kvm->created_vcpus >= kvm->max_vcpus) {
  3511. mutex_unlock(&kvm->lock);
  3512. return -EINVAL;
  3513. }
  3514. r = kvm_arch_vcpu_precreate(kvm, id);
  3515. if (r) {
  3516. mutex_unlock(&kvm->lock);
  3517. return r;
  3518. }
  3519. kvm->created_vcpus++;
  3520. mutex_unlock(&kvm->lock);
  3521. vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
  3522. if (!vcpu) {
  3523. r = -ENOMEM;
  3524. goto vcpu_decrement;
  3525. }
  3526. BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
  3527. page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
  3528. if (!page) {
  3529. r = -ENOMEM;
  3530. goto vcpu_free;
  3531. }
  3532. vcpu->run = page_address(page);
  3533. kvm_vcpu_init(vcpu, kvm, id);
  3534. r = kvm_arch_vcpu_create(vcpu);
  3535. if (r)
  3536. goto vcpu_free_run_page;
  3537. if (kvm->dirty_ring_size) {
  3538. r = kvm_dirty_ring_alloc(kvm, &vcpu->dirty_ring,
  3539. id, kvm->dirty_ring_size);
  3540. if (r)
  3541. goto arch_vcpu_destroy;
  3542. }
  3543. mutex_lock(&kvm->lock);
  3544. if (kvm_get_vcpu_by_id(kvm, id)) {
  3545. r = -EEXIST;
  3546. goto unlock_vcpu_destroy;
  3547. }
  3548. vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
  3549. r = xa_insert(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, GFP_KERNEL_ACCOUNT);
  3550. WARN_ON_ONCE(r == -EBUSY);
  3551. if (r)
  3552. goto unlock_vcpu_destroy;
  3553. /*
  3554. * Now it's all set up, let userspace reach it. Grab the vCPU's mutex
  3555. * so that userspace can't invoke vCPU ioctl()s until the vCPU is fully
  3556. * visible (per online_vcpus), e.g. so that KVM doesn't get tricked
  3557. * into a NULL-pointer dereference because KVM thinks the _current_
  3558. * vCPU doesn't exist. As a bonus, taking vcpu->mutex ensures lockdep
  3559. * knows it's taken *inside* kvm->lock.
  3560. */
  3561. mutex_lock(&vcpu->mutex);
  3562. kvm_get_kvm(kvm);
  3563. r = create_vcpu_fd(vcpu);
  3564. if (r < 0)
  3565. goto kvm_put_xa_erase;
  3566. /*
  3567. * Pairs with smp_rmb() in kvm_get_vcpu. Store the vcpu
  3568. * pointer before kvm->online_vcpu's incremented value.
  3569. */
  3570. smp_wmb();
  3571. atomic_inc(&kvm->online_vcpus);
  3572. mutex_unlock(&vcpu->mutex);
  3573. mutex_unlock(&kvm->lock);
  3574. kvm_arch_vcpu_postcreate(vcpu);
  3575. kvm_create_vcpu_debugfs(vcpu);
  3576. return r;
  3577. kvm_put_xa_erase:
  3578. mutex_unlock(&vcpu->mutex);
  3579. kvm_put_kvm_no_destroy(kvm);
  3580. xa_erase(&kvm->vcpu_array, vcpu->vcpu_idx);
  3581. unlock_vcpu_destroy:
  3582. mutex_unlock(&kvm->lock);
  3583. kvm_dirty_ring_free(&vcpu->dirty_ring);
  3584. arch_vcpu_destroy:
  3585. kvm_arch_vcpu_destroy(vcpu);
  3586. vcpu_free_run_page:
  3587. free_page((unsigned long)vcpu->run);
  3588. vcpu_free:
  3589. kmem_cache_free(kvm_vcpu_cache, vcpu);
  3590. vcpu_decrement:
  3591. mutex_lock(&kvm->lock);
  3592. kvm->created_vcpus--;
  3593. mutex_unlock(&kvm->lock);
  3594. return r;
  3595. }
  3596. static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
  3597. {
  3598. if (sigset) {
  3599. sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
  3600. vcpu->sigset_active = 1;
  3601. vcpu->sigset = *sigset;
  3602. } else
  3603. vcpu->sigset_active = 0;
  3604. return 0;
  3605. }
  3606. static ssize_t kvm_vcpu_stats_read(struct file *file, char __user *user_buffer,
  3607. size_t size, loff_t *offset)
  3608. {
  3609. struct kvm_vcpu *vcpu = file->private_data;
  3610. return kvm_stats_read(vcpu->stats_id, &kvm_vcpu_stats_header,
  3611. &kvm_vcpu_stats_desc[0], &vcpu->stat,
  3612. sizeof(vcpu->stat), user_buffer, size, offset);
  3613. }
  3614. static int kvm_vcpu_stats_release(struct inode *inode, struct file *file)
  3615. {
  3616. struct kvm_vcpu *vcpu = file->private_data;
  3617. kvm_put_kvm(vcpu->kvm);
  3618. return 0;
  3619. }
  3620. static const struct file_operations kvm_vcpu_stats_fops = {
  3621. .owner = THIS_MODULE,
  3622. .read = kvm_vcpu_stats_read,
  3623. .release = kvm_vcpu_stats_release,
  3624. .llseek = noop_llseek,
  3625. };
  3626. static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
  3627. {
  3628. int fd;
  3629. struct file *file;
  3630. char name[15 + ITOA_MAX_LEN + 1];
  3631. snprintf(name, sizeof(name), "kvm-vcpu-stats:%d", vcpu->vcpu_id);
  3632. fd = get_unused_fd_flags(O_CLOEXEC);
  3633. if (fd < 0)
  3634. return fd;
  3635. file = anon_inode_getfile_fmode(name, &kvm_vcpu_stats_fops, vcpu,
  3636. O_RDONLY, FMODE_PREAD);
  3637. if (IS_ERR(file)) {
  3638. put_unused_fd(fd);
  3639. return PTR_ERR(file);
  3640. }
  3641. kvm_get_kvm(vcpu->kvm);
  3642. fd_install(fd, file);
  3643. return fd;
  3644. }
  3645. #ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY
  3646. static int kvm_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
  3647. struct kvm_pre_fault_memory *range)
  3648. {
  3649. int idx;
  3650. long r;
  3651. u64 full_size;
  3652. if (range->flags)
  3653. return -EINVAL;
  3654. if (!PAGE_ALIGNED(range->gpa) ||
  3655. !PAGE_ALIGNED(range->size) ||
  3656. range->gpa + range->size <= range->gpa)
  3657. return -EINVAL;
  3658. vcpu_load(vcpu);
  3659. idx = srcu_read_lock(&vcpu->kvm->srcu);
  3660. full_size = range->size;
  3661. do {
  3662. if (signal_pending(current)) {
  3663. r = -EINTR;
  3664. break;
  3665. }
  3666. r = kvm_arch_vcpu_pre_fault_memory(vcpu, range);
  3667. if (WARN_ON_ONCE(r == 0 || r == -EIO))
  3668. break;
  3669. if (r < 0)
  3670. break;
  3671. range->size -= r;
  3672. range->gpa += r;
  3673. cond_resched();
  3674. } while (range->size);
  3675. srcu_read_unlock(&vcpu->kvm->srcu, idx);
  3676. vcpu_put(vcpu);
  3677. /* Return success if at least one page was mapped successfully. */
  3678. return full_size == range->size ? r : 0;
  3679. }
  3680. #endif
  3681. static int kvm_wait_for_vcpu_online(struct kvm_vcpu *vcpu)
  3682. {
  3683. struct kvm *kvm = vcpu->kvm;
  3684. /*
  3685. * In practice, this happy path will always be taken, as a well-behaved
  3686. * VMM will never invoke a vCPU ioctl() before KVM_CREATE_VCPU returns.
  3687. */
  3688. if (likely(vcpu->vcpu_idx < atomic_read(&kvm->online_vcpus)))
  3689. return 0;
  3690. /*
  3691. * Acquire and release the vCPU's mutex to wait for vCPU creation to
  3692. * complete (kvm_vm_ioctl_create_vcpu() holds the mutex until the vCPU
  3693. * is fully online).
  3694. */
  3695. if (mutex_lock_killable(&vcpu->mutex))
  3696. return -EINTR;
  3697. mutex_unlock(&vcpu->mutex);
  3698. if (WARN_ON_ONCE(!kvm_get_vcpu(kvm, vcpu->vcpu_idx)))
  3699. return -EIO;
  3700. return 0;
  3701. }
  3702. static long kvm_vcpu_ioctl(struct file *filp,
  3703. unsigned int ioctl, unsigned long arg)
  3704. {
  3705. struct kvm_vcpu *vcpu = filp->private_data;
  3706. void __user *argp = (void __user *)arg;
  3707. int r;
  3708. struct kvm_fpu *fpu = NULL;
  3709. struct kvm_sregs *kvm_sregs = NULL;
  3710. if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
  3711. return -EIO;
  3712. if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
  3713. return -EINVAL;
  3714. /*
  3715. * Wait for the vCPU to be online before handling the ioctl(), as KVM
  3716. * assumes the vCPU is reachable via vcpu_array, i.e. may dereference
  3717. * a NULL pointer if userspace invokes an ioctl() before KVM is ready.
  3718. */
  3719. r = kvm_wait_for_vcpu_online(vcpu);
  3720. if (r)
  3721. return r;
  3722. /*
  3723. * Let arch code handle select vCPU ioctls without holding vcpu->mutex,
  3724. * e.g. to support ioctls that can run asynchronous to vCPU execution.
  3725. */
  3726. r = kvm_arch_vcpu_unlocked_ioctl(filp, ioctl, arg);
  3727. if (r != -ENOIOCTLCMD)
  3728. return r;
  3729. if (mutex_lock_killable(&vcpu->mutex))
  3730. return -EINTR;
  3731. switch (ioctl) {
  3732. case KVM_RUN: {
  3733. struct pid *oldpid;
  3734. r = -EINVAL;
  3735. if (arg)
  3736. goto out;
  3737. /*
  3738. * Note, vcpu->pid is primarily protected by vcpu->mutex. The
  3739. * dedicated r/w lock allows other tasks, e.g. other vCPUs, to
  3740. * read vcpu->pid while this vCPU is in KVM_RUN, e.g. to yield
  3741. * directly to this vCPU
  3742. */
  3743. oldpid = vcpu->pid;
  3744. if (unlikely(oldpid != task_pid(current))) {
  3745. /* The thread running this VCPU changed. */
  3746. struct pid *newpid;
  3747. r = kvm_arch_vcpu_run_pid_change(vcpu);
  3748. if (r)
  3749. break;
  3750. newpid = get_task_pid(current, PIDTYPE_PID);
  3751. write_lock(&vcpu->pid_lock);
  3752. vcpu->pid = newpid;
  3753. write_unlock(&vcpu->pid_lock);
  3754. put_pid(oldpid);
  3755. }
  3756. vcpu->wants_to_run = !READ_ONCE(vcpu->run->immediate_exit__unsafe);
  3757. r = kvm_arch_vcpu_ioctl_run(vcpu);
  3758. vcpu->wants_to_run = false;
  3759. /*
  3760. * FIXME: Remove this hack once all KVM architectures
  3761. * support the generic TIF bits, i.e. a dedicated TIF_RSEQ.
  3762. */
  3763. rseq_virt_userspace_exit();
  3764. trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
  3765. break;
  3766. }
  3767. case KVM_GET_REGS: {
  3768. struct kvm_regs *kvm_regs;
  3769. r = -ENOMEM;
  3770. kvm_regs = kzalloc_obj(struct kvm_regs);
  3771. if (!kvm_regs)
  3772. goto out;
  3773. r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
  3774. if (r)
  3775. goto out_free1;
  3776. r = -EFAULT;
  3777. if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
  3778. goto out_free1;
  3779. r = 0;
  3780. out_free1:
  3781. kfree(kvm_regs);
  3782. break;
  3783. }
  3784. case KVM_SET_REGS: {
  3785. struct kvm_regs *kvm_regs;
  3786. kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
  3787. if (IS_ERR(kvm_regs)) {
  3788. r = PTR_ERR(kvm_regs);
  3789. goto out;
  3790. }
  3791. r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
  3792. kfree(kvm_regs);
  3793. break;
  3794. }
  3795. case KVM_GET_SREGS: {
  3796. kvm_sregs = kzalloc_obj(struct kvm_sregs);
  3797. r = -ENOMEM;
  3798. if (!kvm_sregs)
  3799. goto out;
  3800. r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
  3801. if (r)
  3802. goto out;
  3803. r = -EFAULT;
  3804. if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
  3805. goto out;
  3806. r = 0;
  3807. break;
  3808. }
  3809. case KVM_SET_SREGS: {
  3810. kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
  3811. if (IS_ERR(kvm_sregs)) {
  3812. r = PTR_ERR(kvm_sregs);
  3813. kvm_sregs = NULL;
  3814. goto out;
  3815. }
  3816. r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
  3817. break;
  3818. }
  3819. case KVM_GET_MP_STATE: {
  3820. struct kvm_mp_state mp_state;
  3821. r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
  3822. if (r)
  3823. goto out;
  3824. r = -EFAULT;
  3825. if (copy_to_user(argp, &mp_state, sizeof(mp_state)))
  3826. goto out;
  3827. r = 0;
  3828. break;
  3829. }
  3830. case KVM_SET_MP_STATE: {
  3831. struct kvm_mp_state mp_state;
  3832. r = -EFAULT;
  3833. if (copy_from_user(&mp_state, argp, sizeof(mp_state)))
  3834. goto out;
  3835. r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
  3836. break;
  3837. }
  3838. case KVM_TRANSLATE: {
  3839. struct kvm_translation tr;
  3840. r = -EFAULT;
  3841. if (copy_from_user(&tr, argp, sizeof(tr)))
  3842. goto out;
  3843. r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
  3844. if (r)
  3845. goto out;
  3846. r = -EFAULT;
  3847. if (copy_to_user(argp, &tr, sizeof(tr)))
  3848. goto out;
  3849. r = 0;
  3850. break;
  3851. }
  3852. case KVM_SET_GUEST_DEBUG: {
  3853. struct kvm_guest_debug dbg;
  3854. r = -EFAULT;
  3855. if (copy_from_user(&dbg, argp, sizeof(dbg)))
  3856. goto out;
  3857. r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
  3858. break;
  3859. }
  3860. case KVM_SET_SIGNAL_MASK: {
  3861. struct kvm_signal_mask __user *sigmask_arg = argp;
  3862. struct kvm_signal_mask kvm_sigmask;
  3863. sigset_t sigset, *p;
  3864. p = NULL;
  3865. if (argp) {
  3866. r = -EFAULT;
  3867. if (copy_from_user(&kvm_sigmask, argp,
  3868. sizeof(kvm_sigmask)))
  3869. goto out;
  3870. r = -EINVAL;
  3871. if (kvm_sigmask.len != sizeof(sigset))
  3872. goto out;
  3873. r = -EFAULT;
  3874. if (copy_from_user(&sigset, sigmask_arg->sigset,
  3875. sizeof(sigset)))
  3876. goto out;
  3877. p = &sigset;
  3878. }
  3879. r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
  3880. break;
  3881. }
  3882. case KVM_GET_FPU: {
  3883. fpu = kzalloc_obj(struct kvm_fpu);
  3884. r = -ENOMEM;
  3885. if (!fpu)
  3886. goto out;
  3887. r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
  3888. if (r)
  3889. goto out;
  3890. r = -EFAULT;
  3891. if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
  3892. goto out;
  3893. r = 0;
  3894. break;
  3895. }
  3896. case KVM_SET_FPU: {
  3897. fpu = memdup_user(argp, sizeof(*fpu));
  3898. if (IS_ERR(fpu)) {
  3899. r = PTR_ERR(fpu);
  3900. fpu = NULL;
  3901. goto out;
  3902. }
  3903. r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
  3904. break;
  3905. }
  3906. case KVM_GET_STATS_FD: {
  3907. r = kvm_vcpu_ioctl_get_stats_fd(vcpu);
  3908. break;
  3909. }
  3910. #ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY
  3911. case KVM_PRE_FAULT_MEMORY: {
  3912. struct kvm_pre_fault_memory range;
  3913. r = -EFAULT;
  3914. if (copy_from_user(&range, argp, sizeof(range)))
  3915. break;
  3916. r = kvm_vcpu_pre_fault_memory(vcpu, &range);
  3917. /* Pass back leftover range. */
  3918. if (copy_to_user(argp, &range, sizeof(range)))
  3919. r = -EFAULT;
  3920. break;
  3921. }
  3922. #endif
  3923. default:
  3924. r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
  3925. }
  3926. out:
  3927. mutex_unlock(&vcpu->mutex);
  3928. kfree(fpu);
  3929. kfree(kvm_sregs);
  3930. return r;
  3931. }
  3932. #ifdef CONFIG_KVM_COMPAT
  3933. static long kvm_vcpu_compat_ioctl(struct file *filp,
  3934. unsigned int ioctl, unsigned long arg)
  3935. {
  3936. struct kvm_vcpu *vcpu = filp->private_data;
  3937. void __user *argp = compat_ptr(arg);
  3938. int r;
  3939. if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
  3940. return -EIO;
  3941. switch (ioctl) {
  3942. case KVM_SET_SIGNAL_MASK: {
  3943. struct kvm_signal_mask __user *sigmask_arg = argp;
  3944. struct kvm_signal_mask kvm_sigmask;
  3945. sigset_t sigset;
  3946. if (argp) {
  3947. r = -EFAULT;
  3948. if (copy_from_user(&kvm_sigmask, argp,
  3949. sizeof(kvm_sigmask)))
  3950. goto out;
  3951. r = -EINVAL;
  3952. if (kvm_sigmask.len != sizeof(compat_sigset_t))
  3953. goto out;
  3954. r = -EFAULT;
  3955. if (get_compat_sigset(&sigset,
  3956. (compat_sigset_t __user *)sigmask_arg->sigset))
  3957. goto out;
  3958. r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
  3959. } else
  3960. r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
  3961. break;
  3962. }
  3963. default:
  3964. r = kvm_vcpu_ioctl(filp, ioctl, arg);
  3965. }
  3966. out:
  3967. return r;
  3968. }
  3969. #endif
  3970. static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma)
  3971. {
  3972. struct kvm_device *dev = filp->private_data;
  3973. if (dev->ops->mmap)
  3974. return dev->ops->mmap(dev, vma);
  3975. return -ENODEV;
  3976. }
  3977. static int kvm_device_ioctl_attr(struct kvm_device *dev,
  3978. int (*accessor)(struct kvm_device *dev,
  3979. struct kvm_device_attr *attr),
  3980. unsigned long arg)
  3981. {
  3982. struct kvm_device_attr attr;
  3983. if (!accessor)
  3984. return -EPERM;
  3985. if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
  3986. return -EFAULT;
  3987. return accessor(dev, &attr);
  3988. }
  3989. static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
  3990. unsigned long arg)
  3991. {
  3992. struct kvm_device *dev = filp->private_data;
  3993. if (dev->kvm->mm != current->mm || dev->kvm->vm_dead)
  3994. return -EIO;
  3995. switch (ioctl) {
  3996. case KVM_SET_DEVICE_ATTR:
  3997. return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
  3998. case KVM_GET_DEVICE_ATTR:
  3999. return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg);
  4000. case KVM_HAS_DEVICE_ATTR:
  4001. return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg);
  4002. default:
  4003. if (dev->ops->ioctl)
  4004. return dev->ops->ioctl(dev, ioctl, arg);
  4005. return -ENOTTY;
  4006. }
  4007. }
  4008. static int kvm_device_release(struct inode *inode, struct file *filp)
  4009. {
  4010. struct kvm_device *dev = filp->private_data;
  4011. struct kvm *kvm = dev->kvm;
  4012. if (dev->ops->release) {
  4013. mutex_lock(&kvm->lock);
  4014. list_del_rcu(&dev->vm_node);
  4015. synchronize_rcu();
  4016. dev->ops->release(dev);
  4017. mutex_unlock(&kvm->lock);
  4018. }
  4019. kvm_put_kvm(kvm);
  4020. return 0;
  4021. }
  4022. static struct file_operations kvm_device_fops = {
  4023. .unlocked_ioctl = kvm_device_ioctl,
  4024. .release = kvm_device_release,
  4025. KVM_COMPAT(kvm_device_ioctl),
  4026. .mmap = kvm_device_mmap,
  4027. };
  4028. struct kvm_device *kvm_device_from_filp(struct file *filp)
  4029. {
  4030. if (filp->f_op != &kvm_device_fops)
  4031. return NULL;
  4032. return filp->private_data;
  4033. }
  4034. static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
  4035. #ifdef CONFIG_KVM_MPIC
  4036. [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops,
  4037. [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops,
  4038. #endif
  4039. };
  4040. int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type)
  4041. {
  4042. if (type >= ARRAY_SIZE(kvm_device_ops_table))
  4043. return -ENOSPC;
  4044. if (kvm_device_ops_table[type] != NULL)
  4045. return -EEXIST;
  4046. kvm_device_ops_table[type] = ops;
  4047. return 0;
  4048. }
  4049. void kvm_unregister_device_ops(u32 type)
  4050. {
  4051. if (kvm_device_ops_table[type] != NULL)
  4052. kvm_device_ops_table[type] = NULL;
  4053. }
  4054. static int kvm_ioctl_create_device(struct kvm *kvm,
  4055. struct kvm_create_device *cd)
  4056. {
  4057. const struct kvm_device_ops *ops;
  4058. struct kvm_device *dev;
  4059. bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
  4060. int type;
  4061. int ret;
  4062. if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
  4063. return -ENODEV;
  4064. type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table));
  4065. ops = kvm_device_ops_table[type];
  4066. if (ops == NULL)
  4067. return -ENODEV;
  4068. if (test)
  4069. return 0;
  4070. dev = kzalloc_obj(*dev, GFP_KERNEL_ACCOUNT);
  4071. if (!dev)
  4072. return -ENOMEM;
  4073. dev->ops = ops;
  4074. dev->kvm = kvm;
  4075. mutex_lock(&kvm->lock);
  4076. ret = ops->create(dev, type);
  4077. if (ret < 0) {
  4078. mutex_unlock(&kvm->lock);
  4079. kfree(dev);
  4080. return ret;
  4081. }
  4082. list_add_rcu(&dev->vm_node, &kvm->devices);
  4083. mutex_unlock(&kvm->lock);
  4084. if (ops->init)
  4085. ops->init(dev);
  4086. kvm_get_kvm(kvm);
  4087. ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
  4088. if (ret < 0) {
  4089. kvm_put_kvm_no_destroy(kvm);
  4090. mutex_lock(&kvm->lock);
  4091. list_del_rcu(&dev->vm_node);
  4092. synchronize_rcu();
  4093. if (ops->release)
  4094. ops->release(dev);
  4095. mutex_unlock(&kvm->lock);
  4096. if (ops->destroy)
  4097. ops->destroy(dev);
  4098. return ret;
  4099. }
  4100. cd->fd = ret;
  4101. return 0;
  4102. }
  4103. static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
  4104. {
  4105. switch (arg) {
  4106. case KVM_CAP_SYNC_MMU:
  4107. case KVM_CAP_USER_MEMORY:
  4108. case KVM_CAP_USER_MEMORY2:
  4109. case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
  4110. case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
  4111. case KVM_CAP_INTERNAL_ERROR_DATA:
  4112. #ifdef CONFIG_HAVE_KVM_MSI
  4113. case KVM_CAP_SIGNAL_MSI:
  4114. #endif
  4115. #ifdef CONFIG_HAVE_KVM_IRQCHIP
  4116. case KVM_CAP_IRQFD:
  4117. #endif
  4118. case KVM_CAP_IOEVENTFD_ANY_LENGTH:
  4119. case KVM_CAP_CHECK_EXTENSION_VM:
  4120. case KVM_CAP_ENABLE_CAP_VM:
  4121. case KVM_CAP_HALT_POLL:
  4122. return 1;
  4123. #ifdef CONFIG_KVM_MMIO
  4124. case KVM_CAP_COALESCED_MMIO:
  4125. return KVM_COALESCED_MMIO_PAGE_OFFSET;
  4126. case KVM_CAP_COALESCED_PIO:
  4127. return 1;
  4128. #endif
  4129. #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
  4130. case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
  4131. return KVM_DIRTY_LOG_MANUAL_CAPS;
  4132. #endif
  4133. #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
  4134. case KVM_CAP_IRQ_ROUTING:
  4135. return KVM_MAX_IRQ_ROUTES;
  4136. #endif
  4137. #if KVM_MAX_NR_ADDRESS_SPACES > 1
  4138. case KVM_CAP_MULTI_ADDRESS_SPACE:
  4139. if (kvm)
  4140. return kvm_arch_nr_memslot_as_ids(kvm);
  4141. return KVM_MAX_NR_ADDRESS_SPACES;
  4142. #endif
  4143. case KVM_CAP_NR_MEMSLOTS:
  4144. return KVM_USER_MEM_SLOTS;
  4145. case KVM_CAP_DIRTY_LOG_RING:
  4146. #ifdef CONFIG_HAVE_KVM_DIRTY_RING_TSO
  4147. return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
  4148. #else
  4149. return 0;
  4150. #endif
  4151. case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
  4152. #ifdef CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL
  4153. return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
  4154. #else
  4155. return 0;
  4156. #endif
  4157. #ifdef CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP
  4158. case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP:
  4159. #endif
  4160. case KVM_CAP_BINARY_STATS_FD:
  4161. case KVM_CAP_SYSTEM_EVENT_DATA:
  4162. case KVM_CAP_DEVICE_CTRL:
  4163. return 1;
  4164. #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
  4165. case KVM_CAP_MEMORY_ATTRIBUTES:
  4166. return kvm_supported_mem_attributes(kvm);
  4167. #endif
  4168. #ifdef CONFIG_KVM_GUEST_MEMFD
  4169. case KVM_CAP_GUEST_MEMFD:
  4170. return 1;
  4171. case KVM_CAP_GUEST_MEMFD_FLAGS:
  4172. return kvm_gmem_get_supported_flags(kvm);
  4173. #endif
  4174. default:
  4175. break;
  4176. }
  4177. return kvm_vm_ioctl_check_extension(kvm, arg);
  4178. }
  4179. static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size)
  4180. {
  4181. int r;
  4182. if (!KVM_DIRTY_LOG_PAGE_OFFSET)
  4183. return -EINVAL;
  4184. /* the size should be power of 2 */
  4185. if (!size || (size & (size - 1)))
  4186. return -EINVAL;
  4187. /* Should be bigger to keep the reserved entries, or a page */
  4188. if (size < kvm_dirty_ring_get_rsvd_entries(kvm) *
  4189. sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE)
  4190. return -EINVAL;
  4191. if (size > KVM_DIRTY_RING_MAX_ENTRIES *
  4192. sizeof(struct kvm_dirty_gfn))
  4193. return -E2BIG;
  4194. /* We only allow it to set once */
  4195. if (kvm->dirty_ring_size)
  4196. return -EINVAL;
  4197. mutex_lock(&kvm->lock);
  4198. if (kvm->created_vcpus) {
  4199. /* We don't allow to change this value after vcpu created */
  4200. r = -EINVAL;
  4201. } else {
  4202. kvm->dirty_ring_size = size;
  4203. r = 0;
  4204. }
  4205. mutex_unlock(&kvm->lock);
  4206. return r;
  4207. }
  4208. static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
  4209. {
  4210. unsigned long i;
  4211. struct kvm_vcpu *vcpu;
  4212. int cleared = 0, r;
  4213. if (!kvm->dirty_ring_size)
  4214. return -EINVAL;
  4215. mutex_lock(&kvm->slots_lock);
  4216. kvm_for_each_vcpu(i, vcpu, kvm) {
  4217. r = kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring, &cleared);
  4218. if (r)
  4219. break;
  4220. }
  4221. mutex_unlock(&kvm->slots_lock);
  4222. if (cleared)
  4223. kvm_flush_remote_tlbs(kvm);
  4224. return cleared;
  4225. }
  4226. int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
  4227. struct kvm_enable_cap *cap)
  4228. {
  4229. return -EINVAL;
  4230. }
  4231. bool kvm_are_all_memslots_empty(struct kvm *kvm)
  4232. {
  4233. int i;
  4234. lockdep_assert_held(&kvm->slots_lock);
  4235. for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
  4236. if (!kvm_memslots_empty(__kvm_memslots(kvm, i)))
  4237. return false;
  4238. }
  4239. return true;
  4240. }
  4241. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_are_all_memslots_empty);
  4242. static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
  4243. struct kvm_enable_cap *cap)
  4244. {
  4245. switch (cap->cap) {
  4246. #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
  4247. case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: {
  4248. u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE;
  4249. if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE)
  4250. allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS;
  4251. if (cap->flags || (cap->args[0] & ~allowed_options))
  4252. return -EINVAL;
  4253. kvm->manual_dirty_log_protect = cap->args[0];
  4254. return 0;
  4255. }
  4256. #endif
  4257. case KVM_CAP_HALT_POLL: {
  4258. if (cap->flags || cap->args[0] != (unsigned int)cap->args[0])
  4259. return -EINVAL;
  4260. kvm->max_halt_poll_ns = cap->args[0];
  4261. /*
  4262. * Ensure kvm->override_halt_poll_ns does not become visible
  4263. * before kvm->max_halt_poll_ns.
  4264. *
  4265. * Pairs with the smp_rmb() in kvm_vcpu_max_halt_poll_ns().
  4266. */
  4267. smp_wmb();
  4268. kvm->override_halt_poll_ns = true;
  4269. return 0;
  4270. }
  4271. case KVM_CAP_DIRTY_LOG_RING:
  4272. case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
  4273. if (!kvm_vm_ioctl_check_extension_generic(kvm, cap->cap))
  4274. return -EINVAL;
  4275. return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]);
  4276. case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP: {
  4277. int r = -EINVAL;
  4278. if (!IS_ENABLED(CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP) ||
  4279. !kvm->dirty_ring_size || cap->flags)
  4280. return r;
  4281. mutex_lock(&kvm->slots_lock);
  4282. /*
  4283. * For simplicity, allow enabling ring+bitmap if and only if
  4284. * there are no memslots, e.g. to ensure all memslots allocate
  4285. * a bitmap after the capability is enabled.
  4286. */
  4287. if (kvm_are_all_memslots_empty(kvm)) {
  4288. kvm->dirty_ring_with_bitmap = true;
  4289. r = 0;
  4290. }
  4291. mutex_unlock(&kvm->slots_lock);
  4292. return r;
  4293. }
  4294. default:
  4295. return kvm_vm_ioctl_enable_cap(kvm, cap);
  4296. }
  4297. }
  4298. static ssize_t kvm_vm_stats_read(struct file *file, char __user *user_buffer,
  4299. size_t size, loff_t *offset)
  4300. {
  4301. struct kvm *kvm = file->private_data;
  4302. return kvm_stats_read(kvm->stats_id, &kvm_vm_stats_header,
  4303. &kvm_vm_stats_desc[0], &kvm->stat,
  4304. sizeof(kvm->stat), user_buffer, size, offset);
  4305. }
  4306. static int kvm_vm_stats_release(struct inode *inode, struct file *file)
  4307. {
  4308. struct kvm *kvm = file->private_data;
  4309. kvm_put_kvm(kvm);
  4310. return 0;
  4311. }
  4312. static const struct file_operations kvm_vm_stats_fops = {
  4313. .owner = THIS_MODULE,
  4314. .read = kvm_vm_stats_read,
  4315. .release = kvm_vm_stats_release,
  4316. .llseek = noop_llseek,
  4317. };
  4318. static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
  4319. {
  4320. int fd;
  4321. struct file *file;
  4322. fd = get_unused_fd_flags(O_CLOEXEC);
  4323. if (fd < 0)
  4324. return fd;
  4325. file = anon_inode_getfile_fmode("kvm-vm-stats",
  4326. &kvm_vm_stats_fops, kvm, O_RDONLY, FMODE_PREAD);
  4327. if (IS_ERR(file)) {
  4328. put_unused_fd(fd);
  4329. return PTR_ERR(file);
  4330. }
  4331. kvm_get_kvm(kvm);
  4332. fd_install(fd, file);
  4333. return fd;
  4334. }
  4335. #define SANITY_CHECK_MEM_REGION_FIELD(field) \
  4336. do { \
  4337. BUILD_BUG_ON(offsetof(struct kvm_userspace_memory_region, field) != \
  4338. offsetof(struct kvm_userspace_memory_region2, field)); \
  4339. BUILD_BUG_ON(sizeof_field(struct kvm_userspace_memory_region, field) != \
  4340. sizeof_field(struct kvm_userspace_memory_region2, field)); \
  4341. } while (0)
  4342. static long kvm_vm_ioctl(struct file *filp,
  4343. unsigned int ioctl, unsigned long arg)
  4344. {
  4345. struct kvm *kvm = filp->private_data;
  4346. void __user *argp = (void __user *)arg;
  4347. int r;
  4348. if (kvm->mm != current->mm || kvm->vm_dead)
  4349. return -EIO;
  4350. switch (ioctl) {
  4351. case KVM_CREATE_VCPU:
  4352. r = kvm_vm_ioctl_create_vcpu(kvm, arg);
  4353. break;
  4354. case KVM_ENABLE_CAP: {
  4355. struct kvm_enable_cap cap;
  4356. r = -EFAULT;
  4357. if (copy_from_user(&cap, argp, sizeof(cap)))
  4358. goto out;
  4359. r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
  4360. break;
  4361. }
  4362. case KVM_SET_USER_MEMORY_REGION2:
  4363. case KVM_SET_USER_MEMORY_REGION: {
  4364. struct kvm_userspace_memory_region2 mem;
  4365. unsigned long size;
  4366. if (ioctl == KVM_SET_USER_MEMORY_REGION) {
  4367. /*
  4368. * Fields beyond struct kvm_userspace_memory_region shouldn't be
  4369. * accessed, but avoid leaking kernel memory in case of a bug.
  4370. */
  4371. memset(&mem, 0, sizeof(mem));
  4372. size = sizeof(struct kvm_userspace_memory_region);
  4373. } else {
  4374. size = sizeof(struct kvm_userspace_memory_region2);
  4375. }
  4376. /* Ensure the common parts of the two structs are identical. */
  4377. SANITY_CHECK_MEM_REGION_FIELD(slot);
  4378. SANITY_CHECK_MEM_REGION_FIELD(flags);
  4379. SANITY_CHECK_MEM_REGION_FIELD(guest_phys_addr);
  4380. SANITY_CHECK_MEM_REGION_FIELD(memory_size);
  4381. SANITY_CHECK_MEM_REGION_FIELD(userspace_addr);
  4382. r = -EFAULT;
  4383. if (copy_from_user(&mem, argp, size))
  4384. goto out;
  4385. r = -EINVAL;
  4386. if (ioctl == KVM_SET_USER_MEMORY_REGION &&
  4387. (mem.flags & ~KVM_SET_USER_MEMORY_REGION_V1_FLAGS))
  4388. goto out;
  4389. r = kvm_vm_ioctl_set_memory_region(kvm, &mem);
  4390. break;
  4391. }
  4392. case KVM_GET_DIRTY_LOG: {
  4393. struct kvm_dirty_log log;
  4394. r = -EFAULT;
  4395. if (copy_from_user(&log, argp, sizeof(log)))
  4396. goto out;
  4397. r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
  4398. break;
  4399. }
  4400. #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
  4401. case KVM_CLEAR_DIRTY_LOG: {
  4402. struct kvm_clear_dirty_log log;
  4403. r = -EFAULT;
  4404. if (copy_from_user(&log, argp, sizeof(log)))
  4405. goto out;
  4406. r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
  4407. break;
  4408. }
  4409. #endif
  4410. #ifdef CONFIG_KVM_MMIO
  4411. case KVM_REGISTER_COALESCED_MMIO: {
  4412. struct kvm_coalesced_mmio_zone zone;
  4413. r = -EFAULT;
  4414. if (copy_from_user(&zone, argp, sizeof(zone)))
  4415. goto out;
  4416. r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
  4417. break;
  4418. }
  4419. case KVM_UNREGISTER_COALESCED_MMIO: {
  4420. struct kvm_coalesced_mmio_zone zone;
  4421. r = -EFAULT;
  4422. if (copy_from_user(&zone, argp, sizeof(zone)))
  4423. goto out;
  4424. r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
  4425. break;
  4426. }
  4427. #endif
  4428. case KVM_IRQFD: {
  4429. struct kvm_irqfd data;
  4430. r = -EFAULT;
  4431. if (copy_from_user(&data, argp, sizeof(data)))
  4432. goto out;
  4433. r = kvm_irqfd(kvm, &data);
  4434. break;
  4435. }
  4436. case KVM_IOEVENTFD: {
  4437. struct kvm_ioeventfd data;
  4438. r = -EFAULT;
  4439. if (copy_from_user(&data, argp, sizeof(data)))
  4440. goto out;
  4441. r = kvm_ioeventfd(kvm, &data);
  4442. break;
  4443. }
  4444. #ifdef CONFIG_HAVE_KVM_MSI
  4445. case KVM_SIGNAL_MSI: {
  4446. struct kvm_msi msi;
  4447. r = -EFAULT;
  4448. if (copy_from_user(&msi, argp, sizeof(msi)))
  4449. goto out;
  4450. r = kvm_send_userspace_msi(kvm, &msi);
  4451. break;
  4452. }
  4453. #endif
  4454. #ifdef __KVM_HAVE_IRQ_LINE
  4455. case KVM_IRQ_LINE_STATUS:
  4456. case KVM_IRQ_LINE: {
  4457. struct kvm_irq_level irq_event;
  4458. r = -EFAULT;
  4459. if (copy_from_user(&irq_event, argp, sizeof(irq_event)))
  4460. goto out;
  4461. r = kvm_vm_ioctl_irq_line(kvm, &irq_event,
  4462. ioctl == KVM_IRQ_LINE_STATUS);
  4463. if (r)
  4464. goto out;
  4465. r = -EFAULT;
  4466. if (ioctl == KVM_IRQ_LINE_STATUS) {
  4467. if (copy_to_user(argp, &irq_event, sizeof(irq_event)))
  4468. goto out;
  4469. }
  4470. r = 0;
  4471. break;
  4472. }
  4473. #endif
  4474. #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
  4475. case KVM_SET_GSI_ROUTING: {
  4476. struct kvm_irq_routing routing;
  4477. struct kvm_irq_routing __user *urouting;
  4478. struct kvm_irq_routing_entry *entries = NULL;
  4479. r = -EFAULT;
  4480. if (copy_from_user(&routing, argp, sizeof(routing)))
  4481. goto out;
  4482. r = -EINVAL;
  4483. if (!kvm_arch_can_set_irq_routing(kvm))
  4484. goto out;
  4485. if (routing.nr > KVM_MAX_IRQ_ROUTES)
  4486. goto out;
  4487. if (routing.flags)
  4488. goto out;
  4489. if (routing.nr) {
  4490. urouting = argp;
  4491. entries = vmemdup_array_user(urouting->entries,
  4492. routing.nr, sizeof(*entries));
  4493. if (IS_ERR(entries)) {
  4494. r = PTR_ERR(entries);
  4495. goto out;
  4496. }
  4497. }
  4498. r = kvm_set_irq_routing(kvm, entries, routing.nr,
  4499. routing.flags);
  4500. kvfree(entries);
  4501. break;
  4502. }
  4503. #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
  4504. #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
  4505. case KVM_SET_MEMORY_ATTRIBUTES: {
  4506. struct kvm_memory_attributes attrs;
  4507. r = -EFAULT;
  4508. if (copy_from_user(&attrs, argp, sizeof(attrs)))
  4509. goto out;
  4510. r = kvm_vm_ioctl_set_mem_attributes(kvm, &attrs);
  4511. break;
  4512. }
  4513. #endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
  4514. case KVM_CREATE_DEVICE: {
  4515. struct kvm_create_device cd;
  4516. r = -EFAULT;
  4517. if (copy_from_user(&cd, argp, sizeof(cd)))
  4518. goto out;
  4519. r = kvm_ioctl_create_device(kvm, &cd);
  4520. if (r)
  4521. goto out;
  4522. r = -EFAULT;
  4523. if (copy_to_user(argp, &cd, sizeof(cd)))
  4524. goto out;
  4525. r = 0;
  4526. break;
  4527. }
  4528. case KVM_CHECK_EXTENSION:
  4529. r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
  4530. break;
  4531. case KVM_RESET_DIRTY_RINGS:
  4532. r = kvm_vm_ioctl_reset_dirty_pages(kvm);
  4533. break;
  4534. case KVM_GET_STATS_FD:
  4535. r = kvm_vm_ioctl_get_stats_fd(kvm);
  4536. break;
  4537. #ifdef CONFIG_KVM_GUEST_MEMFD
  4538. case KVM_CREATE_GUEST_MEMFD: {
  4539. struct kvm_create_guest_memfd guest_memfd;
  4540. r = -EFAULT;
  4541. if (copy_from_user(&guest_memfd, argp, sizeof(guest_memfd)))
  4542. goto out;
  4543. r = kvm_gmem_create(kvm, &guest_memfd);
  4544. break;
  4545. }
  4546. #endif
  4547. default:
  4548. r = kvm_arch_vm_ioctl(filp, ioctl, arg);
  4549. }
  4550. out:
  4551. return r;
  4552. }
  4553. #ifdef CONFIG_KVM_COMPAT
  4554. struct compat_kvm_dirty_log {
  4555. __u32 slot;
  4556. __u32 padding1;
  4557. union {
  4558. compat_uptr_t dirty_bitmap; /* one bit per page */
  4559. __u64 padding2;
  4560. };
  4561. };
  4562. struct compat_kvm_clear_dirty_log {
  4563. __u32 slot;
  4564. __u32 num_pages;
  4565. __u64 first_page;
  4566. union {
  4567. compat_uptr_t dirty_bitmap; /* one bit per page */
  4568. __u64 padding2;
  4569. };
  4570. };
  4571. long __weak kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl,
  4572. unsigned long arg)
  4573. {
  4574. return -ENOTTY;
  4575. }
  4576. static long kvm_vm_compat_ioctl(struct file *filp,
  4577. unsigned int ioctl, unsigned long arg)
  4578. {
  4579. struct kvm *kvm = filp->private_data;
  4580. int r;
  4581. if (kvm->mm != current->mm || kvm->vm_dead)
  4582. return -EIO;
  4583. r = kvm_arch_vm_compat_ioctl(filp, ioctl, arg);
  4584. if (r != -ENOTTY)
  4585. return r;
  4586. switch (ioctl) {
  4587. #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
  4588. case KVM_CLEAR_DIRTY_LOG: {
  4589. struct compat_kvm_clear_dirty_log compat_log;
  4590. struct kvm_clear_dirty_log log;
  4591. if (copy_from_user(&compat_log, (void __user *)arg,
  4592. sizeof(compat_log)))
  4593. return -EFAULT;
  4594. log.slot = compat_log.slot;
  4595. log.num_pages = compat_log.num_pages;
  4596. log.first_page = compat_log.first_page;
  4597. log.padding2 = compat_log.padding2;
  4598. log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
  4599. r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
  4600. break;
  4601. }
  4602. #endif
  4603. case KVM_GET_DIRTY_LOG: {
  4604. struct compat_kvm_dirty_log compat_log;
  4605. struct kvm_dirty_log log;
  4606. if (copy_from_user(&compat_log, (void __user *)arg,
  4607. sizeof(compat_log)))
  4608. return -EFAULT;
  4609. log.slot = compat_log.slot;
  4610. log.padding1 = compat_log.padding1;
  4611. log.padding2 = compat_log.padding2;
  4612. log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
  4613. r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
  4614. break;
  4615. }
  4616. default:
  4617. r = kvm_vm_ioctl(filp, ioctl, arg);
  4618. }
  4619. return r;
  4620. }
  4621. #endif
  4622. static struct file_operations kvm_vm_fops = {
  4623. .release = kvm_vm_release,
  4624. .unlocked_ioctl = kvm_vm_ioctl,
  4625. .llseek = noop_llseek,
  4626. KVM_COMPAT(kvm_vm_compat_ioctl),
  4627. };
  4628. bool file_is_kvm(struct file *file)
  4629. {
  4630. return file && file->f_op == &kvm_vm_fops;
  4631. }
  4632. EXPORT_SYMBOL_FOR_KVM_INTERNAL(file_is_kvm);
  4633. static int kvm_dev_ioctl_create_vm(unsigned long type)
  4634. {
  4635. char fdname[ITOA_MAX_LEN + 1];
  4636. int r, fd;
  4637. struct kvm *kvm;
  4638. struct file *file;
  4639. fd = get_unused_fd_flags(O_CLOEXEC);
  4640. if (fd < 0)
  4641. return fd;
  4642. snprintf(fdname, sizeof(fdname), "%d", fd);
  4643. kvm = kvm_create_vm(type, fdname);
  4644. if (IS_ERR(kvm)) {
  4645. r = PTR_ERR(kvm);
  4646. goto put_fd;
  4647. }
  4648. file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
  4649. if (IS_ERR(file)) {
  4650. r = PTR_ERR(file);
  4651. goto put_kvm;
  4652. }
  4653. /*
  4654. * Don't call kvm_put_kvm anymore at this point; file->f_op is
  4655. * already set, with ->release() being kvm_vm_release(). In error
  4656. * cases it will be called by the final fput(file) and will take
  4657. * care of doing kvm_put_kvm(kvm).
  4658. */
  4659. kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
  4660. fd_install(fd, file);
  4661. return fd;
  4662. put_kvm:
  4663. kvm_put_kvm(kvm);
  4664. put_fd:
  4665. put_unused_fd(fd);
  4666. return r;
  4667. }
  4668. static long kvm_dev_ioctl(struct file *filp,
  4669. unsigned int ioctl, unsigned long arg)
  4670. {
  4671. int r = -EINVAL;
  4672. switch (ioctl) {
  4673. case KVM_GET_API_VERSION:
  4674. if (arg)
  4675. goto out;
  4676. r = KVM_API_VERSION;
  4677. break;
  4678. case KVM_CREATE_VM:
  4679. r = kvm_dev_ioctl_create_vm(arg);
  4680. break;
  4681. case KVM_CHECK_EXTENSION:
  4682. r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
  4683. break;
  4684. case KVM_GET_VCPU_MMAP_SIZE:
  4685. if (arg)
  4686. goto out;
  4687. r = PAGE_SIZE; /* struct kvm_run */
  4688. #ifdef CONFIG_X86
  4689. r += PAGE_SIZE; /* pio data page */
  4690. #endif
  4691. #ifdef CONFIG_KVM_MMIO
  4692. r += PAGE_SIZE; /* coalesced mmio ring page */
  4693. #endif
  4694. break;
  4695. default:
  4696. return kvm_arch_dev_ioctl(filp, ioctl, arg);
  4697. }
  4698. out:
  4699. return r;
  4700. }
  4701. static struct file_operations kvm_chardev_ops = {
  4702. .unlocked_ioctl = kvm_dev_ioctl,
  4703. .llseek = noop_llseek,
  4704. KVM_COMPAT(kvm_dev_ioctl),
  4705. };
  4706. static struct miscdevice kvm_dev = {
  4707. KVM_MINOR,
  4708. "kvm",
  4709. &kvm_chardev_ops,
  4710. };
  4711. #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
  4712. bool enable_virt_at_load = true;
  4713. module_param(enable_virt_at_load, bool, 0444);
  4714. EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_virt_at_load);
  4715. __visible bool kvm_rebooting;
  4716. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_rebooting);
  4717. static DEFINE_PER_CPU(bool, virtualization_enabled);
  4718. static DEFINE_MUTEX(kvm_usage_lock);
  4719. static int kvm_usage_count;
  4720. __weak void kvm_arch_enable_virtualization(void)
  4721. {
  4722. }
  4723. __weak void kvm_arch_disable_virtualization(void)
  4724. {
  4725. }
  4726. static int kvm_enable_virtualization_cpu(void)
  4727. {
  4728. if (__this_cpu_read(virtualization_enabled))
  4729. return 0;
  4730. if (kvm_arch_enable_virtualization_cpu()) {
  4731. pr_info("kvm: enabling virtualization on CPU%d failed\n",
  4732. raw_smp_processor_id());
  4733. return -EIO;
  4734. }
  4735. __this_cpu_write(virtualization_enabled, true);
  4736. return 0;
  4737. }
  4738. static int kvm_online_cpu(unsigned int cpu)
  4739. {
  4740. /*
  4741. * Abort the CPU online process if hardware virtualization cannot
  4742. * be enabled. Otherwise running VMs would encounter unrecoverable
  4743. * errors when scheduled to this CPU.
  4744. */
  4745. return kvm_enable_virtualization_cpu();
  4746. }
  4747. static void kvm_disable_virtualization_cpu(void *ign)
  4748. {
  4749. if (!__this_cpu_read(virtualization_enabled))
  4750. return;
  4751. kvm_arch_disable_virtualization_cpu();
  4752. __this_cpu_write(virtualization_enabled, false);
  4753. }
  4754. static int kvm_offline_cpu(unsigned int cpu)
  4755. {
  4756. kvm_disable_virtualization_cpu(NULL);
  4757. return 0;
  4758. }
  4759. static void kvm_shutdown(void *data)
  4760. {
  4761. /*
  4762. * Disable hardware virtualization and set kvm_rebooting to indicate
  4763. * that KVM has asynchronously disabled hardware virtualization, i.e.
  4764. * that relevant errors and exceptions aren't entirely unexpected.
  4765. * Some flavors of hardware virtualization need to be disabled before
  4766. * transferring control to firmware (to perform shutdown/reboot), e.g.
  4767. * on x86, virtualization can block INIT interrupts, which are used by
  4768. * firmware to pull APs back under firmware control. Note, this path
  4769. * is used for both shutdown and reboot scenarios, i.e. neither name is
  4770. * 100% comprehensive.
  4771. */
  4772. pr_info("kvm: exiting hardware virtualization\n");
  4773. kvm_rebooting = true;
  4774. on_each_cpu(kvm_disable_virtualization_cpu, NULL, 1);
  4775. }
  4776. static int kvm_suspend(void *data)
  4777. {
  4778. /*
  4779. * Secondary CPUs and CPU hotplug are disabled across the suspend/resume
  4780. * callbacks, i.e. no need to acquire kvm_usage_lock to ensure the usage
  4781. * count is stable. Assert that kvm_usage_lock is not held to ensure
  4782. * the system isn't suspended while KVM is enabling hardware. Hardware
  4783. * enabling can be preempted, but the task cannot be frozen until it has
  4784. * dropped all locks (userspace tasks are frozen via a fake signal).
  4785. */
  4786. lockdep_assert_not_held(&kvm_usage_lock);
  4787. lockdep_assert_irqs_disabled();
  4788. kvm_disable_virtualization_cpu(NULL);
  4789. return 0;
  4790. }
  4791. static void kvm_resume(void *data)
  4792. {
  4793. lockdep_assert_not_held(&kvm_usage_lock);
  4794. lockdep_assert_irqs_disabled();
  4795. WARN_ON_ONCE(kvm_enable_virtualization_cpu());
  4796. }
  4797. static const struct syscore_ops kvm_syscore_ops = {
  4798. .suspend = kvm_suspend,
  4799. .resume = kvm_resume,
  4800. .shutdown = kvm_shutdown,
  4801. };
  4802. static struct syscore kvm_syscore = {
  4803. .ops = &kvm_syscore_ops,
  4804. };
  4805. int kvm_enable_virtualization(void)
  4806. {
  4807. int r;
  4808. guard(mutex)(&kvm_usage_lock);
  4809. if (kvm_usage_count++)
  4810. return 0;
  4811. kvm_arch_enable_virtualization();
  4812. r = cpuhp_setup_state(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online",
  4813. kvm_online_cpu, kvm_offline_cpu);
  4814. if (r)
  4815. goto err_cpuhp;
  4816. register_syscore(&kvm_syscore);
  4817. /*
  4818. * Undo virtualization enabling and bail if the system is going down.
  4819. * If userspace initiated a forced reboot, e.g. reboot -f, then it's
  4820. * possible for an in-flight operation to enable virtualization after
  4821. * syscore_shutdown() is called, i.e. without kvm_shutdown() being
  4822. * invoked. Note, this relies on system_state being set _before_
  4823. * kvm_shutdown(), e.g. to ensure either kvm_shutdown() is invoked
  4824. * or this CPU observes the impending shutdown. Which is why KVM uses
  4825. * a syscore ops hook instead of registering a dedicated reboot
  4826. * notifier (the latter runs before system_state is updated).
  4827. */
  4828. if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF ||
  4829. system_state == SYSTEM_RESTART) {
  4830. r = -EBUSY;
  4831. goto err_rebooting;
  4832. }
  4833. return 0;
  4834. err_rebooting:
  4835. unregister_syscore(&kvm_syscore);
  4836. cpuhp_remove_state(CPUHP_AP_KVM_ONLINE);
  4837. err_cpuhp:
  4838. kvm_arch_disable_virtualization();
  4839. --kvm_usage_count;
  4840. return r;
  4841. }
  4842. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_enable_virtualization);
  4843. void kvm_disable_virtualization(void)
  4844. {
  4845. guard(mutex)(&kvm_usage_lock);
  4846. if (--kvm_usage_count)
  4847. return;
  4848. unregister_syscore(&kvm_syscore);
  4849. cpuhp_remove_state(CPUHP_AP_KVM_ONLINE);
  4850. kvm_arch_disable_virtualization();
  4851. }
  4852. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_disable_virtualization);
  4853. static int kvm_init_virtualization(void)
  4854. {
  4855. if (enable_virt_at_load)
  4856. return kvm_enable_virtualization();
  4857. return 0;
  4858. }
  4859. static void kvm_uninit_virtualization(void)
  4860. {
  4861. if (enable_virt_at_load)
  4862. kvm_disable_virtualization();
  4863. }
  4864. #else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
  4865. static int kvm_init_virtualization(void)
  4866. {
  4867. return 0;
  4868. }
  4869. static void kvm_uninit_virtualization(void)
  4870. {
  4871. }
  4872. #endif /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
  4873. static void kvm_iodevice_destructor(struct kvm_io_device *dev)
  4874. {
  4875. if (dev->ops->destructor)
  4876. dev->ops->destructor(dev);
  4877. }
  4878. static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
  4879. {
  4880. int i;
  4881. for (i = 0; i < bus->dev_count; i++) {
  4882. struct kvm_io_device *pos = bus->range[i].dev;
  4883. kvm_iodevice_destructor(pos);
  4884. }
  4885. kfree(bus);
  4886. }
  4887. static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
  4888. const struct kvm_io_range *r2)
  4889. {
  4890. gpa_t addr1 = r1->addr;
  4891. gpa_t addr2 = r2->addr;
  4892. if (addr1 < addr2)
  4893. return -1;
  4894. /* If r2->len == 0, match the exact address. If r2->len != 0,
  4895. * accept any overlapping write. Any order is acceptable for
  4896. * overlapping ranges, because kvm_io_bus_get_first_dev ensures
  4897. * we process all of them.
  4898. */
  4899. if (r2->len) {
  4900. addr1 += r1->len;
  4901. addr2 += r2->len;
  4902. }
  4903. if (addr1 > addr2)
  4904. return 1;
  4905. return 0;
  4906. }
  4907. static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
  4908. {
  4909. return kvm_io_bus_cmp(p1, p2);
  4910. }
  4911. static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
  4912. gpa_t addr, int len)
  4913. {
  4914. struct kvm_io_range *range, key;
  4915. int off;
  4916. key = (struct kvm_io_range) {
  4917. .addr = addr,
  4918. .len = len,
  4919. };
  4920. range = bsearch(&key, bus->range, bus->dev_count,
  4921. sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
  4922. if (range == NULL)
  4923. return -ENOENT;
  4924. off = range - bus->range;
  4925. while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0)
  4926. off--;
  4927. return off;
  4928. }
  4929. static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
  4930. struct kvm_io_range *range, const void *val)
  4931. {
  4932. int idx;
  4933. idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
  4934. if (idx < 0)
  4935. return -EOPNOTSUPP;
  4936. while (idx < bus->dev_count &&
  4937. kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
  4938. if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
  4939. range->len, val))
  4940. return idx;
  4941. idx++;
  4942. }
  4943. return -EOPNOTSUPP;
  4944. }
  4945. static struct kvm_io_bus *kvm_get_bus_srcu(struct kvm *kvm, enum kvm_bus idx)
  4946. {
  4947. /*
  4948. * Ensure that any updates to kvm_buses[] observed by the previous vCPU
  4949. * machine instruction are also visible to the vCPU machine instruction
  4950. * that triggered this call.
  4951. */
  4952. smp_mb__after_srcu_read_lock();
  4953. return srcu_dereference(kvm->buses[idx], &kvm->srcu);
  4954. }
  4955. int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
  4956. int len, const void *val)
  4957. {
  4958. struct kvm_io_bus *bus;
  4959. struct kvm_io_range range;
  4960. int r;
  4961. range = (struct kvm_io_range) {
  4962. .addr = addr,
  4963. .len = len,
  4964. };
  4965. bus = kvm_get_bus_srcu(vcpu->kvm, bus_idx);
  4966. if (!bus)
  4967. return -ENOMEM;
  4968. r = __kvm_io_bus_write(vcpu, bus, &range, val);
  4969. return r < 0 ? r : 0;
  4970. }
  4971. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_io_bus_write);
  4972. int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
  4973. gpa_t addr, int len, const void *val, long cookie)
  4974. {
  4975. struct kvm_io_bus *bus;
  4976. struct kvm_io_range range;
  4977. range = (struct kvm_io_range) {
  4978. .addr = addr,
  4979. .len = len,
  4980. };
  4981. bus = kvm_get_bus_srcu(vcpu->kvm, bus_idx);
  4982. if (!bus)
  4983. return -ENOMEM;
  4984. /* First try the device referenced by cookie. */
  4985. if ((cookie >= 0) && (cookie < bus->dev_count) &&
  4986. (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
  4987. if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len,
  4988. val))
  4989. return cookie;
  4990. /*
  4991. * cookie contained garbage; fall back to search and return the
  4992. * correct cookie value.
  4993. */
  4994. return __kvm_io_bus_write(vcpu, bus, &range, val);
  4995. }
  4996. static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
  4997. struct kvm_io_range *range, void *val)
  4998. {
  4999. int idx;
  5000. idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
  5001. if (idx < 0)
  5002. return -EOPNOTSUPP;
  5003. while (idx < bus->dev_count &&
  5004. kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
  5005. if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
  5006. range->len, val))
  5007. return idx;
  5008. idx++;
  5009. }
  5010. return -EOPNOTSUPP;
  5011. }
  5012. int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
  5013. int len, void *val)
  5014. {
  5015. struct kvm_io_bus *bus;
  5016. struct kvm_io_range range;
  5017. int r;
  5018. range = (struct kvm_io_range) {
  5019. .addr = addr,
  5020. .len = len,
  5021. };
  5022. bus = kvm_get_bus_srcu(vcpu->kvm, bus_idx);
  5023. if (!bus)
  5024. return -ENOMEM;
  5025. r = __kvm_io_bus_read(vcpu, bus, &range, val);
  5026. return r < 0 ? r : 0;
  5027. }
  5028. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_io_bus_read);
  5029. static void __free_bus(struct rcu_head *rcu)
  5030. {
  5031. struct kvm_io_bus *bus = container_of(rcu, struct kvm_io_bus, rcu);
  5032. kfree(bus);
  5033. }
  5034. int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
  5035. int len, struct kvm_io_device *dev)
  5036. {
  5037. int i;
  5038. struct kvm_io_bus *new_bus, *bus;
  5039. struct kvm_io_range range;
  5040. lockdep_assert_held(&kvm->slots_lock);
  5041. bus = kvm_get_bus(kvm, bus_idx);
  5042. if (!bus)
  5043. return -ENOMEM;
  5044. /* exclude ioeventfd which is limited by maximum fd */
  5045. if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
  5046. return -ENOSPC;
  5047. new_bus = kmalloc_flex(*bus, range, bus->dev_count + 1,
  5048. GFP_KERNEL_ACCOUNT);
  5049. if (!new_bus)
  5050. return -ENOMEM;
  5051. range = (struct kvm_io_range) {
  5052. .addr = addr,
  5053. .len = len,
  5054. .dev = dev,
  5055. };
  5056. for (i = 0; i < bus->dev_count; i++)
  5057. if (kvm_io_bus_cmp(&bus->range[i], &range) > 0)
  5058. break;
  5059. memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
  5060. new_bus->dev_count++;
  5061. new_bus->range[i] = range;
  5062. memcpy(new_bus->range + i + 1, bus->range + i,
  5063. (bus->dev_count - i) * sizeof(struct kvm_io_range));
  5064. rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
  5065. call_srcu(&kvm->srcu, &bus->rcu, __free_bus);
  5066. return 0;
  5067. }
  5068. int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
  5069. struct kvm_io_device *dev)
  5070. {
  5071. int i;
  5072. struct kvm_io_bus *new_bus, *bus;
  5073. lockdep_assert_held(&kvm->slots_lock);
  5074. bus = kvm_get_bus(kvm, bus_idx);
  5075. if (!bus)
  5076. return 0;
  5077. for (i = 0; i < bus->dev_count; i++) {
  5078. if (bus->range[i].dev == dev) {
  5079. break;
  5080. }
  5081. }
  5082. if (i == bus->dev_count)
  5083. return 0;
  5084. new_bus = kmalloc_flex(*bus, range, bus->dev_count - 1,
  5085. GFP_KERNEL_ACCOUNT);
  5086. if (new_bus) {
  5087. memcpy(new_bus, bus, struct_size(bus, range, i));
  5088. new_bus->dev_count--;
  5089. memcpy(new_bus->range + i, bus->range + i + 1,
  5090. flex_array_size(new_bus, range, new_bus->dev_count - i));
  5091. }
  5092. rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
  5093. synchronize_srcu_expedited(&kvm->srcu);
  5094. /*
  5095. * If NULL bus is installed, destroy the old bus, including all the
  5096. * attached devices. Otherwise, destroy the caller's device only.
  5097. */
  5098. if (!new_bus) {
  5099. pr_err("kvm: failed to shrink bus, removing it completely\n");
  5100. kvm_io_bus_destroy(bus);
  5101. return -ENOMEM;
  5102. }
  5103. kvm_iodevice_destructor(dev);
  5104. kfree(bus);
  5105. return 0;
  5106. }
  5107. struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
  5108. gpa_t addr)
  5109. {
  5110. struct kvm_io_bus *bus;
  5111. int dev_idx, srcu_idx;
  5112. struct kvm_io_device *iodev = NULL;
  5113. srcu_idx = srcu_read_lock(&kvm->srcu);
  5114. bus = kvm_get_bus_srcu(kvm, bus_idx);
  5115. if (!bus)
  5116. goto out_unlock;
  5117. dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
  5118. if (dev_idx < 0)
  5119. goto out_unlock;
  5120. iodev = bus->range[dev_idx].dev;
  5121. out_unlock:
  5122. srcu_read_unlock(&kvm->srcu, srcu_idx);
  5123. return iodev;
  5124. }
  5125. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_io_bus_get_dev);
  5126. static int kvm_debugfs_open(struct inode *inode, struct file *file,
  5127. int (*get)(void *, u64 *), int (*set)(void *, u64),
  5128. const char *fmt)
  5129. {
  5130. int ret;
  5131. struct kvm_stat_data *stat_data = inode->i_private;
  5132. /*
  5133. * The debugfs files are a reference to the kvm struct which
  5134. * is still valid when kvm_destroy_vm is called. kvm_get_kvm_safe
  5135. * avoids the race between open and the removal of the debugfs directory.
  5136. */
  5137. if (!kvm_get_kvm_safe(stat_data->kvm))
  5138. return -ENOENT;
  5139. ret = simple_attr_open(inode, file, get,
  5140. kvm_stats_debugfs_mode(stat_data->desc) & 0222
  5141. ? set : NULL, fmt);
  5142. if (ret)
  5143. kvm_put_kvm(stat_data->kvm);
  5144. return ret;
  5145. }
  5146. static int kvm_debugfs_release(struct inode *inode, struct file *file)
  5147. {
  5148. struct kvm_stat_data *stat_data = inode->i_private;
  5149. simple_attr_release(inode, file);
  5150. kvm_put_kvm(stat_data->kvm);
  5151. return 0;
  5152. }
  5153. static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val)
  5154. {
  5155. *val = *(u64 *)((void *)(&kvm->stat) + offset);
  5156. return 0;
  5157. }
  5158. static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset)
  5159. {
  5160. *(u64 *)((void *)(&kvm->stat) + offset) = 0;
  5161. return 0;
  5162. }
  5163. static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val)
  5164. {
  5165. unsigned long i;
  5166. struct kvm_vcpu *vcpu;
  5167. *val = 0;
  5168. kvm_for_each_vcpu(i, vcpu, kvm)
  5169. *val += *(u64 *)((void *)(&vcpu->stat) + offset);
  5170. return 0;
  5171. }
  5172. static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset)
  5173. {
  5174. unsigned long i;
  5175. struct kvm_vcpu *vcpu;
  5176. kvm_for_each_vcpu(i, vcpu, kvm)
  5177. *(u64 *)((void *)(&vcpu->stat) + offset) = 0;
  5178. return 0;
  5179. }
  5180. static int kvm_stat_data_get(void *data, u64 *val)
  5181. {
  5182. int r = -EFAULT;
  5183. struct kvm_stat_data *stat_data = data;
  5184. switch (stat_data->kind) {
  5185. case KVM_STAT_VM:
  5186. r = kvm_get_stat_per_vm(stat_data->kvm,
  5187. stat_data->desc->offset, val);
  5188. break;
  5189. case KVM_STAT_VCPU:
  5190. r = kvm_get_stat_per_vcpu(stat_data->kvm,
  5191. stat_data->desc->offset, val);
  5192. break;
  5193. }
  5194. return r;
  5195. }
  5196. static int kvm_stat_data_clear(void *data, u64 val)
  5197. {
  5198. int r = -EFAULT;
  5199. struct kvm_stat_data *stat_data = data;
  5200. if (val)
  5201. return -EINVAL;
  5202. switch (stat_data->kind) {
  5203. case KVM_STAT_VM:
  5204. r = kvm_clear_stat_per_vm(stat_data->kvm,
  5205. stat_data->desc->offset);
  5206. break;
  5207. case KVM_STAT_VCPU:
  5208. r = kvm_clear_stat_per_vcpu(stat_data->kvm,
  5209. stat_data->desc->offset);
  5210. break;
  5211. }
  5212. return r;
  5213. }
  5214. static int kvm_stat_data_open(struct inode *inode, struct file *file)
  5215. {
  5216. __simple_attr_check_format("%llu\n", 0ull);
  5217. return kvm_debugfs_open(inode, file, kvm_stat_data_get,
  5218. kvm_stat_data_clear, "%llu\n");
  5219. }
  5220. static const struct file_operations stat_fops_per_vm = {
  5221. .owner = THIS_MODULE,
  5222. .open = kvm_stat_data_open,
  5223. .release = kvm_debugfs_release,
  5224. .read = simple_attr_read,
  5225. .write = simple_attr_write,
  5226. };
  5227. static int vm_stat_get(void *_offset, u64 *val)
  5228. {
  5229. unsigned offset = (long)_offset;
  5230. struct kvm *kvm;
  5231. u64 tmp_val;
  5232. *val = 0;
  5233. mutex_lock(&kvm_lock);
  5234. list_for_each_entry(kvm, &vm_list, vm_list) {
  5235. kvm_get_stat_per_vm(kvm, offset, &tmp_val);
  5236. *val += tmp_val;
  5237. }
  5238. mutex_unlock(&kvm_lock);
  5239. return 0;
  5240. }
  5241. static int vm_stat_clear(void *_offset, u64 val)
  5242. {
  5243. unsigned offset = (long)_offset;
  5244. struct kvm *kvm;
  5245. if (val)
  5246. return -EINVAL;
  5247. mutex_lock(&kvm_lock);
  5248. list_for_each_entry(kvm, &vm_list, vm_list) {
  5249. kvm_clear_stat_per_vm(kvm, offset);
  5250. }
  5251. mutex_unlock(&kvm_lock);
  5252. return 0;
  5253. }
  5254. DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n");
  5255. DEFINE_SIMPLE_ATTRIBUTE(vm_stat_readonly_fops, vm_stat_get, NULL, "%llu\n");
  5256. static int vcpu_stat_get(void *_offset, u64 *val)
  5257. {
  5258. unsigned offset = (long)_offset;
  5259. struct kvm *kvm;
  5260. u64 tmp_val;
  5261. *val = 0;
  5262. mutex_lock(&kvm_lock);
  5263. list_for_each_entry(kvm, &vm_list, vm_list) {
  5264. kvm_get_stat_per_vcpu(kvm, offset, &tmp_val);
  5265. *val += tmp_val;
  5266. }
  5267. mutex_unlock(&kvm_lock);
  5268. return 0;
  5269. }
  5270. static int vcpu_stat_clear(void *_offset, u64 val)
  5271. {
  5272. unsigned offset = (long)_offset;
  5273. struct kvm *kvm;
  5274. if (val)
  5275. return -EINVAL;
  5276. mutex_lock(&kvm_lock);
  5277. list_for_each_entry(kvm, &vm_list, vm_list) {
  5278. kvm_clear_stat_per_vcpu(kvm, offset);
  5279. }
  5280. mutex_unlock(&kvm_lock);
  5281. return 0;
  5282. }
  5283. DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear,
  5284. "%llu\n");
  5285. DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_readonly_fops, vcpu_stat_get, NULL, "%llu\n");
  5286. static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
  5287. {
  5288. struct kobj_uevent_env *env;
  5289. unsigned long long created, active;
  5290. if (!kvm_dev.this_device || !kvm)
  5291. return;
  5292. mutex_lock(&kvm_lock);
  5293. if (type == KVM_EVENT_CREATE_VM) {
  5294. kvm_createvm_count++;
  5295. kvm_active_vms++;
  5296. } else if (type == KVM_EVENT_DESTROY_VM) {
  5297. kvm_active_vms--;
  5298. }
  5299. created = kvm_createvm_count;
  5300. active = kvm_active_vms;
  5301. mutex_unlock(&kvm_lock);
  5302. env = kzalloc_obj(*env);
  5303. if (!env)
  5304. return;
  5305. add_uevent_var(env, "CREATED=%llu", created);
  5306. add_uevent_var(env, "COUNT=%llu", active);
  5307. if (type == KVM_EVENT_CREATE_VM) {
  5308. add_uevent_var(env, "EVENT=create");
  5309. kvm->userspace_pid = task_pid_nr(current);
  5310. } else if (type == KVM_EVENT_DESTROY_VM) {
  5311. add_uevent_var(env, "EVENT=destroy");
  5312. }
  5313. add_uevent_var(env, "PID=%d", kvm->userspace_pid);
  5314. if (!IS_ERR(kvm->debugfs_dentry)) {
  5315. char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL);
  5316. if (p) {
  5317. tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
  5318. if (!IS_ERR(tmp))
  5319. add_uevent_var(env, "STATS_PATH=%s", tmp);
  5320. kfree(p);
  5321. }
  5322. }
  5323. /* no need for checks, since we are adding at most only 5 keys */
  5324. env->envp[env->envp_idx++] = NULL;
  5325. kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp);
  5326. kfree(env);
  5327. }
  5328. static void kvm_init_debug(void)
  5329. {
  5330. const struct file_operations *fops;
  5331. const struct kvm_stats_desc *pdesc;
  5332. int i;
  5333. kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
  5334. for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
  5335. pdesc = &kvm_vm_stats_desc[i];
  5336. if (kvm_stats_debugfs_mode(pdesc) & 0222)
  5337. fops = &vm_stat_fops;
  5338. else
  5339. fops = &vm_stat_readonly_fops;
  5340. debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
  5341. kvm_debugfs_dir,
  5342. (void *)(long)pdesc->offset, fops);
  5343. }
  5344. for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
  5345. pdesc = &kvm_vcpu_stats_desc[i];
  5346. if (kvm_stats_debugfs_mode(pdesc) & 0222)
  5347. fops = &vcpu_stat_fops;
  5348. else
  5349. fops = &vcpu_stat_readonly_fops;
  5350. debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
  5351. kvm_debugfs_dir,
  5352. (void *)(long)pdesc->offset, fops);
  5353. }
  5354. }
  5355. static inline
  5356. struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
  5357. {
  5358. return container_of(pn, struct kvm_vcpu, preempt_notifier);
  5359. }
  5360. static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
  5361. {
  5362. struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
  5363. WRITE_ONCE(vcpu->preempted, false);
  5364. WRITE_ONCE(vcpu->ready, false);
  5365. __this_cpu_write(kvm_running_vcpu, vcpu);
  5366. kvm_arch_vcpu_load(vcpu, cpu);
  5367. WRITE_ONCE(vcpu->scheduled_out, false);
  5368. }
  5369. static void kvm_sched_out(struct preempt_notifier *pn,
  5370. struct task_struct *next)
  5371. {
  5372. struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
  5373. WRITE_ONCE(vcpu->scheduled_out, true);
  5374. if (task_is_runnable(current) && vcpu->wants_to_run) {
  5375. WRITE_ONCE(vcpu->preempted, true);
  5376. WRITE_ONCE(vcpu->ready, true);
  5377. }
  5378. kvm_arch_vcpu_put(vcpu);
  5379. __this_cpu_write(kvm_running_vcpu, NULL);
  5380. }
  5381. /**
  5382. * kvm_get_running_vcpu - get the vcpu running on the current CPU.
  5383. *
  5384. * We can disable preemption locally around accessing the per-CPU variable,
  5385. * and use the resolved vcpu pointer after enabling preemption again,
  5386. * because even if the current thread is migrated to another CPU, reading
  5387. * the per-CPU value later will give us the same value as we update the
  5388. * per-CPU variable in the preempt notifier handlers.
  5389. */
  5390. struct kvm_vcpu *kvm_get_running_vcpu(void)
  5391. {
  5392. struct kvm_vcpu *vcpu;
  5393. preempt_disable();
  5394. vcpu = __this_cpu_read(kvm_running_vcpu);
  5395. preempt_enable();
  5396. return vcpu;
  5397. }
  5398. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_running_vcpu);
  5399. /**
  5400. * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus.
  5401. */
  5402. struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void)
  5403. {
  5404. return &kvm_running_vcpu;
  5405. }
  5406. #ifdef CONFIG_GUEST_PERF_EVENTS
  5407. static unsigned int kvm_guest_state(void)
  5408. {
  5409. struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
  5410. unsigned int state;
  5411. if (!kvm_arch_pmi_in_guest(vcpu))
  5412. return 0;
  5413. state = PERF_GUEST_ACTIVE;
  5414. if (!kvm_arch_vcpu_in_kernel(vcpu))
  5415. state |= PERF_GUEST_USER;
  5416. return state;
  5417. }
  5418. static unsigned long kvm_guest_get_ip(void)
  5419. {
  5420. struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
  5421. /* Retrieving the IP must be guarded by a call to kvm_guest_state(). */
  5422. if (WARN_ON_ONCE(!kvm_arch_pmi_in_guest(vcpu)))
  5423. return 0;
  5424. return kvm_arch_vcpu_get_ip(vcpu);
  5425. }
  5426. static struct perf_guest_info_callbacks kvm_guest_cbs = {
  5427. .state = kvm_guest_state,
  5428. .get_ip = kvm_guest_get_ip,
  5429. .handle_intel_pt_intr = NULL,
  5430. .handle_mediated_pmi = NULL,
  5431. };
  5432. void __kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void),
  5433. void (*mediated_pmi_handler)(void))
  5434. {
  5435. kvm_guest_cbs.handle_intel_pt_intr = pt_intr_handler;
  5436. kvm_guest_cbs.handle_mediated_pmi = mediated_pmi_handler;
  5437. perf_register_guest_info_callbacks(&kvm_guest_cbs);
  5438. }
  5439. void kvm_unregister_perf_callbacks(void)
  5440. {
  5441. perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
  5442. }
  5443. #endif
  5444. int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
  5445. {
  5446. int r;
  5447. int cpu;
  5448. /* A kmem cache lets us meet the alignment requirements of fx_save. */
  5449. if (!vcpu_align)
  5450. vcpu_align = __alignof__(struct kvm_vcpu);
  5451. kvm_vcpu_cache =
  5452. kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
  5453. SLAB_ACCOUNT,
  5454. offsetof(struct kvm_vcpu, arch),
  5455. offsetofend(struct kvm_vcpu, stats_id)
  5456. - offsetof(struct kvm_vcpu, arch),
  5457. NULL);
  5458. if (!kvm_vcpu_cache)
  5459. return -ENOMEM;
  5460. for_each_possible_cpu(cpu) {
  5461. if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu),
  5462. GFP_KERNEL, cpu_to_node(cpu))) {
  5463. r = -ENOMEM;
  5464. goto err_cpu_kick_mask;
  5465. }
  5466. }
  5467. r = kvm_irqfd_init();
  5468. if (r)
  5469. goto err_irqfd;
  5470. r = kvm_async_pf_init();
  5471. if (r)
  5472. goto err_async_pf;
  5473. kvm_chardev_ops.owner = module;
  5474. kvm_vm_fops.owner = module;
  5475. kvm_vcpu_fops.owner = module;
  5476. kvm_device_fops.owner = module;
  5477. kvm_preempt_ops.sched_in = kvm_sched_in;
  5478. kvm_preempt_ops.sched_out = kvm_sched_out;
  5479. kvm_init_debug();
  5480. r = kvm_vfio_ops_init();
  5481. if (WARN_ON_ONCE(r))
  5482. goto err_vfio;
  5483. r = kvm_gmem_init(module);
  5484. if (r)
  5485. goto err_gmem;
  5486. r = kvm_init_virtualization();
  5487. if (r)
  5488. goto err_virt;
  5489. /*
  5490. * Registration _must_ be the very last thing done, as this exposes
  5491. * /dev/kvm to userspace, i.e. all infrastructure must be setup!
  5492. */
  5493. r = misc_register(&kvm_dev);
  5494. if (r) {
  5495. pr_err("kvm: misc device register failed\n");
  5496. goto err_register;
  5497. }
  5498. return 0;
  5499. err_register:
  5500. kvm_uninit_virtualization();
  5501. err_virt:
  5502. kvm_gmem_exit();
  5503. err_gmem:
  5504. kvm_vfio_ops_exit();
  5505. err_vfio:
  5506. kvm_async_pf_deinit();
  5507. err_async_pf:
  5508. kvm_irqfd_exit();
  5509. err_irqfd:
  5510. err_cpu_kick_mask:
  5511. for_each_possible_cpu(cpu)
  5512. free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
  5513. kmem_cache_destroy(kvm_vcpu_cache);
  5514. return r;
  5515. }
  5516. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_init);
  5517. void kvm_exit(void)
  5518. {
  5519. int cpu;
  5520. /*
  5521. * Note, unregistering /dev/kvm doesn't strictly need to come first,
  5522. * fops_get(), a.k.a. try_module_get(), prevents acquiring references
  5523. * to KVM while the module is being stopped.
  5524. */
  5525. misc_deregister(&kvm_dev);
  5526. kvm_uninit_virtualization();
  5527. debugfs_remove_recursive(kvm_debugfs_dir);
  5528. for_each_possible_cpu(cpu)
  5529. free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
  5530. kmem_cache_destroy(kvm_vcpu_cache);
  5531. kvm_gmem_exit();
  5532. kvm_vfio_ops_exit();
  5533. kvm_async_pf_deinit();
  5534. kvm_irqfd_exit();
  5535. }
  5536. EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_exit);