ublk_drv.c 138 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. /*
  3. * Userspace block device - block device which IO is handled from userspace
  4. *
  5. * Take full use of io_uring passthrough command for communicating with
  6. * ublk userspace daemon(ublksrvd) for handling basic IO request.
  7. *
  8. * Copyright 2022 Ming Lei <ming.lei@redhat.com>
  9. *
  10. * (part of code stolen from loop.c)
  11. */
  12. #include <linux/module.h>
  13. #include <linux/moduleparam.h>
  14. #include <linux/sched.h>
  15. #include <linux/fs.h>
  16. #include <linux/pagemap.h>
  17. #include <linux/file.h>
  18. #include <linux/stat.h>
  19. #include <linux/errno.h>
  20. #include <linux/major.h>
  21. #include <linux/wait.h>
  22. #include <linux/blkdev.h>
  23. #include <linux/init.h>
  24. #include <linux/swap.h>
  25. #include <linux/slab.h>
  26. #include <linux/compat.h>
  27. #include <linux/mutex.h>
  28. #include <linux/writeback.h>
  29. #include <linux/completion.h>
  30. #include <linux/highmem.h>
  31. #include <linux/sysfs.h>
  32. #include <linux/miscdevice.h>
  33. #include <linux/falloc.h>
  34. #include <linux/uio.h>
  35. #include <linux/ioprio.h>
  36. #include <linux/sched/mm.h>
  37. #include <linux/uaccess.h>
  38. #include <linux/cdev.h>
  39. #include <linux/io_uring/cmd.h>
  40. #include <linux/blk-mq.h>
  41. #include <linux/delay.h>
  42. #include <linux/mm.h>
  43. #include <asm/page.h>
  44. #include <linux/task_work.h>
  45. #include <linux/namei.h>
  46. #include <linux/kref.h>
  47. #include <linux/kfifo.h>
  48. #include <linux/blk-integrity.h>
  49. #include <uapi/linux/fs.h>
  50. #include <uapi/linux/ublk_cmd.h>
  51. #define UBLK_MINORS (1U << MINORBITS)
  52. #define UBLK_INVALID_BUF_IDX ((u16)-1)
  53. /* private ioctl command mirror */
  54. #define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC)
  55. #define UBLK_CMD_UPDATE_SIZE _IOC_NR(UBLK_U_CMD_UPDATE_SIZE)
  56. #define UBLK_CMD_QUIESCE_DEV _IOC_NR(UBLK_U_CMD_QUIESCE_DEV)
  57. #define UBLK_CMD_TRY_STOP_DEV _IOC_NR(UBLK_U_CMD_TRY_STOP_DEV)
  58. #define UBLK_IO_REGISTER_IO_BUF _IOC_NR(UBLK_U_IO_REGISTER_IO_BUF)
  59. #define UBLK_IO_UNREGISTER_IO_BUF _IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF)
  60. /* All UBLK_F_* have to be included into UBLK_F_ALL */
  61. #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \
  62. | UBLK_F_URING_CMD_COMP_IN_TASK \
  63. | UBLK_F_NEED_GET_DATA \
  64. | UBLK_F_USER_RECOVERY \
  65. | UBLK_F_USER_RECOVERY_REISSUE \
  66. | UBLK_F_UNPRIVILEGED_DEV \
  67. | UBLK_F_CMD_IOCTL_ENCODE \
  68. | UBLK_F_USER_COPY \
  69. | UBLK_F_ZONED \
  70. | UBLK_F_USER_RECOVERY_FAIL_IO \
  71. | UBLK_F_UPDATE_SIZE \
  72. | UBLK_F_AUTO_BUF_REG \
  73. | UBLK_F_QUIESCE \
  74. | UBLK_F_PER_IO_DAEMON \
  75. | UBLK_F_BUF_REG_OFF_DAEMON \
  76. | (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ? UBLK_F_INTEGRITY : 0) \
  77. | UBLK_F_SAFE_STOP_DEV \
  78. | UBLK_F_BATCH_IO \
  79. | UBLK_F_NO_AUTO_PART_SCAN)
  80. #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \
  81. | UBLK_F_USER_RECOVERY_REISSUE \
  82. | UBLK_F_USER_RECOVERY_FAIL_IO)
  83. /* All UBLK_PARAM_TYPE_* should be included here */
  84. #define UBLK_PARAM_TYPE_ALL \
  85. (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
  86. UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED | \
  87. UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT | \
  88. UBLK_PARAM_TYPE_INTEGRITY)
  89. #define UBLK_BATCH_F_ALL \
  90. (UBLK_BATCH_F_HAS_ZONE_LBA | \
  91. UBLK_BATCH_F_HAS_BUF_ADDR | \
  92. UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK)
  93. /* ublk batch fetch uring_cmd */
  94. struct ublk_batch_fetch_cmd {
  95. struct list_head node;
  96. struct io_uring_cmd *cmd;
  97. unsigned short buf_group;
  98. };
  99. struct ublk_uring_cmd_pdu {
  100. /*
  101. * Store requests in same batch temporarily for queuing them to
  102. * daemon context.
  103. *
  104. * It should have been stored to request payload, but we do want
  105. * to avoid extra pre-allocation, and uring_cmd payload is always
  106. * free for us
  107. */
  108. union {
  109. struct request *req;
  110. struct request *req_list;
  111. };
  112. /*
  113. * The following two are valid in this cmd whole lifetime, and
  114. * setup in ublk uring_cmd handler
  115. */
  116. struct ublk_queue *ubq;
  117. union {
  118. u16 tag;
  119. struct ublk_batch_fetch_cmd *fcmd; /* batch io only */
  120. };
  121. };
  122. struct ublk_batch_io_data {
  123. struct ublk_device *ub;
  124. struct io_uring_cmd *cmd;
  125. struct ublk_batch_io header;
  126. unsigned int issue_flags;
  127. struct io_comp_batch *iob;
  128. };
  129. /*
  130. * io command is active: sqe cmd is received, and its cqe isn't done
  131. *
  132. * If the flag is set, the io command is owned by ublk driver, and waited
  133. * for incoming blk-mq request from the ublk block device.
  134. *
  135. * If the flag is cleared, the io command will be completed, and owned by
  136. * ublk server.
  137. */
  138. #define UBLK_IO_FLAG_ACTIVE 0x01
  139. /*
  140. * IO command is completed via cqe, and it is being handled by ublksrv, and
  141. * not committed yet
  142. *
  143. * Basically exclusively with UBLK_IO_FLAG_ACTIVE, so can be served for
  144. * cross verification
  145. */
  146. #define UBLK_IO_FLAG_OWNED_BY_SRV 0x02
  147. /*
  148. * UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires
  149. * get data buffer address from ublksrv.
  150. *
  151. * Then, bio data could be copied into this data buffer for a WRITE request
  152. * after the IO command is issued again and UBLK_IO_FLAG_NEED_GET_DATA is unset.
  153. */
  154. #define UBLK_IO_FLAG_NEED_GET_DATA 0x08
  155. /*
  156. * request buffer is registered automatically, so we have to unregister it
  157. * before completing this request.
  158. *
  159. * io_uring will unregister buffer automatically for us during exiting.
  160. */
  161. #define UBLK_IO_FLAG_AUTO_BUF_REG 0x10
  162. /* atomic RW with ubq->cancel_lock */
  163. #define UBLK_IO_FLAG_CANCELED 0x80000000
  164. /*
  165. * Initialize refcount to a large number to include any registered buffers.
  166. * UBLK_IO_COMMIT_AND_FETCH_REQ will release these references minus those for
  167. * any buffers registered on the io daemon task.
  168. */
  169. #define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2)
  170. /* used for UBLK_F_BATCH_IO only */
  171. #define UBLK_BATCH_IO_UNUSED_TAG ((unsigned short)-1)
  172. union ublk_io_buf {
  173. __u64 addr;
  174. struct ublk_auto_buf_reg auto_reg;
  175. };
  176. struct ublk_io {
  177. union ublk_io_buf buf;
  178. unsigned int flags;
  179. int res;
  180. union {
  181. /* valid if UBLK_IO_FLAG_ACTIVE is set */
  182. struct io_uring_cmd *cmd;
  183. /* valid if UBLK_IO_FLAG_OWNED_BY_SRV is set */
  184. struct request *req;
  185. };
  186. struct task_struct *task;
  187. /*
  188. * The number of uses of this I/O by the ublk server
  189. * if user copy or zero copy are enabled:
  190. * - UBLK_REFCOUNT_INIT from dispatch to the server
  191. * until UBLK_IO_COMMIT_AND_FETCH_REQ
  192. * - 1 for each inflight ublk_ch_{read,write}_iter() call not on task
  193. * - 1 for each io_uring registered buffer not registered on task
  194. * The I/O can only be completed once all references are dropped.
  195. * User copy and buffer registration operations are only permitted
  196. * if the reference count is nonzero.
  197. */
  198. refcount_t ref;
  199. /* Count of buffers registered on task and not yet unregistered */
  200. unsigned task_registered_buffers;
  201. void *buf_ctx_handle;
  202. spinlock_t lock;
  203. } ____cacheline_aligned_in_smp;
  204. struct ublk_queue {
  205. int q_id;
  206. int q_depth;
  207. unsigned long flags;
  208. struct ublksrv_io_desc *io_cmd_buf;
  209. bool force_abort;
  210. bool canceling;
  211. bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */
  212. spinlock_t cancel_lock;
  213. struct ublk_device *dev;
  214. u32 nr_io_ready;
  215. /*
  216. * For supporting UBLK_F_BATCH_IO only.
  217. *
  218. * Inflight ublk request tag is saved in this fifo
  219. *
  220. * There are multiple writer from ublk_queue_rq() or ublk_queue_rqs(),
  221. * so lock is required for storing request tag to fifo
  222. *
  223. * Make sure just one reader for fetching request from task work
  224. * function to ublk server, so no need to grab the lock in reader
  225. * side.
  226. *
  227. * Batch I/O State Management:
  228. *
  229. * The batch I/O system uses implicit state management based on the
  230. * combination of three key variables below.
  231. *
  232. * - IDLE: list_empty(&fcmd_head) && !active_fcmd
  233. * No fetch commands available, events queue in evts_fifo
  234. *
  235. * - READY: !list_empty(&fcmd_head) && !active_fcmd
  236. * Fetch commands available but none processing events
  237. *
  238. * - ACTIVE: active_fcmd
  239. * One fetch command actively processing events from evts_fifo
  240. *
  241. * Key Invariants:
  242. * - At most one active_fcmd at any time (single reader)
  243. * - active_fcmd is always from fcmd_head list when non-NULL
  244. * - evts_fifo can be read locklessly by the single active reader
  245. * - All state transitions require evts_lock protection
  246. * - Multiple writers to evts_fifo require lock protection
  247. */
  248. struct {
  249. DECLARE_KFIFO_PTR(evts_fifo, unsigned short);
  250. spinlock_t evts_lock;
  251. /* List of fetch commands available to process events */
  252. struct list_head fcmd_head;
  253. /* Currently active fetch command (NULL = none active) */
  254. struct ublk_batch_fetch_cmd *active_fcmd;
  255. }____cacheline_aligned_in_smp;
  256. struct ublk_io ios[] __counted_by(q_depth);
  257. };
  258. struct ublk_device {
  259. struct gendisk *ub_disk;
  260. struct ublksrv_ctrl_dev_info dev_info;
  261. struct blk_mq_tag_set tag_set;
  262. struct cdev cdev;
  263. struct device cdev_dev;
  264. #define UB_STATE_OPEN 0
  265. #define UB_STATE_USED 1
  266. #define UB_STATE_DELETED 2
  267. unsigned long state;
  268. int ub_number;
  269. struct mutex mutex;
  270. spinlock_t lock;
  271. struct mm_struct *mm;
  272. struct ublk_params params;
  273. struct completion completion;
  274. u32 nr_queue_ready;
  275. bool unprivileged_daemons;
  276. struct mutex cancel_mutex;
  277. bool canceling;
  278. pid_t ublksrv_tgid;
  279. struct delayed_work exit_work;
  280. struct work_struct partition_scan_work;
  281. bool block_open; /* protected by open_mutex */
  282. struct ublk_queue *queues[];
  283. };
  284. /* header of ublk_params */
  285. struct ublk_params_header {
  286. __u32 len;
  287. __u32 types;
  288. };
  289. static void ublk_io_release(void *priv);
  290. static void ublk_stop_dev_unlocked(struct ublk_device *ub);
  291. static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq);
  292. static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
  293. u16 q_id, u16 tag, struct ublk_io *io);
  294. static inline unsigned int ublk_req_build_flags(struct request *req);
  295. static void ublk_batch_dispatch(struct ublk_queue *ubq,
  296. const struct ublk_batch_io_data *data,
  297. struct ublk_batch_fetch_cmd *fcmd);
  298. static inline bool ublk_dev_support_batch_io(const struct ublk_device *ub)
  299. {
  300. return ub->dev_info.flags & UBLK_F_BATCH_IO;
  301. }
  302. static inline bool ublk_support_batch_io(const struct ublk_queue *ubq)
  303. {
  304. return ubq->flags & UBLK_F_BATCH_IO;
  305. }
  306. static inline void ublk_io_lock(struct ublk_io *io)
  307. {
  308. spin_lock(&io->lock);
  309. }
  310. static inline void ublk_io_unlock(struct ublk_io *io)
  311. {
  312. spin_unlock(&io->lock);
  313. }
  314. /* Initialize the event queue */
  315. static inline int ublk_io_evts_init(struct ublk_queue *q, unsigned int size,
  316. int numa_node)
  317. {
  318. spin_lock_init(&q->evts_lock);
  319. return kfifo_alloc_node(&q->evts_fifo, size, GFP_KERNEL, numa_node);
  320. }
  321. /* Check if event queue is empty */
  322. static inline bool ublk_io_evts_empty(const struct ublk_queue *q)
  323. {
  324. return kfifo_is_empty(&q->evts_fifo);
  325. }
  326. static inline void ublk_io_evts_deinit(struct ublk_queue *q)
  327. {
  328. WARN_ON_ONCE(!kfifo_is_empty(&q->evts_fifo));
  329. kfifo_free(&q->evts_fifo);
  330. }
  331. static inline struct ublksrv_io_desc *
  332. ublk_get_iod(const struct ublk_queue *ubq, unsigned tag)
  333. {
  334. return &ubq->io_cmd_buf[tag];
  335. }
  336. static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq)
  337. {
  338. return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY;
  339. }
  340. static inline bool ublk_dev_support_zero_copy(const struct ublk_device *ub)
  341. {
  342. return ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY;
  343. }
  344. static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq)
  345. {
  346. return ubq->flags & UBLK_F_AUTO_BUF_REG;
  347. }
  348. static inline bool ublk_dev_support_auto_buf_reg(const struct ublk_device *ub)
  349. {
  350. return ub->dev_info.flags & UBLK_F_AUTO_BUF_REG;
  351. }
  352. static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
  353. {
  354. return ubq->flags & UBLK_F_USER_COPY;
  355. }
  356. static inline bool ublk_dev_support_user_copy(const struct ublk_device *ub)
  357. {
  358. return ub->dev_info.flags & UBLK_F_USER_COPY;
  359. }
  360. static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
  361. {
  362. return ub->dev_info.flags & UBLK_F_ZONED;
  363. }
  364. static inline bool ublk_queue_is_zoned(const struct ublk_queue *ubq)
  365. {
  366. return ubq->flags & UBLK_F_ZONED;
  367. }
  368. static inline bool ublk_dev_support_integrity(const struct ublk_device *ub)
  369. {
  370. return ub->dev_info.flags & UBLK_F_INTEGRITY;
  371. }
  372. #ifdef CONFIG_BLK_DEV_ZONED
  373. struct ublk_zoned_report_desc {
  374. __u64 sector;
  375. __u32 operation;
  376. __u32 nr_zones;
  377. };
  378. static DEFINE_XARRAY(ublk_zoned_report_descs);
  379. static int ublk_zoned_insert_report_desc(const struct request *req,
  380. struct ublk_zoned_report_desc *desc)
  381. {
  382. return xa_insert(&ublk_zoned_report_descs, (unsigned long)req,
  383. desc, GFP_KERNEL);
  384. }
  385. static struct ublk_zoned_report_desc *ublk_zoned_erase_report_desc(
  386. const struct request *req)
  387. {
  388. return xa_erase(&ublk_zoned_report_descs, (unsigned long)req);
  389. }
  390. static struct ublk_zoned_report_desc *ublk_zoned_get_report_desc(
  391. const struct request *req)
  392. {
  393. return xa_load(&ublk_zoned_report_descs, (unsigned long)req);
  394. }
  395. static int ublk_get_nr_zones(const struct ublk_device *ub)
  396. {
  397. const struct ublk_param_basic *p = &ub->params.basic;
  398. /* Zone size is a power of 2 */
  399. return p->dev_sectors >> ilog2(p->chunk_sectors);
  400. }
  401. static int ublk_revalidate_disk_zones(struct ublk_device *ub)
  402. {
  403. return blk_revalidate_disk_zones(ub->ub_disk);
  404. }
  405. static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
  406. {
  407. const struct ublk_param_zoned *p = &ub->params.zoned;
  408. int nr_zones;
  409. if (!ublk_dev_is_zoned(ub))
  410. return -EINVAL;
  411. if (!p->max_zone_append_sectors)
  412. return -EINVAL;
  413. nr_zones = ublk_get_nr_zones(ub);
  414. if (p->max_active_zones > nr_zones)
  415. return -EINVAL;
  416. if (p->max_open_zones > nr_zones)
  417. return -EINVAL;
  418. return 0;
  419. }
  420. static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
  421. {
  422. ub->ub_disk->nr_zones = ublk_get_nr_zones(ub);
  423. }
  424. /* Based on virtblk_alloc_report_buffer */
  425. static void *ublk_alloc_report_buffer(struct ublk_device *ublk,
  426. unsigned int nr_zones, size_t *buflen)
  427. {
  428. struct request_queue *q = ublk->ub_disk->queue;
  429. size_t bufsize;
  430. void *buf;
  431. nr_zones = min_t(unsigned int, nr_zones,
  432. ublk->ub_disk->nr_zones);
  433. bufsize = nr_zones * sizeof(struct blk_zone);
  434. bufsize =
  435. min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT);
  436. while (bufsize >= sizeof(struct blk_zone)) {
  437. buf = kvmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
  438. if (buf) {
  439. *buflen = bufsize;
  440. return buf;
  441. }
  442. bufsize >>= 1;
  443. }
  444. *buflen = 0;
  445. return NULL;
  446. }
  447. static int ublk_report_zones(struct gendisk *disk, sector_t sector,
  448. unsigned int nr_zones, struct blk_report_zones_args *args)
  449. {
  450. struct ublk_device *ub = disk->private_data;
  451. unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors;
  452. unsigned int first_zone = sector >> ilog2(zone_size_sectors);
  453. unsigned int done_zones = 0;
  454. unsigned int max_zones_per_request;
  455. int ret;
  456. struct blk_zone *buffer;
  457. size_t buffer_length;
  458. nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone,
  459. nr_zones);
  460. buffer = ublk_alloc_report_buffer(ub, nr_zones, &buffer_length);
  461. if (!buffer)
  462. return -ENOMEM;
  463. max_zones_per_request = buffer_length / sizeof(struct blk_zone);
  464. while (done_zones < nr_zones) {
  465. unsigned int remaining_zones = nr_zones - done_zones;
  466. unsigned int zones_in_request =
  467. min_t(unsigned int, remaining_zones, max_zones_per_request);
  468. struct request *req;
  469. struct ublk_zoned_report_desc desc;
  470. blk_status_t status;
  471. memset(buffer, 0, buffer_length);
  472. req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
  473. if (IS_ERR(req)) {
  474. ret = PTR_ERR(req);
  475. goto out;
  476. }
  477. desc.operation = UBLK_IO_OP_REPORT_ZONES;
  478. desc.sector = sector;
  479. desc.nr_zones = zones_in_request;
  480. ret = ublk_zoned_insert_report_desc(req, &desc);
  481. if (ret)
  482. goto free_req;
  483. ret = blk_rq_map_kern(req, buffer, buffer_length, GFP_KERNEL);
  484. if (ret)
  485. goto erase_desc;
  486. status = blk_execute_rq(req, 0);
  487. ret = blk_status_to_errno(status);
  488. erase_desc:
  489. ublk_zoned_erase_report_desc(req);
  490. free_req:
  491. blk_mq_free_request(req);
  492. if (ret)
  493. goto out;
  494. for (unsigned int i = 0; i < zones_in_request; i++) {
  495. struct blk_zone *zone = buffer + i;
  496. /* A zero length zone means no more zones in this response */
  497. if (!zone->len)
  498. break;
  499. ret = disk_report_zone(disk, zone, i, args);
  500. if (ret)
  501. goto out;
  502. done_zones++;
  503. sector += zone_size_sectors;
  504. }
  505. }
  506. ret = done_zones;
  507. out:
  508. kvfree(buffer);
  509. return ret;
  510. }
  511. static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
  512. struct request *req)
  513. {
  514. struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
  515. struct ublk_io *io = &ubq->ios[req->tag];
  516. struct ublk_zoned_report_desc *desc;
  517. u32 ublk_op;
  518. switch (req_op(req)) {
  519. case REQ_OP_ZONE_OPEN:
  520. ublk_op = UBLK_IO_OP_ZONE_OPEN;
  521. break;
  522. case REQ_OP_ZONE_CLOSE:
  523. ublk_op = UBLK_IO_OP_ZONE_CLOSE;
  524. break;
  525. case REQ_OP_ZONE_FINISH:
  526. ublk_op = UBLK_IO_OP_ZONE_FINISH;
  527. break;
  528. case REQ_OP_ZONE_RESET:
  529. ublk_op = UBLK_IO_OP_ZONE_RESET;
  530. break;
  531. case REQ_OP_ZONE_APPEND:
  532. ublk_op = UBLK_IO_OP_ZONE_APPEND;
  533. break;
  534. case REQ_OP_ZONE_RESET_ALL:
  535. ublk_op = UBLK_IO_OP_ZONE_RESET_ALL;
  536. break;
  537. case REQ_OP_DRV_IN:
  538. desc = ublk_zoned_get_report_desc(req);
  539. if (!desc)
  540. return BLK_STS_IOERR;
  541. ublk_op = desc->operation;
  542. switch (ublk_op) {
  543. case UBLK_IO_OP_REPORT_ZONES:
  544. iod->op_flags = ublk_op | ublk_req_build_flags(req);
  545. iod->nr_zones = desc->nr_zones;
  546. iod->start_sector = desc->sector;
  547. return BLK_STS_OK;
  548. default:
  549. return BLK_STS_IOERR;
  550. }
  551. case REQ_OP_DRV_OUT:
  552. /* We do not support drv_out */
  553. return BLK_STS_NOTSUPP;
  554. default:
  555. return BLK_STS_IOERR;
  556. }
  557. iod->op_flags = ublk_op | ublk_req_build_flags(req);
  558. iod->nr_sectors = blk_rq_sectors(req);
  559. iod->start_sector = blk_rq_pos(req);
  560. iod->addr = io->buf.addr;
  561. return BLK_STS_OK;
  562. }
  563. #else
  564. #define ublk_report_zones (NULL)
  565. static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
  566. {
  567. return -EOPNOTSUPP;
  568. }
  569. static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
  570. {
  571. }
  572. static int ublk_revalidate_disk_zones(struct ublk_device *ub)
  573. {
  574. return 0;
  575. }
  576. static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
  577. struct request *req)
  578. {
  579. return BLK_STS_NOTSUPP;
  580. }
  581. #endif
  582. static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io,
  583. bool need_map, struct io_comp_batch *iob);
  584. static dev_t ublk_chr_devt;
  585. static const struct class ublk_chr_class = {
  586. .name = "ublk-char",
  587. };
  588. static DEFINE_IDR(ublk_index_idr);
  589. static DEFINE_SPINLOCK(ublk_idr_lock);
  590. static wait_queue_head_t ublk_idr_wq; /* wait until one idr is freed */
  591. static DEFINE_MUTEX(ublk_ctl_mutex);
  592. static struct ublk_batch_fetch_cmd *
  593. ublk_batch_alloc_fcmd(struct io_uring_cmd *cmd)
  594. {
  595. struct ublk_batch_fetch_cmd *fcmd = kzalloc_obj(*fcmd, GFP_NOIO);
  596. if (fcmd) {
  597. fcmd->cmd = cmd;
  598. fcmd->buf_group = READ_ONCE(cmd->sqe->buf_index);
  599. }
  600. return fcmd;
  601. }
  602. static void ublk_batch_free_fcmd(struct ublk_batch_fetch_cmd *fcmd)
  603. {
  604. kfree(fcmd);
  605. }
  606. static void __ublk_release_fcmd(struct ublk_queue *ubq)
  607. {
  608. WRITE_ONCE(ubq->active_fcmd, NULL);
  609. }
  610. /*
  611. * Nothing can move on, so clear ->active_fcmd, and the caller should stop
  612. * dispatching
  613. */
  614. static void ublk_batch_deinit_fetch_buf(struct ublk_queue *ubq,
  615. const struct ublk_batch_io_data *data,
  616. struct ublk_batch_fetch_cmd *fcmd,
  617. int res)
  618. {
  619. spin_lock(&ubq->evts_lock);
  620. list_del_init(&fcmd->node);
  621. WARN_ON_ONCE(fcmd != ubq->active_fcmd);
  622. __ublk_release_fcmd(ubq);
  623. spin_unlock(&ubq->evts_lock);
  624. io_uring_cmd_done(fcmd->cmd, res, data->issue_flags);
  625. ublk_batch_free_fcmd(fcmd);
  626. }
  627. static int ublk_batch_fetch_post_cqe(struct ublk_batch_fetch_cmd *fcmd,
  628. struct io_br_sel *sel,
  629. unsigned int issue_flags)
  630. {
  631. if (io_uring_mshot_cmd_post_cqe(fcmd->cmd, sel, issue_flags))
  632. return -ENOBUFS;
  633. return 0;
  634. }
  635. static ssize_t ublk_batch_copy_io_tags(struct ublk_batch_fetch_cmd *fcmd,
  636. void __user *buf, const u16 *tag_buf,
  637. unsigned int len)
  638. {
  639. if (copy_to_user(buf, tag_buf, len))
  640. return -EFAULT;
  641. return len;
  642. }
  643. #define UBLK_MAX_UBLKS UBLK_MINORS
  644. /*
  645. * Max unprivileged ublk devices allowed to add
  646. *
  647. * It can be extended to one per-user limit in future or even controlled
  648. * by cgroup.
  649. */
  650. static unsigned int unprivileged_ublks_max = 64;
  651. static unsigned int unprivileged_ublks_added; /* protected by ublk_ctl_mutex */
  652. static struct miscdevice ublk_misc;
  653. static inline unsigned ublk_pos_to_hwq(loff_t pos)
  654. {
  655. return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_QID_OFF) &
  656. UBLK_QID_BITS_MASK;
  657. }
  658. static inline unsigned ublk_pos_to_buf_off(loff_t pos)
  659. {
  660. return (pos - UBLKSRV_IO_BUF_OFFSET) & UBLK_IO_BUF_BITS_MASK;
  661. }
  662. static inline unsigned ublk_pos_to_tag(loff_t pos)
  663. {
  664. return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_TAG_OFF) &
  665. UBLK_TAG_BITS_MASK;
  666. }
  667. static void ublk_dev_param_basic_apply(struct ublk_device *ub)
  668. {
  669. const struct ublk_param_basic *p = &ub->params.basic;
  670. if (p->attrs & UBLK_ATTR_READ_ONLY)
  671. set_disk_ro(ub->ub_disk, true);
  672. set_capacity(ub->ub_disk, p->dev_sectors);
  673. }
  674. static int ublk_integrity_flags(u32 flags)
  675. {
  676. int ret_flags = 0;
  677. if (flags & LBMD_PI_CAP_INTEGRITY) {
  678. flags &= ~LBMD_PI_CAP_INTEGRITY;
  679. ret_flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
  680. }
  681. if (flags & LBMD_PI_CAP_REFTAG) {
  682. flags &= ~LBMD_PI_CAP_REFTAG;
  683. ret_flags |= BLK_INTEGRITY_REF_TAG;
  684. }
  685. return flags ? -EINVAL : ret_flags;
  686. }
  687. static int ublk_integrity_pi_tuple_size(u8 csum_type)
  688. {
  689. switch (csum_type) {
  690. case LBMD_PI_CSUM_NONE:
  691. return 0;
  692. case LBMD_PI_CSUM_IP:
  693. case LBMD_PI_CSUM_CRC16_T10DIF:
  694. return 8;
  695. case LBMD_PI_CSUM_CRC64_NVME:
  696. return 16;
  697. default:
  698. return -EINVAL;
  699. }
  700. }
  701. static enum blk_integrity_checksum ublk_integrity_csum_type(u8 csum_type)
  702. {
  703. switch (csum_type) {
  704. case LBMD_PI_CSUM_NONE:
  705. return BLK_INTEGRITY_CSUM_NONE;
  706. case LBMD_PI_CSUM_IP:
  707. return BLK_INTEGRITY_CSUM_IP;
  708. case LBMD_PI_CSUM_CRC16_T10DIF:
  709. return BLK_INTEGRITY_CSUM_CRC;
  710. case LBMD_PI_CSUM_CRC64_NVME:
  711. return BLK_INTEGRITY_CSUM_CRC64;
  712. default:
  713. WARN_ON_ONCE(1);
  714. return BLK_INTEGRITY_CSUM_NONE;
  715. }
  716. }
  717. static int ublk_validate_params(const struct ublk_device *ub)
  718. {
  719. /* basic param is the only one which must be set */
  720. if (ub->params.types & UBLK_PARAM_TYPE_BASIC) {
  721. const struct ublk_param_basic *p = &ub->params.basic;
  722. if (p->logical_bs_shift > PAGE_SHIFT || p->logical_bs_shift < 9)
  723. return -EINVAL;
  724. if (p->logical_bs_shift > p->physical_bs_shift)
  725. return -EINVAL;
  726. if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9))
  727. return -EINVAL;
  728. if (ublk_dev_is_zoned(ub) && !p->chunk_sectors)
  729. return -EINVAL;
  730. } else
  731. return -EINVAL;
  732. if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
  733. const struct ublk_param_discard *p = &ub->params.discard;
  734. /* So far, only support single segment discard */
  735. if (p->max_discard_sectors && p->max_discard_segments != 1)
  736. return -EINVAL;
  737. if (!p->discard_granularity)
  738. return -EINVAL;
  739. }
  740. /* dev_t is read-only */
  741. if (ub->params.types & UBLK_PARAM_TYPE_DEVT)
  742. return -EINVAL;
  743. if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
  744. return ublk_dev_param_zoned_validate(ub);
  745. else if (ublk_dev_is_zoned(ub))
  746. return -EINVAL;
  747. if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN) {
  748. const struct ublk_param_dma_align *p = &ub->params.dma;
  749. if (p->alignment >= PAGE_SIZE)
  750. return -EINVAL;
  751. if (!is_power_of_2(p->alignment + 1))
  752. return -EINVAL;
  753. }
  754. if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) {
  755. const struct ublk_param_segment *p = &ub->params.seg;
  756. if (!is_power_of_2(p->seg_boundary_mask + 1))
  757. return -EINVAL;
  758. if (p->seg_boundary_mask + 1 < UBLK_MIN_SEGMENT_SIZE)
  759. return -EINVAL;
  760. if (p->max_segment_size < UBLK_MIN_SEGMENT_SIZE)
  761. return -EINVAL;
  762. }
  763. if (ub->params.types & UBLK_PARAM_TYPE_INTEGRITY) {
  764. const struct ublk_param_integrity *p = &ub->params.integrity;
  765. int pi_tuple_size = ublk_integrity_pi_tuple_size(p->csum_type);
  766. int flags = ublk_integrity_flags(p->flags);
  767. if (!ublk_dev_support_integrity(ub))
  768. return -EINVAL;
  769. if (flags < 0)
  770. return flags;
  771. if (pi_tuple_size < 0)
  772. return pi_tuple_size;
  773. if (!p->metadata_size)
  774. return -EINVAL;
  775. if (p->csum_type == LBMD_PI_CSUM_NONE &&
  776. p->flags & LBMD_PI_CAP_REFTAG)
  777. return -EINVAL;
  778. if (p->pi_offset + pi_tuple_size > p->metadata_size)
  779. return -EINVAL;
  780. if (p->interval_exp < SECTOR_SHIFT ||
  781. p->interval_exp > ub->params.basic.logical_bs_shift)
  782. return -EINVAL;
  783. }
  784. return 0;
  785. }
  786. static void ublk_apply_params(struct ublk_device *ub)
  787. {
  788. ublk_dev_param_basic_apply(ub);
  789. if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
  790. ublk_dev_param_zoned_apply(ub);
  791. }
  792. static inline bool ublk_need_map_io(const struct ublk_queue *ubq)
  793. {
  794. return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq) &&
  795. !ublk_support_auto_buf_reg(ubq);
  796. }
  797. static inline bool ublk_dev_need_map_io(const struct ublk_device *ub)
  798. {
  799. return !ublk_dev_support_user_copy(ub) &&
  800. !ublk_dev_support_zero_copy(ub) &&
  801. !ublk_dev_support_auto_buf_reg(ub);
  802. }
  803. static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
  804. {
  805. /*
  806. * read()/write() is involved in user copy, so request reference
  807. * has to be grabbed
  808. *
  809. * for zero copy, request buffer need to be registered to io_uring
  810. * buffer table, so reference is needed
  811. *
  812. * For auto buffer register, ublk server still may issue
  813. * UBLK_IO_COMMIT_AND_FETCH_REQ before one registered buffer is used up,
  814. * so reference is required too.
  815. */
  816. return ublk_support_user_copy(ubq) || ublk_support_zero_copy(ubq) ||
  817. ublk_support_auto_buf_reg(ubq);
  818. }
  819. static inline bool ublk_dev_need_req_ref(const struct ublk_device *ub)
  820. {
  821. return ublk_dev_support_user_copy(ub) ||
  822. ublk_dev_support_zero_copy(ub) ||
  823. ublk_dev_support_auto_buf_reg(ub);
  824. }
  825. /*
  826. * ublk IO Reference Counting Design
  827. * ==================================
  828. *
  829. * For user-copy and zero-copy modes, ublk uses a split reference model with
  830. * two counters that together track IO lifetime:
  831. *
  832. * - io->ref: refcount for off-task buffer registrations and user-copy ops
  833. * - io->task_registered_buffers: count of buffers registered on the IO task
  834. *
  835. * Key Invariant:
  836. * --------------
  837. * When IO is dispatched to the ublk server (UBLK_IO_FLAG_OWNED_BY_SRV set),
  838. * the sum (io->ref + io->task_registered_buffers) must equal UBLK_REFCOUNT_INIT
  839. * when no active references exist. After IO completion, both counters become
  840. * zero. For I/Os not currently dispatched to the ublk server, both ref and
  841. * task_registered_buffers are 0.
  842. *
  843. * This invariant is checked by ublk_check_and_reset_active_ref() during daemon
  844. * exit to determine if all references have been released.
  845. *
  846. * Why Split Counters:
  847. * -------------------
  848. * Buffers registered on the IO daemon task can use the lightweight
  849. * task_registered_buffers counter (simple increment/decrement) instead of
  850. * atomic refcount operations. The ublk_io_release() callback checks if
  851. * current == io->task to decide which counter to update.
  852. *
  853. * This optimization only applies before IO completion. At completion,
  854. * ublk_sub_req_ref() collapses task_registered_buffers into the atomic ref.
  855. * After that, all subsequent buffer unregistrations must use the atomic ref
  856. * since they may be releasing the last reference.
  857. *
  858. * Reference Lifecycle:
  859. * --------------------
  860. * 1. ublk_init_req_ref(): Sets io->ref = UBLK_REFCOUNT_INIT at IO dispatch
  861. *
  862. * 2. During IO processing:
  863. * - On-task buffer reg: task_registered_buffers++ (no ref change)
  864. * - Off-task buffer reg: ref++ via ublk_get_req_ref()
  865. * - Buffer unregister callback (ublk_io_release):
  866. * * If on-task: task_registered_buffers--
  867. * * If off-task: ref-- via ublk_put_req_ref()
  868. *
  869. * 3. ublk_sub_req_ref() at IO completion:
  870. * - Computes: sub_refs = UBLK_REFCOUNT_INIT - task_registered_buffers
  871. * - Subtracts sub_refs from ref and zeroes task_registered_buffers
  872. * - This effectively collapses task_registered_buffers into the atomic ref,
  873. * accounting for the initial UBLK_REFCOUNT_INIT minus any on-task
  874. * buffers that were already counted
  875. *
  876. * Example (zero-copy, register on-task, unregister off-task):
  877. * - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 0
  878. * - Register buffer on-task: task_registered_buffers = 1
  879. * - Unregister off-task: ref-- (UBLK_REFCOUNT_INIT - 1), task_registered_buffers stays 1
  880. * - Completion via ublk_sub_req_ref():
  881. * sub_refs = UBLK_REFCOUNT_INIT - 1,
  882. * ref = (UBLK_REFCOUNT_INIT - 1) - (UBLK_REFCOUNT_INIT - 1) = 0
  883. *
  884. * Example (auto buffer registration):
  885. * Auto buffer registration sets task_registered_buffers = 1 at dispatch.
  886. *
  887. * - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 1
  888. * - Buffer unregister: task_registered_buffers-- (becomes 0)
  889. * - Completion via ublk_sub_req_ref():
  890. * sub_refs = UBLK_REFCOUNT_INIT - 0, ref becomes 0
  891. *
  892. * Example (zero-copy, ublk server killed):
  893. * When daemon is killed, io_uring cleanup unregisters buffers off-task.
  894. * ublk_check_and_reset_active_ref() waits for the invariant to hold.
  895. *
  896. * - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 0
  897. * - Register buffer on-task: task_registered_buffers = 1
  898. * - Daemon killed, io_uring cleanup unregisters buffer (off-task):
  899. * ref-- (UBLK_REFCOUNT_INIT - 1), task_registered_buffers stays 1
  900. * - Daemon exit check: sum = (UBLK_REFCOUNT_INIT - 1) + 1 = UBLK_REFCOUNT_INIT
  901. * - Sum equals UBLK_REFCOUNT_INIT, then both two counters are zeroed by
  902. * ublk_check_and_reset_active_ref(), so ublk_abort_queue() can proceed
  903. * and abort pending requests
  904. *
  905. * Batch IO Special Case:
  906. * ----------------------
  907. * In batch IO mode, io->task is NULL. This means ublk_io_release() always
  908. * takes the off-task path (ublk_put_req_ref), decrementing io->ref. The
  909. * task_registered_buffers counter still tracks registered buffers for the
  910. * invariant check, even though the callback doesn't decrement it.
  911. *
  912. * Note: updating task_registered_buffers is protected by io->lock.
  913. */
  914. static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
  915. struct ublk_io *io)
  916. {
  917. if (ublk_need_req_ref(ubq))
  918. refcount_set(&io->ref, UBLK_REFCOUNT_INIT);
  919. }
  920. static inline bool ublk_get_req_ref(struct ublk_io *io)
  921. {
  922. return refcount_inc_not_zero(&io->ref);
  923. }
  924. static inline void ublk_put_req_ref(struct ublk_io *io, struct request *req)
  925. {
  926. if (!refcount_dec_and_test(&io->ref))
  927. return;
  928. /* ublk_need_map_io() and ublk_need_req_ref() are mutually exclusive */
  929. __ublk_complete_rq(req, io, false, NULL);
  930. }
  931. static inline bool ublk_sub_req_ref(struct ublk_io *io)
  932. {
  933. unsigned sub_refs = UBLK_REFCOUNT_INIT - io->task_registered_buffers;
  934. io->task_registered_buffers = 0;
  935. return refcount_sub_and_test(sub_refs, &io->ref);
  936. }
  937. static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
  938. {
  939. return ubq->flags & UBLK_F_NEED_GET_DATA;
  940. }
  941. static inline bool ublk_dev_need_get_data(const struct ublk_device *ub)
  942. {
  943. return ub->dev_info.flags & UBLK_F_NEED_GET_DATA;
  944. }
  945. /* Called in slow path only, keep it noinline for trace purpose */
  946. static noinline struct ublk_device *ublk_get_device(struct ublk_device *ub)
  947. {
  948. if (kobject_get_unless_zero(&ub->cdev_dev.kobj))
  949. return ub;
  950. return NULL;
  951. }
  952. /* Called in slow path only, keep it noinline for trace purpose */
  953. static noinline void ublk_put_device(struct ublk_device *ub)
  954. {
  955. put_device(&ub->cdev_dev);
  956. }
  957. static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev,
  958. int qid)
  959. {
  960. return dev->queues[qid];
  961. }
  962. static inline bool ublk_rq_has_data(const struct request *rq)
  963. {
  964. return bio_has_data(rq->bio);
  965. }
  966. static inline struct ublksrv_io_desc *
  967. ublk_queue_cmd_buf(struct ublk_device *ub, int q_id)
  968. {
  969. return ublk_get_queue(ub, q_id)->io_cmd_buf;
  970. }
  971. static inline int __ublk_queue_cmd_buf_size(int depth)
  972. {
  973. return round_up(depth * sizeof(struct ublksrv_io_desc), PAGE_SIZE);
  974. }
  975. static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub)
  976. {
  977. return __ublk_queue_cmd_buf_size(ub->dev_info.queue_depth);
  978. }
  979. static int ublk_max_cmd_buf_size(void)
  980. {
  981. return __ublk_queue_cmd_buf_size(UBLK_MAX_QUEUE_DEPTH);
  982. }
  983. /*
  984. * Should I/O outstanding to the ublk server when it exits be reissued?
  985. * If not, outstanding I/O will get errors.
  986. */
  987. static inline bool ublk_nosrv_should_reissue_outstanding(struct ublk_device *ub)
  988. {
  989. return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) &&
  990. (ub->dev_info.flags & UBLK_F_USER_RECOVERY_REISSUE);
  991. }
  992. /*
  993. * Should I/O issued while there is no ublk server queue? If not, I/O
  994. * issued while there is no ublk server will get errors.
  995. */
  996. static inline bool ublk_nosrv_dev_should_queue_io(struct ublk_device *ub)
  997. {
  998. return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) &&
  999. !(ub->dev_info.flags & UBLK_F_USER_RECOVERY_FAIL_IO);
  1000. }
  1001. /*
  1002. * Same as ublk_nosrv_dev_should_queue_io, but uses a queue-local copy
  1003. * of the device flags for smaller cache footprint - better for fast
  1004. * paths.
  1005. */
  1006. static inline bool ublk_nosrv_should_queue_io(struct ublk_queue *ubq)
  1007. {
  1008. return (ubq->flags & UBLK_F_USER_RECOVERY) &&
  1009. !(ubq->flags & UBLK_F_USER_RECOVERY_FAIL_IO);
  1010. }
  1011. /*
  1012. * Should ublk devices be stopped (i.e. no recovery possible) when the
  1013. * ublk server exits? If not, devices can be used again by a future
  1014. * incarnation of a ublk server via the start_recovery/end_recovery
  1015. * commands.
  1016. */
  1017. static inline bool ublk_nosrv_should_stop_dev(struct ublk_device *ub)
  1018. {
  1019. return !(ub->dev_info.flags & UBLK_F_USER_RECOVERY);
  1020. }
  1021. static inline bool ublk_dev_in_recoverable_state(struct ublk_device *ub)
  1022. {
  1023. return ub->dev_info.state == UBLK_S_DEV_QUIESCED ||
  1024. ub->dev_info.state == UBLK_S_DEV_FAIL_IO;
  1025. }
  1026. static void ublk_free_disk(struct gendisk *disk)
  1027. {
  1028. struct ublk_device *ub = disk->private_data;
  1029. clear_bit(UB_STATE_USED, &ub->state);
  1030. ublk_put_device(ub);
  1031. }
  1032. static void ublk_store_owner_uid_gid(unsigned int *owner_uid,
  1033. unsigned int *owner_gid)
  1034. {
  1035. kuid_t uid;
  1036. kgid_t gid;
  1037. current_uid_gid(&uid, &gid);
  1038. *owner_uid = from_kuid(&init_user_ns, uid);
  1039. *owner_gid = from_kgid(&init_user_ns, gid);
  1040. }
  1041. static int ublk_open(struct gendisk *disk, blk_mode_t mode)
  1042. {
  1043. struct ublk_device *ub = disk->private_data;
  1044. if (capable(CAP_SYS_ADMIN))
  1045. return 0;
  1046. /*
  1047. * If it is one unprivileged device, only owner can open
  1048. * the disk. Otherwise it could be one trap made by one
  1049. * evil user who grants this disk's privileges to other
  1050. * users deliberately.
  1051. *
  1052. * This way is reasonable too given anyone can create
  1053. * unprivileged device, and no need other's grant.
  1054. */
  1055. if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV) {
  1056. unsigned int curr_uid, curr_gid;
  1057. ublk_store_owner_uid_gid(&curr_uid, &curr_gid);
  1058. if (curr_uid != ub->dev_info.owner_uid || curr_gid !=
  1059. ub->dev_info.owner_gid)
  1060. return -EPERM;
  1061. }
  1062. if (ub->block_open)
  1063. return -ENXIO;
  1064. return 0;
  1065. }
  1066. static const struct block_device_operations ub_fops = {
  1067. .owner = THIS_MODULE,
  1068. .open = ublk_open,
  1069. .free_disk = ublk_free_disk,
  1070. .report_zones = ublk_report_zones,
  1071. };
  1072. static bool ublk_copy_user_bvec(const struct bio_vec *bv, unsigned *offset,
  1073. struct iov_iter *uiter, int dir, size_t *done)
  1074. {
  1075. unsigned len;
  1076. void *bv_buf;
  1077. size_t copied;
  1078. if (*offset >= bv->bv_len) {
  1079. *offset -= bv->bv_len;
  1080. return true;
  1081. }
  1082. len = bv->bv_len - *offset;
  1083. bv_buf = kmap_local_page(bv->bv_page) + bv->bv_offset + *offset;
  1084. if (dir == ITER_DEST)
  1085. copied = copy_to_iter(bv_buf, len, uiter);
  1086. else
  1087. copied = copy_from_iter(bv_buf, len, uiter);
  1088. kunmap_local(bv_buf);
  1089. *done += copied;
  1090. if (copied < len)
  1091. return false;
  1092. *offset = 0;
  1093. return true;
  1094. }
  1095. /*
  1096. * Copy data between request pages and io_iter, and 'offset'
  1097. * is the start point of linear offset of request.
  1098. */
  1099. static size_t ublk_copy_user_pages(const struct request *req,
  1100. unsigned offset, struct iov_iter *uiter, int dir)
  1101. {
  1102. struct req_iterator iter;
  1103. struct bio_vec bv;
  1104. size_t done = 0;
  1105. rq_for_each_segment(bv, req, iter) {
  1106. if (!ublk_copy_user_bvec(&bv, &offset, uiter, dir, &done))
  1107. break;
  1108. }
  1109. return done;
  1110. }
  1111. #ifdef CONFIG_BLK_DEV_INTEGRITY
  1112. static size_t ublk_copy_user_integrity(const struct request *req,
  1113. unsigned offset, struct iov_iter *uiter, int dir)
  1114. {
  1115. size_t done = 0;
  1116. struct bio *bio = req->bio;
  1117. struct bvec_iter iter;
  1118. struct bio_vec iv;
  1119. if (!blk_integrity_rq(req))
  1120. return 0;
  1121. bio_for_each_integrity_vec(iv, bio, iter) {
  1122. if (!ublk_copy_user_bvec(&iv, &offset, uiter, dir, &done))
  1123. break;
  1124. }
  1125. return done;
  1126. }
  1127. #else /* #ifdef CONFIG_BLK_DEV_INTEGRITY */
  1128. static size_t ublk_copy_user_integrity(const struct request *req,
  1129. unsigned offset, struct iov_iter *uiter, int dir)
  1130. {
  1131. return 0;
  1132. }
  1133. #endif /* #ifdef CONFIG_BLK_DEV_INTEGRITY */
  1134. static inline bool ublk_need_map_req(const struct request *req)
  1135. {
  1136. return ublk_rq_has_data(req) && req_op(req) == REQ_OP_WRITE;
  1137. }
  1138. static inline bool ublk_need_unmap_req(const struct request *req)
  1139. {
  1140. return ublk_rq_has_data(req) &&
  1141. (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN);
  1142. }
  1143. static unsigned int ublk_map_io(const struct ublk_queue *ubq,
  1144. const struct request *req,
  1145. const struct ublk_io *io)
  1146. {
  1147. const unsigned int rq_bytes = blk_rq_bytes(req);
  1148. if (!ublk_need_map_io(ubq))
  1149. return rq_bytes;
  1150. /*
  1151. * no zero copy, we delay copy WRITE request data into ublksrv
  1152. * context and the big benefit is that pinning pages in current
  1153. * context is pretty fast, see ublk_pin_user_pages
  1154. */
  1155. if (ublk_need_map_req(req)) {
  1156. struct iov_iter iter;
  1157. const int dir = ITER_DEST;
  1158. import_ubuf(dir, u64_to_user_ptr(io->buf.addr), rq_bytes, &iter);
  1159. return ublk_copy_user_pages(req, 0, &iter, dir);
  1160. }
  1161. return rq_bytes;
  1162. }
  1163. static unsigned int ublk_unmap_io(bool need_map,
  1164. const struct request *req,
  1165. const struct ublk_io *io)
  1166. {
  1167. const unsigned int rq_bytes = blk_rq_bytes(req);
  1168. if (!need_map)
  1169. return rq_bytes;
  1170. if (ublk_need_unmap_req(req)) {
  1171. struct iov_iter iter;
  1172. const int dir = ITER_SOURCE;
  1173. WARN_ON_ONCE(io->res > rq_bytes);
  1174. import_ubuf(dir, u64_to_user_ptr(io->buf.addr), io->res, &iter);
  1175. return ublk_copy_user_pages(req, 0, &iter, dir);
  1176. }
  1177. return rq_bytes;
  1178. }
  1179. static inline unsigned int ublk_req_build_flags(struct request *req)
  1180. {
  1181. unsigned flags = 0;
  1182. if (req->cmd_flags & REQ_FAILFAST_DEV)
  1183. flags |= UBLK_IO_F_FAILFAST_DEV;
  1184. if (req->cmd_flags & REQ_FAILFAST_TRANSPORT)
  1185. flags |= UBLK_IO_F_FAILFAST_TRANSPORT;
  1186. if (req->cmd_flags & REQ_FAILFAST_DRIVER)
  1187. flags |= UBLK_IO_F_FAILFAST_DRIVER;
  1188. if (req->cmd_flags & REQ_META)
  1189. flags |= UBLK_IO_F_META;
  1190. if (req->cmd_flags & REQ_FUA)
  1191. flags |= UBLK_IO_F_FUA;
  1192. if (req->cmd_flags & REQ_NOUNMAP)
  1193. flags |= UBLK_IO_F_NOUNMAP;
  1194. if (req->cmd_flags & REQ_SWAP)
  1195. flags |= UBLK_IO_F_SWAP;
  1196. if (blk_integrity_rq(req))
  1197. flags |= UBLK_IO_F_INTEGRITY;
  1198. return flags;
  1199. }
  1200. static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
  1201. {
  1202. struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
  1203. struct ublk_io *io = &ubq->ios[req->tag];
  1204. u32 ublk_op;
  1205. switch (req_op(req)) {
  1206. case REQ_OP_READ:
  1207. ublk_op = UBLK_IO_OP_READ;
  1208. break;
  1209. case REQ_OP_WRITE:
  1210. ublk_op = UBLK_IO_OP_WRITE;
  1211. break;
  1212. case REQ_OP_FLUSH:
  1213. ublk_op = UBLK_IO_OP_FLUSH;
  1214. break;
  1215. case REQ_OP_DISCARD:
  1216. ublk_op = UBLK_IO_OP_DISCARD;
  1217. break;
  1218. case REQ_OP_WRITE_ZEROES:
  1219. ublk_op = UBLK_IO_OP_WRITE_ZEROES;
  1220. break;
  1221. default:
  1222. if (ublk_queue_is_zoned(ubq))
  1223. return ublk_setup_iod_zoned(ubq, req);
  1224. return BLK_STS_IOERR;
  1225. }
  1226. /* need to translate since kernel may change */
  1227. iod->op_flags = ublk_op | ublk_req_build_flags(req);
  1228. iod->nr_sectors = blk_rq_sectors(req);
  1229. iod->start_sector = blk_rq_pos(req);
  1230. iod->addr = io->buf.addr;
  1231. return BLK_STS_OK;
  1232. }
  1233. static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu(
  1234. struct io_uring_cmd *ioucmd)
  1235. {
  1236. return io_uring_cmd_to_pdu(ioucmd, struct ublk_uring_cmd_pdu);
  1237. }
  1238. static void ublk_end_request(struct request *req, blk_status_t error)
  1239. {
  1240. local_bh_disable();
  1241. blk_mq_end_request(req, error);
  1242. local_bh_enable();
  1243. }
  1244. /* todo: handle partial completion */
  1245. static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io,
  1246. bool need_map, struct io_comp_batch *iob)
  1247. {
  1248. unsigned int unmapped_bytes;
  1249. blk_status_t res = BLK_STS_OK;
  1250. bool requeue;
  1251. /* failed read IO if nothing is read */
  1252. if (!io->res && req_op(req) == REQ_OP_READ)
  1253. io->res = -EIO;
  1254. if (io->res < 0) {
  1255. res = errno_to_blk_status(io->res);
  1256. goto exit;
  1257. }
  1258. /*
  1259. * FLUSH, DISCARD or WRITE_ZEROES usually won't return bytes returned, so end them
  1260. * directly.
  1261. *
  1262. * Both the two needn't unmap.
  1263. */
  1264. if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
  1265. req_op(req) != REQ_OP_DRV_IN)
  1266. goto exit;
  1267. /* for READ request, writing data in iod->addr to rq buffers */
  1268. unmapped_bytes = ublk_unmap_io(need_map, req, io);
  1269. /*
  1270. * Extremely impossible since we got data filled in just before
  1271. *
  1272. * Re-read simply for this unlikely case.
  1273. */
  1274. if (unlikely(unmapped_bytes < io->res))
  1275. io->res = unmapped_bytes;
  1276. /*
  1277. * Run bio->bi_end_io() with softirqs disabled. If the final fput
  1278. * happens off this path, then that will prevent ublk's blkdev_release()
  1279. * from being called on current's task work, see fput() implementation.
  1280. *
  1281. * Otherwise, ublk server may not provide forward progress in case of
  1282. * reading the partition table from bdev_open() with disk->open_mutex
  1283. * held, and causes dead lock as we could already be holding
  1284. * disk->open_mutex here.
  1285. *
  1286. * Preferably we would not be doing IO with a mutex held that is also
  1287. * used for release, but this work-around will suffice for now.
  1288. */
  1289. local_bh_disable();
  1290. requeue = blk_update_request(req, BLK_STS_OK, io->res);
  1291. local_bh_enable();
  1292. if (requeue)
  1293. blk_mq_requeue_request(req, true);
  1294. else if (likely(!blk_should_fake_timeout(req->q))) {
  1295. if (blk_mq_add_to_batch(req, iob, false, blk_mq_end_request_batch))
  1296. return;
  1297. __blk_mq_end_request(req, BLK_STS_OK);
  1298. }
  1299. return;
  1300. exit:
  1301. ublk_end_request(req, res);
  1302. }
  1303. static struct io_uring_cmd *__ublk_prep_compl_io_cmd(struct ublk_io *io,
  1304. struct request *req)
  1305. {
  1306. /* read cmd first because req will overwrite it */
  1307. struct io_uring_cmd *cmd = io->cmd;
  1308. /* mark this cmd owned by ublksrv */
  1309. io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
  1310. /*
  1311. * clear ACTIVE since we are done with this sqe/cmd slot
  1312. * We can only accept io cmd in case of being not active.
  1313. */
  1314. io->flags &= ~UBLK_IO_FLAG_ACTIVE;
  1315. io->req = req;
  1316. return cmd;
  1317. }
  1318. static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req,
  1319. int res, unsigned issue_flags)
  1320. {
  1321. struct io_uring_cmd *cmd = __ublk_prep_compl_io_cmd(io, req);
  1322. /* tell ublksrv one io request is coming */
  1323. io_uring_cmd_done(cmd, res, issue_flags);
  1324. }
  1325. #define UBLK_REQUEUE_DELAY_MS 3
  1326. static inline void __ublk_abort_rq(struct ublk_queue *ubq,
  1327. struct request *rq)
  1328. {
  1329. /* We cannot process this rq so just requeue it. */
  1330. if (ublk_nosrv_dev_should_queue_io(ubq->dev))
  1331. blk_mq_requeue_request(rq, false);
  1332. else
  1333. ublk_end_request(rq, BLK_STS_IOERR);
  1334. }
  1335. static void
  1336. ublk_auto_buf_reg_fallback(const struct ublk_queue *ubq, unsigned tag)
  1337. {
  1338. struct ublksrv_io_desc *iod = ublk_get_iod(ubq, tag);
  1339. iod->op_flags |= UBLK_IO_F_NEED_REG_BUF;
  1340. }
  1341. enum auto_buf_reg_res {
  1342. AUTO_BUF_REG_FAIL,
  1343. AUTO_BUF_REG_FALLBACK,
  1344. AUTO_BUF_REG_OK,
  1345. };
  1346. /*
  1347. * Setup io state after auto buffer registration.
  1348. *
  1349. * Must be called after ublk_auto_buf_register() is done.
  1350. * Caller must hold io->lock in batch context.
  1351. */
  1352. static void ublk_auto_buf_io_setup(const struct ublk_queue *ubq,
  1353. struct request *req, struct ublk_io *io,
  1354. struct io_uring_cmd *cmd,
  1355. enum auto_buf_reg_res res)
  1356. {
  1357. if (res == AUTO_BUF_REG_OK) {
  1358. io->task_registered_buffers = 1;
  1359. io->buf_ctx_handle = io_uring_cmd_ctx_handle(cmd);
  1360. io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG;
  1361. }
  1362. ublk_init_req_ref(ubq, io);
  1363. __ublk_prep_compl_io_cmd(io, req);
  1364. }
  1365. /* Register request bvec to io_uring for auto buffer registration. */
  1366. static enum auto_buf_reg_res
  1367. ublk_auto_buf_register(const struct ublk_queue *ubq, struct request *req,
  1368. struct ublk_io *io, struct io_uring_cmd *cmd,
  1369. unsigned int issue_flags)
  1370. {
  1371. int ret;
  1372. ret = io_buffer_register_bvec(cmd, req, ublk_io_release,
  1373. io->buf.auto_reg.index, issue_flags);
  1374. if (ret) {
  1375. if (io->buf.auto_reg.flags & UBLK_AUTO_BUF_REG_FALLBACK) {
  1376. ublk_auto_buf_reg_fallback(ubq, req->tag);
  1377. return AUTO_BUF_REG_FALLBACK;
  1378. }
  1379. ublk_end_request(req, BLK_STS_IOERR);
  1380. return AUTO_BUF_REG_FAIL;
  1381. }
  1382. return AUTO_BUF_REG_OK;
  1383. }
  1384. /*
  1385. * Dispatch IO to userspace with auto buffer registration.
  1386. *
  1387. * Only called in non-batch context from task work, io->lock not held.
  1388. */
  1389. static void ublk_auto_buf_dispatch(const struct ublk_queue *ubq,
  1390. struct request *req, struct ublk_io *io,
  1391. struct io_uring_cmd *cmd,
  1392. unsigned int issue_flags)
  1393. {
  1394. enum auto_buf_reg_res res = ublk_auto_buf_register(ubq, req, io, cmd,
  1395. issue_flags);
  1396. if (res != AUTO_BUF_REG_FAIL) {
  1397. ublk_auto_buf_io_setup(ubq, req, io, cmd, res);
  1398. io_uring_cmd_done(cmd, UBLK_IO_RES_OK, issue_flags);
  1399. }
  1400. }
  1401. static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req,
  1402. struct ublk_io *io)
  1403. {
  1404. unsigned mapped_bytes = ublk_map_io(ubq, req, io);
  1405. /* partially mapped, update io descriptor */
  1406. if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
  1407. /*
  1408. * Nothing mapped, retry until we succeed.
  1409. *
  1410. * We may never succeed in mapping any bytes here because
  1411. * of OOM. TODO: reserve one buffer with single page pinned
  1412. * for providing forward progress guarantee.
  1413. */
  1414. if (unlikely(!mapped_bytes)) {
  1415. blk_mq_requeue_request(req, false);
  1416. blk_mq_delay_kick_requeue_list(req->q,
  1417. UBLK_REQUEUE_DELAY_MS);
  1418. return false;
  1419. }
  1420. ublk_get_iod(ubq, req->tag)->nr_sectors =
  1421. mapped_bytes >> 9;
  1422. }
  1423. return true;
  1424. }
  1425. static void ublk_dispatch_req(struct ublk_queue *ubq, struct request *req)
  1426. {
  1427. unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
  1428. int tag = req->tag;
  1429. struct ublk_io *io = &ubq->ios[tag];
  1430. pr_devel("%s: complete: qid %d tag %d io_flags %x addr %llx\n",
  1431. __func__, ubq->q_id, req->tag, io->flags,
  1432. ublk_get_iod(ubq, req->tag)->addr);
  1433. /*
  1434. * Task is exiting if either:
  1435. *
  1436. * (1) current != io->task.
  1437. * io_uring_cmd_complete_in_task() tries to run task_work
  1438. * in a workqueue if cmd's task is PF_EXITING.
  1439. *
  1440. * (2) current->flags & PF_EXITING.
  1441. */
  1442. if (unlikely(current != io->task || current->flags & PF_EXITING)) {
  1443. __ublk_abort_rq(ubq, req);
  1444. return;
  1445. }
  1446. if (ublk_need_get_data(ubq) && ublk_need_map_req(req)) {
  1447. /*
  1448. * We have not handled UBLK_IO_NEED_GET_DATA command yet,
  1449. * so immediately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
  1450. * and notify it.
  1451. */
  1452. io->flags |= UBLK_IO_FLAG_NEED_GET_DATA;
  1453. pr_devel("%s: need get data. qid %d tag %d io_flags %x\n",
  1454. __func__, ubq->q_id, req->tag, io->flags);
  1455. ublk_complete_io_cmd(io, req, UBLK_IO_RES_NEED_GET_DATA,
  1456. issue_flags);
  1457. return;
  1458. }
  1459. if (!ublk_start_io(ubq, req, io))
  1460. return;
  1461. if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) {
  1462. ublk_auto_buf_dispatch(ubq, req, io, io->cmd, issue_flags);
  1463. } else {
  1464. ublk_init_req_ref(ubq, io);
  1465. ublk_complete_io_cmd(io, req, UBLK_IO_RES_OK, issue_flags);
  1466. }
  1467. }
  1468. static bool __ublk_batch_prep_dispatch(struct ublk_queue *ubq,
  1469. const struct ublk_batch_io_data *data,
  1470. unsigned short tag)
  1471. {
  1472. struct ublk_device *ub = data->ub;
  1473. struct ublk_io *io = &ubq->ios[tag];
  1474. struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
  1475. enum auto_buf_reg_res res = AUTO_BUF_REG_FALLBACK;
  1476. struct io_uring_cmd *cmd = data->cmd;
  1477. if (!ublk_start_io(ubq, req, io))
  1478. return false;
  1479. if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) {
  1480. res = ublk_auto_buf_register(ubq, req, io, cmd,
  1481. data->issue_flags);
  1482. if (res == AUTO_BUF_REG_FAIL)
  1483. return false;
  1484. }
  1485. ublk_io_lock(io);
  1486. ublk_auto_buf_io_setup(ubq, req, io, cmd, res);
  1487. ublk_io_unlock(io);
  1488. return true;
  1489. }
  1490. static bool ublk_batch_prep_dispatch(struct ublk_queue *ubq,
  1491. const struct ublk_batch_io_data *data,
  1492. unsigned short *tag_buf,
  1493. unsigned int len)
  1494. {
  1495. bool has_unused = false;
  1496. unsigned int i;
  1497. for (i = 0; i < len; i++) {
  1498. unsigned short tag = tag_buf[i];
  1499. if (!__ublk_batch_prep_dispatch(ubq, data, tag)) {
  1500. tag_buf[i] = UBLK_BATCH_IO_UNUSED_TAG;
  1501. has_unused = true;
  1502. }
  1503. }
  1504. return has_unused;
  1505. }
  1506. /*
  1507. * Filter out UBLK_BATCH_IO_UNUSED_TAG entries from tag_buf.
  1508. * Returns the new length after filtering.
  1509. */
  1510. static unsigned int ublk_filter_unused_tags(unsigned short *tag_buf,
  1511. unsigned int len)
  1512. {
  1513. unsigned int i, j;
  1514. for (i = 0, j = 0; i < len; i++) {
  1515. if (tag_buf[i] != UBLK_BATCH_IO_UNUSED_TAG) {
  1516. if (i != j)
  1517. tag_buf[j] = tag_buf[i];
  1518. j++;
  1519. }
  1520. }
  1521. return j;
  1522. }
  1523. #define MAX_NR_TAG 128
  1524. static int __ublk_batch_dispatch(struct ublk_queue *ubq,
  1525. const struct ublk_batch_io_data *data,
  1526. struct ublk_batch_fetch_cmd *fcmd)
  1527. {
  1528. const unsigned int tag_sz = sizeof(unsigned short);
  1529. unsigned short tag_buf[MAX_NR_TAG];
  1530. struct io_br_sel sel;
  1531. size_t len = 0;
  1532. bool needs_filter;
  1533. int ret;
  1534. WARN_ON_ONCE(data->cmd != fcmd->cmd);
  1535. sel = io_uring_cmd_buffer_select(fcmd->cmd, fcmd->buf_group, &len,
  1536. data->issue_flags);
  1537. if (sel.val < 0)
  1538. return sel.val;
  1539. if (!sel.addr)
  1540. return -ENOBUFS;
  1541. /* single reader needn't lock and sizeof(kfifo element) is 2 bytes */
  1542. len = min(len, sizeof(tag_buf)) / tag_sz;
  1543. len = kfifo_out(&ubq->evts_fifo, tag_buf, len);
  1544. needs_filter = ublk_batch_prep_dispatch(ubq, data, tag_buf, len);
  1545. /* Filter out unused tags before posting to userspace */
  1546. if (unlikely(needs_filter)) {
  1547. int new_len = ublk_filter_unused_tags(tag_buf, len);
  1548. /* return actual length if all are failed or requeued */
  1549. if (!new_len) {
  1550. /* release the selected buffer */
  1551. sel.val = 0;
  1552. WARN_ON_ONCE(!io_uring_mshot_cmd_post_cqe(fcmd->cmd,
  1553. &sel, data->issue_flags));
  1554. return len;
  1555. }
  1556. len = new_len;
  1557. }
  1558. sel.val = ublk_batch_copy_io_tags(fcmd, sel.addr, tag_buf, len * tag_sz);
  1559. ret = ublk_batch_fetch_post_cqe(fcmd, &sel, data->issue_flags);
  1560. if (unlikely(ret < 0)) {
  1561. int i, res;
  1562. /*
  1563. * Undo prep state for all IOs since userspace never received them.
  1564. * This restores IOs to pre-prepared state so they can be cleanly
  1565. * re-prepared when tags are pulled from FIFO again.
  1566. */
  1567. for (i = 0; i < len; i++) {
  1568. struct ublk_io *io = &ubq->ios[tag_buf[i]];
  1569. int index = -1;
  1570. ublk_io_lock(io);
  1571. if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG)
  1572. index = io->buf.auto_reg.index;
  1573. io->flags &= ~(UBLK_IO_FLAG_OWNED_BY_SRV | UBLK_IO_FLAG_AUTO_BUF_REG);
  1574. io->flags |= UBLK_IO_FLAG_ACTIVE;
  1575. ublk_io_unlock(io);
  1576. if (index != -1)
  1577. io_buffer_unregister_bvec(data->cmd, index,
  1578. data->issue_flags);
  1579. }
  1580. res = kfifo_in_spinlocked_noirqsave(&ubq->evts_fifo,
  1581. tag_buf, len, &ubq->evts_lock);
  1582. pr_warn_ratelimited("%s: copy tags or post CQE failure, move back "
  1583. "tags(%d %zu) ret %d\n", __func__, res, len,
  1584. ret);
  1585. }
  1586. return ret;
  1587. }
  1588. static struct ublk_batch_fetch_cmd *__ublk_acquire_fcmd(
  1589. struct ublk_queue *ubq)
  1590. {
  1591. struct ublk_batch_fetch_cmd *fcmd;
  1592. lockdep_assert_held(&ubq->evts_lock);
  1593. /*
  1594. * Ordering updating ubq->evts_fifo and checking ubq->active_fcmd.
  1595. *
  1596. * The pair is the smp_mb() in ublk_batch_dispatch().
  1597. *
  1598. * If ubq->active_fcmd is observed as non-NULL, the new added tags
  1599. * can be visisible in ublk_batch_dispatch() with the barrier pairing.
  1600. */
  1601. smp_mb();
  1602. if (READ_ONCE(ubq->active_fcmd)) {
  1603. fcmd = NULL;
  1604. } else {
  1605. fcmd = list_first_entry_or_null(&ubq->fcmd_head,
  1606. struct ublk_batch_fetch_cmd, node);
  1607. WRITE_ONCE(ubq->active_fcmd, fcmd);
  1608. }
  1609. return fcmd;
  1610. }
  1611. static void ublk_batch_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
  1612. {
  1613. unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
  1614. struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
  1615. struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
  1616. struct ublk_batch_fetch_cmd *fcmd = pdu->fcmd;
  1617. struct ublk_batch_io_data data = {
  1618. .ub = pdu->ubq->dev,
  1619. .cmd = fcmd->cmd,
  1620. .issue_flags = issue_flags,
  1621. };
  1622. WARN_ON_ONCE(pdu->ubq->active_fcmd != fcmd);
  1623. ublk_batch_dispatch(pdu->ubq, &data, fcmd);
  1624. }
  1625. static void
  1626. ublk_batch_dispatch(struct ublk_queue *ubq,
  1627. const struct ublk_batch_io_data *data,
  1628. struct ublk_batch_fetch_cmd *fcmd)
  1629. {
  1630. struct ublk_batch_fetch_cmd *new_fcmd;
  1631. unsigned tried = 0;
  1632. int ret = 0;
  1633. again:
  1634. while (!ublk_io_evts_empty(ubq)) {
  1635. ret = __ublk_batch_dispatch(ubq, data, fcmd);
  1636. if (ret <= 0)
  1637. break;
  1638. }
  1639. if (ret < 0) {
  1640. ublk_batch_deinit_fetch_buf(ubq, data, fcmd, ret);
  1641. return;
  1642. }
  1643. __ublk_release_fcmd(ubq);
  1644. /*
  1645. * Order clearing ubq->active_fcmd from __ublk_release_fcmd() and
  1646. * checking ubq->evts_fifo.
  1647. *
  1648. * The pair is the smp_mb() in __ublk_acquire_fcmd().
  1649. */
  1650. smp_mb();
  1651. if (likely(ublk_io_evts_empty(ubq)))
  1652. return;
  1653. spin_lock(&ubq->evts_lock);
  1654. new_fcmd = __ublk_acquire_fcmd(ubq);
  1655. spin_unlock(&ubq->evts_lock);
  1656. if (!new_fcmd)
  1657. return;
  1658. /* Avoid lockup by allowing to handle at most 32 batches */
  1659. if (new_fcmd == fcmd && tried++ < 32)
  1660. goto again;
  1661. io_uring_cmd_complete_in_task(new_fcmd->cmd, ublk_batch_tw_cb);
  1662. }
  1663. static void ublk_cmd_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
  1664. {
  1665. struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
  1666. struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
  1667. struct ublk_queue *ubq = pdu->ubq;
  1668. ublk_dispatch_req(ubq, pdu->req);
  1669. }
  1670. static void ublk_batch_queue_cmd(struct ublk_queue *ubq, struct request *rq, bool last)
  1671. {
  1672. unsigned short tag = rq->tag;
  1673. struct ublk_batch_fetch_cmd *fcmd = NULL;
  1674. spin_lock(&ubq->evts_lock);
  1675. kfifo_put(&ubq->evts_fifo, tag);
  1676. if (last)
  1677. fcmd = __ublk_acquire_fcmd(ubq);
  1678. spin_unlock(&ubq->evts_lock);
  1679. if (fcmd)
  1680. io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
  1681. }
  1682. static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
  1683. {
  1684. struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd;
  1685. struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
  1686. pdu->req = rq;
  1687. io_uring_cmd_complete_in_task(cmd, ublk_cmd_tw_cb);
  1688. }
  1689. static void ublk_cmd_list_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
  1690. {
  1691. struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
  1692. struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
  1693. struct request *rq = pdu->req_list;
  1694. struct request *next;
  1695. do {
  1696. next = rq->rq_next;
  1697. rq->rq_next = NULL;
  1698. ublk_dispatch_req(rq->mq_hctx->driver_data, rq);
  1699. rq = next;
  1700. } while (rq);
  1701. }
  1702. static void ublk_queue_cmd_list(struct ublk_io *io, struct rq_list *l)
  1703. {
  1704. struct io_uring_cmd *cmd = io->cmd;
  1705. struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
  1706. pdu->req_list = rq_list_peek(l);
  1707. rq_list_init(l);
  1708. io_uring_cmd_complete_in_task(cmd, ublk_cmd_list_tw_cb);
  1709. }
  1710. static enum blk_eh_timer_return ublk_timeout(struct request *rq)
  1711. {
  1712. struct ublk_queue *ubq = rq->mq_hctx->driver_data;
  1713. pid_t tgid = ubq->dev->ublksrv_tgid;
  1714. struct task_struct *p;
  1715. struct pid *pid;
  1716. if (!(ubq->flags & UBLK_F_UNPRIVILEGED_DEV))
  1717. return BLK_EH_RESET_TIMER;
  1718. if (unlikely(!tgid))
  1719. return BLK_EH_RESET_TIMER;
  1720. rcu_read_lock();
  1721. pid = find_vpid(tgid);
  1722. p = pid_task(pid, PIDTYPE_PID);
  1723. if (p)
  1724. send_sig(SIGKILL, p, 0);
  1725. rcu_read_unlock();
  1726. return BLK_EH_DONE;
  1727. }
  1728. static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq,
  1729. bool check_cancel)
  1730. {
  1731. blk_status_t res;
  1732. if (unlikely(READ_ONCE(ubq->fail_io)))
  1733. return BLK_STS_TARGET;
  1734. /* With recovery feature enabled, force_abort is set in
  1735. * ublk_stop_dev() before calling del_gendisk(). We have to
  1736. * abort all requeued and new rqs here to let del_gendisk()
  1737. * move on. Besides, we cannot not call io_uring_cmd_complete_in_task()
  1738. * to avoid UAF on io_uring ctx.
  1739. *
  1740. * Note: force_abort is guaranteed to be seen because it is set
  1741. * before request queue is unqiuesced.
  1742. */
  1743. if (ublk_nosrv_should_queue_io(ubq) &&
  1744. unlikely(READ_ONCE(ubq->force_abort)))
  1745. return BLK_STS_IOERR;
  1746. if (check_cancel && unlikely(ubq->canceling))
  1747. return BLK_STS_IOERR;
  1748. /* fill iod to slot in io cmd buffer */
  1749. res = ublk_setup_iod(ubq, rq);
  1750. if (unlikely(res != BLK_STS_OK))
  1751. return BLK_STS_IOERR;
  1752. blk_mq_start_request(rq);
  1753. return BLK_STS_OK;
  1754. }
  1755. /*
  1756. * Common helper for queue_rq that handles request preparation and
  1757. * cancellation checks. Returns status and sets should_queue to indicate
  1758. * whether the caller should proceed with queuing the request.
  1759. */
  1760. static inline blk_status_t __ublk_queue_rq_common(struct ublk_queue *ubq,
  1761. struct request *rq,
  1762. bool *should_queue)
  1763. {
  1764. blk_status_t res;
  1765. res = ublk_prep_req(ubq, rq, false);
  1766. if (res != BLK_STS_OK) {
  1767. *should_queue = false;
  1768. return res;
  1769. }
  1770. /*
  1771. * ->canceling has to be handled after ->force_abort and ->fail_io
  1772. * is dealt with, otherwise this request may not be failed in case
  1773. * of recovery, and cause hang when deleting disk
  1774. */
  1775. if (unlikely(ubq->canceling)) {
  1776. *should_queue = false;
  1777. __ublk_abort_rq(ubq, rq);
  1778. return BLK_STS_OK;
  1779. }
  1780. *should_queue = true;
  1781. return BLK_STS_OK;
  1782. }
  1783. static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
  1784. const struct blk_mq_queue_data *bd)
  1785. {
  1786. struct ublk_queue *ubq = hctx->driver_data;
  1787. struct request *rq = bd->rq;
  1788. bool should_queue;
  1789. blk_status_t res;
  1790. res = __ublk_queue_rq_common(ubq, rq, &should_queue);
  1791. if (!should_queue)
  1792. return res;
  1793. ublk_queue_cmd(ubq, rq);
  1794. return BLK_STS_OK;
  1795. }
  1796. static blk_status_t ublk_batch_queue_rq(struct blk_mq_hw_ctx *hctx,
  1797. const struct blk_mq_queue_data *bd)
  1798. {
  1799. struct ublk_queue *ubq = hctx->driver_data;
  1800. struct request *rq = bd->rq;
  1801. bool should_queue;
  1802. blk_status_t res;
  1803. res = __ublk_queue_rq_common(ubq, rq, &should_queue);
  1804. if (!should_queue)
  1805. return res;
  1806. ublk_batch_queue_cmd(ubq, rq, bd->last);
  1807. return BLK_STS_OK;
  1808. }
  1809. static inline bool ublk_belong_to_same_batch(const struct ublk_io *io,
  1810. const struct ublk_io *io2)
  1811. {
  1812. return (io_uring_cmd_ctx_handle(io->cmd) ==
  1813. io_uring_cmd_ctx_handle(io2->cmd)) &&
  1814. (io->task == io2->task);
  1815. }
  1816. static void ublk_commit_rqs(struct blk_mq_hw_ctx *hctx)
  1817. {
  1818. struct ublk_queue *ubq = hctx->driver_data;
  1819. struct ublk_batch_fetch_cmd *fcmd;
  1820. spin_lock(&ubq->evts_lock);
  1821. fcmd = __ublk_acquire_fcmd(ubq);
  1822. spin_unlock(&ubq->evts_lock);
  1823. if (fcmd)
  1824. io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
  1825. }
  1826. static void ublk_queue_rqs(struct rq_list *rqlist)
  1827. {
  1828. struct rq_list requeue_list = { };
  1829. struct rq_list submit_list = { };
  1830. struct ublk_io *io = NULL;
  1831. struct request *req;
  1832. while ((req = rq_list_pop(rqlist))) {
  1833. struct ublk_queue *this_q = req->mq_hctx->driver_data;
  1834. struct ublk_io *this_io = &this_q->ios[req->tag];
  1835. if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) {
  1836. rq_list_add_tail(&requeue_list, req);
  1837. continue;
  1838. }
  1839. if (io && !ublk_belong_to_same_batch(io, this_io) &&
  1840. !rq_list_empty(&submit_list))
  1841. ublk_queue_cmd_list(io, &submit_list);
  1842. io = this_io;
  1843. rq_list_add_tail(&submit_list, req);
  1844. }
  1845. if (!rq_list_empty(&submit_list))
  1846. ublk_queue_cmd_list(io, &submit_list);
  1847. *rqlist = requeue_list;
  1848. }
  1849. static void ublk_batch_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l)
  1850. {
  1851. unsigned short tags[MAX_NR_TAG];
  1852. struct ublk_batch_fetch_cmd *fcmd;
  1853. struct request *rq;
  1854. unsigned cnt = 0;
  1855. spin_lock(&ubq->evts_lock);
  1856. rq_list_for_each(l, rq) {
  1857. tags[cnt++] = (unsigned short)rq->tag;
  1858. if (cnt >= MAX_NR_TAG) {
  1859. kfifo_in(&ubq->evts_fifo, tags, cnt);
  1860. cnt = 0;
  1861. }
  1862. }
  1863. if (cnt)
  1864. kfifo_in(&ubq->evts_fifo, tags, cnt);
  1865. fcmd = __ublk_acquire_fcmd(ubq);
  1866. spin_unlock(&ubq->evts_lock);
  1867. rq_list_init(l);
  1868. if (fcmd)
  1869. io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb);
  1870. }
  1871. static void ublk_batch_queue_rqs(struct rq_list *rqlist)
  1872. {
  1873. struct rq_list requeue_list = { };
  1874. struct rq_list submit_list = { };
  1875. struct ublk_queue *ubq = NULL;
  1876. struct request *req;
  1877. while ((req = rq_list_pop(rqlist))) {
  1878. struct ublk_queue *this_q = req->mq_hctx->driver_data;
  1879. if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) {
  1880. rq_list_add_tail(&requeue_list, req);
  1881. continue;
  1882. }
  1883. if (ubq && this_q != ubq && !rq_list_empty(&submit_list))
  1884. ublk_batch_queue_cmd_list(ubq, &submit_list);
  1885. ubq = this_q;
  1886. rq_list_add_tail(&submit_list, req);
  1887. }
  1888. if (!rq_list_empty(&submit_list))
  1889. ublk_batch_queue_cmd_list(ubq, &submit_list);
  1890. *rqlist = requeue_list;
  1891. }
  1892. static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
  1893. unsigned int hctx_idx)
  1894. {
  1895. struct ublk_device *ub = driver_data;
  1896. struct ublk_queue *ubq = ublk_get_queue(ub, hctx->queue_num);
  1897. hctx->driver_data = ubq;
  1898. return 0;
  1899. }
  1900. static const struct blk_mq_ops ublk_mq_ops = {
  1901. .queue_rq = ublk_queue_rq,
  1902. .queue_rqs = ublk_queue_rqs,
  1903. .init_hctx = ublk_init_hctx,
  1904. .timeout = ublk_timeout,
  1905. };
  1906. static const struct blk_mq_ops ublk_batch_mq_ops = {
  1907. .commit_rqs = ublk_commit_rqs,
  1908. .queue_rq = ublk_batch_queue_rq,
  1909. .queue_rqs = ublk_batch_queue_rqs,
  1910. .init_hctx = ublk_init_hctx,
  1911. .timeout = ublk_timeout,
  1912. };
  1913. static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
  1914. {
  1915. int i;
  1916. ubq->nr_io_ready = 0;
  1917. for (i = 0; i < ubq->q_depth; i++) {
  1918. struct ublk_io *io = &ubq->ios[i];
  1919. /*
  1920. * UBLK_IO_FLAG_CANCELED is kept for avoiding to touch
  1921. * io->cmd
  1922. */
  1923. io->flags &= UBLK_IO_FLAG_CANCELED;
  1924. io->cmd = NULL;
  1925. io->buf.addr = 0;
  1926. /*
  1927. * old task is PF_EXITING, put it now
  1928. *
  1929. * It could be NULL in case of closing one quiesced
  1930. * device.
  1931. */
  1932. if (io->task) {
  1933. put_task_struct(io->task);
  1934. io->task = NULL;
  1935. }
  1936. WARN_ON_ONCE(refcount_read(&io->ref));
  1937. WARN_ON_ONCE(io->task_registered_buffers);
  1938. }
  1939. }
  1940. static int ublk_ch_open(struct inode *inode, struct file *filp)
  1941. {
  1942. struct ublk_device *ub = container_of(inode->i_cdev,
  1943. struct ublk_device, cdev);
  1944. if (test_and_set_bit(UB_STATE_OPEN, &ub->state))
  1945. return -EBUSY;
  1946. filp->private_data = ub;
  1947. ub->ublksrv_tgid = current->tgid;
  1948. return 0;
  1949. }
  1950. static void ublk_reset_ch_dev(struct ublk_device *ub)
  1951. {
  1952. int i;
  1953. for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
  1954. ublk_queue_reinit(ub, ublk_get_queue(ub, i));
  1955. /* set to NULL, otherwise new tasks cannot mmap io_cmd_buf */
  1956. ub->mm = NULL;
  1957. ub->nr_queue_ready = 0;
  1958. ub->unprivileged_daemons = false;
  1959. ub->ublksrv_tgid = -1;
  1960. }
  1961. static struct gendisk *ublk_get_disk(struct ublk_device *ub)
  1962. {
  1963. struct gendisk *disk;
  1964. spin_lock(&ub->lock);
  1965. disk = ub->ub_disk;
  1966. if (disk)
  1967. get_device(disk_to_dev(disk));
  1968. spin_unlock(&ub->lock);
  1969. return disk;
  1970. }
  1971. static void ublk_put_disk(struct gendisk *disk)
  1972. {
  1973. if (disk)
  1974. put_device(disk_to_dev(disk));
  1975. }
  1976. static void ublk_partition_scan_work(struct work_struct *work)
  1977. {
  1978. struct ublk_device *ub =
  1979. container_of(work, struct ublk_device, partition_scan_work);
  1980. /* Hold disk reference to prevent UAF during concurrent teardown */
  1981. struct gendisk *disk = ublk_get_disk(ub);
  1982. if (!disk)
  1983. return;
  1984. if (WARN_ON_ONCE(!test_and_clear_bit(GD_SUPPRESS_PART_SCAN,
  1985. &disk->state)))
  1986. goto out;
  1987. mutex_lock(&disk->open_mutex);
  1988. bdev_disk_changed(disk, false);
  1989. mutex_unlock(&disk->open_mutex);
  1990. out:
  1991. ublk_put_disk(disk);
  1992. }
  1993. /*
  1994. * Use this function to ensure that ->canceling is consistently set for
  1995. * the device and all queues. Do not set these flags directly.
  1996. *
  1997. * Caller must ensure that:
  1998. * - cancel_mutex is held. This ensures that there is no concurrent
  1999. * access to ub->canceling and no concurrent writes to ubq->canceling.
  2000. * - there are no concurrent reads of ubq->canceling from the queue_rq
  2001. * path. This can be done by quiescing the queue, or through other
  2002. * means.
  2003. */
  2004. static void ublk_set_canceling(struct ublk_device *ub, bool canceling)
  2005. __must_hold(&ub->cancel_mutex)
  2006. {
  2007. int i;
  2008. ub->canceling = canceling;
  2009. for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
  2010. ublk_get_queue(ub, i)->canceling = canceling;
  2011. }
  2012. static bool ublk_check_and_reset_active_ref(struct ublk_device *ub)
  2013. {
  2014. int i, j;
  2015. if (!ublk_dev_need_req_ref(ub))
  2016. return false;
  2017. for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
  2018. struct ublk_queue *ubq = ublk_get_queue(ub, i);
  2019. for (j = 0; j < ubq->q_depth; j++) {
  2020. struct ublk_io *io = &ubq->ios[j];
  2021. unsigned int refs = refcount_read(&io->ref) +
  2022. io->task_registered_buffers;
  2023. /*
  2024. * UBLK_REFCOUNT_INIT or zero means no active
  2025. * reference
  2026. */
  2027. if (refs != UBLK_REFCOUNT_INIT && refs != 0)
  2028. return true;
  2029. /* reset to zero if the io hasn't active references */
  2030. refcount_set(&io->ref, 0);
  2031. io->task_registered_buffers = 0;
  2032. }
  2033. }
  2034. return false;
  2035. }
  2036. static void ublk_ch_release_work_fn(struct work_struct *work)
  2037. {
  2038. struct ublk_device *ub =
  2039. container_of(work, struct ublk_device, exit_work.work);
  2040. struct gendisk *disk;
  2041. int i;
  2042. /*
  2043. * For zero-copy and auto buffer register modes, I/O references
  2044. * might not be dropped naturally when the daemon is killed, but
  2045. * io_uring guarantees that registered bvec kernel buffers are
  2046. * unregistered finally when freeing io_uring context, then the
  2047. * active references are dropped.
  2048. *
  2049. * Wait until active references are dropped for avoiding use-after-free
  2050. *
  2051. * registered buffer may be unregistered in io_ring's release hander,
  2052. * so have to wait by scheduling work function for avoiding the two
  2053. * file release dependency.
  2054. */
  2055. if (ublk_check_and_reset_active_ref(ub)) {
  2056. schedule_delayed_work(&ub->exit_work, 1);
  2057. return;
  2058. }
  2059. /*
  2060. * disk isn't attached yet, either device isn't live, or it has
  2061. * been removed already, so we needn't to do anything
  2062. */
  2063. disk = ublk_get_disk(ub);
  2064. if (!disk)
  2065. goto out;
  2066. /*
  2067. * All uring_cmd are done now, so abort any request outstanding to
  2068. * the ublk server
  2069. *
  2070. * This can be done in lockless way because ublk server has been
  2071. * gone
  2072. *
  2073. * More importantly, we have to provide forward progress guarantee
  2074. * without holding ub->mutex, otherwise control task grabbing
  2075. * ub->mutex triggers deadlock
  2076. *
  2077. * All requests may be inflight, so ->canceling may not be set, set
  2078. * it now.
  2079. */
  2080. mutex_lock(&ub->cancel_mutex);
  2081. ublk_set_canceling(ub, true);
  2082. for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
  2083. ublk_abort_queue(ub, ublk_get_queue(ub, i));
  2084. mutex_unlock(&ub->cancel_mutex);
  2085. blk_mq_kick_requeue_list(disk->queue);
  2086. /*
  2087. * All infligh requests have been completed or requeued and any new
  2088. * request will be failed or requeued via `->canceling` now, so it is
  2089. * fine to grab ub->mutex now.
  2090. */
  2091. mutex_lock(&ub->mutex);
  2092. /* double check after grabbing lock */
  2093. if (!ub->ub_disk)
  2094. goto unlock;
  2095. /*
  2096. * Transition the device to the nosrv state. What exactly this
  2097. * means depends on the recovery flags
  2098. */
  2099. if (ublk_nosrv_should_stop_dev(ub)) {
  2100. /*
  2101. * Allow any pending/future I/O to pass through quickly
  2102. * with an error. This is needed because del_gendisk
  2103. * waits for all pending I/O to complete
  2104. */
  2105. for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
  2106. WRITE_ONCE(ublk_get_queue(ub, i)->force_abort, true);
  2107. ublk_stop_dev_unlocked(ub);
  2108. } else {
  2109. if (ublk_nosrv_dev_should_queue_io(ub)) {
  2110. /* ->canceling is set and all requests are aborted */
  2111. ub->dev_info.state = UBLK_S_DEV_QUIESCED;
  2112. } else {
  2113. ub->dev_info.state = UBLK_S_DEV_FAIL_IO;
  2114. for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
  2115. WRITE_ONCE(ublk_get_queue(ub, i)->fail_io, true);
  2116. }
  2117. }
  2118. unlock:
  2119. mutex_unlock(&ub->mutex);
  2120. ublk_put_disk(disk);
  2121. /* all uring_cmd has been done now, reset device & ubq */
  2122. ublk_reset_ch_dev(ub);
  2123. out:
  2124. clear_bit(UB_STATE_OPEN, &ub->state);
  2125. /* put the reference grabbed in ublk_ch_release() */
  2126. ublk_put_device(ub);
  2127. }
  2128. static int ublk_ch_release(struct inode *inode, struct file *filp)
  2129. {
  2130. struct ublk_device *ub = filp->private_data;
  2131. /*
  2132. * Grab ublk device reference, so it won't be gone until we are
  2133. * really released from work function.
  2134. */
  2135. ublk_get_device(ub);
  2136. INIT_DELAYED_WORK(&ub->exit_work, ublk_ch_release_work_fn);
  2137. schedule_delayed_work(&ub->exit_work, 0);
  2138. return 0;
  2139. }
  2140. /* map pre-allocated per-queue cmd buffer to ublksrv daemon */
  2141. static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
  2142. {
  2143. struct ublk_device *ub = filp->private_data;
  2144. size_t sz = vma->vm_end - vma->vm_start;
  2145. unsigned max_sz = ublk_max_cmd_buf_size();
  2146. unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT;
  2147. int q_id, ret = 0;
  2148. spin_lock(&ub->lock);
  2149. if (!ub->mm)
  2150. ub->mm = current->mm;
  2151. if (current->mm != ub->mm)
  2152. ret = -EINVAL;
  2153. spin_unlock(&ub->lock);
  2154. if (ret)
  2155. return ret;
  2156. if (vma->vm_flags & VM_WRITE)
  2157. return -EPERM;
  2158. end = UBLKSRV_CMD_BUF_OFFSET + ub->dev_info.nr_hw_queues * max_sz;
  2159. if (phys_off < UBLKSRV_CMD_BUF_OFFSET || phys_off >= end)
  2160. return -EINVAL;
  2161. q_id = (phys_off - UBLKSRV_CMD_BUF_OFFSET) / max_sz;
  2162. pr_devel("%s: qid %d, pid %d, addr %lx pg_off %lx sz %lu\n",
  2163. __func__, q_id, current->pid, vma->vm_start,
  2164. phys_off, (unsigned long)sz);
  2165. if (sz != ublk_queue_cmd_buf_size(ub))
  2166. return -EINVAL;
  2167. pfn = virt_to_phys(ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT;
  2168. return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
  2169. }
  2170. static void __ublk_fail_req(struct ublk_device *ub, struct ublk_io *io,
  2171. struct request *req)
  2172. {
  2173. WARN_ON_ONCE(!ublk_dev_support_batch_io(ub) &&
  2174. io->flags & UBLK_IO_FLAG_ACTIVE);
  2175. if (ublk_nosrv_should_reissue_outstanding(ub))
  2176. blk_mq_requeue_request(req, false);
  2177. else {
  2178. io->res = -EIO;
  2179. __ublk_complete_rq(req, io, ublk_dev_need_map_io(ub), NULL);
  2180. }
  2181. }
  2182. /*
  2183. * Request tag may just be filled to event kfifo, not get chance to
  2184. * dispatch, abort these requests too
  2185. */
  2186. static void ublk_abort_batch_queue(struct ublk_device *ub,
  2187. struct ublk_queue *ubq)
  2188. {
  2189. unsigned short tag;
  2190. while (kfifo_out(&ubq->evts_fifo, &tag, 1)) {
  2191. struct request *req = blk_mq_tag_to_rq(
  2192. ub->tag_set.tags[ubq->q_id], tag);
  2193. if (!WARN_ON_ONCE(!req || !blk_mq_request_started(req)))
  2194. __ublk_fail_req(ub, &ubq->ios[tag], req);
  2195. }
  2196. }
  2197. /*
  2198. * Called from ublk char device release handler, when any uring_cmd is
  2199. * done, meantime request queue is "quiesced" since all inflight requests
  2200. * can't be completed because ublk server is dead.
  2201. *
  2202. * So no one can hold our request IO reference any more, simply ignore the
  2203. * reference, and complete the request immediately
  2204. */
  2205. static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
  2206. {
  2207. int i;
  2208. for (i = 0; i < ubq->q_depth; i++) {
  2209. struct ublk_io *io = &ubq->ios[i];
  2210. if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
  2211. __ublk_fail_req(ub, io, io->req);
  2212. }
  2213. if (ublk_support_batch_io(ubq))
  2214. ublk_abort_batch_queue(ub, ubq);
  2215. }
  2216. static void ublk_start_cancel(struct ublk_device *ub)
  2217. {
  2218. struct gendisk *disk = ublk_get_disk(ub);
  2219. /* Our disk has been dead */
  2220. if (!disk)
  2221. return;
  2222. mutex_lock(&ub->cancel_mutex);
  2223. if (ub->canceling)
  2224. goto out;
  2225. /*
  2226. * Now we are serialized with ublk_queue_rq()
  2227. *
  2228. * Make sure that ubq->canceling is set when queue is frozen,
  2229. * because ublk_queue_rq() has to rely on this flag for avoiding to
  2230. * touch completed uring_cmd
  2231. */
  2232. blk_mq_quiesce_queue(disk->queue);
  2233. ublk_set_canceling(ub, true);
  2234. blk_mq_unquiesce_queue(disk->queue);
  2235. out:
  2236. mutex_unlock(&ub->cancel_mutex);
  2237. ublk_put_disk(disk);
  2238. }
  2239. static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag,
  2240. unsigned int issue_flags)
  2241. {
  2242. struct ublk_io *io = &ubq->ios[tag];
  2243. struct ublk_device *ub = ubq->dev;
  2244. struct request *req;
  2245. bool done;
  2246. if (!(io->flags & UBLK_IO_FLAG_ACTIVE))
  2247. return;
  2248. /*
  2249. * Don't try to cancel this command if the request is started for
  2250. * avoiding race between io_uring_cmd_done() and
  2251. * io_uring_cmd_complete_in_task().
  2252. *
  2253. * Either the started request will be aborted via __ublk_abort_rq(),
  2254. * then this uring_cmd is canceled next time, or it will be done in
  2255. * task work function ublk_dispatch_req() because io_uring guarantees
  2256. * that ublk_dispatch_req() is always called
  2257. */
  2258. req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
  2259. if (req && blk_mq_request_started(req) && req->tag == tag)
  2260. return;
  2261. spin_lock(&ubq->cancel_lock);
  2262. done = !!(io->flags & UBLK_IO_FLAG_CANCELED);
  2263. if (!done)
  2264. io->flags |= UBLK_IO_FLAG_CANCELED;
  2265. spin_unlock(&ubq->cancel_lock);
  2266. if (!done)
  2267. io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, issue_flags);
  2268. }
  2269. /*
  2270. * Cancel a batch fetch command if it hasn't been claimed by another path.
  2271. *
  2272. * An fcmd can only be cancelled if:
  2273. * 1. It's not the active_fcmd (which is currently being processed)
  2274. * 2. It's still on the list (!list_empty check) - once removed from the list,
  2275. * the fcmd is considered claimed and will be freed by whoever removed it
  2276. *
  2277. * Use list_del_init() so subsequent list_empty() checks work correctly.
  2278. */
  2279. static void ublk_batch_cancel_cmd(struct ublk_queue *ubq,
  2280. struct ublk_batch_fetch_cmd *fcmd,
  2281. unsigned int issue_flags)
  2282. {
  2283. bool done;
  2284. spin_lock(&ubq->evts_lock);
  2285. done = (READ_ONCE(ubq->active_fcmd) != fcmd) && !list_empty(&fcmd->node);
  2286. if (done)
  2287. list_del_init(&fcmd->node);
  2288. spin_unlock(&ubq->evts_lock);
  2289. if (done) {
  2290. io_uring_cmd_done(fcmd->cmd, UBLK_IO_RES_ABORT, issue_flags);
  2291. ublk_batch_free_fcmd(fcmd);
  2292. }
  2293. }
  2294. static void ublk_batch_cancel_queue(struct ublk_queue *ubq)
  2295. {
  2296. struct ublk_batch_fetch_cmd *fcmd;
  2297. LIST_HEAD(fcmd_list);
  2298. spin_lock(&ubq->evts_lock);
  2299. ubq->force_abort = true;
  2300. list_splice_init(&ubq->fcmd_head, &fcmd_list);
  2301. fcmd = READ_ONCE(ubq->active_fcmd);
  2302. if (fcmd)
  2303. list_move(&fcmd->node, &ubq->fcmd_head);
  2304. spin_unlock(&ubq->evts_lock);
  2305. while (!list_empty(&fcmd_list)) {
  2306. fcmd = list_first_entry(&fcmd_list,
  2307. struct ublk_batch_fetch_cmd, node);
  2308. ublk_batch_cancel_cmd(ubq, fcmd, IO_URING_F_UNLOCKED);
  2309. }
  2310. }
  2311. static void ublk_batch_cancel_fn(struct io_uring_cmd *cmd,
  2312. unsigned int issue_flags)
  2313. {
  2314. struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
  2315. struct ublk_batch_fetch_cmd *fcmd = pdu->fcmd;
  2316. struct ublk_queue *ubq = pdu->ubq;
  2317. ublk_start_cancel(ubq->dev);
  2318. ublk_batch_cancel_cmd(ubq, fcmd, issue_flags);
  2319. }
  2320. /*
  2321. * The ublk char device won't be closed when calling cancel fn, so both
  2322. * ublk device and queue are guaranteed to be live
  2323. *
  2324. * Two-stage cancel:
  2325. *
  2326. * - make every active uring_cmd done in ->cancel_fn()
  2327. *
  2328. * - aborting inflight ublk IO requests in ublk char device release handler,
  2329. * which depends on 1st stage because device can only be closed iff all
  2330. * uring_cmd are done
  2331. *
  2332. * Do _not_ try to acquire ub->mutex before all inflight requests are
  2333. * aborted, otherwise deadlock may be caused.
  2334. */
  2335. static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
  2336. unsigned int issue_flags)
  2337. {
  2338. struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
  2339. struct ublk_queue *ubq = pdu->ubq;
  2340. struct task_struct *task;
  2341. struct ublk_io *io;
  2342. if (WARN_ON_ONCE(!ubq))
  2343. return;
  2344. if (WARN_ON_ONCE(pdu->tag >= ubq->q_depth))
  2345. return;
  2346. task = io_uring_cmd_get_task(cmd);
  2347. io = &ubq->ios[pdu->tag];
  2348. if (WARN_ON_ONCE(task && task != io->task))
  2349. return;
  2350. ublk_start_cancel(ubq->dev);
  2351. WARN_ON_ONCE(io->cmd != cmd);
  2352. ublk_cancel_cmd(ubq, pdu->tag, issue_flags);
  2353. }
  2354. static inline bool ublk_queue_ready(const struct ublk_queue *ubq)
  2355. {
  2356. return ubq->nr_io_ready == ubq->q_depth;
  2357. }
  2358. static inline bool ublk_dev_ready(const struct ublk_device *ub)
  2359. {
  2360. return ub->nr_queue_ready == ub->dev_info.nr_hw_queues;
  2361. }
  2362. static void ublk_cancel_queue(struct ublk_queue *ubq)
  2363. {
  2364. int i;
  2365. if (ublk_support_batch_io(ubq)) {
  2366. ublk_batch_cancel_queue(ubq);
  2367. return;
  2368. }
  2369. for (i = 0; i < ubq->q_depth; i++)
  2370. ublk_cancel_cmd(ubq, i, IO_URING_F_UNLOCKED);
  2371. }
  2372. /* Cancel all pending commands, must be called after del_gendisk() returns */
  2373. static void ublk_cancel_dev(struct ublk_device *ub)
  2374. {
  2375. int i;
  2376. for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
  2377. ublk_cancel_queue(ublk_get_queue(ub, i));
  2378. }
  2379. static bool ublk_check_inflight_rq(struct request *rq, void *data)
  2380. {
  2381. bool *idle = data;
  2382. if (blk_mq_request_started(rq)) {
  2383. *idle = false;
  2384. return false;
  2385. }
  2386. return true;
  2387. }
  2388. static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub)
  2389. {
  2390. bool idle;
  2391. WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue));
  2392. while (true) {
  2393. idle = true;
  2394. blk_mq_tagset_busy_iter(&ub->tag_set,
  2395. ublk_check_inflight_rq, &idle);
  2396. if (idle)
  2397. break;
  2398. msleep(UBLK_REQUEUE_DELAY_MS);
  2399. }
  2400. }
  2401. static void ublk_force_abort_dev(struct ublk_device *ub)
  2402. {
  2403. int i;
  2404. pr_devel("%s: force abort ub: dev_id %d state %s\n",
  2405. __func__, ub->dev_info.dev_id,
  2406. ub->dev_info.state == UBLK_S_DEV_LIVE ?
  2407. "LIVE" : "QUIESCED");
  2408. blk_mq_quiesce_queue(ub->ub_disk->queue);
  2409. if (ub->dev_info.state == UBLK_S_DEV_LIVE)
  2410. ublk_wait_tagset_rqs_idle(ub);
  2411. for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
  2412. ublk_get_queue(ub, i)->force_abort = true;
  2413. blk_mq_unquiesce_queue(ub->ub_disk->queue);
  2414. /* We may have requeued some rqs in ublk_quiesce_queue() */
  2415. blk_mq_kick_requeue_list(ub->ub_disk->queue);
  2416. }
  2417. static struct gendisk *ublk_detach_disk(struct ublk_device *ub)
  2418. {
  2419. struct gendisk *disk;
  2420. /* Sync with ublk_abort_queue() by holding the lock */
  2421. spin_lock(&ub->lock);
  2422. disk = ub->ub_disk;
  2423. ub->dev_info.state = UBLK_S_DEV_DEAD;
  2424. ub->dev_info.ublksrv_pid = -1;
  2425. ub->ub_disk = NULL;
  2426. spin_unlock(&ub->lock);
  2427. return disk;
  2428. }
  2429. static void ublk_stop_dev_unlocked(struct ublk_device *ub)
  2430. __must_hold(&ub->mutex)
  2431. {
  2432. struct gendisk *disk;
  2433. if (ub->dev_info.state == UBLK_S_DEV_DEAD)
  2434. return;
  2435. if (ublk_nosrv_dev_should_queue_io(ub))
  2436. ublk_force_abort_dev(ub);
  2437. del_gendisk(ub->ub_disk);
  2438. disk = ublk_detach_disk(ub);
  2439. put_disk(disk);
  2440. }
  2441. static void ublk_stop_dev(struct ublk_device *ub)
  2442. {
  2443. mutex_lock(&ub->mutex);
  2444. ublk_stop_dev_unlocked(ub);
  2445. mutex_unlock(&ub->mutex);
  2446. cancel_work_sync(&ub->partition_scan_work);
  2447. ublk_cancel_dev(ub);
  2448. }
  2449. /* reset per-queue io flags */
  2450. static void ublk_queue_reset_io_flags(struct ublk_queue *ubq)
  2451. {
  2452. int j;
  2453. /* UBLK_IO_FLAG_CANCELED can be cleared now */
  2454. spin_lock(&ubq->cancel_lock);
  2455. for (j = 0; j < ubq->q_depth; j++)
  2456. ubq->ios[j].flags &= ~UBLK_IO_FLAG_CANCELED;
  2457. ubq->canceling = false;
  2458. spin_unlock(&ubq->cancel_lock);
  2459. ubq->fail_io = false;
  2460. }
  2461. /* device can only be started after all IOs are ready */
  2462. static void ublk_mark_io_ready(struct ublk_device *ub, u16 q_id)
  2463. __must_hold(&ub->mutex)
  2464. {
  2465. struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
  2466. if (!ub->unprivileged_daemons && !capable(CAP_SYS_ADMIN))
  2467. ub->unprivileged_daemons = true;
  2468. ubq->nr_io_ready++;
  2469. /* Check if this specific queue is now fully ready */
  2470. if (ublk_queue_ready(ubq)) {
  2471. ub->nr_queue_ready++;
  2472. /*
  2473. * Reset queue flags as soon as this queue is ready.
  2474. * This clears the canceling flag, allowing batch FETCH commands
  2475. * to succeed during recovery without waiting for all queues.
  2476. */
  2477. ublk_queue_reset_io_flags(ubq);
  2478. }
  2479. /* Check if all queues are ready */
  2480. if (ublk_dev_ready(ub)) {
  2481. /*
  2482. * All queues ready - clear device-level canceling flag
  2483. * and complete the recovery/initialization.
  2484. */
  2485. mutex_lock(&ub->cancel_mutex);
  2486. ub->canceling = false;
  2487. mutex_unlock(&ub->cancel_mutex);
  2488. complete_all(&ub->completion);
  2489. }
  2490. }
  2491. static inline int ublk_check_cmd_op(u32 cmd_op)
  2492. {
  2493. u32 ioc_type = _IOC_TYPE(cmd_op);
  2494. if (!IS_ENABLED(CONFIG_BLKDEV_UBLK_LEGACY_OPCODES) && ioc_type != 'u')
  2495. return -EOPNOTSUPP;
  2496. if (ioc_type != 'u' && ioc_type != 0)
  2497. return -EOPNOTSUPP;
  2498. return 0;
  2499. }
  2500. static inline int ublk_set_auto_buf_reg(struct ublk_io *io, struct io_uring_cmd *cmd)
  2501. {
  2502. struct ublk_auto_buf_reg buf;
  2503. buf = ublk_sqe_addr_to_auto_buf_reg(READ_ONCE(cmd->sqe->addr));
  2504. if (buf.reserved0 || buf.reserved1)
  2505. return -EINVAL;
  2506. if (buf.flags & ~UBLK_AUTO_BUF_REG_F_MASK)
  2507. return -EINVAL;
  2508. io->buf.auto_reg = buf;
  2509. return 0;
  2510. }
  2511. static void ublk_clear_auto_buf_reg(struct ublk_io *io,
  2512. struct io_uring_cmd *cmd,
  2513. u16 *buf_idx)
  2514. {
  2515. if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG) {
  2516. io->flags &= ~UBLK_IO_FLAG_AUTO_BUF_REG;
  2517. /*
  2518. * `UBLK_F_AUTO_BUF_REG` only works iff `UBLK_IO_FETCH_REQ`
  2519. * and `UBLK_IO_COMMIT_AND_FETCH_REQ` are issued from same
  2520. * `io_ring_ctx`.
  2521. *
  2522. * If this uring_cmd's io_ring_ctx isn't same with the
  2523. * one for registering the buffer, it is ublk server's
  2524. * responsibility for unregistering the buffer, otherwise
  2525. * this ublk request gets stuck.
  2526. */
  2527. if (io->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd))
  2528. *buf_idx = io->buf.auto_reg.index;
  2529. }
  2530. }
  2531. static int ublk_handle_auto_buf_reg(struct ublk_io *io,
  2532. struct io_uring_cmd *cmd,
  2533. u16 *buf_idx)
  2534. {
  2535. ublk_clear_auto_buf_reg(io, cmd, buf_idx);
  2536. return ublk_set_auto_buf_reg(io, cmd);
  2537. }
  2538. /* Once we return, `io->req` can't be used any more */
  2539. static inline struct request *
  2540. ublk_fill_io_cmd(struct ublk_io *io, struct io_uring_cmd *cmd)
  2541. {
  2542. struct request *req = io->req;
  2543. io->cmd = cmd;
  2544. io->flags |= UBLK_IO_FLAG_ACTIVE;
  2545. /* now this cmd slot is owned by ublk driver */
  2546. io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
  2547. return req;
  2548. }
  2549. static inline int
  2550. ublk_config_io_buf(const struct ublk_device *ub, struct ublk_io *io,
  2551. struct io_uring_cmd *cmd, unsigned long buf_addr,
  2552. u16 *buf_idx)
  2553. {
  2554. if (ublk_dev_support_auto_buf_reg(ub))
  2555. return ublk_handle_auto_buf_reg(io, cmd, buf_idx);
  2556. io->buf.addr = buf_addr;
  2557. return 0;
  2558. }
  2559. static inline void ublk_prep_cancel(struct io_uring_cmd *cmd,
  2560. unsigned int issue_flags,
  2561. struct ublk_queue *ubq, unsigned int tag)
  2562. {
  2563. struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
  2564. /*
  2565. * Safe to refer to @ubq since ublk_queue won't be died until its
  2566. * commands are completed
  2567. */
  2568. pdu->ubq = ubq;
  2569. pdu->tag = tag;
  2570. io_uring_cmd_mark_cancelable(cmd, issue_flags);
  2571. }
  2572. static void ublk_io_release(void *priv)
  2573. {
  2574. struct request *rq = priv;
  2575. struct ublk_queue *ubq = rq->mq_hctx->driver_data;
  2576. struct ublk_io *io = &ubq->ios[rq->tag];
  2577. /*
  2578. * task_registered_buffers may be 0 if buffers were registered off task
  2579. * but unregistered on task. Or after UBLK_IO_COMMIT_AND_FETCH_REQ.
  2580. */
  2581. if (current == io->task && io->task_registered_buffers)
  2582. io->task_registered_buffers--;
  2583. else
  2584. ublk_put_req_ref(io, rq);
  2585. }
  2586. static int ublk_register_io_buf(struct io_uring_cmd *cmd,
  2587. struct ublk_device *ub,
  2588. u16 q_id, u16 tag,
  2589. struct ublk_io *io,
  2590. unsigned int index, unsigned int issue_flags)
  2591. {
  2592. struct request *req;
  2593. int ret;
  2594. if (!ublk_dev_support_zero_copy(ub))
  2595. return -EINVAL;
  2596. req = __ublk_check_and_get_req(ub, q_id, tag, io);
  2597. if (!req)
  2598. return -EINVAL;
  2599. ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
  2600. issue_flags);
  2601. if (ret) {
  2602. ublk_put_req_ref(io, req);
  2603. return ret;
  2604. }
  2605. return 0;
  2606. }
  2607. static int
  2608. ublk_daemon_register_io_buf(struct io_uring_cmd *cmd,
  2609. struct ublk_device *ub,
  2610. u16 q_id, u16 tag, struct ublk_io *io,
  2611. unsigned index, unsigned issue_flags)
  2612. {
  2613. unsigned new_registered_buffers;
  2614. struct request *req = io->req;
  2615. int ret;
  2616. /*
  2617. * Ensure there are still references for ublk_sub_req_ref() to release.
  2618. * If not, fall back on the thread-safe buffer registration.
  2619. */
  2620. new_registered_buffers = io->task_registered_buffers + 1;
  2621. if (unlikely(new_registered_buffers >= UBLK_REFCOUNT_INIT))
  2622. return ublk_register_io_buf(cmd, ub, q_id, tag, io, index,
  2623. issue_flags);
  2624. if (!ublk_dev_support_zero_copy(ub) || !ublk_rq_has_data(req))
  2625. return -EINVAL;
  2626. ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
  2627. issue_flags);
  2628. if (ret)
  2629. return ret;
  2630. io->task_registered_buffers = new_registered_buffers;
  2631. return 0;
  2632. }
  2633. static int ublk_unregister_io_buf(struct io_uring_cmd *cmd,
  2634. const struct ublk_device *ub,
  2635. unsigned int index, unsigned int issue_flags)
  2636. {
  2637. if (!(ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY))
  2638. return -EINVAL;
  2639. return io_buffer_unregister_bvec(cmd, index, issue_flags);
  2640. }
  2641. static int ublk_check_fetch_buf(const struct ublk_device *ub, __u64 buf_addr)
  2642. {
  2643. if (ublk_dev_need_map_io(ub)) {
  2644. /*
  2645. * FETCH_RQ has to provide IO buffer if NEED GET
  2646. * DATA is not enabled
  2647. */
  2648. if (!buf_addr && !ublk_dev_need_get_data(ub))
  2649. return -EINVAL;
  2650. } else if (buf_addr) {
  2651. /* User copy requires addr to be unset */
  2652. return -EINVAL;
  2653. }
  2654. return 0;
  2655. }
  2656. static int __ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub,
  2657. struct ublk_io *io, u16 q_id)
  2658. {
  2659. /* UBLK_IO_FETCH_REQ is only allowed before dev is setup */
  2660. if (ublk_dev_ready(ub))
  2661. return -EBUSY;
  2662. /* allow each command to be FETCHed at most once */
  2663. if (io->flags & UBLK_IO_FLAG_ACTIVE)
  2664. return -EINVAL;
  2665. WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV);
  2666. ublk_fill_io_cmd(io, cmd);
  2667. if (ublk_dev_support_batch_io(ub))
  2668. WRITE_ONCE(io->task, NULL);
  2669. else
  2670. WRITE_ONCE(io->task, get_task_struct(current));
  2671. return 0;
  2672. }
  2673. static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub,
  2674. struct ublk_io *io, __u64 buf_addr, u16 q_id)
  2675. {
  2676. int ret;
  2677. /*
  2678. * When handling FETCH command for setting up ublk uring queue,
  2679. * ub->mutex is the innermost lock, and we won't block for handling
  2680. * FETCH, so it is fine even for IO_URING_F_NONBLOCK.
  2681. */
  2682. mutex_lock(&ub->mutex);
  2683. ret = __ublk_fetch(cmd, ub, io, q_id);
  2684. if (!ret)
  2685. ret = ublk_config_io_buf(ub, io, cmd, buf_addr, NULL);
  2686. if (!ret)
  2687. ublk_mark_io_ready(ub, q_id);
  2688. mutex_unlock(&ub->mutex);
  2689. return ret;
  2690. }
  2691. static int ublk_check_commit_and_fetch(const struct ublk_device *ub,
  2692. struct ublk_io *io, __u64 buf_addr)
  2693. {
  2694. struct request *req = io->req;
  2695. if (ublk_dev_need_map_io(ub)) {
  2696. /*
  2697. * COMMIT_AND_FETCH_REQ has to provide IO buffer if
  2698. * NEED GET DATA is not enabled or it is Read IO.
  2699. */
  2700. if (!buf_addr && (!ublk_dev_need_get_data(ub) ||
  2701. req_op(req) == REQ_OP_READ))
  2702. return -EINVAL;
  2703. } else if (req_op(req) != REQ_OP_ZONE_APPEND && buf_addr) {
  2704. /*
  2705. * User copy requires addr to be unset when command is
  2706. * not zone append
  2707. */
  2708. return -EINVAL;
  2709. }
  2710. return 0;
  2711. }
  2712. static bool ublk_need_complete_req(const struct ublk_device *ub,
  2713. struct ublk_io *io)
  2714. {
  2715. if (ublk_dev_need_req_ref(ub))
  2716. return ublk_sub_req_ref(io);
  2717. return true;
  2718. }
  2719. static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io,
  2720. struct request *req)
  2721. {
  2722. /*
  2723. * We have handled UBLK_IO_NEED_GET_DATA command,
  2724. * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
  2725. * do the copy work.
  2726. */
  2727. io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
  2728. /* update iod->addr because ublksrv may have passed a new io buffer */
  2729. ublk_get_iod(ubq, req->tag)->addr = io->buf.addr;
  2730. pr_devel("%s: update iod->addr: qid %d tag %d io_flags %x addr %llx\n",
  2731. __func__, ubq->q_id, req->tag, io->flags,
  2732. ublk_get_iod(ubq, req->tag)->addr);
  2733. return ublk_start_io(ubq, req, io);
  2734. }
  2735. static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
  2736. unsigned int issue_flags)
  2737. {
  2738. /* May point to userspace-mapped memory */
  2739. const struct ublksrv_io_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe,
  2740. struct ublksrv_io_cmd);
  2741. u16 buf_idx = UBLK_INVALID_BUF_IDX;
  2742. struct ublk_device *ub = cmd->file->private_data;
  2743. struct ublk_queue *ubq;
  2744. struct ublk_io *io = NULL;
  2745. u32 cmd_op = cmd->cmd_op;
  2746. u16 q_id = READ_ONCE(ub_src->q_id);
  2747. u16 tag = READ_ONCE(ub_src->tag);
  2748. s32 result = READ_ONCE(ub_src->result);
  2749. u64 addr = READ_ONCE(ub_src->addr); /* unioned with zone_append_lba */
  2750. struct request *req;
  2751. int ret;
  2752. bool compl;
  2753. WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED);
  2754. pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
  2755. __func__, cmd->cmd_op, q_id, tag, result);
  2756. ret = ublk_check_cmd_op(cmd_op);
  2757. if (ret)
  2758. goto out;
  2759. /*
  2760. * io_buffer_unregister_bvec() doesn't access the ubq or io,
  2761. * so no need to validate the q_id, tag, or task
  2762. */
  2763. if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF)
  2764. return ublk_unregister_io_buf(cmd, ub, addr, issue_flags);
  2765. ret = -EINVAL;
  2766. if (q_id >= ub->dev_info.nr_hw_queues)
  2767. goto out;
  2768. ubq = ublk_get_queue(ub, q_id);
  2769. if (tag >= ub->dev_info.queue_depth)
  2770. goto out;
  2771. io = &ubq->ios[tag];
  2772. /* UBLK_IO_FETCH_REQ can be handled on any task, which sets io->task */
  2773. if (unlikely(_IOC_NR(cmd_op) == UBLK_IO_FETCH_REQ)) {
  2774. ret = ublk_check_fetch_buf(ub, addr);
  2775. if (ret)
  2776. goto out;
  2777. ret = ublk_fetch(cmd, ub, io, addr, q_id);
  2778. if (ret)
  2779. goto out;
  2780. ublk_prep_cancel(cmd, issue_flags, ubq, tag);
  2781. return -EIOCBQUEUED;
  2782. }
  2783. if (READ_ONCE(io->task) != current) {
  2784. /*
  2785. * ublk_register_io_buf() accesses only the io's refcount,
  2786. * so can be handled on any task
  2787. */
  2788. if (_IOC_NR(cmd_op) == UBLK_IO_REGISTER_IO_BUF)
  2789. return ublk_register_io_buf(cmd, ub, q_id, tag, io,
  2790. addr, issue_flags);
  2791. goto out;
  2792. }
  2793. /* there is pending io cmd, something must be wrong */
  2794. if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) {
  2795. ret = -EBUSY;
  2796. goto out;
  2797. }
  2798. /*
  2799. * ensure that the user issues UBLK_IO_NEED_GET_DATA
  2800. * iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA.
  2801. */
  2802. if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA))
  2803. ^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA))
  2804. goto out;
  2805. switch (_IOC_NR(cmd_op)) {
  2806. case UBLK_IO_REGISTER_IO_BUF:
  2807. return ublk_daemon_register_io_buf(cmd, ub, q_id, tag, io, addr,
  2808. issue_flags);
  2809. case UBLK_IO_COMMIT_AND_FETCH_REQ:
  2810. ret = ublk_check_commit_and_fetch(ub, io, addr);
  2811. if (ret)
  2812. goto out;
  2813. io->res = result;
  2814. req = ublk_fill_io_cmd(io, cmd);
  2815. ret = ublk_config_io_buf(ub, io, cmd, addr, &buf_idx);
  2816. if (buf_idx != UBLK_INVALID_BUF_IDX)
  2817. io_buffer_unregister_bvec(cmd, buf_idx, issue_flags);
  2818. compl = ublk_need_complete_req(ub, io);
  2819. if (req_op(req) == REQ_OP_ZONE_APPEND)
  2820. req->__sector = addr;
  2821. if (compl)
  2822. __ublk_complete_rq(req, io, ublk_dev_need_map_io(ub), NULL);
  2823. if (ret)
  2824. goto out;
  2825. break;
  2826. case UBLK_IO_NEED_GET_DATA:
  2827. /*
  2828. * ublk_get_data() may fail and fallback to requeue, so keep
  2829. * uring_cmd active first and prepare for handling new requeued
  2830. * request
  2831. */
  2832. req = ublk_fill_io_cmd(io, cmd);
  2833. ret = ublk_config_io_buf(ub, io, cmd, addr, NULL);
  2834. WARN_ON_ONCE(ret);
  2835. if (likely(ublk_get_data(ubq, io, req))) {
  2836. __ublk_prep_compl_io_cmd(io, req);
  2837. return UBLK_IO_RES_OK;
  2838. }
  2839. break;
  2840. default:
  2841. goto out;
  2842. }
  2843. ublk_prep_cancel(cmd, issue_flags, ubq, tag);
  2844. return -EIOCBQUEUED;
  2845. out:
  2846. pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n",
  2847. __func__, cmd_op, tag, ret, io ? io->flags : 0);
  2848. return ret;
  2849. }
  2850. static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
  2851. u16 q_id, u16 tag, struct ublk_io *io)
  2852. {
  2853. struct request *req;
  2854. /*
  2855. * can't use io->req in case of concurrent UBLK_IO_COMMIT_AND_FETCH_REQ,
  2856. * which would overwrite it with io->cmd
  2857. */
  2858. req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag);
  2859. if (!req)
  2860. return NULL;
  2861. if (!ublk_get_req_ref(io))
  2862. return NULL;
  2863. if (unlikely(!blk_mq_request_started(req) || req->tag != tag))
  2864. goto fail_put;
  2865. if (!ublk_rq_has_data(req))
  2866. goto fail_put;
  2867. return req;
  2868. fail_put:
  2869. ublk_put_req_ref(io, req);
  2870. return NULL;
  2871. }
  2872. static void ublk_ch_uring_cmd_cb(struct io_tw_req tw_req, io_tw_token_t tw)
  2873. {
  2874. unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS;
  2875. struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req);
  2876. int ret = ublk_ch_uring_cmd_local(cmd, issue_flags);
  2877. if (ret != -EIOCBQUEUED)
  2878. io_uring_cmd_done(cmd, ret, issue_flags);
  2879. }
  2880. static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
  2881. {
  2882. if (unlikely(issue_flags & IO_URING_F_CANCEL)) {
  2883. ublk_uring_cmd_cancel_fn(cmd, issue_flags);
  2884. return 0;
  2885. }
  2886. /* well-implemented server won't run into unlocked */
  2887. if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
  2888. io_uring_cmd_complete_in_task(cmd, ublk_ch_uring_cmd_cb);
  2889. return -EIOCBQUEUED;
  2890. }
  2891. return ublk_ch_uring_cmd_local(cmd, issue_flags);
  2892. }
  2893. static inline __u64 ublk_batch_buf_addr(const struct ublk_batch_io *uc,
  2894. const struct ublk_elem_header *elem)
  2895. {
  2896. const void *buf = elem;
  2897. if (uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR)
  2898. return *(const __u64 *)(buf + sizeof(*elem));
  2899. return 0;
  2900. }
  2901. static inline __u64 ublk_batch_zone_lba(const struct ublk_batch_io *uc,
  2902. const struct ublk_elem_header *elem)
  2903. {
  2904. const void *buf = elem;
  2905. if (uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA)
  2906. return *(const __u64 *)(buf + sizeof(*elem) +
  2907. 8 * !!(uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR));
  2908. return -1;
  2909. }
  2910. static struct ublk_auto_buf_reg
  2911. ublk_batch_auto_buf_reg(const struct ublk_batch_io *uc,
  2912. const struct ublk_elem_header *elem)
  2913. {
  2914. struct ublk_auto_buf_reg reg = {
  2915. .index = elem->buf_index,
  2916. .flags = (uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) ?
  2917. UBLK_AUTO_BUF_REG_FALLBACK : 0,
  2918. };
  2919. return reg;
  2920. }
  2921. /*
  2922. * 48 can hold any type of buffer element(8, 16 and 24 bytes) because
  2923. * it is the least common multiple(LCM) of 8, 16 and 24
  2924. */
  2925. #define UBLK_CMD_BATCH_TMP_BUF_SZ (48 * 10)
  2926. struct ublk_batch_io_iter {
  2927. void __user *uaddr;
  2928. unsigned done, total;
  2929. unsigned char elem_bytes;
  2930. /* copy to this buffer from user space */
  2931. unsigned char buf[UBLK_CMD_BATCH_TMP_BUF_SZ];
  2932. };
  2933. static inline int
  2934. __ublk_walk_cmd_buf(struct ublk_queue *ubq,
  2935. struct ublk_batch_io_iter *iter,
  2936. const struct ublk_batch_io_data *data,
  2937. unsigned bytes,
  2938. int (*cb)(struct ublk_queue *q,
  2939. const struct ublk_batch_io_data *data,
  2940. const struct ublk_elem_header *elem))
  2941. {
  2942. unsigned int i;
  2943. int ret = 0;
  2944. for (i = 0; i < bytes; i += iter->elem_bytes) {
  2945. const struct ublk_elem_header *elem =
  2946. (const struct ublk_elem_header *)&iter->buf[i];
  2947. if (unlikely(elem->tag >= data->ub->dev_info.queue_depth)) {
  2948. ret = -EINVAL;
  2949. break;
  2950. }
  2951. ret = cb(ubq, data, elem);
  2952. if (unlikely(ret))
  2953. break;
  2954. }
  2955. iter->done += i;
  2956. return ret;
  2957. }
  2958. static int ublk_walk_cmd_buf(struct ublk_batch_io_iter *iter,
  2959. const struct ublk_batch_io_data *data,
  2960. int (*cb)(struct ublk_queue *q,
  2961. const struct ublk_batch_io_data *data,
  2962. const struct ublk_elem_header *elem))
  2963. {
  2964. struct ublk_queue *ubq = ublk_get_queue(data->ub, data->header.q_id);
  2965. int ret = 0;
  2966. while (iter->done < iter->total) {
  2967. unsigned int len = min(sizeof(iter->buf), iter->total - iter->done);
  2968. if (copy_from_user(iter->buf, iter->uaddr + iter->done, len)) {
  2969. pr_warn("ublk%d: read batch cmd buffer failed\n",
  2970. data->ub->dev_info.dev_id);
  2971. return -EFAULT;
  2972. }
  2973. ret = __ublk_walk_cmd_buf(ubq, iter, data, len, cb);
  2974. if (ret)
  2975. return ret;
  2976. }
  2977. return 0;
  2978. }
  2979. static int ublk_batch_unprep_io(struct ublk_queue *ubq,
  2980. const struct ublk_batch_io_data *data,
  2981. const struct ublk_elem_header *elem)
  2982. {
  2983. struct ublk_io *io = &ubq->ios[elem->tag];
  2984. /*
  2985. * If queue was ready before this decrement, it won't be anymore,
  2986. * so we need to decrement the queue ready count and restore the
  2987. * canceling flag to prevent new requests from being queued.
  2988. */
  2989. if (ublk_queue_ready(ubq)) {
  2990. data->ub->nr_queue_ready--;
  2991. spin_lock(&ubq->cancel_lock);
  2992. ubq->canceling = true;
  2993. spin_unlock(&ubq->cancel_lock);
  2994. }
  2995. ubq->nr_io_ready--;
  2996. ublk_io_lock(io);
  2997. io->flags = 0;
  2998. ublk_io_unlock(io);
  2999. return 0;
  3000. }
  3001. static void ublk_batch_revert_prep_cmd(struct ublk_batch_io_iter *iter,
  3002. const struct ublk_batch_io_data *data)
  3003. {
  3004. int ret;
  3005. /* Re-process only what we've already processed, starting from beginning */
  3006. iter->total = iter->done;
  3007. iter->done = 0;
  3008. ret = ublk_walk_cmd_buf(iter, data, ublk_batch_unprep_io);
  3009. WARN_ON_ONCE(ret);
  3010. }
  3011. static int ublk_batch_prep_io(struct ublk_queue *ubq,
  3012. const struct ublk_batch_io_data *data,
  3013. const struct ublk_elem_header *elem)
  3014. {
  3015. struct ublk_io *io = &ubq->ios[elem->tag];
  3016. const struct ublk_batch_io *uc = &data->header;
  3017. union ublk_io_buf buf = { 0 };
  3018. int ret;
  3019. if (ublk_dev_support_auto_buf_reg(data->ub))
  3020. buf.auto_reg = ublk_batch_auto_buf_reg(uc, elem);
  3021. else if (ublk_dev_need_map_io(data->ub)) {
  3022. buf.addr = ublk_batch_buf_addr(uc, elem);
  3023. ret = ublk_check_fetch_buf(data->ub, buf.addr);
  3024. if (ret)
  3025. return ret;
  3026. }
  3027. ublk_io_lock(io);
  3028. ret = __ublk_fetch(data->cmd, data->ub, io, ubq->q_id);
  3029. if (!ret)
  3030. io->buf = buf;
  3031. ublk_io_unlock(io);
  3032. if (!ret)
  3033. ublk_mark_io_ready(data->ub, ubq->q_id);
  3034. return ret;
  3035. }
  3036. static int ublk_handle_batch_prep_cmd(const struct ublk_batch_io_data *data)
  3037. {
  3038. const struct ublk_batch_io *uc = &data->header;
  3039. struct io_uring_cmd *cmd = data->cmd;
  3040. struct ublk_batch_io_iter iter = {
  3041. .uaddr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)),
  3042. .total = uc->nr_elem * uc->elem_bytes,
  3043. .elem_bytes = uc->elem_bytes,
  3044. };
  3045. int ret;
  3046. mutex_lock(&data->ub->mutex);
  3047. ret = ublk_walk_cmd_buf(&iter, data, ublk_batch_prep_io);
  3048. if (ret && iter.done)
  3049. ublk_batch_revert_prep_cmd(&iter, data);
  3050. mutex_unlock(&data->ub->mutex);
  3051. return ret;
  3052. }
  3053. static int ublk_batch_commit_io_check(const struct ublk_queue *ubq,
  3054. struct ublk_io *io,
  3055. union ublk_io_buf *buf)
  3056. {
  3057. if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
  3058. return -EBUSY;
  3059. /* BATCH_IO doesn't support UBLK_F_NEED_GET_DATA */
  3060. if (ublk_need_map_io(ubq) && !buf->addr)
  3061. return -EINVAL;
  3062. return 0;
  3063. }
  3064. static int ublk_batch_commit_io(struct ublk_queue *ubq,
  3065. const struct ublk_batch_io_data *data,
  3066. const struct ublk_elem_header *elem)
  3067. {
  3068. struct ublk_io *io = &ubq->ios[elem->tag];
  3069. const struct ublk_batch_io *uc = &data->header;
  3070. u16 buf_idx = UBLK_INVALID_BUF_IDX;
  3071. union ublk_io_buf buf = { 0 };
  3072. struct request *req = NULL;
  3073. bool auto_reg = false;
  3074. bool compl = false;
  3075. int ret;
  3076. if (ublk_dev_support_auto_buf_reg(data->ub)) {
  3077. buf.auto_reg = ublk_batch_auto_buf_reg(uc, elem);
  3078. auto_reg = true;
  3079. } else if (ublk_dev_need_map_io(data->ub))
  3080. buf.addr = ublk_batch_buf_addr(uc, elem);
  3081. ublk_io_lock(io);
  3082. ret = ublk_batch_commit_io_check(ubq, io, &buf);
  3083. if (!ret) {
  3084. io->res = elem->result;
  3085. io->buf = buf;
  3086. req = ublk_fill_io_cmd(io, data->cmd);
  3087. if (auto_reg)
  3088. ublk_clear_auto_buf_reg(io, data->cmd, &buf_idx);
  3089. compl = ublk_need_complete_req(data->ub, io);
  3090. }
  3091. ublk_io_unlock(io);
  3092. if (unlikely(ret)) {
  3093. pr_warn_ratelimited("%s: dev %u queue %u io %u: commit failure %d\n",
  3094. __func__, data->ub->dev_info.dev_id, ubq->q_id,
  3095. elem->tag, ret);
  3096. return ret;
  3097. }
  3098. if (buf_idx != UBLK_INVALID_BUF_IDX)
  3099. io_buffer_unregister_bvec(data->cmd, buf_idx, data->issue_flags);
  3100. if (req_op(req) == REQ_OP_ZONE_APPEND)
  3101. req->__sector = ublk_batch_zone_lba(uc, elem);
  3102. if (compl)
  3103. __ublk_complete_rq(req, io, ublk_dev_need_map_io(data->ub), data->iob);
  3104. return 0;
  3105. }
  3106. static int ublk_handle_batch_commit_cmd(struct ublk_batch_io_data *data)
  3107. {
  3108. const struct ublk_batch_io *uc = &data->header;
  3109. struct io_uring_cmd *cmd = data->cmd;
  3110. struct ublk_batch_io_iter iter = {
  3111. .uaddr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)),
  3112. .total = uc->nr_elem * uc->elem_bytes,
  3113. .elem_bytes = uc->elem_bytes,
  3114. };
  3115. DEFINE_IO_COMP_BATCH(iob);
  3116. int ret;
  3117. data->iob = &iob;
  3118. ret = ublk_walk_cmd_buf(&iter, data, ublk_batch_commit_io);
  3119. if (iob.complete)
  3120. iob.complete(&iob);
  3121. return iter.done == 0 ? ret : iter.done;
  3122. }
  3123. static int ublk_check_batch_cmd_flags(const struct ublk_batch_io *uc)
  3124. {
  3125. unsigned elem_bytes = sizeof(struct ublk_elem_header);
  3126. if (uc->flags & ~UBLK_BATCH_F_ALL)
  3127. return -EINVAL;
  3128. /* UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK requires buffer index */
  3129. if ((uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) &&
  3130. (uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR))
  3131. return -EINVAL;
  3132. elem_bytes += (uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA ? sizeof(u64) : 0) +
  3133. (uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR ? sizeof(u64) : 0);
  3134. if (uc->elem_bytes != elem_bytes)
  3135. return -EINVAL;
  3136. return 0;
  3137. }
  3138. static int ublk_check_batch_cmd(const struct ublk_batch_io_data *data)
  3139. {
  3140. const struct ublk_batch_io *uc = &data->header;
  3141. if (uc->q_id >= data->ub->dev_info.nr_hw_queues)
  3142. return -EINVAL;
  3143. if (uc->nr_elem > data->ub->dev_info.queue_depth)
  3144. return -E2BIG;
  3145. if ((uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA) &&
  3146. !ublk_dev_is_zoned(data->ub))
  3147. return -EINVAL;
  3148. if ((uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR) &&
  3149. !ublk_dev_need_map_io(data->ub))
  3150. return -EINVAL;
  3151. if ((uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) &&
  3152. !ublk_dev_support_auto_buf_reg(data->ub))
  3153. return -EINVAL;
  3154. return ublk_check_batch_cmd_flags(uc);
  3155. }
  3156. static int ublk_batch_attach(struct ublk_queue *ubq,
  3157. struct ublk_batch_io_data *data,
  3158. struct ublk_batch_fetch_cmd *fcmd)
  3159. {
  3160. struct ublk_batch_fetch_cmd *new_fcmd = NULL;
  3161. bool free = false;
  3162. struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(data->cmd);
  3163. spin_lock(&ubq->evts_lock);
  3164. if (unlikely(ubq->force_abort || ubq->canceling)) {
  3165. free = true;
  3166. } else {
  3167. list_add_tail(&fcmd->node, &ubq->fcmd_head);
  3168. new_fcmd = __ublk_acquire_fcmd(ubq);
  3169. }
  3170. spin_unlock(&ubq->evts_lock);
  3171. if (unlikely(free)) {
  3172. ublk_batch_free_fcmd(fcmd);
  3173. return -ENODEV;
  3174. }
  3175. pdu->ubq = ubq;
  3176. pdu->fcmd = fcmd;
  3177. io_uring_cmd_mark_cancelable(fcmd->cmd, data->issue_flags);
  3178. if (!new_fcmd)
  3179. goto out;
  3180. /*
  3181. * If the two fetch commands are originated from same io_ring_ctx,
  3182. * run batch dispatch directly. Otherwise, schedule task work for
  3183. * doing it.
  3184. */
  3185. if (io_uring_cmd_ctx_handle(new_fcmd->cmd) ==
  3186. io_uring_cmd_ctx_handle(fcmd->cmd)) {
  3187. data->cmd = new_fcmd->cmd;
  3188. ublk_batch_dispatch(ubq, data, new_fcmd);
  3189. } else {
  3190. io_uring_cmd_complete_in_task(new_fcmd->cmd,
  3191. ublk_batch_tw_cb);
  3192. }
  3193. out:
  3194. return -EIOCBQUEUED;
  3195. }
  3196. static int ublk_handle_batch_fetch_cmd(struct ublk_batch_io_data *data)
  3197. {
  3198. struct ublk_queue *ubq = ublk_get_queue(data->ub, data->header.q_id);
  3199. struct ublk_batch_fetch_cmd *fcmd = ublk_batch_alloc_fcmd(data->cmd);
  3200. if (!fcmd)
  3201. return -ENOMEM;
  3202. return ublk_batch_attach(ubq, data, fcmd);
  3203. }
  3204. static int ublk_validate_batch_fetch_cmd(struct ublk_batch_io_data *data)
  3205. {
  3206. const struct ublk_batch_io *uc = &data->header;
  3207. if (uc->q_id >= data->ub->dev_info.nr_hw_queues)
  3208. return -EINVAL;
  3209. if (!(data->cmd->flags & IORING_URING_CMD_MULTISHOT))
  3210. return -EINVAL;
  3211. if (uc->elem_bytes != sizeof(__u16))
  3212. return -EINVAL;
  3213. if (uc->flags != 0)
  3214. return -EINVAL;
  3215. return 0;
  3216. }
  3217. static int ublk_handle_non_batch_cmd(struct io_uring_cmd *cmd,
  3218. unsigned int issue_flags)
  3219. {
  3220. const struct ublksrv_io_cmd *ub_cmd = io_uring_sqe_cmd(cmd->sqe,
  3221. struct ublksrv_io_cmd);
  3222. struct ublk_device *ub = cmd->file->private_data;
  3223. unsigned tag = READ_ONCE(ub_cmd->tag);
  3224. unsigned q_id = READ_ONCE(ub_cmd->q_id);
  3225. unsigned index = READ_ONCE(ub_cmd->addr);
  3226. struct ublk_queue *ubq;
  3227. struct ublk_io *io;
  3228. if (cmd->cmd_op == UBLK_U_IO_UNREGISTER_IO_BUF)
  3229. return ublk_unregister_io_buf(cmd, ub, index, issue_flags);
  3230. if (q_id >= ub->dev_info.nr_hw_queues)
  3231. return -EINVAL;
  3232. if (tag >= ub->dev_info.queue_depth)
  3233. return -EINVAL;
  3234. if (cmd->cmd_op != UBLK_U_IO_REGISTER_IO_BUF)
  3235. return -EOPNOTSUPP;
  3236. ubq = ublk_get_queue(ub, q_id);
  3237. io = &ubq->ios[tag];
  3238. return ublk_register_io_buf(cmd, ub, q_id, tag, io, index,
  3239. issue_flags);
  3240. }
  3241. static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd,
  3242. unsigned int issue_flags)
  3243. {
  3244. const struct ublk_batch_io *uc = io_uring_sqe_cmd(cmd->sqe,
  3245. struct ublk_batch_io);
  3246. struct ublk_device *ub = cmd->file->private_data;
  3247. struct ublk_batch_io_data data = {
  3248. .ub = ub,
  3249. .cmd = cmd,
  3250. .header = (struct ublk_batch_io) {
  3251. .q_id = READ_ONCE(uc->q_id),
  3252. .flags = READ_ONCE(uc->flags),
  3253. .nr_elem = READ_ONCE(uc->nr_elem),
  3254. .elem_bytes = READ_ONCE(uc->elem_bytes),
  3255. },
  3256. .issue_flags = issue_flags,
  3257. };
  3258. u32 cmd_op = cmd->cmd_op;
  3259. int ret = -EINVAL;
  3260. if (unlikely(issue_flags & IO_URING_F_CANCEL)) {
  3261. ublk_batch_cancel_fn(cmd, issue_flags);
  3262. return 0;
  3263. }
  3264. switch (cmd_op) {
  3265. case UBLK_U_IO_PREP_IO_CMDS:
  3266. ret = ublk_check_batch_cmd(&data);
  3267. if (ret)
  3268. goto out;
  3269. ret = ublk_handle_batch_prep_cmd(&data);
  3270. break;
  3271. case UBLK_U_IO_COMMIT_IO_CMDS:
  3272. ret = ublk_check_batch_cmd(&data);
  3273. if (ret)
  3274. goto out;
  3275. ret = ublk_handle_batch_commit_cmd(&data);
  3276. break;
  3277. case UBLK_U_IO_FETCH_IO_CMDS:
  3278. ret = ublk_validate_batch_fetch_cmd(&data);
  3279. if (ret)
  3280. goto out;
  3281. ret = ublk_handle_batch_fetch_cmd(&data);
  3282. break;
  3283. default:
  3284. ret = ublk_handle_non_batch_cmd(cmd, issue_flags);
  3285. break;
  3286. }
  3287. out:
  3288. return ret;
  3289. }
  3290. static inline bool ublk_check_ubuf_dir(const struct request *req,
  3291. int ubuf_dir)
  3292. {
  3293. /* copy ubuf to request pages */
  3294. if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) &&
  3295. ubuf_dir == ITER_SOURCE)
  3296. return true;
  3297. /* copy request pages to ubuf */
  3298. if ((req_op(req) == REQ_OP_WRITE ||
  3299. req_op(req) == REQ_OP_ZONE_APPEND) &&
  3300. ubuf_dir == ITER_DEST)
  3301. return true;
  3302. return false;
  3303. }
  3304. static ssize_t
  3305. ublk_user_copy(struct kiocb *iocb, struct iov_iter *iter, int dir)
  3306. {
  3307. struct ublk_device *ub = iocb->ki_filp->private_data;
  3308. struct ublk_queue *ubq;
  3309. struct request *req;
  3310. struct ublk_io *io;
  3311. unsigned data_len;
  3312. bool is_integrity;
  3313. bool on_daemon;
  3314. size_t buf_off;
  3315. u16 tag, q_id;
  3316. ssize_t ret;
  3317. if (!user_backed_iter(iter))
  3318. return -EACCES;
  3319. if (ub->dev_info.state == UBLK_S_DEV_DEAD)
  3320. return -EACCES;
  3321. tag = ublk_pos_to_tag(iocb->ki_pos);
  3322. q_id = ublk_pos_to_hwq(iocb->ki_pos);
  3323. buf_off = ublk_pos_to_buf_off(iocb->ki_pos);
  3324. is_integrity = !!(iocb->ki_pos & UBLKSRV_IO_INTEGRITY_FLAG);
  3325. if (unlikely(!ublk_dev_support_integrity(ub) && is_integrity))
  3326. return -EINVAL;
  3327. if (q_id >= ub->dev_info.nr_hw_queues)
  3328. return -EINVAL;
  3329. ubq = ublk_get_queue(ub, q_id);
  3330. if (!ublk_dev_support_user_copy(ub))
  3331. return -EACCES;
  3332. if (tag >= ub->dev_info.queue_depth)
  3333. return -EINVAL;
  3334. io = &ubq->ios[tag];
  3335. on_daemon = current == READ_ONCE(io->task);
  3336. if (on_daemon) {
  3337. /* On daemon, io can't be completed concurrently, so skip ref */
  3338. if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
  3339. return -EINVAL;
  3340. req = io->req;
  3341. if (!ublk_rq_has_data(req))
  3342. return -EINVAL;
  3343. } else {
  3344. req = __ublk_check_and_get_req(ub, q_id, tag, io);
  3345. if (!req)
  3346. return -EINVAL;
  3347. }
  3348. if (is_integrity) {
  3349. struct blk_integrity *bi = &req->q->limits.integrity;
  3350. data_len = bio_integrity_bytes(bi, blk_rq_sectors(req));
  3351. } else {
  3352. data_len = blk_rq_bytes(req);
  3353. }
  3354. if (buf_off > data_len) {
  3355. ret = -EINVAL;
  3356. goto out;
  3357. }
  3358. if (!ublk_check_ubuf_dir(req, dir)) {
  3359. ret = -EACCES;
  3360. goto out;
  3361. }
  3362. if (is_integrity)
  3363. ret = ublk_copy_user_integrity(req, buf_off, iter, dir);
  3364. else
  3365. ret = ublk_copy_user_pages(req, buf_off, iter, dir);
  3366. out:
  3367. if (!on_daemon)
  3368. ublk_put_req_ref(io, req);
  3369. return ret;
  3370. }
  3371. static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to)
  3372. {
  3373. return ublk_user_copy(iocb, to, ITER_DEST);
  3374. }
  3375. static ssize_t ublk_ch_write_iter(struct kiocb *iocb, struct iov_iter *from)
  3376. {
  3377. return ublk_user_copy(iocb, from, ITER_SOURCE);
  3378. }
  3379. static const struct file_operations ublk_ch_fops = {
  3380. .owner = THIS_MODULE,
  3381. .open = ublk_ch_open,
  3382. .release = ublk_ch_release,
  3383. .read_iter = ublk_ch_read_iter,
  3384. .write_iter = ublk_ch_write_iter,
  3385. .uring_cmd = ublk_ch_uring_cmd,
  3386. .mmap = ublk_ch_mmap,
  3387. };
  3388. static const struct file_operations ublk_ch_batch_io_fops = {
  3389. .owner = THIS_MODULE,
  3390. .open = ublk_ch_open,
  3391. .release = ublk_ch_release,
  3392. .read_iter = ublk_ch_read_iter,
  3393. .write_iter = ublk_ch_write_iter,
  3394. .uring_cmd = ublk_ch_batch_io_uring_cmd,
  3395. .mmap = ublk_ch_mmap,
  3396. };
  3397. static void __ublk_deinit_queue(struct ublk_device *ub, struct ublk_queue *ubq)
  3398. {
  3399. int size, i;
  3400. size = ublk_queue_cmd_buf_size(ub);
  3401. for (i = 0; i < ubq->q_depth; i++) {
  3402. struct ublk_io *io = &ubq->ios[i];
  3403. if (io->task)
  3404. put_task_struct(io->task);
  3405. WARN_ON_ONCE(refcount_read(&io->ref));
  3406. WARN_ON_ONCE(io->task_registered_buffers);
  3407. }
  3408. if (ubq->io_cmd_buf)
  3409. free_pages((unsigned long)ubq->io_cmd_buf, get_order(size));
  3410. if (ublk_dev_support_batch_io(ub))
  3411. ublk_io_evts_deinit(ubq);
  3412. kvfree(ubq);
  3413. }
  3414. static void ublk_deinit_queue(struct ublk_device *ub, int q_id)
  3415. {
  3416. struct ublk_queue *ubq = ub->queues[q_id];
  3417. if (!ubq)
  3418. return;
  3419. __ublk_deinit_queue(ub, ubq);
  3420. ub->queues[q_id] = NULL;
  3421. }
  3422. static int ublk_get_queue_numa_node(struct ublk_device *ub, int q_id)
  3423. {
  3424. unsigned int cpu;
  3425. /* Find first CPU mapped to this queue */
  3426. for_each_possible_cpu(cpu) {
  3427. if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[cpu] == q_id)
  3428. return cpu_to_node(cpu);
  3429. }
  3430. return NUMA_NO_NODE;
  3431. }
  3432. static int ublk_init_queue(struct ublk_device *ub, int q_id)
  3433. {
  3434. int depth = ub->dev_info.queue_depth;
  3435. gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO;
  3436. struct ublk_queue *ubq;
  3437. struct page *page;
  3438. int numa_node;
  3439. int size, i, ret;
  3440. /* Determine NUMA node based on queue's CPU affinity */
  3441. numa_node = ublk_get_queue_numa_node(ub, q_id);
  3442. /* Allocate queue structure on local NUMA node */
  3443. ubq = kvzalloc_node(struct_size(ubq, ios, depth), GFP_KERNEL,
  3444. numa_node);
  3445. if (!ubq)
  3446. return -ENOMEM;
  3447. spin_lock_init(&ubq->cancel_lock);
  3448. ubq->flags = ub->dev_info.flags;
  3449. ubq->q_id = q_id;
  3450. ubq->q_depth = depth;
  3451. size = ublk_queue_cmd_buf_size(ub);
  3452. /* Allocate I/O command buffer on local NUMA node */
  3453. page = alloc_pages_node(numa_node, gfp_flags, get_order(size));
  3454. if (!page) {
  3455. kvfree(ubq);
  3456. return -ENOMEM;
  3457. }
  3458. ubq->io_cmd_buf = page_address(page);
  3459. for (i = 0; i < ubq->q_depth; i++)
  3460. spin_lock_init(&ubq->ios[i].lock);
  3461. if (ublk_dev_support_batch_io(ub)) {
  3462. ret = ublk_io_evts_init(ubq, ubq->q_depth, numa_node);
  3463. if (ret)
  3464. goto fail;
  3465. INIT_LIST_HEAD(&ubq->fcmd_head);
  3466. }
  3467. ub->queues[q_id] = ubq;
  3468. ubq->dev = ub;
  3469. return 0;
  3470. fail:
  3471. __ublk_deinit_queue(ub, ubq);
  3472. return ret;
  3473. }
  3474. static void ublk_deinit_queues(struct ublk_device *ub)
  3475. {
  3476. int i;
  3477. for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
  3478. ublk_deinit_queue(ub, i);
  3479. }
  3480. static int ublk_init_queues(struct ublk_device *ub)
  3481. {
  3482. int i, ret;
  3483. for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
  3484. ret = ublk_init_queue(ub, i);
  3485. if (ret)
  3486. goto fail;
  3487. }
  3488. init_completion(&ub->completion);
  3489. return 0;
  3490. fail:
  3491. ublk_deinit_queues(ub);
  3492. return ret;
  3493. }
  3494. static int ublk_alloc_dev_number(struct ublk_device *ub, int idx)
  3495. {
  3496. int i = idx;
  3497. int err;
  3498. spin_lock(&ublk_idr_lock);
  3499. /* allocate id, if @id >= 0, we're requesting that specific id */
  3500. if (i >= 0) {
  3501. err = idr_alloc(&ublk_index_idr, ub, i, i + 1, GFP_NOWAIT);
  3502. if (err == -ENOSPC)
  3503. err = -EEXIST;
  3504. } else {
  3505. err = idr_alloc(&ublk_index_idr, ub, 0, UBLK_MAX_UBLKS,
  3506. GFP_NOWAIT);
  3507. }
  3508. spin_unlock(&ublk_idr_lock);
  3509. if (err >= 0)
  3510. ub->ub_number = err;
  3511. return err;
  3512. }
  3513. static void ublk_free_dev_number(struct ublk_device *ub)
  3514. {
  3515. spin_lock(&ublk_idr_lock);
  3516. idr_remove(&ublk_index_idr, ub->ub_number);
  3517. wake_up_all(&ublk_idr_wq);
  3518. spin_unlock(&ublk_idr_lock);
  3519. }
  3520. static void ublk_cdev_rel(struct device *dev)
  3521. {
  3522. struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev);
  3523. blk_mq_free_tag_set(&ub->tag_set);
  3524. ublk_deinit_queues(ub);
  3525. ublk_free_dev_number(ub);
  3526. mutex_destroy(&ub->mutex);
  3527. mutex_destroy(&ub->cancel_mutex);
  3528. kfree(ub);
  3529. }
  3530. static int ublk_add_chdev(struct ublk_device *ub)
  3531. {
  3532. struct device *dev = &ub->cdev_dev;
  3533. int minor = ub->ub_number;
  3534. int ret;
  3535. dev->parent = ublk_misc.this_device;
  3536. dev->devt = MKDEV(MAJOR(ublk_chr_devt), minor);
  3537. dev->class = &ublk_chr_class;
  3538. dev->release = ublk_cdev_rel;
  3539. device_initialize(dev);
  3540. ret = dev_set_name(dev, "ublkc%d", minor);
  3541. if (ret)
  3542. goto fail;
  3543. if (ublk_dev_support_batch_io(ub))
  3544. cdev_init(&ub->cdev, &ublk_ch_batch_io_fops);
  3545. else
  3546. cdev_init(&ub->cdev, &ublk_ch_fops);
  3547. ret = cdev_device_add(&ub->cdev, dev);
  3548. if (ret)
  3549. goto fail;
  3550. if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV)
  3551. unprivileged_ublks_added++;
  3552. return 0;
  3553. fail:
  3554. put_device(dev);
  3555. return ret;
  3556. }
  3557. /* align max io buffer size with PAGE_SIZE */
  3558. static void ublk_align_max_io_size(struct ublk_device *ub)
  3559. {
  3560. unsigned int max_io_bytes = ub->dev_info.max_io_buf_bytes;
  3561. ub->dev_info.max_io_buf_bytes =
  3562. round_down(max_io_bytes, PAGE_SIZE);
  3563. }
  3564. static int ublk_add_tag_set(struct ublk_device *ub)
  3565. {
  3566. if (ublk_dev_support_batch_io(ub))
  3567. ub->tag_set.ops = &ublk_batch_mq_ops;
  3568. else
  3569. ub->tag_set.ops = &ublk_mq_ops;
  3570. ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues;
  3571. ub->tag_set.queue_depth = ub->dev_info.queue_depth;
  3572. ub->tag_set.numa_node = NUMA_NO_NODE;
  3573. ub->tag_set.driver_data = ub;
  3574. return blk_mq_alloc_tag_set(&ub->tag_set);
  3575. }
  3576. static void ublk_remove(struct ublk_device *ub)
  3577. {
  3578. bool unprivileged;
  3579. ublk_stop_dev(ub);
  3580. cdev_device_del(&ub->cdev, &ub->cdev_dev);
  3581. unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
  3582. ublk_put_device(ub);
  3583. if (unprivileged)
  3584. unprivileged_ublks_added--;
  3585. }
  3586. static struct ublk_device *ublk_get_device_from_id(int idx)
  3587. {
  3588. struct ublk_device *ub = NULL;
  3589. if (idx < 0)
  3590. return NULL;
  3591. spin_lock(&ublk_idr_lock);
  3592. ub = idr_find(&ublk_index_idr, idx);
  3593. if (ub)
  3594. ub = ublk_get_device(ub);
  3595. spin_unlock(&ublk_idr_lock);
  3596. return ub;
  3597. }
  3598. static bool ublk_validate_user_pid(struct ublk_device *ub, pid_t ublksrv_pid)
  3599. {
  3600. rcu_read_lock();
  3601. ublksrv_pid = pid_nr(find_vpid(ublksrv_pid));
  3602. rcu_read_unlock();
  3603. return ub->ublksrv_tgid == ublksrv_pid;
  3604. }
  3605. static int ublk_ctrl_start_dev(struct ublk_device *ub,
  3606. const struct ublksrv_ctrl_cmd *header)
  3607. {
  3608. const struct ublk_param_basic *p = &ub->params.basic;
  3609. int ublksrv_pid = (int)header->data[0];
  3610. struct queue_limits lim = {
  3611. .logical_block_size = 1 << p->logical_bs_shift,
  3612. .physical_block_size = 1 << p->physical_bs_shift,
  3613. .io_min = 1 << p->io_min_shift,
  3614. .io_opt = 1 << p->io_opt_shift,
  3615. .max_hw_sectors = p->max_sectors,
  3616. .chunk_sectors = p->chunk_sectors,
  3617. .virt_boundary_mask = p->virt_boundary_mask,
  3618. .max_segments = USHRT_MAX,
  3619. .max_segment_size = UINT_MAX,
  3620. .dma_alignment = 3,
  3621. };
  3622. struct gendisk *disk;
  3623. int ret = -EINVAL;
  3624. if (ublksrv_pid <= 0)
  3625. return -EINVAL;
  3626. if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC))
  3627. return -EINVAL;
  3628. if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
  3629. const struct ublk_param_discard *pd = &ub->params.discard;
  3630. lim.discard_alignment = pd->discard_alignment;
  3631. lim.discard_granularity = pd->discard_granularity;
  3632. lim.max_hw_discard_sectors = pd->max_discard_sectors;
  3633. lim.max_write_zeroes_sectors = pd->max_write_zeroes_sectors;
  3634. lim.max_discard_segments = pd->max_discard_segments;
  3635. }
  3636. if (ub->params.types & UBLK_PARAM_TYPE_ZONED) {
  3637. const struct ublk_param_zoned *p = &ub->params.zoned;
  3638. if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
  3639. return -EOPNOTSUPP;
  3640. lim.features |= BLK_FEAT_ZONED;
  3641. lim.max_active_zones = p->max_active_zones;
  3642. lim.max_open_zones = p->max_open_zones;
  3643. lim.max_hw_zone_append_sectors = p->max_zone_append_sectors;
  3644. }
  3645. if (ub->params.basic.attrs & UBLK_ATTR_VOLATILE_CACHE) {
  3646. lim.features |= BLK_FEAT_WRITE_CACHE;
  3647. if (ub->params.basic.attrs & UBLK_ATTR_FUA)
  3648. lim.features |= BLK_FEAT_FUA;
  3649. }
  3650. if (ub->params.basic.attrs & UBLK_ATTR_ROTATIONAL)
  3651. lim.features |= BLK_FEAT_ROTATIONAL;
  3652. if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN)
  3653. lim.dma_alignment = ub->params.dma.alignment;
  3654. if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) {
  3655. lim.seg_boundary_mask = ub->params.seg.seg_boundary_mask;
  3656. lim.max_segment_size = ub->params.seg.max_segment_size;
  3657. lim.max_segments = ub->params.seg.max_segments;
  3658. }
  3659. if (ub->params.types & UBLK_PARAM_TYPE_INTEGRITY) {
  3660. const struct ublk_param_integrity *p = &ub->params.integrity;
  3661. int pi_tuple_size = ublk_integrity_pi_tuple_size(p->csum_type);
  3662. lim.max_integrity_segments =
  3663. p->max_integrity_segments ?: USHRT_MAX;
  3664. lim.integrity = (struct blk_integrity) {
  3665. .flags = ublk_integrity_flags(p->flags),
  3666. .csum_type = ublk_integrity_csum_type(p->csum_type),
  3667. .metadata_size = p->metadata_size,
  3668. .pi_offset = p->pi_offset,
  3669. .interval_exp = p->interval_exp,
  3670. .tag_size = p->tag_size,
  3671. .pi_tuple_size = pi_tuple_size,
  3672. };
  3673. }
  3674. if (wait_for_completion_interruptible(&ub->completion) != 0)
  3675. return -EINTR;
  3676. if (!ublk_validate_user_pid(ub, ublksrv_pid))
  3677. return -EINVAL;
  3678. mutex_lock(&ub->mutex);
  3679. /* device may become not ready in case of F_BATCH */
  3680. if (!ublk_dev_ready(ub)) {
  3681. ret = -EINVAL;
  3682. goto out_unlock;
  3683. }
  3684. if (ub->dev_info.state == UBLK_S_DEV_LIVE ||
  3685. test_bit(UB_STATE_USED, &ub->state)) {
  3686. ret = -EEXIST;
  3687. goto out_unlock;
  3688. }
  3689. disk = blk_mq_alloc_disk(&ub->tag_set, &lim, NULL);
  3690. if (IS_ERR(disk)) {
  3691. ret = PTR_ERR(disk);
  3692. goto out_unlock;
  3693. }
  3694. sprintf(disk->disk_name, "ublkb%d", ub->ub_number);
  3695. disk->fops = &ub_fops;
  3696. disk->private_data = ub;
  3697. ub->dev_info.ublksrv_pid = ub->ublksrv_tgid;
  3698. ub->ub_disk = disk;
  3699. ublk_apply_params(ub);
  3700. /*
  3701. * Suppress partition scan to avoid potential IO hang.
  3702. *
  3703. * If ublk server error occurs during partition scan, the IO may
  3704. * wait while holding ub->mutex, which can deadlock with other
  3705. * operations that need the mutex. Defer partition scan to async
  3706. * work.
  3707. * For unprivileged daemons, keep GD_SUPPRESS_PART_SCAN set
  3708. * permanently.
  3709. */
  3710. set_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
  3711. ublk_get_device(ub);
  3712. ub->dev_info.state = UBLK_S_DEV_LIVE;
  3713. if (ublk_dev_is_zoned(ub)) {
  3714. ret = ublk_revalidate_disk_zones(ub);
  3715. if (ret)
  3716. goto out_put_cdev;
  3717. }
  3718. ret = add_disk(disk);
  3719. if (ret)
  3720. goto out_put_cdev;
  3721. set_bit(UB_STATE_USED, &ub->state);
  3722. /* Skip partition scan if disabled by user */
  3723. if (ub->dev_info.flags & UBLK_F_NO_AUTO_PART_SCAN) {
  3724. /* Not clear for unprivileged daemons, see comment above */
  3725. if (!ub->unprivileged_daemons)
  3726. clear_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
  3727. } else {
  3728. /* Schedule async partition scan for trusted daemons */
  3729. if (!ub->unprivileged_daemons)
  3730. schedule_work(&ub->partition_scan_work);
  3731. }
  3732. out_put_cdev:
  3733. if (ret) {
  3734. ublk_detach_disk(ub);
  3735. ublk_put_device(ub);
  3736. }
  3737. if (ret)
  3738. put_disk(disk);
  3739. out_unlock:
  3740. mutex_unlock(&ub->mutex);
  3741. return ret;
  3742. }
  3743. static int ublk_ctrl_get_queue_affinity(struct ublk_device *ub,
  3744. const struct ublksrv_ctrl_cmd *header)
  3745. {
  3746. void __user *argp = (void __user *)(unsigned long)header->addr;
  3747. cpumask_var_t cpumask;
  3748. unsigned long queue;
  3749. unsigned int retlen;
  3750. unsigned int i;
  3751. int ret;
  3752. if (header->len * BITS_PER_BYTE < nr_cpu_ids)
  3753. return -EINVAL;
  3754. if (header->len & (sizeof(unsigned long)-1))
  3755. return -EINVAL;
  3756. if (!header->addr)
  3757. return -EINVAL;
  3758. queue = header->data[0];
  3759. if (queue >= ub->dev_info.nr_hw_queues)
  3760. return -EINVAL;
  3761. if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
  3762. return -ENOMEM;
  3763. for_each_possible_cpu(i) {
  3764. if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[i] == queue)
  3765. cpumask_set_cpu(i, cpumask);
  3766. }
  3767. ret = -EFAULT;
  3768. retlen = min_t(unsigned short, header->len, cpumask_size());
  3769. if (copy_to_user(argp, cpumask, retlen))
  3770. goto out_free_cpumask;
  3771. if (retlen != header->len &&
  3772. clear_user(argp + retlen, header->len - retlen))
  3773. goto out_free_cpumask;
  3774. ret = 0;
  3775. out_free_cpumask:
  3776. free_cpumask_var(cpumask);
  3777. return ret;
  3778. }
  3779. static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info)
  3780. {
  3781. pr_devel("%s: dev id %d flags %llx\n", __func__,
  3782. info->dev_id, info->flags);
  3783. pr_devel("\t nr_hw_queues %d queue_depth %d\n",
  3784. info->nr_hw_queues, info->queue_depth);
  3785. }
  3786. static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
  3787. {
  3788. void __user *argp = (void __user *)(unsigned long)header->addr;
  3789. struct ublksrv_ctrl_dev_info info;
  3790. struct ublk_device *ub;
  3791. int ret = -EINVAL;
  3792. if (header->len < sizeof(info) || !header->addr)
  3793. return -EINVAL;
  3794. if (header->queue_id != (u16)-1) {
  3795. pr_warn("%s: queue_id is wrong %x\n",
  3796. __func__, header->queue_id);
  3797. return -EINVAL;
  3798. }
  3799. if (copy_from_user(&info, argp, sizeof(info)))
  3800. return -EFAULT;
  3801. if (info.queue_depth > UBLK_MAX_QUEUE_DEPTH || !info.queue_depth ||
  3802. info.nr_hw_queues > UBLK_MAX_NR_QUEUES || !info.nr_hw_queues)
  3803. return -EINVAL;
  3804. if (capable(CAP_SYS_ADMIN))
  3805. info.flags &= ~UBLK_F_UNPRIVILEGED_DEV;
  3806. else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV))
  3807. return -EPERM;
  3808. /* forbid nonsense combinations of recovery flags */
  3809. switch (info.flags & UBLK_F_ALL_RECOVERY_FLAGS) {
  3810. case 0:
  3811. case UBLK_F_USER_RECOVERY:
  3812. case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_REISSUE):
  3813. case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_FAIL_IO):
  3814. break;
  3815. default:
  3816. pr_warn("%s: invalid recovery flags %llx\n", __func__,
  3817. info.flags & UBLK_F_ALL_RECOVERY_FLAGS);
  3818. return -EINVAL;
  3819. }
  3820. if ((info.flags & UBLK_F_QUIESCE) && !(info.flags & UBLK_F_USER_RECOVERY)) {
  3821. pr_warn("UBLK_F_QUIESCE requires UBLK_F_USER_RECOVERY\n");
  3822. return -EINVAL;
  3823. }
  3824. /*
  3825. * unprivileged device can't be trusted, but RECOVERY and
  3826. * RECOVERY_REISSUE still may hang error handling, so can't
  3827. * support recovery features for unprivileged ublk now
  3828. *
  3829. * TODO: provide forward progress for RECOVERY handler, so that
  3830. * unprivileged device can benefit from it
  3831. */
  3832. if (info.flags & UBLK_F_UNPRIVILEGED_DEV) {
  3833. info.flags &= ~(UBLK_F_USER_RECOVERY_REISSUE |
  3834. UBLK_F_USER_RECOVERY);
  3835. /*
  3836. * For USER_COPY, we depends on userspace to fill request
  3837. * buffer by pwrite() to ublk char device, which can't be
  3838. * used for unprivileged device
  3839. *
  3840. * Same with zero copy or auto buffer register.
  3841. */
  3842. if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
  3843. UBLK_F_AUTO_BUF_REG))
  3844. return -EINVAL;
  3845. }
  3846. /* User copy is required to access integrity buffer */
  3847. if (info.flags & UBLK_F_INTEGRITY && !(info.flags & UBLK_F_USER_COPY))
  3848. return -EINVAL;
  3849. /* the created device is always owned by current user */
  3850. ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid);
  3851. if (header->dev_id != info.dev_id) {
  3852. pr_warn("%s: dev id not match %u %u\n",
  3853. __func__, header->dev_id, info.dev_id);
  3854. return -EINVAL;
  3855. }
  3856. if (header->dev_id != U32_MAX && header->dev_id >= UBLK_MAX_UBLKS) {
  3857. pr_warn("%s: dev id is too large. Max supported is %d\n",
  3858. __func__, UBLK_MAX_UBLKS - 1);
  3859. return -EINVAL;
  3860. }
  3861. ublk_dump_dev_info(&info);
  3862. ret = mutex_lock_killable(&ublk_ctl_mutex);
  3863. if (ret)
  3864. return ret;
  3865. ret = -EACCES;
  3866. if ((info.flags & UBLK_F_UNPRIVILEGED_DEV) &&
  3867. unprivileged_ublks_added >= unprivileged_ublks_max)
  3868. goto out_unlock;
  3869. ret = -ENOMEM;
  3870. ub = kzalloc_flex(*ub, queues, info.nr_hw_queues);
  3871. if (!ub)
  3872. goto out_unlock;
  3873. mutex_init(&ub->mutex);
  3874. spin_lock_init(&ub->lock);
  3875. mutex_init(&ub->cancel_mutex);
  3876. INIT_WORK(&ub->partition_scan_work, ublk_partition_scan_work);
  3877. ret = ublk_alloc_dev_number(ub, header->dev_id);
  3878. if (ret < 0)
  3879. goto out_free_ub;
  3880. memcpy(&ub->dev_info, &info, sizeof(info));
  3881. /* update device id */
  3882. ub->dev_info.dev_id = ub->ub_number;
  3883. /*
  3884. * 64bit flags will be copied back to userspace as feature
  3885. * negotiation result, so have to clear flags which driver
  3886. * doesn't support yet, then userspace can get correct flags
  3887. * (features) to handle.
  3888. */
  3889. ub->dev_info.flags &= UBLK_F_ALL;
  3890. ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE |
  3891. UBLK_F_URING_CMD_COMP_IN_TASK |
  3892. UBLK_F_PER_IO_DAEMON |
  3893. UBLK_F_BUF_REG_OFF_DAEMON |
  3894. UBLK_F_SAFE_STOP_DEV;
  3895. /* So far, UBLK_F_PER_IO_DAEMON won't be exposed for BATCH_IO */
  3896. if (ublk_dev_support_batch_io(ub))
  3897. ub->dev_info.flags &= ~UBLK_F_PER_IO_DAEMON;
  3898. /* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */
  3899. if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
  3900. UBLK_F_AUTO_BUF_REG))
  3901. ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
  3902. /* UBLK_F_BATCH_IO doesn't support GET_DATA */
  3903. if (ublk_dev_support_batch_io(ub))
  3904. ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
  3905. /*
  3906. * Zoned storage support requires reuse `ublksrv_io_cmd->addr` for
  3907. * returning write_append_lba, which is only allowed in case of
  3908. * user copy or zero copy
  3909. */
  3910. if (ublk_dev_is_zoned(ub) &&
  3911. (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !(ub->dev_info.flags &
  3912. (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY)))) {
  3913. ret = -EINVAL;
  3914. goto out_free_dev_number;
  3915. }
  3916. ub->dev_info.nr_hw_queues = min_t(unsigned int,
  3917. ub->dev_info.nr_hw_queues, nr_cpu_ids);
  3918. ublk_align_max_io_size(ub);
  3919. ret = ublk_add_tag_set(ub);
  3920. if (ret)
  3921. goto out_free_dev_number;
  3922. ret = ublk_init_queues(ub);
  3923. if (ret)
  3924. goto out_free_tag_set;
  3925. ret = -EFAULT;
  3926. if (copy_to_user(argp, &ub->dev_info, sizeof(info)))
  3927. goto out_deinit_queues;
  3928. /*
  3929. * Add the char dev so that ublksrv daemon can be setup.
  3930. * ublk_add_chdev() will cleanup everything if it fails.
  3931. */
  3932. ret = ublk_add_chdev(ub);
  3933. goto out_unlock;
  3934. out_deinit_queues:
  3935. ublk_deinit_queues(ub);
  3936. out_free_tag_set:
  3937. blk_mq_free_tag_set(&ub->tag_set);
  3938. out_free_dev_number:
  3939. ublk_free_dev_number(ub);
  3940. out_free_ub:
  3941. mutex_destroy(&ub->mutex);
  3942. mutex_destroy(&ub->cancel_mutex);
  3943. kfree(ub);
  3944. out_unlock:
  3945. mutex_unlock(&ublk_ctl_mutex);
  3946. return ret;
  3947. }
  3948. static inline bool ublk_idr_freed(int id)
  3949. {
  3950. void *ptr;
  3951. spin_lock(&ublk_idr_lock);
  3952. ptr = idr_find(&ublk_index_idr, id);
  3953. spin_unlock(&ublk_idr_lock);
  3954. return ptr == NULL;
  3955. }
  3956. static int ublk_ctrl_del_dev(struct ublk_device **p_ub, bool wait)
  3957. {
  3958. struct ublk_device *ub = *p_ub;
  3959. int idx = ub->ub_number;
  3960. int ret;
  3961. ret = mutex_lock_killable(&ublk_ctl_mutex);
  3962. if (ret)
  3963. return ret;
  3964. if (!test_bit(UB_STATE_DELETED, &ub->state)) {
  3965. ublk_remove(ub);
  3966. set_bit(UB_STATE_DELETED, &ub->state);
  3967. }
  3968. /* Mark the reference as consumed */
  3969. *p_ub = NULL;
  3970. ublk_put_device(ub);
  3971. mutex_unlock(&ublk_ctl_mutex);
  3972. /*
  3973. * Wait until the idr is removed, then it can be reused after
  3974. * DEL_DEV command is returned.
  3975. *
  3976. * If we returns because of user interrupt, future delete command
  3977. * may come:
  3978. *
  3979. * - the device number isn't freed, this device won't or needn't
  3980. * be deleted again, since UB_STATE_DELETED is set, and device
  3981. * will be released after the last reference is dropped
  3982. *
  3983. * - the device number is freed already, we will not find this
  3984. * device via ublk_get_device_from_id()
  3985. */
  3986. if (wait && wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx)))
  3987. return -EINTR;
  3988. return 0;
  3989. }
  3990. static inline void ublk_ctrl_cmd_dump(u32 cmd_op,
  3991. const struct ublksrv_ctrl_cmd *header)
  3992. {
  3993. pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n",
  3994. __func__, cmd_op, header->dev_id, header->queue_id,
  3995. header->data[0], header->addr, header->len);
  3996. }
  3997. static void ublk_ctrl_stop_dev(struct ublk_device *ub)
  3998. {
  3999. ublk_stop_dev(ub);
  4000. }
  4001. static int ublk_ctrl_try_stop_dev(struct ublk_device *ub)
  4002. {
  4003. struct gendisk *disk;
  4004. int ret = 0;
  4005. disk = ublk_get_disk(ub);
  4006. if (!disk)
  4007. return -ENODEV;
  4008. mutex_lock(&disk->open_mutex);
  4009. if (disk_openers(disk) > 0) {
  4010. ret = -EBUSY;
  4011. goto unlock;
  4012. }
  4013. ub->block_open = true;
  4014. /* release open_mutex as del_gendisk() will reacquire it */
  4015. mutex_unlock(&disk->open_mutex);
  4016. ublk_ctrl_stop_dev(ub);
  4017. goto out;
  4018. unlock:
  4019. mutex_unlock(&disk->open_mutex);
  4020. out:
  4021. ublk_put_disk(disk);
  4022. return ret;
  4023. }
  4024. static int ublk_ctrl_get_dev_info(struct ublk_device *ub,
  4025. const struct ublksrv_ctrl_cmd *header)
  4026. {
  4027. struct task_struct *p;
  4028. struct pid *pid;
  4029. struct ublksrv_ctrl_dev_info dev_info;
  4030. pid_t init_ublksrv_tgid = ub->dev_info.ublksrv_pid;
  4031. void __user *argp = (void __user *)(unsigned long)header->addr;
  4032. if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr)
  4033. return -EINVAL;
  4034. memcpy(&dev_info, &ub->dev_info, sizeof(dev_info));
  4035. dev_info.ublksrv_pid = -1;
  4036. if (init_ublksrv_tgid > 0) {
  4037. rcu_read_lock();
  4038. pid = find_pid_ns(init_ublksrv_tgid, &init_pid_ns);
  4039. p = pid_task(pid, PIDTYPE_TGID);
  4040. if (p) {
  4041. int vnr = task_tgid_vnr(p);
  4042. if (vnr)
  4043. dev_info.ublksrv_pid = vnr;
  4044. }
  4045. rcu_read_unlock();
  4046. }
  4047. if (copy_to_user(argp, &dev_info, sizeof(dev_info)))
  4048. return -EFAULT;
  4049. return 0;
  4050. }
  4051. /* TYPE_DEVT is readonly, so fill it up before returning to userspace */
  4052. static void ublk_ctrl_fill_params_devt(struct ublk_device *ub)
  4053. {
  4054. ub->params.devt.char_major = MAJOR(ub->cdev_dev.devt);
  4055. ub->params.devt.char_minor = MINOR(ub->cdev_dev.devt);
  4056. if (ub->ub_disk) {
  4057. ub->params.devt.disk_major = MAJOR(disk_devt(ub->ub_disk));
  4058. ub->params.devt.disk_minor = MINOR(disk_devt(ub->ub_disk));
  4059. } else {
  4060. ub->params.devt.disk_major = 0;
  4061. ub->params.devt.disk_minor = 0;
  4062. }
  4063. ub->params.types |= UBLK_PARAM_TYPE_DEVT;
  4064. }
  4065. static int ublk_ctrl_get_params(struct ublk_device *ub,
  4066. const struct ublksrv_ctrl_cmd *header)
  4067. {
  4068. void __user *argp = (void __user *)(unsigned long)header->addr;
  4069. struct ublk_params_header ph;
  4070. int ret;
  4071. if (header->len <= sizeof(ph) || !header->addr)
  4072. return -EINVAL;
  4073. if (copy_from_user(&ph, argp, sizeof(ph)))
  4074. return -EFAULT;
  4075. if (ph.len > header->len || !ph.len)
  4076. return -EINVAL;
  4077. if (ph.len > sizeof(struct ublk_params))
  4078. ph.len = sizeof(struct ublk_params);
  4079. mutex_lock(&ub->mutex);
  4080. ublk_ctrl_fill_params_devt(ub);
  4081. if (copy_to_user(argp, &ub->params, ph.len))
  4082. ret = -EFAULT;
  4083. else
  4084. ret = 0;
  4085. mutex_unlock(&ub->mutex);
  4086. return ret;
  4087. }
  4088. static int ublk_ctrl_set_params(struct ublk_device *ub,
  4089. const struct ublksrv_ctrl_cmd *header)
  4090. {
  4091. void __user *argp = (void __user *)(unsigned long)header->addr;
  4092. struct ublk_params_header ph;
  4093. int ret = -EFAULT;
  4094. if (header->len <= sizeof(ph) || !header->addr)
  4095. return -EINVAL;
  4096. if (copy_from_user(&ph, argp, sizeof(ph)))
  4097. return -EFAULT;
  4098. if (ph.len > header->len || !ph.len || !ph.types)
  4099. return -EINVAL;
  4100. if (ph.len > sizeof(struct ublk_params))
  4101. ph.len = sizeof(struct ublk_params);
  4102. mutex_lock(&ub->mutex);
  4103. if (test_bit(UB_STATE_USED, &ub->state)) {
  4104. /*
  4105. * Parameters can only be changed when device hasn't
  4106. * been started yet
  4107. */
  4108. ret = -EACCES;
  4109. } else if (copy_from_user(&ub->params, argp, ph.len)) {
  4110. ret = -EFAULT;
  4111. } else {
  4112. /* clear all we don't support yet */
  4113. ub->params.types &= UBLK_PARAM_TYPE_ALL;
  4114. ret = ublk_validate_params(ub);
  4115. if (ret)
  4116. ub->params.types = 0;
  4117. }
  4118. mutex_unlock(&ub->mutex);
  4119. return ret;
  4120. }
  4121. static int ublk_ctrl_start_recovery(struct ublk_device *ub)
  4122. {
  4123. int ret = -EINVAL;
  4124. mutex_lock(&ub->mutex);
  4125. if (ublk_nosrv_should_stop_dev(ub))
  4126. goto out_unlock;
  4127. /*
  4128. * START_RECOVERY is only allowd after:
  4129. *
  4130. * (1) UB_STATE_OPEN is not set, which means the dying process is exited
  4131. * and related io_uring ctx is freed so file struct of /dev/ublkcX is
  4132. * released.
  4133. *
  4134. * and one of the following holds
  4135. *
  4136. * (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work:
  4137. * (a)has quiesced request queue
  4138. * (b)has requeued every inflight rqs whose io_flags is ACTIVE
  4139. * (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE
  4140. * (d)has completed/camceled all ioucmds owned by ther dying process
  4141. *
  4142. * (3) UBLK_S_DEV_FAIL_IO is set, which means the queue is not
  4143. * quiesced, but all I/O is being immediately errored
  4144. */
  4145. if (test_bit(UB_STATE_OPEN, &ub->state) || !ublk_dev_in_recoverable_state(ub)) {
  4146. ret = -EBUSY;
  4147. goto out_unlock;
  4148. }
  4149. pr_devel("%s: start recovery for dev id %d\n", __func__, ub->ub_number);
  4150. init_completion(&ub->completion);
  4151. ret = 0;
  4152. out_unlock:
  4153. mutex_unlock(&ub->mutex);
  4154. return ret;
  4155. }
  4156. static int ublk_ctrl_end_recovery(struct ublk_device *ub,
  4157. const struct ublksrv_ctrl_cmd *header)
  4158. {
  4159. int ublksrv_pid = (int)header->data[0];
  4160. int ret = -EINVAL;
  4161. pr_devel("%s: Waiting for all FETCH_REQs, dev id %d...\n", __func__,
  4162. header->dev_id);
  4163. if (wait_for_completion_interruptible(&ub->completion))
  4164. return -EINTR;
  4165. pr_devel("%s: All FETCH_REQs received, dev id %d\n", __func__,
  4166. header->dev_id);
  4167. if (!ublk_validate_user_pid(ub, ublksrv_pid))
  4168. return -EINVAL;
  4169. mutex_lock(&ub->mutex);
  4170. if (ublk_nosrv_should_stop_dev(ub))
  4171. goto out_unlock;
  4172. if (!ublk_dev_in_recoverable_state(ub)) {
  4173. ret = -EBUSY;
  4174. goto out_unlock;
  4175. }
  4176. ub->dev_info.ublksrv_pid = ub->ublksrv_tgid;
  4177. ub->dev_info.state = UBLK_S_DEV_LIVE;
  4178. pr_devel("%s: new ublksrv_pid %d, dev id %d\n",
  4179. __func__, ublksrv_pid, header->dev_id);
  4180. blk_mq_kick_requeue_list(ub->ub_disk->queue);
  4181. ret = 0;
  4182. out_unlock:
  4183. mutex_unlock(&ub->mutex);
  4184. return ret;
  4185. }
  4186. static int ublk_ctrl_get_features(const struct ublksrv_ctrl_cmd *header)
  4187. {
  4188. void __user *argp = (void __user *)(unsigned long)header->addr;
  4189. u64 features = UBLK_F_ALL;
  4190. if (header->len != UBLK_FEATURES_LEN || !header->addr)
  4191. return -EINVAL;
  4192. if (copy_to_user(argp, &features, UBLK_FEATURES_LEN))
  4193. return -EFAULT;
  4194. return 0;
  4195. }
  4196. static int ublk_ctrl_set_size(struct ublk_device *ub, const struct ublksrv_ctrl_cmd *header)
  4197. {
  4198. struct ublk_param_basic *p = &ub->params.basic;
  4199. u64 new_size = header->data[0];
  4200. int ret = 0;
  4201. mutex_lock(&ub->mutex);
  4202. if (!ub->ub_disk) {
  4203. ret = -ENODEV;
  4204. goto out;
  4205. }
  4206. p->dev_sectors = new_size;
  4207. set_capacity_and_notify(ub->ub_disk, p->dev_sectors);
  4208. out:
  4209. mutex_unlock(&ub->mutex);
  4210. return ret;
  4211. }
  4212. struct count_busy {
  4213. const struct ublk_queue *ubq;
  4214. unsigned int nr_busy;
  4215. };
  4216. static bool ublk_count_busy_req(struct request *rq, void *data)
  4217. {
  4218. struct count_busy *idle = data;
  4219. if (!blk_mq_request_started(rq) && rq->mq_hctx->driver_data == idle->ubq)
  4220. idle->nr_busy += 1;
  4221. return true;
  4222. }
  4223. /* uring_cmd is guaranteed to be active if the associated request is idle */
  4224. static bool ubq_has_idle_io(const struct ublk_queue *ubq)
  4225. {
  4226. struct count_busy data = {
  4227. .ubq = ubq,
  4228. };
  4229. blk_mq_tagset_busy_iter(&ubq->dev->tag_set, ublk_count_busy_req, &data);
  4230. return data.nr_busy < ubq->q_depth;
  4231. }
  4232. /* Wait until each hw queue has at least one idle IO */
  4233. static int ublk_wait_for_idle_io(struct ublk_device *ub,
  4234. unsigned int timeout_ms)
  4235. {
  4236. unsigned int elapsed = 0;
  4237. int ret;
  4238. /*
  4239. * For UBLK_F_BATCH_IO ublk server can get notified with existing
  4240. * or new fetch command, so needn't wait any more
  4241. */
  4242. if (ublk_dev_support_batch_io(ub))
  4243. return 0;
  4244. while (elapsed < timeout_ms && !signal_pending(current)) {
  4245. unsigned int queues_cancelable = 0;
  4246. int i;
  4247. for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
  4248. struct ublk_queue *ubq = ublk_get_queue(ub, i);
  4249. queues_cancelable += !!ubq_has_idle_io(ubq);
  4250. }
  4251. /*
  4252. * Each queue needs at least one active command for
  4253. * notifying ublk server
  4254. */
  4255. if (queues_cancelable == ub->dev_info.nr_hw_queues)
  4256. break;
  4257. msleep(UBLK_REQUEUE_DELAY_MS);
  4258. elapsed += UBLK_REQUEUE_DELAY_MS;
  4259. }
  4260. if (signal_pending(current))
  4261. ret = -EINTR;
  4262. else if (elapsed >= timeout_ms)
  4263. ret = -EBUSY;
  4264. else
  4265. ret = 0;
  4266. return ret;
  4267. }
  4268. static int ublk_ctrl_quiesce_dev(struct ublk_device *ub,
  4269. const struct ublksrv_ctrl_cmd *header)
  4270. {
  4271. /* zero means wait forever */
  4272. u64 timeout_ms = header->data[0];
  4273. struct gendisk *disk;
  4274. int ret = -ENODEV;
  4275. if (!(ub->dev_info.flags & UBLK_F_QUIESCE))
  4276. return -EOPNOTSUPP;
  4277. mutex_lock(&ub->mutex);
  4278. disk = ublk_get_disk(ub);
  4279. if (!disk)
  4280. goto unlock;
  4281. if (ub->dev_info.state == UBLK_S_DEV_DEAD)
  4282. goto put_disk;
  4283. ret = 0;
  4284. /* already in expected state */
  4285. if (ub->dev_info.state != UBLK_S_DEV_LIVE)
  4286. goto put_disk;
  4287. /* Mark the device as canceling */
  4288. mutex_lock(&ub->cancel_mutex);
  4289. blk_mq_quiesce_queue(disk->queue);
  4290. ublk_set_canceling(ub, true);
  4291. blk_mq_unquiesce_queue(disk->queue);
  4292. mutex_unlock(&ub->cancel_mutex);
  4293. if (!timeout_ms)
  4294. timeout_ms = UINT_MAX;
  4295. ret = ublk_wait_for_idle_io(ub, timeout_ms);
  4296. put_disk:
  4297. ublk_put_disk(disk);
  4298. unlock:
  4299. mutex_unlock(&ub->mutex);
  4300. /* Cancel pending uring_cmd */
  4301. if (!ret)
  4302. ublk_cancel_dev(ub);
  4303. return ret;
  4304. }
  4305. /*
  4306. * All control commands are sent via /dev/ublk-control, so we have to check
  4307. * the destination device's permission
  4308. */
  4309. static int ublk_char_dev_permission(struct ublk_device *ub,
  4310. const char *dev_path, int mask)
  4311. {
  4312. int err;
  4313. struct path path;
  4314. struct kstat stat;
  4315. err = kern_path(dev_path, LOOKUP_FOLLOW, &path);
  4316. if (err)
  4317. return err;
  4318. err = vfs_getattr(&path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT);
  4319. if (err)
  4320. goto exit;
  4321. err = -EPERM;
  4322. if (stat.rdev != ub->cdev_dev.devt || !S_ISCHR(stat.mode))
  4323. goto exit;
  4324. err = inode_permission(&nop_mnt_idmap,
  4325. d_backing_inode(path.dentry), mask);
  4326. exit:
  4327. path_put(&path);
  4328. return err;
  4329. }
  4330. static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
  4331. u32 cmd_op, struct ublksrv_ctrl_cmd *header)
  4332. {
  4333. bool unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
  4334. void __user *argp = (void __user *)(unsigned long)header->addr;
  4335. char *dev_path = NULL;
  4336. int ret = 0;
  4337. int mask;
  4338. if (!unprivileged) {
  4339. if (!capable(CAP_SYS_ADMIN))
  4340. return -EPERM;
  4341. /*
  4342. * The new added command of UBLK_CMD_GET_DEV_INFO2 includes
  4343. * char_dev_path in payload too, since userspace may not
  4344. * know if the specified device is created as unprivileged
  4345. * mode.
  4346. */
  4347. if (_IOC_NR(cmd_op) != UBLK_CMD_GET_DEV_INFO2)
  4348. return 0;
  4349. }
  4350. /*
  4351. * User has to provide the char device path for unprivileged ublk
  4352. *
  4353. * header->addr always points to the dev path buffer, and
  4354. * header->dev_path_len records length of dev path buffer.
  4355. */
  4356. if (!header->dev_path_len || header->dev_path_len > PATH_MAX)
  4357. return -EINVAL;
  4358. if (header->len < header->dev_path_len)
  4359. return -EINVAL;
  4360. dev_path = memdup_user_nul(argp, header->dev_path_len);
  4361. if (IS_ERR(dev_path))
  4362. return PTR_ERR(dev_path);
  4363. ret = -EINVAL;
  4364. switch (_IOC_NR(cmd_op)) {
  4365. case UBLK_CMD_GET_DEV_INFO:
  4366. case UBLK_CMD_GET_DEV_INFO2:
  4367. case UBLK_CMD_GET_QUEUE_AFFINITY:
  4368. case UBLK_CMD_GET_PARAMS:
  4369. case (_IOC_NR(UBLK_U_CMD_GET_FEATURES)):
  4370. mask = MAY_READ;
  4371. break;
  4372. case UBLK_CMD_START_DEV:
  4373. case UBLK_CMD_STOP_DEV:
  4374. case UBLK_CMD_ADD_DEV:
  4375. case UBLK_CMD_DEL_DEV:
  4376. case UBLK_CMD_SET_PARAMS:
  4377. case UBLK_CMD_START_USER_RECOVERY:
  4378. case UBLK_CMD_END_USER_RECOVERY:
  4379. case UBLK_CMD_UPDATE_SIZE:
  4380. case UBLK_CMD_QUIESCE_DEV:
  4381. case UBLK_CMD_TRY_STOP_DEV:
  4382. mask = MAY_READ | MAY_WRITE;
  4383. break;
  4384. default:
  4385. goto exit;
  4386. }
  4387. ret = ublk_char_dev_permission(ub, dev_path, mask);
  4388. if (!ret) {
  4389. header->len -= header->dev_path_len;
  4390. header->addr += header->dev_path_len;
  4391. }
  4392. pr_devel("%s: dev id %d cmd_op %x uid %d gid %d path %s ret %d\n",
  4393. __func__, ub->ub_number, cmd_op,
  4394. ub->dev_info.owner_uid, ub->dev_info.owner_gid,
  4395. dev_path, ret);
  4396. exit:
  4397. kfree(dev_path);
  4398. return ret;
  4399. }
  4400. static bool ublk_ctrl_uring_cmd_may_sleep(u32 cmd_op)
  4401. {
  4402. switch (_IOC_NR(cmd_op)) {
  4403. case UBLK_CMD_GET_QUEUE_AFFINITY:
  4404. case UBLK_CMD_GET_DEV_INFO:
  4405. case UBLK_CMD_GET_DEV_INFO2:
  4406. case _IOC_NR(UBLK_U_CMD_GET_FEATURES):
  4407. return false;
  4408. default:
  4409. return true;
  4410. }
  4411. }
  4412. static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
  4413. unsigned int issue_flags)
  4414. {
  4415. /* May point to userspace-mapped memory */
  4416. const struct ublksrv_ctrl_cmd *ub_src = io_uring_sqe128_cmd(cmd->sqe,
  4417. struct ublksrv_ctrl_cmd);
  4418. struct ublksrv_ctrl_cmd header;
  4419. struct ublk_device *ub = NULL;
  4420. u32 cmd_op = cmd->cmd_op;
  4421. int ret = -EINVAL;
  4422. if (ublk_ctrl_uring_cmd_may_sleep(cmd_op) &&
  4423. issue_flags & IO_URING_F_NONBLOCK)
  4424. return -EAGAIN;
  4425. if (!(issue_flags & IO_URING_F_SQE128))
  4426. return -EINVAL;
  4427. header.dev_id = READ_ONCE(ub_src->dev_id);
  4428. header.queue_id = READ_ONCE(ub_src->queue_id);
  4429. header.len = READ_ONCE(ub_src->len);
  4430. header.addr = READ_ONCE(ub_src->addr);
  4431. header.data[0] = READ_ONCE(ub_src->data[0]);
  4432. header.dev_path_len = READ_ONCE(ub_src->dev_path_len);
  4433. ublk_ctrl_cmd_dump(cmd_op, &header);
  4434. ret = ublk_check_cmd_op(cmd_op);
  4435. if (ret)
  4436. goto out;
  4437. if (cmd_op == UBLK_U_CMD_GET_FEATURES) {
  4438. ret = ublk_ctrl_get_features(&header);
  4439. goto out;
  4440. }
  4441. if (_IOC_NR(cmd_op) != UBLK_CMD_ADD_DEV) {
  4442. ret = -ENODEV;
  4443. ub = ublk_get_device_from_id(header.dev_id);
  4444. if (!ub)
  4445. goto out;
  4446. ret = ublk_ctrl_uring_cmd_permission(ub, cmd_op, &header);
  4447. if (ret)
  4448. goto put_dev;
  4449. }
  4450. switch (_IOC_NR(cmd_op)) {
  4451. case UBLK_CMD_START_DEV:
  4452. ret = ublk_ctrl_start_dev(ub, &header);
  4453. break;
  4454. case UBLK_CMD_STOP_DEV:
  4455. ublk_ctrl_stop_dev(ub);
  4456. ret = 0;
  4457. break;
  4458. case UBLK_CMD_GET_DEV_INFO:
  4459. case UBLK_CMD_GET_DEV_INFO2:
  4460. ret = ublk_ctrl_get_dev_info(ub, &header);
  4461. break;
  4462. case UBLK_CMD_ADD_DEV:
  4463. ret = ublk_ctrl_add_dev(&header);
  4464. break;
  4465. case UBLK_CMD_DEL_DEV:
  4466. ret = ublk_ctrl_del_dev(&ub, true);
  4467. break;
  4468. case UBLK_CMD_DEL_DEV_ASYNC:
  4469. ret = ublk_ctrl_del_dev(&ub, false);
  4470. break;
  4471. case UBLK_CMD_GET_QUEUE_AFFINITY:
  4472. ret = ublk_ctrl_get_queue_affinity(ub, &header);
  4473. break;
  4474. case UBLK_CMD_GET_PARAMS:
  4475. ret = ublk_ctrl_get_params(ub, &header);
  4476. break;
  4477. case UBLK_CMD_SET_PARAMS:
  4478. ret = ublk_ctrl_set_params(ub, &header);
  4479. break;
  4480. case UBLK_CMD_START_USER_RECOVERY:
  4481. ret = ublk_ctrl_start_recovery(ub);
  4482. break;
  4483. case UBLK_CMD_END_USER_RECOVERY:
  4484. ret = ublk_ctrl_end_recovery(ub, &header);
  4485. break;
  4486. case UBLK_CMD_UPDATE_SIZE:
  4487. ret = ublk_ctrl_set_size(ub, &header);
  4488. break;
  4489. case UBLK_CMD_QUIESCE_DEV:
  4490. ret = ublk_ctrl_quiesce_dev(ub, &header);
  4491. break;
  4492. case UBLK_CMD_TRY_STOP_DEV:
  4493. ret = ublk_ctrl_try_stop_dev(ub);
  4494. break;
  4495. default:
  4496. ret = -EOPNOTSUPP;
  4497. break;
  4498. }
  4499. put_dev:
  4500. if (ub)
  4501. ublk_put_device(ub);
  4502. out:
  4503. pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n",
  4504. __func__, ret, cmd_op, header.dev_id, header.queue_id);
  4505. return ret;
  4506. }
  4507. static const struct file_operations ublk_ctl_fops = {
  4508. .open = nonseekable_open,
  4509. .uring_cmd = ublk_ctrl_uring_cmd,
  4510. .owner = THIS_MODULE,
  4511. .llseek = noop_llseek,
  4512. };
  4513. static struct miscdevice ublk_misc = {
  4514. .minor = MISC_DYNAMIC_MINOR,
  4515. .name = "ublk-control",
  4516. .fops = &ublk_ctl_fops,
  4517. };
  4518. static int __init ublk_init(void)
  4519. {
  4520. int ret;
  4521. BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET +
  4522. UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET);
  4523. /*
  4524. * Ensure UBLKSRV_IO_BUF_OFFSET + UBLKSRV_IO_BUF_TOTAL_SIZE
  4525. * doesn't overflow into UBLKSRV_IO_INTEGRITY_FLAG
  4526. */
  4527. BUILD_BUG_ON(UBLKSRV_IO_BUF_OFFSET + UBLKSRV_IO_BUF_TOTAL_SIZE >=
  4528. UBLKSRV_IO_INTEGRITY_FLAG);
  4529. BUILD_BUG_ON(sizeof(struct ublk_auto_buf_reg) != 8);
  4530. init_waitqueue_head(&ublk_idr_wq);
  4531. ret = misc_register(&ublk_misc);
  4532. if (ret)
  4533. return ret;
  4534. ret = alloc_chrdev_region(&ublk_chr_devt, 0, UBLK_MINORS, "ublk-char");
  4535. if (ret)
  4536. goto unregister_mis;
  4537. ret = class_register(&ublk_chr_class);
  4538. if (ret)
  4539. goto free_chrdev_region;
  4540. return 0;
  4541. free_chrdev_region:
  4542. unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
  4543. unregister_mis:
  4544. misc_deregister(&ublk_misc);
  4545. return ret;
  4546. }
  4547. static void __exit ublk_exit(void)
  4548. {
  4549. struct ublk_device *ub;
  4550. int id;
  4551. idr_for_each_entry(&ublk_index_idr, ub, id)
  4552. ublk_remove(ub);
  4553. class_unregister(&ublk_chr_class);
  4554. misc_deregister(&ublk_misc);
  4555. idr_destroy(&ublk_index_idr);
  4556. unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
  4557. }
  4558. module_init(ublk_init);
  4559. module_exit(ublk_exit);
  4560. static int ublk_set_max_unprivileged_ublks(const char *buf,
  4561. const struct kernel_param *kp)
  4562. {
  4563. return param_set_uint_minmax(buf, kp, 0, UBLK_MAX_UBLKS);
  4564. }
  4565. static int ublk_get_max_unprivileged_ublks(char *buf,
  4566. const struct kernel_param *kp)
  4567. {
  4568. return sysfs_emit(buf, "%u\n", unprivileged_ublks_max);
  4569. }
  4570. static const struct kernel_param_ops ublk_max_unprivileged_ublks_ops = {
  4571. .set = ublk_set_max_unprivileged_ublks,
  4572. .get = ublk_get_max_unprivileged_ublks,
  4573. };
  4574. module_param_cb(ublks_max, &ublk_max_unprivileged_ublks_ops,
  4575. &unprivileged_ublks_max, 0644);
  4576. MODULE_PARM_DESC(ublks_max, "max number of unprivileged ublk devices allowed to add(default: 64)");
  4577. MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>");
  4578. MODULE_DESCRIPTION("Userspace block device");
  4579. MODULE_LICENSE("GPL");