skbuff.c 187 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108610961106111611261136114611561166117611861196120612161226123612461256126612761286129613061316132613361346135613661376138613961406141614261436144614561466147614861496150615161526153615461556156615761586159616061616162616361646165616661676168616961706171617261736174617561766177617861796180618161826183618461856186618761886189619061916192619361946195619661976198619962006201620262036204620562066207620862096210621162126213621462156216621762186219622062216222622362246225622662276228622962306231623262336234623562366237623862396240624162426243624462456246624762486249625062516252625362546255625662576258625962606261626262636264626562666267626862696270627162726273627462756276627762786279628062816282628362846285628662876288628962906291629262936294629562966297629862996300630163026303630463056306630763086309631063116312631363146315631663176318631963206321632263236324632563266327632863296330633163326333633463356336633763386339634063416342634363446345634663476348634963506351635263536354635563566357635863596360636163626363636463656366636763686369637063716372637363746375637663776378637963806381638263836384638563866387638863896390639163926393639463956396639763986399640064016402640364046405640664076408640964106411641264136414641564166417641864196420642164226423642464256426642764286429643064316432643364346435643664376438643964406441644264436444644564466447644864496450645164526453645464556456645764586459646064616462646364646465646664676468646964706471647264736474647564766477647864796480648164826483648464856486648764886489649064916492649364946495649664976498649965006501650265036504650565066507650865096510651165126513651465156516651765186519652065216522652365246525652665276528652965306531653265336534653565366537653865396540654165426543654465456546654765486549655065516552655365546555655665576558655965606561656265636564656565666567656865696570657165726573657465756576657765786579658065816582658365846585658665876588658965906591659265936594659565966597659865996600660166026603660466056606660766086609661066116612661366146615661666176618661966206621662266236624662566266627662866296630663166326633663466356636663766386639664066416642664366446645664666476648664966506651665266536654665566566657665866596660666166626663666466656666666766686669667066716672667366746675667666776678667966806681668266836684668566866687668866896690669166926693669466956696669766986699670067016702670367046705670667076708670967106711671267136714671567166717671867196720672167226723672467256726672767286729673067316732673367346735673667376738673967406741674267436744674567466747674867496750675167526753675467556756675767586759676067616762676367646765676667676768676967706771677267736774677567766777677867796780678167826783678467856786678767886789679067916792679367946795679667976798679968006801680268036804680568066807680868096810681168126813681468156816681768186819682068216822682368246825682668276828682968306831683268336834683568366837683868396840684168426843684468456846684768486849685068516852685368546855685668576858685968606861686268636864686568666867686868696870687168726873687468756876687768786879688068816882688368846885688668876888688968906891689268936894689568966897689868996900690169026903690469056906690769086909691069116912691369146915691669176918691969206921692269236924692569266927692869296930693169326933693469356936693769386939694069416942694369446945694669476948694969506951695269536954695569566957695869596960696169626963696469656966696769686969697069716972697369746975697669776978697969806981698269836984698569866987698869896990699169926993699469956996699769986999700070017002700370047005700670077008700970107011701270137014701570167017701870197020702170227023702470257026702770287029703070317032703370347035703670377038703970407041704270437044704570467047704870497050705170527053705470557056705770587059706070617062706370647065706670677068706970707071707270737074707570767077707870797080708170827083708470857086708770887089709070917092709370947095709670977098709971007101710271037104710571067107710871097110711171127113711471157116711771187119712071217122712371247125712671277128712971307131713271337134713571367137713871397140714171427143714471457146714771487149715071517152715371547155715671577158715971607161716271637164716571667167716871697170717171727173717471757176717771787179718071817182718371847185718671877188718971907191719271937194719571967197719871997200720172027203720472057206720772087209721072117212721372147215721672177218721972207221722272237224722572267227722872297230723172327233723472357236723772387239724072417242724372447245724672477248724972507251725272537254725572567257725872597260726172627263726472657266726772687269727072717272727372747275727672777278727972807281728272837284728572867287728872897290729172927293729472957296729772987299730073017302730373047305730673077308730973107311731273137314731573167317731873197320732173227323732473257326732773287329733073317332733373347335733673377338733973407341734273437344734573467347734873497350735173527353735473557356735773587359736073617362736373647365736673677368736973707371737273737374737573767377737873797380738173827383738473857386738773887389739073917392739373947395739673977398739974007401740274037404740574067407740874097410741174127413741474157416741774187419742074217422742374247425742674277428742974307431743274337434743574367437743874397440744174427443744474457446744774487449745074517452745374547455745674577458745974607461746274637464746574667467746874697470747174727473747474757476747774787479748074817482748374847485748674877488748974907491749274937494749574967497
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. /*
  3. * Routines having to do with the 'struct sk_buff' memory handlers.
  4. *
  5. * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>
  6. * Florian La Roche <rzsfl@rz.uni-sb.de>
  7. *
  8. * Fixes:
  9. * Alan Cox : Fixed the worst of the load
  10. * balancer bugs.
  11. * Dave Platt : Interrupt stacking fix.
  12. * Richard Kooijman : Timestamp fixes.
  13. * Alan Cox : Changed buffer format.
  14. * Alan Cox : destructor hook for AF_UNIX etc.
  15. * Linus Torvalds : Better skb_clone.
  16. * Alan Cox : Added skb_copy.
  17. * Alan Cox : Added all the changed routines Linus
  18. * only put in the headers
  19. * Ray VanTassle : Fixed --skb->lock in free
  20. * Alan Cox : skb_copy copy arp field
  21. * Andi Kleen : slabified it.
  22. * Robert Olsson : Removed skb_head_pool
  23. *
  24. * NOTE:
  25. * The __skb_ routines should be called with interrupts
  26. * disabled, or you better be *real* sure that the operation is atomic
  27. * with respect to whatever list is being frobbed (e.g. via lock_sock()
  28. * or via disabling bottom half handlers, etc).
  29. */
  30. /*
  31. * The functions in this file will not compile correctly with gcc 2.4.x
  32. */
  33. #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  34. #include <linux/module.h>
  35. #include <linux/types.h>
  36. #include <linux/kernel.h>
  37. #include <linux/mm.h>
  38. #include <linux/interrupt.h>
  39. #include <linux/in.h>
  40. #include <linux/inet.h>
  41. #include <linux/slab.h>
  42. #include <linux/tcp.h>
  43. #include <linux/udp.h>
  44. #include <linux/sctp.h>
  45. #include <linux/netdevice.h>
  46. #ifdef CONFIG_NET_CLS_ACT
  47. #include <net/pkt_sched.h>
  48. #endif
  49. #include <linux/string.h>
  50. #include <linux/skbuff.h>
  51. #include <linux/skbuff_ref.h>
  52. #include <linux/splice.h>
  53. #include <linux/cache.h>
  54. #include <linux/rtnetlink.h>
  55. #include <linux/init.h>
  56. #include <linux/scatterlist.h>
  57. #include <linux/errqueue.h>
  58. #include <linux/prefetch.h>
  59. #include <linux/bitfield.h>
  60. #include <linux/if_vlan.h>
  61. #include <linux/mpls.h>
  62. #include <linux/kcov.h>
  63. #include <linux/iov_iter.h>
  64. #include <linux/crc32.h>
  65. #include <net/protocol.h>
  66. #include <net/dst.h>
  67. #include <net/sock.h>
  68. #include <net/checksum.h>
  69. #include <net/gro.h>
  70. #include <net/gso.h>
  71. #include <net/hotdata.h>
  72. #include <net/ip6_checksum.h>
  73. #include <net/xfrm.h>
  74. #include <net/mpls.h>
  75. #include <net/mptcp.h>
  76. #include <net/mctp.h>
  77. #include <net/can.h>
  78. #include <net/page_pool/helpers.h>
  79. #include <net/psp/types.h>
  80. #include <net/dropreason.h>
  81. #include <net/xdp_sock.h>
  82. #include <linux/uaccess.h>
  83. #include <trace/events/skb.h>
  84. #include <linux/highmem.h>
  85. #include <linux/capability.h>
  86. #include <linux/user_namespace.h>
  87. #include <linux/indirect_call_wrapper.h>
  88. #include <linux/textsearch.h>
  89. #include "dev.h"
  90. #include "devmem.h"
  91. #include "netmem_priv.h"
  92. #include "sock_destructor.h"
  93. #ifdef CONFIG_SKB_EXTENSIONS
  94. static struct kmem_cache *skbuff_ext_cache __ro_after_init;
  95. #endif
  96. #define GRO_MAX_HEAD_PAD (GRO_MAX_HEAD + NET_SKB_PAD + NET_IP_ALIGN)
  97. #define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(max(MAX_TCP_HEADER, \
  98. GRO_MAX_HEAD_PAD))
  99. /* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two.
  100. * This should ensure that SKB_SMALL_HEAD_HEADROOM is a unique
  101. * size, and we can differentiate heads from skb_small_head_cache
  102. * vs system slabs by looking at their size (skb_end_offset()).
  103. */
  104. #define SKB_SMALL_HEAD_CACHE_SIZE \
  105. (is_power_of_2(SKB_SMALL_HEAD_SIZE) ? \
  106. (SKB_SMALL_HEAD_SIZE + L1_CACHE_BYTES) : \
  107. SKB_SMALL_HEAD_SIZE)
  108. #define SKB_SMALL_HEAD_HEADROOM \
  109. SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE)
  110. /* kcm_write_msgs() relies on casting paged frags to bio_vec to use
  111. * iov_iter_bvec(). These static asserts ensure the cast is valid is long as the
  112. * netmem is a page.
  113. */
  114. static_assert(offsetof(struct bio_vec, bv_page) ==
  115. offsetof(skb_frag_t, netmem));
  116. static_assert(sizeof_field(struct bio_vec, bv_page) ==
  117. sizeof_field(skb_frag_t, netmem));
  118. static_assert(offsetof(struct bio_vec, bv_len) == offsetof(skb_frag_t, len));
  119. static_assert(sizeof_field(struct bio_vec, bv_len) ==
  120. sizeof_field(skb_frag_t, len));
  121. static_assert(offsetof(struct bio_vec, bv_offset) ==
  122. offsetof(skb_frag_t, offset));
  123. static_assert(sizeof_field(struct bio_vec, bv_offset) ==
  124. sizeof_field(skb_frag_t, offset));
  125. #undef FN
  126. #define FN(reason) [SKB_DROP_REASON_##reason] = #reason,
  127. static const char * const drop_reasons[] = {
  128. [SKB_CONSUMED] = "CONSUMED",
  129. DEFINE_DROP_REASON(FN, FN)
  130. };
  131. static const struct drop_reason_list drop_reasons_core = {
  132. .reasons = drop_reasons,
  133. .n_reasons = ARRAY_SIZE(drop_reasons),
  134. };
  135. const struct drop_reason_list __rcu *
  136. drop_reasons_by_subsys[SKB_DROP_REASON_SUBSYS_NUM] = {
  137. [SKB_DROP_REASON_SUBSYS_CORE] = RCU_INITIALIZER(&drop_reasons_core),
  138. };
  139. EXPORT_SYMBOL(drop_reasons_by_subsys);
  140. /**
  141. * drop_reasons_register_subsys - register another drop reason subsystem
  142. * @subsys: the subsystem to register, must not be the core
  143. * @list: the list of drop reasons within the subsystem, must point to
  144. * a statically initialized list
  145. */
  146. void drop_reasons_register_subsys(enum skb_drop_reason_subsys subsys,
  147. const struct drop_reason_list *list)
  148. {
  149. if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE ||
  150. subsys >= ARRAY_SIZE(drop_reasons_by_subsys),
  151. "invalid subsystem %d\n", subsys))
  152. return;
  153. /* must point to statically allocated memory, so INIT is OK */
  154. RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], list);
  155. }
  156. EXPORT_SYMBOL_GPL(drop_reasons_register_subsys);
  157. /**
  158. * drop_reasons_unregister_subsys - unregister a drop reason subsystem
  159. * @subsys: the subsystem to remove, must not be the core
  160. *
  161. * Note: This will synchronize_rcu() to ensure no users when it returns.
  162. */
  163. void drop_reasons_unregister_subsys(enum skb_drop_reason_subsys subsys)
  164. {
  165. if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE ||
  166. subsys >= ARRAY_SIZE(drop_reasons_by_subsys),
  167. "invalid subsystem %d\n", subsys))
  168. return;
  169. RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], NULL);
  170. synchronize_rcu();
  171. }
  172. EXPORT_SYMBOL_GPL(drop_reasons_unregister_subsys);
  173. /**
  174. * skb_panic - private function for out-of-line support
  175. * @skb: buffer
  176. * @sz: size
  177. * @addr: address
  178. * @msg: skb_over_panic or skb_under_panic
  179. *
  180. * Out-of-line support for skb_put() and skb_push().
  181. * Called via the wrapper skb_over_panic() or skb_under_panic().
  182. * Keep out of line to prevent kernel bloat.
  183. * __builtin_return_address is not used because it is not always reliable.
  184. */
  185. static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr,
  186. const char msg[])
  187. {
  188. pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n",
  189. msg, addr, skb->len, sz, skb->head, skb->data,
  190. (unsigned long)skb->tail, (unsigned long)skb->end,
  191. skb->dev ? skb->dev->name : "<NULL>");
  192. BUG();
  193. }
  194. static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr)
  195. {
  196. skb_panic(skb, sz, addr, __func__);
  197. }
  198. static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
  199. {
  200. skb_panic(skb, sz, addr, __func__);
  201. }
  202. #define NAPI_SKB_CACHE_SIZE 128
  203. #define NAPI_SKB_CACHE_BULK 32
  204. #define NAPI_SKB_CACHE_FREE 32
  205. struct napi_alloc_cache {
  206. local_lock_t bh_lock;
  207. struct page_frag_cache page;
  208. unsigned int skb_count;
  209. void *skb_cache[NAPI_SKB_CACHE_SIZE];
  210. };
  211. static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
  212. static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache) = {
  213. .bh_lock = INIT_LOCAL_LOCK(bh_lock),
  214. };
  215. void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
  216. {
  217. struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
  218. void *data;
  219. fragsz = SKB_DATA_ALIGN(fragsz);
  220. local_lock_nested_bh(&napi_alloc_cache.bh_lock);
  221. data = __page_frag_alloc_align(&nc->page, fragsz,
  222. GFP_ATOMIC | __GFP_NOWARN, align_mask);
  223. local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
  224. return data;
  225. }
  226. EXPORT_SYMBOL(__napi_alloc_frag_align);
  227. void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
  228. {
  229. void *data;
  230. if (in_hardirq() || irqs_disabled()) {
  231. struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache);
  232. fragsz = SKB_DATA_ALIGN(fragsz);
  233. data = __page_frag_alloc_align(nc, fragsz,
  234. GFP_ATOMIC | __GFP_NOWARN,
  235. align_mask);
  236. } else {
  237. local_bh_disable();
  238. data = __napi_alloc_frag_align(fragsz, align_mask);
  239. local_bh_enable();
  240. }
  241. return data;
  242. }
  243. EXPORT_SYMBOL(__netdev_alloc_frag_align);
  244. /* Cache kmem_cache_size(net_hotdata.skbuff_cache) to help the compiler
  245. * remove dead code (and skbuff_cache_size) when CONFIG_KASAN is unset.
  246. */
  247. static u32 skbuff_cache_size __read_mostly;
  248. static inline struct sk_buff *napi_skb_cache_get(bool alloc)
  249. {
  250. struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
  251. struct sk_buff *skb;
  252. local_lock_nested_bh(&napi_alloc_cache.bh_lock);
  253. if (unlikely(!nc->skb_count)) {
  254. if (alloc)
  255. nc->skb_count = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
  256. GFP_ATOMIC | __GFP_NOWARN,
  257. NAPI_SKB_CACHE_BULK,
  258. nc->skb_cache);
  259. if (unlikely(!nc->skb_count)) {
  260. local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
  261. return NULL;
  262. }
  263. }
  264. skb = nc->skb_cache[--nc->skb_count];
  265. if (nc->skb_count)
  266. prefetch(nc->skb_cache[nc->skb_count - 1]);
  267. local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
  268. kasan_mempool_unpoison_object(skb, skbuff_cache_size);
  269. return skb;
  270. }
  271. /*
  272. * Only clear those fields we need to clear, not those that we will
  273. * actually initialise later. Hence, don't put any more fields after
  274. * the tail pointer in struct sk_buff!
  275. */
  276. static inline void skbuff_clear(struct sk_buff *skb)
  277. {
  278. /* Replace memset(skb, 0, offsetof(struct sk_buff, tail))
  279. * with two smaller memset(), with a barrier() between them.
  280. * This forces the compiler to inline both calls.
  281. */
  282. BUILD_BUG_ON(offsetof(struct sk_buff, tail) <= 128);
  283. memset(skb, 0, 128);
  284. barrier();
  285. memset((void *)skb + 128, 0, offsetof(struct sk_buff, tail) - 128);
  286. }
  287. /**
  288. * napi_skb_cache_get_bulk - obtain a number of zeroed skb heads from the cache
  289. * @skbs: pointer to an at least @n-sized array to fill with skb pointers
  290. * @n: number of entries to provide
  291. *
  292. * Tries to obtain @n &sk_buff entries from the NAPI percpu cache and writes
  293. * the pointers into the provided array @skbs. If there are less entries
  294. * available, tries to replenish the cache and bulk-allocates the diff from
  295. * the MM layer if needed.
  296. * The heads are being zeroed with either memset() or %__GFP_ZERO, so they are
  297. * ready for {,__}build_skb_around() and don't have any data buffers attached.
  298. * Must be called *only* from the BH context.
  299. *
  300. * Return: number of successfully allocated skbs (@n if no actual allocation
  301. * needed or kmem_cache_alloc_bulk() didn't fail).
  302. */
  303. u32 napi_skb_cache_get_bulk(void **skbs, u32 n)
  304. {
  305. struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
  306. u32 bulk, total = n;
  307. local_lock_nested_bh(&napi_alloc_cache.bh_lock);
  308. if (nc->skb_count >= n)
  309. goto get;
  310. /* No enough cached skbs. Try refilling the cache first */
  311. bulk = min(NAPI_SKB_CACHE_SIZE - nc->skb_count, NAPI_SKB_CACHE_BULK);
  312. nc->skb_count += kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
  313. GFP_ATOMIC | __GFP_NOWARN, bulk,
  314. &nc->skb_cache[nc->skb_count]);
  315. if (likely(nc->skb_count >= n))
  316. goto get;
  317. /* Still not enough. Bulk-allocate the missing part directly, zeroed */
  318. n -= kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
  319. GFP_ATOMIC | __GFP_ZERO | __GFP_NOWARN,
  320. n - nc->skb_count, &skbs[nc->skb_count]);
  321. if (likely(nc->skb_count >= n))
  322. goto get;
  323. /* kmem_cache didn't allocate the number we need, limit the output */
  324. total -= n - nc->skb_count;
  325. n = nc->skb_count;
  326. get:
  327. for (u32 base = nc->skb_count - n, i = 0; i < n; i++) {
  328. skbs[i] = nc->skb_cache[base + i];
  329. kasan_mempool_unpoison_object(skbs[i], skbuff_cache_size);
  330. skbuff_clear(skbs[i]);
  331. }
  332. nc->skb_count -= n;
  333. local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
  334. return total;
  335. }
  336. EXPORT_SYMBOL_GPL(napi_skb_cache_get_bulk);
  337. static inline void __finalize_skb_around(struct sk_buff *skb, void *data,
  338. unsigned int size)
  339. {
  340. struct skb_shared_info *shinfo;
  341. size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
  342. /* Assumes caller memset cleared SKB */
  343. skb->truesize = SKB_TRUESIZE(size);
  344. refcount_set(&skb->users, 1);
  345. skb->head = data;
  346. skb->data = data;
  347. skb_reset_tail_pointer(skb);
  348. skb_set_end_offset(skb, size);
  349. skb->mac_header = (typeof(skb->mac_header))~0U;
  350. skb->transport_header = (typeof(skb->transport_header))~0U;
  351. skb->alloc_cpu = raw_smp_processor_id();
  352. /* make sure we initialize shinfo sequentially */
  353. shinfo = skb_shinfo(skb);
  354. memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
  355. atomic_set(&shinfo->dataref, 1);
  356. skb_set_kcov_handle(skb, kcov_common_handle());
  357. }
  358. static inline void *__slab_build_skb(void *data, unsigned int *size)
  359. {
  360. void *resized;
  361. /* Must find the allocation size (and grow it to match). */
  362. *size = ksize(data);
  363. /* krealloc() will immediately return "data" when
  364. * "ksize(data)" is requested: it is the existing upper
  365. * bounds. As a result, GFP_ATOMIC will be ignored. Note
  366. * that this "new" pointer needs to be passed back to the
  367. * caller for use so the __alloc_size hinting will be
  368. * tracked correctly.
  369. */
  370. resized = krealloc(data, *size, GFP_ATOMIC);
  371. WARN_ON_ONCE(resized != data);
  372. return resized;
  373. }
  374. /* build_skb() variant which can operate on slab buffers.
  375. * Note that this should be used sparingly as slab buffers
  376. * cannot be combined efficiently by GRO!
  377. */
  378. struct sk_buff *slab_build_skb(void *data)
  379. {
  380. struct sk_buff *skb;
  381. unsigned int size;
  382. skb = kmem_cache_alloc(net_hotdata.skbuff_cache,
  383. GFP_ATOMIC | __GFP_NOWARN);
  384. if (unlikely(!skb))
  385. return NULL;
  386. skbuff_clear(skb);
  387. data = __slab_build_skb(data, &size);
  388. __finalize_skb_around(skb, data, size);
  389. return skb;
  390. }
  391. EXPORT_SYMBOL(slab_build_skb);
  392. /* Caller must provide SKB that is memset cleared */
  393. static void __build_skb_around(struct sk_buff *skb, void *data,
  394. unsigned int frag_size)
  395. {
  396. unsigned int size = frag_size;
  397. /* frag_size == 0 is considered deprecated now. Callers
  398. * using slab buffer should use slab_build_skb() instead.
  399. */
  400. if (WARN_ONCE(size == 0, "Use slab_build_skb() instead"))
  401. data = __slab_build_skb(data, &size);
  402. __finalize_skb_around(skb, data, size);
  403. }
  404. /**
  405. * __build_skb - build a network buffer
  406. * @data: data buffer provided by caller
  407. * @frag_size: size of data (must not be 0)
  408. *
  409. * Allocate a new &sk_buff. Caller provides space holding head and
  410. * skb_shared_info. @data must have been allocated from the page
  411. * allocator or vmalloc(). (A @frag_size of 0 to indicate a kmalloc()
  412. * allocation is deprecated, and callers should use slab_build_skb()
  413. * instead.)
  414. * The return is the new skb buffer.
  415. * On a failure the return is %NULL, and @data is not freed.
  416. * Notes :
  417. * Before IO, driver allocates only data buffer where NIC put incoming frame
  418. * Driver should add room at head (NET_SKB_PAD) and
  419. * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info))
  420. * After IO, driver calls build_skb(), to allocate sk_buff and populate it
  421. * before giving packet to stack.
  422. * RX rings only contains data buffers, not full skbs.
  423. */
  424. struct sk_buff *__build_skb(void *data, unsigned int frag_size)
  425. {
  426. struct sk_buff *skb;
  427. skb = kmem_cache_alloc(net_hotdata.skbuff_cache,
  428. GFP_ATOMIC | __GFP_NOWARN);
  429. if (unlikely(!skb))
  430. return NULL;
  431. skbuff_clear(skb);
  432. __build_skb_around(skb, data, frag_size);
  433. return skb;
  434. }
  435. /* build_skb() is wrapper over __build_skb(), that specifically
  436. * takes care of skb->head and skb->pfmemalloc
  437. */
  438. struct sk_buff *build_skb(void *data, unsigned int frag_size)
  439. {
  440. struct sk_buff *skb = __build_skb(data, frag_size);
  441. if (likely(skb && frag_size)) {
  442. skb->head_frag = 1;
  443. skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
  444. }
  445. return skb;
  446. }
  447. EXPORT_SYMBOL(build_skb);
  448. /**
  449. * build_skb_around - build a network buffer around provided skb
  450. * @skb: sk_buff provide by caller, must be memset cleared
  451. * @data: data buffer provided by caller
  452. * @frag_size: size of data
  453. */
  454. struct sk_buff *build_skb_around(struct sk_buff *skb,
  455. void *data, unsigned int frag_size)
  456. {
  457. if (unlikely(!skb))
  458. return NULL;
  459. __build_skb_around(skb, data, frag_size);
  460. if (frag_size) {
  461. skb->head_frag = 1;
  462. skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
  463. }
  464. return skb;
  465. }
  466. EXPORT_SYMBOL(build_skb_around);
  467. /**
  468. * __napi_build_skb - build a network buffer
  469. * @data: data buffer provided by caller
  470. * @frag_size: size of data
  471. *
  472. * Version of __build_skb() that uses NAPI percpu caches to obtain
  473. * skbuff_head instead of inplace allocation.
  474. *
  475. * Returns a new &sk_buff on success, %NULL on allocation failure.
  476. */
  477. static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size)
  478. {
  479. struct sk_buff *skb;
  480. skb = napi_skb_cache_get(true);
  481. if (unlikely(!skb))
  482. return NULL;
  483. skbuff_clear(skb);
  484. __build_skb_around(skb, data, frag_size);
  485. return skb;
  486. }
  487. /**
  488. * napi_build_skb - build a network buffer
  489. * @data: data buffer provided by caller
  490. * @frag_size: size of data
  491. *
  492. * Version of __napi_build_skb() that takes care of skb->head_frag
  493. * and skb->pfmemalloc when the data is a page or page fragment.
  494. *
  495. * Returns a new &sk_buff on success, %NULL on allocation failure.
  496. */
  497. struct sk_buff *napi_build_skb(void *data, unsigned int frag_size)
  498. {
  499. struct sk_buff *skb = __napi_build_skb(data, frag_size);
  500. if (likely(skb) && frag_size) {
  501. skb->head_frag = 1;
  502. skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
  503. }
  504. return skb;
  505. }
  506. EXPORT_SYMBOL(napi_build_skb);
  507. static void *kmalloc_pfmemalloc(size_t obj_size, gfp_t flags, int node)
  508. {
  509. if (!gfp_pfmemalloc_allowed(flags))
  510. return NULL;
  511. if (!obj_size)
  512. return kmem_cache_alloc_node(net_hotdata.skb_small_head_cache,
  513. flags, node);
  514. return kmalloc_node_track_caller(obj_size, flags, node);
  515. }
  516. /*
  517. * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
  518. * the caller if emergency pfmemalloc reserves are being used. If it is and
  519. * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
  520. * may be used. Otherwise, the packet data may be discarded until enough
  521. * memory is free
  522. */
  523. static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node,
  524. struct sk_buff *skb)
  525. {
  526. size_t obj_size;
  527. void *obj;
  528. obj_size = SKB_HEAD_ALIGN(*size);
  529. if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE &&
  530. !(flags & KMALLOC_NOT_NORMAL_BITS)) {
  531. obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache,
  532. flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
  533. node);
  534. *size = SKB_SMALL_HEAD_CACHE_SIZE;
  535. if (likely(obj))
  536. goto out;
  537. /* Try again but now we are using pfmemalloc reserves */
  538. if (skb)
  539. skb->pfmemalloc = true;
  540. return kmalloc_pfmemalloc(0, flags, node);
  541. }
  542. obj_size = kmalloc_size_roundup(obj_size);
  543. /* The following cast might truncate high-order bits of obj_size, this
  544. * is harmless because kmalloc(obj_size >= 2^32) will fail anyway.
  545. */
  546. *size = (unsigned int)obj_size;
  547. /*
  548. * Try a regular allocation, when that fails and we're not entitled
  549. * to the reserves, fail.
  550. */
  551. obj = kmalloc_node_track_caller(obj_size,
  552. flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
  553. node);
  554. if (likely(obj))
  555. goto out;
  556. /* Try again but now we are using pfmemalloc reserves */
  557. if (skb)
  558. skb->pfmemalloc = true;
  559. obj = kmalloc_pfmemalloc(obj_size, flags, node);
  560. out:
  561. return obj;
  562. }
  563. /* Allocate a new skbuff. We do this ourselves so we can fill in a few
  564. * 'private' fields and also do memory statistics to find all the
  565. * [BEEP] leaks.
  566. *
  567. */
  568. /**
  569. * __alloc_skb - allocate a network buffer
  570. * @size: size to allocate
  571. * @gfp_mask: allocation mask
  572. * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
  573. * instead of head cache and allocate a cloned (child) skb.
  574. * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
  575. * allocations in case the data is required for writeback
  576. * @node: numa node to allocate memory on
  577. *
  578. * Allocate a new &sk_buff. The returned buffer has no headroom and a
  579. * tail room of at least size bytes. The object has a reference count
  580. * of one. The return is the buffer. On a failure the return is %NULL.
  581. *
  582. * Buffers may only be allocated from interrupts using a @gfp_mask of
  583. * %GFP_ATOMIC.
  584. */
  585. struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
  586. int flags, int node)
  587. {
  588. struct sk_buff *skb = NULL;
  589. struct kmem_cache *cache;
  590. u8 *data;
  591. if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
  592. gfp_mask |= __GFP_MEMALLOC;
  593. if (flags & SKB_ALLOC_FCLONE) {
  594. cache = net_hotdata.skbuff_fclone_cache;
  595. goto fallback;
  596. }
  597. cache = net_hotdata.skbuff_cache;
  598. if (unlikely(node != NUMA_NO_NODE && node != numa_mem_id()))
  599. goto fallback;
  600. if (flags & SKB_ALLOC_NAPI) {
  601. skb = napi_skb_cache_get(true);
  602. if (unlikely(!skb))
  603. return NULL;
  604. } else if (!in_hardirq() && !irqs_disabled()) {
  605. local_bh_disable();
  606. skb = napi_skb_cache_get(false);
  607. local_bh_enable();
  608. }
  609. if (!skb) {
  610. fallback:
  611. skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node);
  612. if (unlikely(!skb))
  613. return NULL;
  614. }
  615. skbuff_clear(skb);
  616. /* We do our best to align skb_shared_info on a separate cache
  617. * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
  618. * aligned memory blocks, unless SLUB/SLAB debug is enabled.
  619. * Both skb->head and skb_shared_info are cache line aligned.
  620. */
  621. data = kmalloc_reserve(&size, gfp_mask, node, skb);
  622. if (unlikely(!data))
  623. goto nodata;
  624. /* kmalloc_size_roundup() might give us more room than requested.
  625. * Put skb_shared_info exactly at the end of allocated zone,
  626. * to allow max possible filling before reallocation.
  627. */
  628. __finalize_skb_around(skb, data, size);
  629. if (flags & SKB_ALLOC_FCLONE) {
  630. struct sk_buff_fclones *fclones;
  631. fclones = container_of(skb, struct sk_buff_fclones, skb1);
  632. /* skb->fclone is a 2bits field.
  633. * Replace expensive RMW (skb->fclone = SKB_FCLONE_ORIG)
  634. * with a single OR.
  635. */
  636. BUILD_BUG_ON(SKB_FCLONE_UNAVAILABLE != 0);
  637. DEBUG_NET_WARN_ON_ONCE(skb->fclone != SKB_FCLONE_UNAVAILABLE);
  638. skb->fclone |= SKB_FCLONE_ORIG;
  639. refcount_set(&fclones->fclone_ref, 1);
  640. }
  641. return skb;
  642. nodata:
  643. kmem_cache_free(cache, skb);
  644. return NULL;
  645. }
  646. EXPORT_SYMBOL(__alloc_skb);
  647. /**
  648. * __netdev_alloc_skb - allocate an skbuff for rx on a specific device
  649. * @dev: network device to receive on
  650. * @len: length to allocate
  651. * @gfp_mask: get_free_pages mask, passed to alloc_skb
  652. *
  653. * Allocate a new &sk_buff and assign it a usage count of one. The
  654. * buffer has NET_SKB_PAD headroom built in. Users should allocate
  655. * the headroom they think they need without accounting for the
  656. * built in space. The built in space is used for optimisations.
  657. *
  658. * %NULL is returned if there is no free memory.
  659. */
  660. struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
  661. gfp_t gfp_mask)
  662. {
  663. struct page_frag_cache *nc;
  664. struct sk_buff *skb;
  665. bool pfmemalloc;
  666. void *data;
  667. len += NET_SKB_PAD;
  668. /* If requested length is either too small or too big,
  669. * we use kmalloc() for skb->head allocation.
  670. */
  671. if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) ||
  672. len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
  673. (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
  674. skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
  675. if (!skb)
  676. goto skb_fail;
  677. goto skb_success;
  678. }
  679. len = SKB_HEAD_ALIGN(len);
  680. if (sk_memalloc_socks())
  681. gfp_mask |= __GFP_MEMALLOC;
  682. if (in_hardirq() || irqs_disabled()) {
  683. nc = this_cpu_ptr(&netdev_alloc_cache);
  684. data = page_frag_alloc(nc, len, gfp_mask);
  685. pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
  686. } else {
  687. local_bh_disable();
  688. local_lock_nested_bh(&napi_alloc_cache.bh_lock);
  689. nc = this_cpu_ptr(&napi_alloc_cache.page);
  690. data = page_frag_alloc(nc, len, gfp_mask);
  691. pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
  692. local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
  693. local_bh_enable();
  694. }
  695. if (unlikely(!data))
  696. return NULL;
  697. skb = __build_skb(data, len);
  698. if (unlikely(!skb)) {
  699. skb_free_frag(data);
  700. return NULL;
  701. }
  702. if (pfmemalloc)
  703. skb->pfmemalloc = 1;
  704. skb->head_frag = 1;
  705. skb_success:
  706. skb_reserve(skb, NET_SKB_PAD);
  707. skb->dev = dev;
  708. skb_fail:
  709. return skb;
  710. }
  711. EXPORT_SYMBOL(__netdev_alloc_skb);
  712. /**
  713. * napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
  714. * @napi: napi instance this buffer was allocated for
  715. * @len: length to allocate
  716. *
  717. * Allocate a new sk_buff for use in NAPI receive. This buffer will
  718. * attempt to allocate the head from a special reserved region used
  719. * only for NAPI Rx allocation. By doing this we can save several
  720. * CPU cycles by avoiding having to disable and re-enable IRQs.
  721. *
  722. * %NULL is returned if there is no free memory.
  723. */
  724. struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len)
  725. {
  726. gfp_t gfp_mask = GFP_ATOMIC | __GFP_NOWARN;
  727. struct napi_alloc_cache *nc;
  728. struct sk_buff *skb;
  729. bool pfmemalloc;
  730. void *data;
  731. DEBUG_NET_WARN_ON_ONCE(!in_softirq());
  732. len += NET_SKB_PAD + NET_IP_ALIGN;
  733. /* If requested length is either too small or too big,
  734. * we use kmalloc() for skb->head allocation.
  735. */
  736. if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) ||
  737. len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
  738. (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
  739. skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
  740. NUMA_NO_NODE);
  741. if (!skb)
  742. goto skb_fail;
  743. goto skb_success;
  744. }
  745. len = SKB_HEAD_ALIGN(len);
  746. if (sk_memalloc_socks())
  747. gfp_mask |= __GFP_MEMALLOC;
  748. local_lock_nested_bh(&napi_alloc_cache.bh_lock);
  749. nc = this_cpu_ptr(&napi_alloc_cache);
  750. data = page_frag_alloc(&nc->page, len, gfp_mask);
  751. pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page);
  752. local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
  753. if (unlikely(!data))
  754. return NULL;
  755. skb = __napi_build_skb(data, len);
  756. if (unlikely(!skb)) {
  757. skb_free_frag(data);
  758. return NULL;
  759. }
  760. if (pfmemalloc)
  761. skb->pfmemalloc = 1;
  762. skb->head_frag = 1;
  763. skb_success:
  764. skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
  765. skb->dev = napi->dev;
  766. skb_fail:
  767. return skb;
  768. }
  769. EXPORT_SYMBOL(napi_alloc_skb);
  770. void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem,
  771. int off, int size, unsigned int truesize)
  772. {
  773. DEBUG_NET_WARN_ON_ONCE(size > truesize);
  774. skb_fill_netmem_desc(skb, i, netmem, off, size);
  775. skb->len += size;
  776. skb->data_len += size;
  777. skb->truesize += truesize;
  778. }
  779. EXPORT_SYMBOL(skb_add_rx_frag_netmem);
  780. void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
  781. unsigned int truesize)
  782. {
  783. skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
  784. DEBUG_NET_WARN_ON_ONCE(size > truesize);
  785. skb_frag_size_add(frag, size);
  786. skb->len += size;
  787. skb->data_len += size;
  788. skb->truesize += truesize;
  789. }
  790. EXPORT_SYMBOL(skb_coalesce_rx_frag);
  791. static void skb_drop_list(struct sk_buff **listp)
  792. {
  793. kfree_skb_list(*listp);
  794. *listp = NULL;
  795. }
  796. static inline void skb_drop_fraglist(struct sk_buff *skb)
  797. {
  798. skb_drop_list(&skb_shinfo(skb)->frag_list);
  799. }
  800. static void skb_clone_fraglist(struct sk_buff *skb)
  801. {
  802. struct sk_buff *list;
  803. skb_walk_frags(skb, list)
  804. skb_get(list);
  805. }
  806. int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
  807. unsigned int headroom)
  808. {
  809. #if IS_ENABLED(CONFIG_PAGE_POOL)
  810. u32 size, truesize, len, max_head_size, off;
  811. struct sk_buff *skb = *pskb, *nskb;
  812. int err, i, head_off;
  813. void *data;
  814. /* XDP does not support fraglist so we need to linearize
  815. * the skb.
  816. */
  817. if (skb_has_frag_list(skb))
  818. return -EOPNOTSUPP;
  819. max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE - headroom);
  820. if (skb->len > max_head_size + MAX_SKB_FRAGS * PAGE_SIZE)
  821. return -ENOMEM;
  822. size = min_t(u32, skb->len, max_head_size);
  823. truesize = SKB_HEAD_ALIGN(size) + headroom;
  824. data = page_pool_dev_alloc_va(pool, &truesize);
  825. if (!data)
  826. return -ENOMEM;
  827. nskb = napi_build_skb(data, truesize);
  828. if (!nskb) {
  829. page_pool_free_va(pool, data, true);
  830. return -ENOMEM;
  831. }
  832. skb_reserve(nskb, headroom);
  833. skb_copy_header(nskb, skb);
  834. skb_mark_for_recycle(nskb);
  835. err = skb_copy_bits(skb, 0, nskb->data, size);
  836. if (err) {
  837. consume_skb(nskb);
  838. return err;
  839. }
  840. skb_put(nskb, size);
  841. head_off = skb_headroom(nskb) - skb_headroom(skb);
  842. skb_headers_offset_update(nskb, head_off);
  843. off = size;
  844. len = skb->len - off;
  845. for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) {
  846. struct page *page;
  847. u32 page_off;
  848. size = min_t(u32, len, PAGE_SIZE);
  849. truesize = size;
  850. page = page_pool_dev_alloc(pool, &page_off, &truesize);
  851. if (!page) {
  852. consume_skb(nskb);
  853. return -ENOMEM;
  854. }
  855. skb_add_rx_frag(nskb, i, page, page_off, size, truesize);
  856. err = skb_copy_bits(skb, off, page_address(page) + page_off,
  857. size);
  858. if (err) {
  859. consume_skb(nskb);
  860. return err;
  861. }
  862. len -= size;
  863. off += size;
  864. }
  865. consume_skb(skb);
  866. *pskb = nskb;
  867. return 0;
  868. #else
  869. return -EOPNOTSUPP;
  870. #endif
  871. }
  872. EXPORT_SYMBOL(skb_pp_cow_data);
  873. int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
  874. const struct bpf_prog *prog)
  875. {
  876. if (!prog->aux->xdp_has_frags)
  877. return -EINVAL;
  878. return skb_pp_cow_data(pool, pskb, XDP_PACKET_HEADROOM);
  879. }
  880. EXPORT_SYMBOL(skb_cow_data_for_xdp);
  881. #if IS_ENABLED(CONFIG_PAGE_POOL)
  882. bool napi_pp_put_page(netmem_ref netmem)
  883. {
  884. netmem = netmem_compound_head(netmem);
  885. if (unlikely(!netmem_is_pp(netmem)))
  886. return false;
  887. page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, false);
  888. return true;
  889. }
  890. EXPORT_SYMBOL(napi_pp_put_page);
  891. #endif
  892. static bool skb_pp_recycle(struct sk_buff *skb, void *data)
  893. {
  894. if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
  895. return false;
  896. return napi_pp_put_page(page_to_netmem(virt_to_page(data)));
  897. }
  898. /**
  899. * skb_pp_frag_ref() - Increase fragment references of a page pool aware skb
  900. * @skb: page pool aware skb
  901. *
  902. * Increase the fragment reference count (pp_ref_count) of a skb. This is
  903. * intended to gain fragment references only for page pool aware skbs,
  904. * i.e. when skb->pp_recycle is true, and not for fragments in a
  905. * non-pp-recycling skb. It has a fallback to increase references on normal
  906. * pages, as page pool aware skbs may also have normal page fragments.
  907. */
  908. static int skb_pp_frag_ref(struct sk_buff *skb)
  909. {
  910. struct skb_shared_info *shinfo;
  911. netmem_ref head_netmem;
  912. int i;
  913. if (!skb->pp_recycle)
  914. return -EINVAL;
  915. shinfo = skb_shinfo(skb);
  916. for (i = 0; i < shinfo->nr_frags; i++) {
  917. head_netmem = netmem_compound_head(shinfo->frags[i].netmem);
  918. if (likely(netmem_is_pp(head_netmem)))
  919. page_pool_ref_netmem(head_netmem);
  920. else
  921. page_ref_inc(netmem_to_page(head_netmem));
  922. }
  923. return 0;
  924. }
  925. static void skb_kfree_head(void *head, unsigned int end_offset)
  926. {
  927. kfree(head);
  928. }
  929. static void skb_free_head(struct sk_buff *skb)
  930. {
  931. unsigned char *head = skb->head;
  932. if (skb->head_frag) {
  933. if (skb_pp_recycle(skb, head))
  934. return;
  935. skb_free_frag(head);
  936. } else {
  937. skb_kfree_head(head, skb_end_offset(skb));
  938. }
  939. }
  940. static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)
  941. {
  942. struct skb_shared_info *shinfo = skb_shinfo(skb);
  943. int i;
  944. if (!skb_data_unref(skb, shinfo))
  945. goto exit;
  946. if (skb_zcopy(skb)) {
  947. bool skip_unref = shinfo->flags & SKBFL_MANAGED_FRAG_REFS;
  948. skb_zcopy_clear(skb, true);
  949. if (skip_unref)
  950. goto free_head;
  951. }
  952. for (i = 0; i < shinfo->nr_frags; i++)
  953. __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
  954. free_head:
  955. if (shinfo->frag_list)
  956. kfree_skb_list_reason(shinfo->frag_list, reason);
  957. skb_free_head(skb);
  958. exit:
  959. /* When we clone an SKB we copy the reycling bit. The pp_recycle
  960. * bit is only set on the head though, so in order to avoid races
  961. * while trying to recycle fragments on __skb_frag_unref() we need
  962. * to make one SKB responsible for triggering the recycle path.
  963. * So disable the recycling bit if an SKB is cloned and we have
  964. * additional references to the fragmented part of the SKB.
  965. * Eventually the last SKB will have the recycling bit set and it's
  966. * dataref set to 0, which will trigger the recycling
  967. */
  968. skb->pp_recycle = 0;
  969. }
  970. /*
  971. * Free an skbuff by memory without cleaning the state.
  972. */
  973. static void kfree_skbmem(struct sk_buff *skb)
  974. {
  975. struct sk_buff_fclones *fclones;
  976. switch (skb->fclone) {
  977. case SKB_FCLONE_UNAVAILABLE:
  978. kmem_cache_free(net_hotdata.skbuff_cache, skb);
  979. return;
  980. case SKB_FCLONE_ORIG:
  981. fclones = container_of(skb, struct sk_buff_fclones, skb1);
  982. /* We usually free the clone (TX completion) before original skb
  983. * This test would have no chance to be true for the clone,
  984. * while here, branch prediction will be good.
  985. */
  986. if (refcount_read(&fclones->fclone_ref) == 1)
  987. goto fastpath;
  988. break;
  989. default: /* SKB_FCLONE_CLONE */
  990. fclones = container_of(skb, struct sk_buff_fclones, skb2);
  991. break;
  992. }
  993. if (!refcount_dec_and_test(&fclones->fclone_ref))
  994. return;
  995. fastpath:
  996. kmem_cache_free(net_hotdata.skbuff_fclone_cache, fclones);
  997. }
  998. void skb_release_head_state(struct sk_buff *skb)
  999. {
  1000. skb_dst_drop(skb);
  1001. if (skb->destructor) {
  1002. DEBUG_NET_WARN_ON_ONCE(in_hardirq());
  1003. #ifdef CONFIG_INET
  1004. INDIRECT_CALL_4(skb->destructor,
  1005. tcp_wfree, __sock_wfree, sock_wfree,
  1006. xsk_destruct_skb,
  1007. skb);
  1008. #else
  1009. INDIRECT_CALL_2(skb->destructor,
  1010. sock_wfree, xsk_destruct_skb,
  1011. skb);
  1012. #endif
  1013. skb->destructor = NULL;
  1014. skb->sk = NULL;
  1015. }
  1016. nf_reset_ct(skb);
  1017. skb_ext_reset(skb);
  1018. }
  1019. /* Free everything but the sk_buff shell. */
  1020. static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)
  1021. {
  1022. skb_release_head_state(skb);
  1023. if (likely(skb->head))
  1024. skb_release_data(skb, reason);
  1025. }
  1026. /**
  1027. * __kfree_skb - private function
  1028. * @skb: buffer
  1029. *
  1030. * Free an sk_buff. Release anything attached to the buffer.
  1031. * Clean the state. This is an internal helper function. Users should
  1032. * always call kfree_skb
  1033. */
  1034. void __kfree_skb(struct sk_buff *skb)
  1035. {
  1036. skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);
  1037. kfree_skbmem(skb);
  1038. }
  1039. EXPORT_SYMBOL(__kfree_skb);
  1040. static __always_inline
  1041. bool __sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb,
  1042. enum skb_drop_reason reason)
  1043. {
  1044. if (unlikely(!skb_unref(skb)))
  1045. return false;
  1046. DEBUG_NET_WARN_ON_ONCE(reason == SKB_NOT_DROPPED_YET ||
  1047. u32_get_bits(reason,
  1048. SKB_DROP_REASON_SUBSYS_MASK) >=
  1049. SKB_DROP_REASON_SUBSYS_NUM);
  1050. if (reason == SKB_CONSUMED)
  1051. trace_consume_skb(skb, __builtin_return_address(0));
  1052. else
  1053. trace_kfree_skb(skb, __builtin_return_address(0), reason, sk);
  1054. return true;
  1055. }
  1056. /**
  1057. * sk_skb_reason_drop - free an sk_buff with special reason
  1058. * @sk: the socket to receive @skb, or NULL if not applicable
  1059. * @skb: buffer to free
  1060. * @reason: reason why this skb is dropped
  1061. *
  1062. * Drop a reference to the buffer and free it if the usage count has hit
  1063. * zero. Meanwhile, pass the receiving socket and drop reason to
  1064. * 'kfree_skb' tracepoint.
  1065. */
  1066. void __fix_address
  1067. sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason)
  1068. {
  1069. if (__sk_skb_reason_drop(sk, skb, reason))
  1070. __kfree_skb(skb);
  1071. }
  1072. EXPORT_SYMBOL(sk_skb_reason_drop);
  1073. #define KFREE_SKB_BULK_SIZE 16
  1074. struct skb_free_array {
  1075. unsigned int skb_count;
  1076. void *skb_array[KFREE_SKB_BULK_SIZE];
  1077. };
  1078. static void kfree_skb_add_bulk(struct sk_buff *skb,
  1079. struct skb_free_array *sa,
  1080. enum skb_drop_reason reason)
  1081. {
  1082. /* if SKB is a clone, don't handle this case */
  1083. if (unlikely(skb->fclone != SKB_FCLONE_UNAVAILABLE)) {
  1084. __kfree_skb(skb);
  1085. return;
  1086. }
  1087. skb_release_all(skb, reason);
  1088. sa->skb_array[sa->skb_count++] = skb;
  1089. if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) {
  1090. kmem_cache_free_bulk(net_hotdata.skbuff_cache, KFREE_SKB_BULK_SIZE,
  1091. sa->skb_array);
  1092. sa->skb_count = 0;
  1093. }
  1094. }
  1095. void __fix_address
  1096. kfree_skb_list_reason(struct sk_buff *segs, enum skb_drop_reason reason)
  1097. {
  1098. struct skb_free_array sa;
  1099. sa.skb_count = 0;
  1100. while (segs) {
  1101. struct sk_buff *next = segs->next;
  1102. if (__sk_skb_reason_drop(NULL, segs, reason)) {
  1103. skb_poison_list(segs);
  1104. kfree_skb_add_bulk(segs, &sa, reason);
  1105. }
  1106. segs = next;
  1107. }
  1108. if (sa.skb_count)
  1109. kmem_cache_free_bulk(net_hotdata.skbuff_cache, sa.skb_count, sa.skb_array);
  1110. }
  1111. EXPORT_SYMBOL(kfree_skb_list_reason);
  1112. /* Dump skb information and contents.
  1113. *
  1114. * Must only be called from net_ratelimit()-ed paths.
  1115. *
  1116. * Dumps whole packets if full_pkt, only headers otherwise.
  1117. */
  1118. void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt)
  1119. {
  1120. struct skb_shared_info *sh = skb_shinfo(skb);
  1121. struct net_device *dev = skb->dev;
  1122. struct sock *sk = skb->sk;
  1123. struct sk_buff *list_skb;
  1124. bool has_mac, has_trans;
  1125. int headroom, tailroom;
  1126. int i, len, seg_len;
  1127. if (full_pkt)
  1128. len = skb->len;
  1129. else
  1130. len = min_t(int, skb->len, MAX_HEADER + 128);
  1131. headroom = skb_headroom(skb);
  1132. tailroom = skb_tailroom(skb);
  1133. has_mac = skb_mac_header_was_set(skb);
  1134. has_trans = skb_transport_header_was_set(skb);
  1135. printk("%sskb len=%u data_len=%u headroom=%u headlen=%u tailroom=%u\n"
  1136. "end-tail=%u mac=(%d,%d) mac_len=%u net=(%d,%d) trans=%d\n"
  1137. "shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n"
  1138. "csum(0x%x start=%u offset=%u ip_summed=%u complete_sw=%u valid=%u level=%u)\n"
  1139. "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n"
  1140. "priority=0x%x mark=0x%x alloc_cpu=%u vlan_all=0x%x\n"
  1141. "encapsulation=%d inner(proto=0x%04x, mac=%u, net=%u, trans=%u)\n",
  1142. level, skb->len, skb->data_len, headroom, skb_headlen(skb),
  1143. tailroom, skb->end - skb->tail,
  1144. has_mac ? skb->mac_header : -1,
  1145. has_mac ? skb_mac_header_len(skb) : -1,
  1146. skb->mac_len,
  1147. skb->network_header,
  1148. has_trans ? skb_network_header_len(skb) : -1,
  1149. has_trans ? skb->transport_header : -1,
  1150. sh->tx_flags, sh->nr_frags,
  1151. sh->gso_size, sh->gso_type, sh->gso_segs,
  1152. skb->csum, skb->csum_start, skb->csum_offset, skb->ip_summed,
  1153. skb->csum_complete_sw, skb->csum_valid, skb->csum_level,
  1154. skb->hash, skb->sw_hash, skb->l4_hash,
  1155. ntohs(skb->protocol), skb->pkt_type, skb->skb_iif,
  1156. skb->priority, skb->mark, skb->alloc_cpu, skb->vlan_all,
  1157. skb->encapsulation, skb->inner_protocol, skb->inner_mac_header,
  1158. skb->inner_network_header, skb->inner_transport_header);
  1159. if (dev)
  1160. printk("%sdev name=%s feat=%pNF\n",
  1161. level, dev->name, &dev->features);
  1162. if (sk)
  1163. printk("%ssk family=%hu type=%u proto=%u\n",
  1164. level, sk->sk_family, sk->sk_type, sk->sk_protocol);
  1165. if (full_pkt && headroom)
  1166. print_hex_dump(level, "skb headroom: ", DUMP_PREFIX_OFFSET,
  1167. 16, 1, skb->head, headroom, false);
  1168. seg_len = min_t(int, skb_headlen(skb), len);
  1169. if (seg_len)
  1170. print_hex_dump(level, "skb linear: ", DUMP_PREFIX_OFFSET,
  1171. 16, 1, skb->data, seg_len, false);
  1172. len -= seg_len;
  1173. if (full_pkt && tailroom)
  1174. print_hex_dump(level, "skb tailroom: ", DUMP_PREFIX_OFFSET,
  1175. 16, 1, skb_tail_pointer(skb), tailroom, false);
  1176. for (i = 0; len && i < skb_shinfo(skb)->nr_frags; i++) {
  1177. skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
  1178. u32 p_off, p_len, copied;
  1179. struct page *p;
  1180. u8 *vaddr;
  1181. if (skb_frag_is_net_iov(frag)) {
  1182. printk("%sskb frag %d: not readable\n", level, i);
  1183. len -= skb_frag_size(frag);
  1184. if (!len)
  1185. break;
  1186. continue;
  1187. }
  1188. skb_frag_foreach_page(frag, skb_frag_off(frag),
  1189. skb_frag_size(frag), p, p_off, p_len,
  1190. copied) {
  1191. seg_len = min_t(int, p_len, len);
  1192. vaddr = kmap_atomic(p);
  1193. print_hex_dump(level, "skb frag: ",
  1194. DUMP_PREFIX_OFFSET,
  1195. 16, 1, vaddr + p_off, seg_len, false);
  1196. kunmap_atomic(vaddr);
  1197. len -= seg_len;
  1198. if (!len)
  1199. break;
  1200. }
  1201. }
  1202. if (full_pkt && skb_has_frag_list(skb)) {
  1203. printk("skb fraglist:\n");
  1204. skb_walk_frags(skb, list_skb)
  1205. skb_dump(level, list_skb, true);
  1206. }
  1207. }
  1208. EXPORT_SYMBOL(skb_dump);
  1209. /**
  1210. * skb_tx_error - report an sk_buff xmit error
  1211. * @skb: buffer that triggered an error
  1212. *
  1213. * Report xmit error if a device callback is tracking this skb.
  1214. * skb must be freed afterwards.
  1215. */
  1216. void skb_tx_error(struct sk_buff *skb)
  1217. {
  1218. if (skb) {
  1219. skb_zcopy_downgrade_managed(skb);
  1220. skb_zcopy_clear(skb, true);
  1221. }
  1222. }
  1223. EXPORT_SYMBOL(skb_tx_error);
  1224. #ifdef CONFIG_TRACEPOINTS
  1225. /**
  1226. * consume_skb - free an skbuff
  1227. * @skb: buffer to free
  1228. *
  1229. * Drop a ref to the buffer and free it if the usage count has hit zero
  1230. * Functions identically to kfree_skb, but kfree_skb assumes that the frame
  1231. * is being dropped after a failure and notes that
  1232. */
  1233. void consume_skb(struct sk_buff *skb)
  1234. {
  1235. if (!skb_unref(skb))
  1236. return;
  1237. trace_consume_skb(skb, __builtin_return_address(0));
  1238. __kfree_skb(skb);
  1239. }
  1240. EXPORT_SYMBOL(consume_skb);
  1241. #endif
  1242. /**
  1243. * __consume_stateless_skb - free an skbuff, assuming it is stateless
  1244. * @skb: buffer to free
  1245. *
  1246. * Alike consume_skb(), but this variant assumes that this is the last
  1247. * skb reference and all the head states have been already dropped
  1248. */
  1249. void __consume_stateless_skb(struct sk_buff *skb)
  1250. {
  1251. trace_consume_skb(skb, __builtin_return_address(0));
  1252. skb_release_data(skb, SKB_CONSUMED);
  1253. kfree_skbmem(skb);
  1254. }
  1255. static void napi_skb_cache_put(struct sk_buff *skb)
  1256. {
  1257. struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
  1258. if (!kasan_mempool_poison_object(skb))
  1259. return;
  1260. local_lock_nested_bh(&napi_alloc_cache.bh_lock);
  1261. nc->skb_cache[nc->skb_count++] = skb;
  1262. if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) {
  1263. u32 i, remaining = NAPI_SKB_CACHE_SIZE - NAPI_SKB_CACHE_FREE;
  1264. for (i = remaining; i < NAPI_SKB_CACHE_SIZE; i++)
  1265. kasan_mempool_unpoison_object(nc->skb_cache[i],
  1266. skbuff_cache_size);
  1267. kmem_cache_free_bulk(net_hotdata.skbuff_cache,
  1268. NAPI_SKB_CACHE_FREE,
  1269. nc->skb_cache + remaining);
  1270. nc->skb_count = remaining;
  1271. }
  1272. local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
  1273. }
  1274. void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason)
  1275. {
  1276. skb_release_all(skb, reason);
  1277. napi_skb_cache_put(skb);
  1278. }
  1279. void napi_skb_free_stolen_head(struct sk_buff *skb)
  1280. {
  1281. if (unlikely(skb->slow_gro)) {
  1282. nf_reset_ct(skb);
  1283. skb_dst_drop(skb);
  1284. skb_ext_put(skb);
  1285. skb_orphan(skb);
  1286. skb->slow_gro = 0;
  1287. }
  1288. napi_skb_cache_put(skb);
  1289. }
  1290. /**
  1291. * napi_consume_skb() - consume skb in NAPI context, try to feed skb cache
  1292. * @skb: buffer to free
  1293. * @budget: NAPI budget
  1294. *
  1295. * Non-zero @budget must come from the @budget argument passed by the core
  1296. * to a NAPI poll function. Note that core may pass budget of 0 to NAPI poll
  1297. * for example when polling for netpoll / netconsole.
  1298. *
  1299. * Passing @budget of 0 is safe from any context, it turns this function
  1300. * into dev_consume_skb_any().
  1301. */
  1302. void napi_consume_skb(struct sk_buff *skb, int budget)
  1303. {
  1304. if (unlikely(!budget || !skb)) {
  1305. dev_consume_skb_any(skb);
  1306. return;
  1307. }
  1308. DEBUG_NET_WARN_ON_ONCE(!in_softirq());
  1309. if (skb->alloc_cpu != smp_processor_id() && !skb_shared(skb)) {
  1310. skb_release_head_state(skb);
  1311. return skb_attempt_defer_free(skb);
  1312. }
  1313. if (!skb_unref(skb))
  1314. return;
  1315. /* if reaching here SKB is ready to free */
  1316. trace_consume_skb(skb, __builtin_return_address(0));
  1317. /* if SKB is a clone, don't handle this case */
  1318. if (skb->fclone != SKB_FCLONE_UNAVAILABLE) {
  1319. __kfree_skb(skb);
  1320. return;
  1321. }
  1322. skb_release_all(skb, SKB_CONSUMED);
  1323. napi_skb_cache_put(skb);
  1324. }
  1325. EXPORT_SYMBOL(napi_consume_skb);
  1326. /* Make sure a field is contained by headers group */
  1327. #define CHECK_SKB_FIELD(field) \
  1328. BUILD_BUG_ON(offsetof(struct sk_buff, field) != \
  1329. offsetof(struct sk_buff, headers.field)); \
  1330. static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
  1331. {
  1332. new->tstamp = old->tstamp;
  1333. /* We do not copy old->sk */
  1334. new->dev = old->dev;
  1335. memcpy(new->cb, old->cb, sizeof(old->cb));
  1336. skb_dst_copy(new, old);
  1337. __skb_ext_copy(new, old);
  1338. __nf_copy(new, old, false);
  1339. /* Note : this field could be in the headers group.
  1340. * It is not yet because we do not want to have a 16 bit hole
  1341. */
  1342. new->queue_mapping = old->queue_mapping;
  1343. memcpy(&new->headers, &old->headers, sizeof(new->headers));
  1344. CHECK_SKB_FIELD(protocol);
  1345. CHECK_SKB_FIELD(csum);
  1346. CHECK_SKB_FIELD(hash);
  1347. CHECK_SKB_FIELD(priority);
  1348. CHECK_SKB_FIELD(skb_iif);
  1349. CHECK_SKB_FIELD(vlan_proto);
  1350. CHECK_SKB_FIELD(vlan_tci);
  1351. CHECK_SKB_FIELD(transport_header);
  1352. CHECK_SKB_FIELD(network_header);
  1353. CHECK_SKB_FIELD(mac_header);
  1354. CHECK_SKB_FIELD(inner_protocol);
  1355. CHECK_SKB_FIELD(inner_transport_header);
  1356. CHECK_SKB_FIELD(inner_network_header);
  1357. CHECK_SKB_FIELD(inner_mac_header);
  1358. CHECK_SKB_FIELD(mark);
  1359. #ifdef CONFIG_NETWORK_SECMARK
  1360. CHECK_SKB_FIELD(secmark);
  1361. #endif
  1362. #ifdef CONFIG_NET_RX_BUSY_POLL
  1363. CHECK_SKB_FIELD(napi_id);
  1364. #endif
  1365. CHECK_SKB_FIELD(alloc_cpu);
  1366. #ifdef CONFIG_XPS
  1367. CHECK_SKB_FIELD(sender_cpu);
  1368. #endif
  1369. #ifdef CONFIG_NET_SCHED
  1370. CHECK_SKB_FIELD(tc_index);
  1371. #endif
  1372. }
  1373. /*
  1374. * You should not add any new code to this function. Add it to
  1375. * __copy_skb_header above instead.
  1376. */
  1377. static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
  1378. {
  1379. #define C(x) n->x = skb->x
  1380. n->next = n->prev = NULL;
  1381. n->sk = NULL;
  1382. __copy_skb_header(n, skb);
  1383. C(len);
  1384. C(data_len);
  1385. C(mac_len);
  1386. n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len;
  1387. n->cloned = 1;
  1388. n->nohdr = 0;
  1389. n->peeked = 0;
  1390. C(pfmemalloc);
  1391. C(pp_recycle);
  1392. n->destructor = NULL;
  1393. C(tail);
  1394. C(end);
  1395. C(head);
  1396. C(head_frag);
  1397. C(data);
  1398. C(truesize);
  1399. refcount_set(&n->users, 1);
  1400. atomic_inc(&(skb_shinfo(skb)->dataref));
  1401. skb->cloned = 1;
  1402. return n;
  1403. #undef C
  1404. }
  1405. /**
  1406. * alloc_skb_for_msg() - allocate sk_buff to wrap frag list forming a msg
  1407. * @first: first sk_buff of the msg
  1408. */
  1409. struct sk_buff *alloc_skb_for_msg(struct sk_buff *first)
  1410. {
  1411. struct sk_buff *n;
  1412. n = alloc_skb(0, GFP_ATOMIC);
  1413. if (!n)
  1414. return NULL;
  1415. n->len = first->len;
  1416. n->data_len = first->len;
  1417. n->truesize = first->truesize;
  1418. skb_shinfo(n)->frag_list = first;
  1419. __copy_skb_header(n, first);
  1420. n->destructor = NULL;
  1421. return n;
  1422. }
  1423. EXPORT_SYMBOL_GPL(alloc_skb_for_msg);
  1424. /**
  1425. * skb_morph - morph one skb into another
  1426. * @dst: the skb to receive the contents
  1427. * @src: the skb to supply the contents
  1428. *
  1429. * This is identical to skb_clone except that the target skb is
  1430. * supplied by the user.
  1431. *
  1432. * The target skb is returned upon exit.
  1433. */
  1434. struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
  1435. {
  1436. skb_release_all(dst, SKB_CONSUMED);
  1437. return __skb_clone(dst, src);
  1438. }
  1439. EXPORT_SYMBOL_GPL(skb_morph);
  1440. int mm_account_pinned_pages(struct mmpin *mmp, size_t size)
  1441. {
  1442. unsigned long max_pg, num_pg, new_pg, old_pg, rlim;
  1443. struct user_struct *user;
  1444. if (capable(CAP_IPC_LOCK) || !size)
  1445. return 0;
  1446. rlim = rlimit(RLIMIT_MEMLOCK);
  1447. if (rlim == RLIM_INFINITY)
  1448. return 0;
  1449. num_pg = (size >> PAGE_SHIFT) + 2; /* worst case */
  1450. max_pg = rlim >> PAGE_SHIFT;
  1451. user = mmp->user ? : current_user();
  1452. old_pg = atomic_long_read(&user->locked_vm);
  1453. do {
  1454. new_pg = old_pg + num_pg;
  1455. if (new_pg > max_pg)
  1456. return -ENOBUFS;
  1457. } while (!atomic_long_try_cmpxchg(&user->locked_vm, &old_pg, new_pg));
  1458. if (!mmp->user) {
  1459. mmp->user = get_uid(user);
  1460. mmp->num_pg = num_pg;
  1461. } else {
  1462. mmp->num_pg += num_pg;
  1463. }
  1464. return 0;
  1465. }
  1466. EXPORT_SYMBOL_GPL(mm_account_pinned_pages);
  1467. void mm_unaccount_pinned_pages(struct mmpin *mmp)
  1468. {
  1469. if (mmp->user) {
  1470. atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm);
  1471. free_uid(mmp->user);
  1472. }
  1473. }
  1474. EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages);
  1475. static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size,
  1476. bool devmem)
  1477. {
  1478. struct ubuf_info_msgzc *uarg;
  1479. struct sk_buff *skb;
  1480. WARN_ON_ONCE(!in_task());
  1481. skb = sock_omalloc(sk, 0, GFP_KERNEL);
  1482. if (!skb)
  1483. return NULL;
  1484. BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb));
  1485. uarg = (void *)skb->cb;
  1486. uarg->mmp.user = NULL;
  1487. if (likely(!devmem) && mm_account_pinned_pages(&uarg->mmp, size)) {
  1488. kfree_skb(skb);
  1489. return NULL;
  1490. }
  1491. uarg->ubuf.ops = &msg_zerocopy_ubuf_ops;
  1492. uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1;
  1493. uarg->len = 1;
  1494. uarg->bytelen = size;
  1495. uarg->zerocopy = 1;
  1496. uarg->ubuf.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
  1497. refcount_set(&uarg->ubuf.refcnt, 1);
  1498. sock_hold(sk);
  1499. return &uarg->ubuf;
  1500. }
  1501. static inline struct sk_buff *skb_from_uarg(struct ubuf_info_msgzc *uarg)
  1502. {
  1503. return container_of((void *)uarg, struct sk_buff, cb);
  1504. }
  1505. struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
  1506. struct ubuf_info *uarg, bool devmem)
  1507. {
  1508. if (uarg) {
  1509. struct ubuf_info_msgzc *uarg_zc;
  1510. const u32 byte_limit = 1 << 19; /* limit to a few TSO */
  1511. u32 bytelen, next;
  1512. /* there might be non MSG_ZEROCOPY users */
  1513. if (uarg->ops != &msg_zerocopy_ubuf_ops)
  1514. return NULL;
  1515. /* realloc only when socket is locked (TCP, UDP cork),
  1516. * so uarg->len and sk_zckey access is serialized
  1517. */
  1518. if (!sock_owned_by_user(sk)) {
  1519. WARN_ON_ONCE(1);
  1520. return NULL;
  1521. }
  1522. uarg_zc = uarg_to_msgzc(uarg);
  1523. bytelen = uarg_zc->bytelen + size;
  1524. if (uarg_zc->len == USHRT_MAX - 1 || bytelen > byte_limit) {
  1525. /* TCP can create new skb to attach new uarg */
  1526. if (sk->sk_type == SOCK_STREAM)
  1527. goto new_alloc;
  1528. return NULL;
  1529. }
  1530. next = (u32)atomic_read(&sk->sk_zckey);
  1531. if ((u32)(uarg_zc->id + uarg_zc->len) == next) {
  1532. if (likely(!devmem) &&
  1533. mm_account_pinned_pages(&uarg_zc->mmp, size))
  1534. return NULL;
  1535. uarg_zc->len++;
  1536. uarg_zc->bytelen = bytelen;
  1537. atomic_set(&sk->sk_zckey, ++next);
  1538. /* no extra ref when appending to datagram (MSG_MORE) */
  1539. if (sk->sk_type == SOCK_STREAM)
  1540. net_zcopy_get(uarg);
  1541. return uarg;
  1542. }
  1543. }
  1544. new_alloc:
  1545. return msg_zerocopy_alloc(sk, size, devmem);
  1546. }
  1547. EXPORT_SYMBOL_GPL(msg_zerocopy_realloc);
  1548. static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len)
  1549. {
  1550. struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
  1551. u32 old_lo, old_hi;
  1552. u64 sum_len;
  1553. old_lo = serr->ee.ee_info;
  1554. old_hi = serr->ee.ee_data;
  1555. sum_len = old_hi - old_lo + 1ULL + len;
  1556. if (sum_len >= (1ULL << 32))
  1557. return false;
  1558. if (lo != old_hi + 1)
  1559. return false;
  1560. serr->ee.ee_data += len;
  1561. return true;
  1562. }
  1563. static void __msg_zerocopy_callback(struct ubuf_info_msgzc *uarg)
  1564. {
  1565. struct sk_buff *tail, *skb = skb_from_uarg(uarg);
  1566. struct sock_exterr_skb *serr;
  1567. struct sock *sk = skb->sk;
  1568. struct sk_buff_head *q;
  1569. unsigned long flags;
  1570. bool is_zerocopy;
  1571. u32 lo, hi;
  1572. u16 len;
  1573. mm_unaccount_pinned_pages(&uarg->mmp);
  1574. /* if !len, there was only 1 call, and it was aborted
  1575. * so do not queue a completion notification
  1576. */
  1577. if (!uarg->len || sock_flag(sk, SOCK_DEAD))
  1578. goto release;
  1579. len = uarg->len;
  1580. lo = uarg->id;
  1581. hi = uarg->id + len - 1;
  1582. is_zerocopy = uarg->zerocopy;
  1583. serr = SKB_EXT_ERR(skb);
  1584. memset(serr, 0, sizeof(*serr));
  1585. serr->ee.ee_errno = 0;
  1586. serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY;
  1587. serr->ee.ee_data = hi;
  1588. serr->ee.ee_info = lo;
  1589. if (!is_zerocopy)
  1590. serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED;
  1591. q = &sk->sk_error_queue;
  1592. spin_lock_irqsave(&q->lock, flags);
  1593. tail = skb_peek_tail(q);
  1594. if (!tail || SKB_EXT_ERR(tail)->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY ||
  1595. !skb_zerocopy_notify_extend(tail, lo, len)) {
  1596. __skb_queue_tail(q, skb);
  1597. skb = NULL;
  1598. }
  1599. spin_unlock_irqrestore(&q->lock, flags);
  1600. sk_error_report(sk);
  1601. release:
  1602. consume_skb(skb);
  1603. sock_put(sk);
  1604. }
  1605. static void msg_zerocopy_complete(struct sk_buff *skb, struct ubuf_info *uarg,
  1606. bool success)
  1607. {
  1608. struct ubuf_info_msgzc *uarg_zc = uarg_to_msgzc(uarg);
  1609. uarg_zc->zerocopy = uarg_zc->zerocopy & success;
  1610. if (refcount_dec_and_test(&uarg->refcnt))
  1611. __msg_zerocopy_callback(uarg_zc);
  1612. }
  1613. void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
  1614. {
  1615. struct sock *sk = skb_from_uarg(uarg_to_msgzc(uarg))->sk;
  1616. atomic_dec(&sk->sk_zckey);
  1617. uarg_to_msgzc(uarg)->len--;
  1618. if (have_uref)
  1619. msg_zerocopy_complete(NULL, uarg, true);
  1620. }
  1621. EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort);
  1622. const struct ubuf_info_ops msg_zerocopy_ubuf_ops = {
  1623. .complete = msg_zerocopy_complete,
  1624. };
  1625. EXPORT_SYMBOL_GPL(msg_zerocopy_ubuf_ops);
  1626. int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
  1627. struct msghdr *msg, int len,
  1628. struct ubuf_info *uarg,
  1629. struct net_devmem_dmabuf_binding *binding)
  1630. {
  1631. int err, orig_len = skb->len;
  1632. if (uarg->ops->link_skb) {
  1633. err = uarg->ops->link_skb(skb, uarg);
  1634. if (err)
  1635. return err;
  1636. } else {
  1637. struct ubuf_info *orig_uarg = skb_zcopy(skb);
  1638. /* An skb can only point to one uarg. This edge case happens
  1639. * when TCP appends to an skb, but zerocopy_realloc triggered
  1640. * a new alloc.
  1641. */
  1642. if (orig_uarg && uarg != orig_uarg)
  1643. return -EEXIST;
  1644. }
  1645. err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len,
  1646. binding);
  1647. if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
  1648. struct sock *save_sk = skb->sk;
  1649. /* Streams do not free skb on error. Reset to prev state. */
  1650. iov_iter_revert(&msg->msg_iter, skb->len - orig_len);
  1651. skb->sk = sk;
  1652. ___pskb_trim(skb, orig_len);
  1653. skb->sk = save_sk;
  1654. return err;
  1655. }
  1656. skb_zcopy_set(skb, uarg, NULL);
  1657. return skb->len - orig_len;
  1658. }
  1659. EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream);
  1660. void __skb_zcopy_downgrade_managed(struct sk_buff *skb)
  1661. {
  1662. int i;
  1663. skb_shinfo(skb)->flags &= ~SKBFL_MANAGED_FRAG_REFS;
  1664. for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
  1665. skb_frag_ref(skb, i);
  1666. }
  1667. EXPORT_SYMBOL_GPL(__skb_zcopy_downgrade_managed);
  1668. static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
  1669. gfp_t gfp_mask)
  1670. {
  1671. if (skb_zcopy(orig)) {
  1672. if (skb_zcopy(nskb)) {
  1673. /* !gfp_mask callers are verified to !skb_zcopy(nskb) */
  1674. if (!gfp_mask) {
  1675. WARN_ON_ONCE(1);
  1676. return -ENOMEM;
  1677. }
  1678. if (skb_uarg(nskb) == skb_uarg(orig))
  1679. return 0;
  1680. if (skb_copy_ubufs(nskb, GFP_ATOMIC))
  1681. return -EIO;
  1682. }
  1683. skb_zcopy_set(nskb, skb_uarg(orig), NULL);
  1684. }
  1685. return 0;
  1686. }
  1687. /**
  1688. * skb_copy_ubufs - copy userspace skb frags buffers to kernel
  1689. * @skb: the skb to modify
  1690. * @gfp_mask: allocation priority
  1691. *
  1692. * This must be called on skb with SKBFL_ZEROCOPY_ENABLE.
  1693. * It will copy all frags into kernel and drop the reference
  1694. * to userspace pages.
  1695. *
  1696. * If this function is called from an interrupt gfp_mask() must be
  1697. * %GFP_ATOMIC.
  1698. *
  1699. * Returns 0 on success or a negative error code on failure
  1700. * to allocate kernel memory to copy to.
  1701. */
  1702. int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
  1703. {
  1704. int num_frags = skb_shinfo(skb)->nr_frags;
  1705. struct page *page, *head = NULL;
  1706. int i, order, psize, new_frags;
  1707. u32 d_off;
  1708. if (skb_shared(skb) || skb_unclone(skb, gfp_mask))
  1709. return -EINVAL;
  1710. if (!skb_frags_readable(skb))
  1711. return -EFAULT;
  1712. if (!num_frags)
  1713. goto release;
  1714. /* We might have to allocate high order pages, so compute what minimum
  1715. * page order is needed.
  1716. */
  1717. order = 0;
  1718. while ((PAGE_SIZE << order) * MAX_SKB_FRAGS < __skb_pagelen(skb))
  1719. order++;
  1720. psize = (PAGE_SIZE << order);
  1721. new_frags = (__skb_pagelen(skb) + psize - 1) >> (PAGE_SHIFT + order);
  1722. for (i = 0; i < new_frags; i++) {
  1723. page = alloc_pages(gfp_mask | __GFP_COMP, order);
  1724. if (!page) {
  1725. while (head) {
  1726. struct page *next = (struct page *)page_private(head);
  1727. put_page(head);
  1728. head = next;
  1729. }
  1730. return -ENOMEM;
  1731. }
  1732. set_page_private(page, (unsigned long)head);
  1733. head = page;
  1734. }
  1735. page = head;
  1736. d_off = 0;
  1737. for (i = 0; i < num_frags; i++) {
  1738. skb_frag_t *f = &skb_shinfo(skb)->frags[i];
  1739. u32 p_off, p_len, copied;
  1740. struct page *p;
  1741. u8 *vaddr;
  1742. skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f),
  1743. p, p_off, p_len, copied) {
  1744. u32 copy, done = 0;
  1745. vaddr = kmap_atomic(p);
  1746. while (done < p_len) {
  1747. if (d_off == psize) {
  1748. d_off = 0;
  1749. page = (struct page *)page_private(page);
  1750. }
  1751. copy = min_t(u32, psize - d_off, p_len - done);
  1752. memcpy(page_address(page) + d_off,
  1753. vaddr + p_off + done, copy);
  1754. done += copy;
  1755. d_off += copy;
  1756. }
  1757. kunmap_atomic(vaddr);
  1758. }
  1759. }
  1760. /* skb frags release userspace buffers */
  1761. for (i = 0; i < num_frags; i++)
  1762. skb_frag_unref(skb, i);
  1763. /* skb frags point to kernel buffers */
  1764. for (i = 0; i < new_frags - 1; i++) {
  1765. __skb_fill_netmem_desc(skb, i, page_to_netmem(head), 0, psize);
  1766. head = (struct page *)page_private(head);
  1767. }
  1768. __skb_fill_netmem_desc(skb, new_frags - 1, page_to_netmem(head), 0,
  1769. d_off);
  1770. skb_shinfo(skb)->nr_frags = new_frags;
  1771. release:
  1772. skb_zcopy_clear(skb, false);
  1773. return 0;
  1774. }
  1775. EXPORT_SYMBOL_GPL(skb_copy_ubufs);
  1776. /**
  1777. * skb_clone - duplicate an sk_buff
  1778. * @skb: buffer to clone
  1779. * @gfp_mask: allocation priority
  1780. *
  1781. * Duplicate an &sk_buff. The new one is not owned by a socket. Both
  1782. * copies share the same packet data but not structure. The new
  1783. * buffer has a reference count of 1. If the allocation fails the
  1784. * function returns %NULL otherwise the new buffer is returned.
  1785. *
  1786. * If this function is called from an interrupt gfp_mask() must be
  1787. * %GFP_ATOMIC.
  1788. */
  1789. struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
  1790. {
  1791. struct sk_buff_fclones *fclones = container_of(skb,
  1792. struct sk_buff_fclones,
  1793. skb1);
  1794. struct sk_buff *n;
  1795. if (skb_orphan_frags(skb, gfp_mask))
  1796. return NULL;
  1797. if (skb->fclone == SKB_FCLONE_ORIG &&
  1798. refcount_read(&fclones->fclone_ref) == 1) {
  1799. n = &fclones->skb2;
  1800. refcount_set(&fclones->fclone_ref, 2);
  1801. n->fclone = SKB_FCLONE_CLONE;
  1802. } else {
  1803. if (skb_pfmemalloc(skb))
  1804. gfp_mask |= __GFP_MEMALLOC;
  1805. n = kmem_cache_alloc(net_hotdata.skbuff_cache, gfp_mask);
  1806. if (!n)
  1807. return NULL;
  1808. n->fclone = SKB_FCLONE_UNAVAILABLE;
  1809. }
  1810. return __skb_clone(n, skb);
  1811. }
  1812. EXPORT_SYMBOL(skb_clone);
  1813. void skb_headers_offset_update(struct sk_buff *skb, int off)
  1814. {
  1815. /* Only adjust this if it actually is csum_start rather than csum */
  1816. if (skb->ip_summed == CHECKSUM_PARTIAL)
  1817. skb->csum_start += off;
  1818. /* {transport,network,mac}_header and tail are relative to skb->head */
  1819. skb->transport_header += off;
  1820. skb->network_header += off;
  1821. if (skb_mac_header_was_set(skb))
  1822. skb->mac_header += off;
  1823. skb->inner_transport_header += off;
  1824. skb->inner_network_header += off;
  1825. skb->inner_mac_header += off;
  1826. }
  1827. EXPORT_SYMBOL(skb_headers_offset_update);
  1828. void skb_copy_header(struct sk_buff *new, const struct sk_buff *old)
  1829. {
  1830. __copy_skb_header(new, old);
  1831. skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size;
  1832. skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
  1833. skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
  1834. }
  1835. EXPORT_SYMBOL(skb_copy_header);
  1836. static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
  1837. {
  1838. if (skb_pfmemalloc(skb))
  1839. return SKB_ALLOC_RX;
  1840. return 0;
  1841. }
  1842. /**
  1843. * skb_copy - create private copy of an sk_buff
  1844. * @skb: buffer to copy
  1845. * @gfp_mask: allocation priority
  1846. *
  1847. * Make a copy of both an &sk_buff and its data. This is used when the
  1848. * caller wishes to modify the data and needs a private copy of the
  1849. * data to alter. Returns %NULL on failure or the pointer to the buffer
  1850. * on success. The returned buffer has a reference count of 1.
  1851. *
  1852. * As by-product this function converts non-linear &sk_buff to linear
  1853. * one, so that &sk_buff becomes completely private and caller is allowed
  1854. * to modify all the data of returned buffer. This means that this
  1855. * function is not recommended for use in circumstances when only
  1856. * header is going to be modified. Use pskb_copy() instead.
  1857. */
  1858. struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
  1859. {
  1860. struct sk_buff *n;
  1861. unsigned int size;
  1862. int headerlen;
  1863. if (!skb_frags_readable(skb))
  1864. return NULL;
  1865. if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST))
  1866. return NULL;
  1867. headerlen = skb_headroom(skb);
  1868. size = skb_end_offset(skb) + skb->data_len;
  1869. n = __alloc_skb(size, gfp_mask,
  1870. skb_alloc_rx_flag(skb), NUMA_NO_NODE);
  1871. if (!n)
  1872. return NULL;
  1873. /* Set the data pointer */
  1874. skb_reserve(n, headerlen);
  1875. /* Set the tail pointer and length */
  1876. skb_put(n, skb->len);
  1877. BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len));
  1878. skb_copy_header(n, skb);
  1879. return n;
  1880. }
  1881. EXPORT_SYMBOL(skb_copy);
  1882. /**
  1883. * __pskb_copy_fclone - create copy of an sk_buff with private head.
  1884. * @skb: buffer to copy
  1885. * @headroom: headroom of new skb
  1886. * @gfp_mask: allocation priority
  1887. * @fclone: if true allocate the copy of the skb from the fclone
  1888. * cache instead of the head cache; it is recommended to set this
  1889. * to true for the cases where the copy will likely be cloned
  1890. *
  1891. * Make a copy of both an &sk_buff and part of its data, located
  1892. * in header. Fragmented data remain shared. This is used when
  1893. * the caller wishes to modify only header of &sk_buff and needs
  1894. * private copy of the header to alter. Returns %NULL on failure
  1895. * or the pointer to the buffer on success.
  1896. * The returned buffer has a reference count of 1.
  1897. */
  1898. struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
  1899. gfp_t gfp_mask, bool fclone)
  1900. {
  1901. unsigned int size = skb_headlen(skb) + headroom;
  1902. int flags = skb_alloc_rx_flag(skb) | (fclone ? SKB_ALLOC_FCLONE : 0);
  1903. struct sk_buff *n = __alloc_skb(size, gfp_mask, flags, NUMA_NO_NODE);
  1904. if (!n)
  1905. goto out;
  1906. /* Set the data pointer */
  1907. skb_reserve(n, headroom);
  1908. /* Set the tail pointer and length */
  1909. skb_put(n, skb_headlen(skb));
  1910. /* Copy the bytes */
  1911. skb_copy_from_linear_data(skb, n->data, n->len);
  1912. n->truesize += skb->data_len;
  1913. n->data_len = skb->data_len;
  1914. n->len = skb->len;
  1915. if (skb_shinfo(skb)->nr_frags) {
  1916. int i;
  1917. if (skb_orphan_frags(skb, gfp_mask) ||
  1918. skb_zerocopy_clone(n, skb, gfp_mask)) {
  1919. kfree_skb(n);
  1920. n = NULL;
  1921. goto out;
  1922. }
  1923. for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
  1924. skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
  1925. skb_frag_ref(skb, i);
  1926. }
  1927. skb_shinfo(n)->nr_frags = i;
  1928. }
  1929. if (skb_has_frag_list(skb)) {
  1930. skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list;
  1931. skb_clone_fraglist(n);
  1932. }
  1933. skb_copy_header(n, skb);
  1934. out:
  1935. return n;
  1936. }
  1937. EXPORT_SYMBOL(__pskb_copy_fclone);
  1938. /**
  1939. * pskb_expand_head - reallocate header of &sk_buff
  1940. * @skb: buffer to reallocate
  1941. * @nhead: room to add at head
  1942. * @ntail: room to add at tail
  1943. * @gfp_mask: allocation priority
  1944. *
  1945. * Expands (or creates identical copy, if @nhead and @ntail are zero)
  1946. * header of @skb. &sk_buff itself is not changed. &sk_buff MUST have
  1947. * reference count of 1. Returns zero in the case of success or error,
  1948. * if expansion failed. In the last case, &sk_buff is not changed.
  1949. *
  1950. * All the pointers pointing into skb header may change and must be
  1951. * reloaded after call to this function.
  1952. *
  1953. * Note: If you skb_push() the start of the buffer after reallocating the
  1954. * header, call skb_postpush_data_move() first to move the metadata out of
  1955. * the way before writing to &sk_buff->data.
  1956. */
  1957. int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
  1958. gfp_t gfp_mask)
  1959. {
  1960. unsigned int osize = skb_end_offset(skb);
  1961. unsigned int size = osize + nhead + ntail;
  1962. long off;
  1963. u8 *data;
  1964. int i;
  1965. BUG_ON(nhead < 0);
  1966. BUG_ON(skb_shared(skb));
  1967. skb_zcopy_downgrade_managed(skb);
  1968. if (skb_pfmemalloc(skb))
  1969. gfp_mask |= __GFP_MEMALLOC;
  1970. data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
  1971. if (!data)
  1972. goto nodata;
  1973. size = SKB_WITH_OVERHEAD(size);
  1974. /* Copy only real data... and, alas, header. This should be
  1975. * optimized for the cases when header is void.
  1976. */
  1977. memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head);
  1978. memcpy((struct skb_shared_info *)(data + size),
  1979. skb_shinfo(skb),
  1980. offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
  1981. /*
  1982. * if shinfo is shared we must drop the old head gracefully, but if it
  1983. * is not we can just drop the old head and let the existing refcount
  1984. * be since all we did is relocate the values
  1985. */
  1986. if (skb_cloned(skb)) {
  1987. if (skb_orphan_frags(skb, gfp_mask))
  1988. goto nofrags;
  1989. if (skb_zcopy(skb))
  1990. refcount_inc(&skb_uarg(skb)->refcnt);
  1991. for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
  1992. skb_frag_ref(skb, i);
  1993. if (skb_has_frag_list(skb))
  1994. skb_clone_fraglist(skb);
  1995. skb_release_data(skb, SKB_CONSUMED);
  1996. } else {
  1997. skb_free_head(skb);
  1998. }
  1999. off = (data + nhead) - skb->head;
  2000. skb->head = data;
  2001. skb->head_frag = 0;
  2002. skb->data += off;
  2003. skb_set_end_offset(skb, size);
  2004. #ifdef NET_SKBUFF_DATA_USES_OFFSET
  2005. off = nhead;
  2006. #endif
  2007. skb->tail += off;
  2008. skb_headers_offset_update(skb, nhead);
  2009. skb->cloned = 0;
  2010. skb->hdr_len = 0;
  2011. skb->nohdr = 0;
  2012. atomic_set(&skb_shinfo(skb)->dataref, 1);
  2013. /* It is not generally safe to change skb->truesize.
  2014. * For the moment, we really care of rx path, or
  2015. * when skb is orphaned (not attached to a socket).
  2016. */
  2017. if (!skb->sk || skb->destructor == sock_edemux)
  2018. skb->truesize += size - osize;
  2019. return 0;
  2020. nofrags:
  2021. skb_kfree_head(data, size);
  2022. nodata:
  2023. return -ENOMEM;
  2024. }
  2025. EXPORT_SYMBOL(pskb_expand_head);
  2026. /* Make private copy of skb with writable head and some headroom */
  2027. struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
  2028. {
  2029. struct sk_buff *skb2;
  2030. int delta = headroom - skb_headroom(skb);
  2031. if (delta <= 0)
  2032. skb2 = pskb_copy(skb, GFP_ATOMIC);
  2033. else {
  2034. skb2 = skb_clone(skb, GFP_ATOMIC);
  2035. if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0,
  2036. GFP_ATOMIC)) {
  2037. kfree_skb(skb2);
  2038. skb2 = NULL;
  2039. }
  2040. }
  2041. return skb2;
  2042. }
  2043. EXPORT_SYMBOL(skb_realloc_headroom);
  2044. /* Note: We plan to rework this in linux-6.4 */
  2045. int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri)
  2046. {
  2047. unsigned int saved_end_offset, saved_truesize;
  2048. struct skb_shared_info *shinfo;
  2049. int res;
  2050. saved_end_offset = skb_end_offset(skb);
  2051. saved_truesize = skb->truesize;
  2052. res = pskb_expand_head(skb, 0, 0, pri);
  2053. if (res)
  2054. return res;
  2055. skb->truesize = saved_truesize;
  2056. if (likely(skb_end_offset(skb) == saved_end_offset))
  2057. return 0;
  2058. /* We can not change skb->end if the original or new value
  2059. * is SKB_SMALL_HEAD_HEADROOM, as it might break skb_kfree_head().
  2060. */
  2061. if (saved_end_offset == SKB_SMALL_HEAD_HEADROOM ||
  2062. skb_end_offset(skb) == SKB_SMALL_HEAD_HEADROOM) {
  2063. /* We think this path should not be taken.
  2064. * Add a temporary trace to warn us just in case.
  2065. */
  2066. pr_err_once("__skb_unclone_keeptruesize() skb_end_offset() %u -> %u\n",
  2067. saved_end_offset, skb_end_offset(skb));
  2068. WARN_ON_ONCE(1);
  2069. return 0;
  2070. }
  2071. shinfo = skb_shinfo(skb);
  2072. /* We are about to change back skb->end,
  2073. * we need to move skb_shinfo() to its new location.
  2074. */
  2075. memmove(skb->head + saved_end_offset,
  2076. shinfo,
  2077. offsetof(struct skb_shared_info, frags[shinfo->nr_frags]));
  2078. skb_set_end_offset(skb, saved_end_offset);
  2079. return 0;
  2080. }
  2081. /**
  2082. * skb_expand_head - reallocate header of &sk_buff
  2083. * @skb: buffer to reallocate
  2084. * @headroom: needed headroom
  2085. *
  2086. * Unlike skb_realloc_headroom, this one does not allocate a new skb
  2087. * if possible; copies skb->sk to new skb as needed
  2088. * and frees original skb in case of failures.
  2089. *
  2090. * It expect increased headroom and generates warning otherwise.
  2091. */
  2092. struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
  2093. {
  2094. int delta = headroom - skb_headroom(skb);
  2095. int osize = skb_end_offset(skb);
  2096. struct sock *sk = skb->sk;
  2097. if (WARN_ONCE(delta <= 0,
  2098. "%s is expecting an increase in the headroom", __func__))
  2099. return skb;
  2100. delta = SKB_DATA_ALIGN(delta);
  2101. /* pskb_expand_head() might crash, if skb is shared. */
  2102. if (skb_shared(skb) || !is_skb_wmem(skb)) {
  2103. struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
  2104. if (unlikely(!nskb))
  2105. goto fail;
  2106. if (sk)
  2107. skb_set_owner_w(nskb, sk);
  2108. consume_skb(skb);
  2109. skb = nskb;
  2110. }
  2111. if (pskb_expand_head(skb, delta, 0, GFP_ATOMIC))
  2112. goto fail;
  2113. if (sk && is_skb_wmem(skb)) {
  2114. delta = skb_end_offset(skb) - osize;
  2115. refcount_add(delta, &sk->sk_wmem_alloc);
  2116. skb->truesize += delta;
  2117. }
  2118. return skb;
  2119. fail:
  2120. kfree_skb(skb);
  2121. return NULL;
  2122. }
  2123. EXPORT_SYMBOL(skb_expand_head);
  2124. /**
  2125. * skb_copy_expand - copy and expand sk_buff
  2126. * @skb: buffer to copy
  2127. * @newheadroom: new free bytes at head
  2128. * @newtailroom: new free bytes at tail
  2129. * @gfp_mask: allocation priority
  2130. *
  2131. * Make a copy of both an &sk_buff and its data and while doing so
  2132. * allocate additional space.
  2133. *
  2134. * This is used when the caller wishes to modify the data and needs a
  2135. * private copy of the data to alter as well as more space for new fields.
  2136. * Returns %NULL on failure or the pointer to the buffer
  2137. * on success. The returned buffer has a reference count of 1.
  2138. *
  2139. * You must pass %GFP_ATOMIC as the allocation priority if this function
  2140. * is called from an interrupt.
  2141. */
  2142. struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
  2143. int newheadroom, int newtailroom,
  2144. gfp_t gfp_mask)
  2145. {
  2146. /*
  2147. * Allocate the copy buffer
  2148. */
  2149. int head_copy_len, head_copy_off;
  2150. struct sk_buff *n;
  2151. int oldheadroom;
  2152. if (!skb_frags_readable(skb))
  2153. return NULL;
  2154. if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST))
  2155. return NULL;
  2156. oldheadroom = skb_headroom(skb);
  2157. n = __alloc_skb(newheadroom + skb->len + newtailroom,
  2158. gfp_mask, skb_alloc_rx_flag(skb),
  2159. NUMA_NO_NODE);
  2160. if (!n)
  2161. return NULL;
  2162. skb_reserve(n, newheadroom);
  2163. /* Set the tail pointer and length */
  2164. skb_put(n, skb->len);
  2165. head_copy_len = oldheadroom;
  2166. head_copy_off = 0;
  2167. if (newheadroom <= head_copy_len)
  2168. head_copy_len = newheadroom;
  2169. else
  2170. head_copy_off = newheadroom - head_copy_len;
  2171. /* Copy the linear header and data. */
  2172. BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,
  2173. skb->len + head_copy_len));
  2174. skb_copy_header(n, skb);
  2175. skb_headers_offset_update(n, newheadroom - oldheadroom);
  2176. return n;
  2177. }
  2178. EXPORT_SYMBOL(skb_copy_expand);
  2179. /**
  2180. * __skb_pad - zero pad the tail of an skb
  2181. * @skb: buffer to pad
  2182. * @pad: space to pad
  2183. * @free_on_error: free buffer on error
  2184. *
  2185. * Ensure that a buffer is followed by a padding area that is zero
  2186. * filled. Used by network drivers which may DMA or transfer data
  2187. * beyond the buffer end onto the wire.
  2188. *
  2189. * May return error in out of memory cases. The skb is freed on error
  2190. * if @free_on_error is true.
  2191. */
  2192. int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error)
  2193. {
  2194. int err;
  2195. int ntail;
  2196. /* If the skbuff is non linear tailroom is always zero.. */
  2197. if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) {
  2198. memset(skb->data+skb->len, 0, pad);
  2199. return 0;
  2200. }
  2201. ntail = skb->data_len + pad - (skb->end - skb->tail);
  2202. if (likely(skb_cloned(skb) || ntail > 0)) {
  2203. err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC);
  2204. if (unlikely(err))
  2205. goto free_skb;
  2206. }
  2207. /* FIXME: The use of this function with non-linear skb's really needs
  2208. * to be audited.
  2209. */
  2210. err = skb_linearize(skb);
  2211. if (unlikely(err))
  2212. goto free_skb;
  2213. memset(skb->data + skb->len, 0, pad);
  2214. return 0;
  2215. free_skb:
  2216. if (free_on_error)
  2217. kfree_skb(skb);
  2218. return err;
  2219. }
  2220. EXPORT_SYMBOL(__skb_pad);
  2221. /**
  2222. * pskb_put - add data to the tail of a potentially fragmented buffer
  2223. * @skb: start of the buffer to use
  2224. * @tail: tail fragment of the buffer to use
  2225. * @len: amount of data to add
  2226. *
  2227. * This function extends the used data area of the potentially
  2228. * fragmented buffer. @tail must be the last fragment of @skb -- or
  2229. * @skb itself. If this would exceed the total buffer size the kernel
  2230. * will panic. A pointer to the first byte of the extra data is
  2231. * returned.
  2232. */
  2233. void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len)
  2234. {
  2235. if (tail != skb) {
  2236. skb->data_len += len;
  2237. skb->len += len;
  2238. }
  2239. return skb_put(tail, len);
  2240. }
  2241. EXPORT_SYMBOL_GPL(pskb_put);
  2242. /**
  2243. * skb_put - add data to a buffer
  2244. * @skb: buffer to use
  2245. * @len: amount of data to add
  2246. *
  2247. * This function extends the used data area of the buffer. If this would
  2248. * exceed the total buffer size the kernel will panic. A pointer to the
  2249. * first byte of the extra data is returned.
  2250. */
  2251. void *skb_put(struct sk_buff *skb, unsigned int len)
  2252. {
  2253. void *tmp = skb_tail_pointer(skb);
  2254. SKB_LINEAR_ASSERT(skb);
  2255. skb->tail += len;
  2256. skb->len += len;
  2257. if (unlikely(skb->tail > skb->end))
  2258. skb_over_panic(skb, len, __builtin_return_address(0));
  2259. return tmp;
  2260. }
  2261. EXPORT_SYMBOL(skb_put);
  2262. /**
  2263. * skb_push - add data to the start of a buffer
  2264. * @skb: buffer to use
  2265. * @len: amount of data to add
  2266. *
  2267. * This function extends the used data area of the buffer at the buffer
  2268. * start. If this would exceed the total buffer headroom the kernel will
  2269. * panic. A pointer to the first byte of the extra data is returned.
  2270. */
  2271. void *skb_push(struct sk_buff *skb, unsigned int len)
  2272. {
  2273. skb->data -= len;
  2274. skb->len += len;
  2275. if (unlikely(skb->data < skb->head))
  2276. skb_under_panic(skb, len, __builtin_return_address(0));
  2277. return skb->data;
  2278. }
  2279. EXPORT_SYMBOL(skb_push);
  2280. /**
  2281. * skb_pull - remove data from the start of a buffer
  2282. * @skb: buffer to use
  2283. * @len: amount of data to remove
  2284. *
  2285. * This function removes data from the start of a buffer, returning
  2286. * the memory to the headroom. A pointer to the next data in the buffer
  2287. * is returned. Once the data has been pulled future pushes will overwrite
  2288. * the old data.
  2289. */
  2290. void *skb_pull(struct sk_buff *skb, unsigned int len)
  2291. {
  2292. return skb_pull_inline(skb, len);
  2293. }
  2294. EXPORT_SYMBOL(skb_pull);
  2295. /**
  2296. * skb_pull_data - remove data from the start of a buffer returning its
  2297. * original position.
  2298. * @skb: buffer to use
  2299. * @len: amount of data to remove
  2300. *
  2301. * This function removes data from the start of a buffer, returning
  2302. * the memory to the headroom. A pointer to the original data in the buffer
  2303. * is returned after checking if there is enough data to pull. Once the
  2304. * data has been pulled future pushes will overwrite the old data.
  2305. */
  2306. void *skb_pull_data(struct sk_buff *skb, size_t len)
  2307. {
  2308. void *data = skb->data;
  2309. if (skb->len < len)
  2310. return NULL;
  2311. skb_pull(skb, len);
  2312. return data;
  2313. }
  2314. EXPORT_SYMBOL(skb_pull_data);
  2315. /**
  2316. * skb_trim - remove end from a buffer
  2317. * @skb: buffer to alter
  2318. * @len: new length
  2319. *
  2320. * Cut the length of a buffer down by removing data from the tail. If
  2321. * the buffer is already under the length specified it is not modified.
  2322. * The skb must be linear.
  2323. */
  2324. void skb_trim(struct sk_buff *skb, unsigned int len)
  2325. {
  2326. if (skb->len > len)
  2327. __skb_trim(skb, len);
  2328. }
  2329. EXPORT_SYMBOL(skb_trim);
  2330. /* Trims skb to length len. It can change skb pointers.
  2331. */
  2332. int ___pskb_trim(struct sk_buff *skb, unsigned int len)
  2333. {
  2334. struct sk_buff **fragp;
  2335. struct sk_buff *frag;
  2336. int offset = skb_headlen(skb);
  2337. int nfrags = skb_shinfo(skb)->nr_frags;
  2338. int i;
  2339. int err;
  2340. if (skb_cloned(skb) &&
  2341. unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))))
  2342. return err;
  2343. i = 0;
  2344. if (offset >= len)
  2345. goto drop_pages;
  2346. for (; i < nfrags; i++) {
  2347. int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]);
  2348. if (end < len) {
  2349. offset = end;
  2350. continue;
  2351. }
  2352. skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset);
  2353. drop_pages:
  2354. skb_shinfo(skb)->nr_frags = i;
  2355. for (; i < nfrags; i++)
  2356. skb_frag_unref(skb, i);
  2357. if (skb_has_frag_list(skb))
  2358. skb_drop_fraglist(skb);
  2359. goto done;
  2360. }
  2361. for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp);
  2362. fragp = &frag->next) {
  2363. int end = offset + frag->len;
  2364. if (skb_shared(frag)) {
  2365. struct sk_buff *nfrag;
  2366. nfrag = skb_clone(frag, GFP_ATOMIC);
  2367. if (unlikely(!nfrag))
  2368. return -ENOMEM;
  2369. nfrag->next = frag->next;
  2370. consume_skb(frag);
  2371. frag = nfrag;
  2372. *fragp = frag;
  2373. }
  2374. if (end < len) {
  2375. offset = end;
  2376. continue;
  2377. }
  2378. if (end > len &&
  2379. unlikely((err = pskb_trim(frag, len - offset))))
  2380. return err;
  2381. if (frag->next)
  2382. skb_drop_list(&frag->next);
  2383. break;
  2384. }
  2385. done:
  2386. if (len > skb_headlen(skb)) {
  2387. skb->data_len -= skb->len - len;
  2388. skb->len = len;
  2389. } else {
  2390. skb->len = len;
  2391. skb->data_len = 0;
  2392. skb_set_tail_pointer(skb, len);
  2393. }
  2394. if (!skb->sk || skb->destructor == sock_edemux)
  2395. skb_condense(skb);
  2396. return 0;
  2397. }
  2398. EXPORT_SYMBOL(___pskb_trim);
  2399. /* Note : use pskb_trim_rcsum() instead of calling this directly
  2400. */
  2401. int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len)
  2402. {
  2403. if (skb->ip_summed == CHECKSUM_COMPLETE) {
  2404. int delta = skb->len - len;
  2405. skb->csum = csum_block_sub(skb->csum,
  2406. skb_checksum(skb, len, delta, 0),
  2407. len);
  2408. } else if (skb->ip_summed == CHECKSUM_PARTIAL) {
  2409. int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len;
  2410. int offset = skb_checksum_start_offset(skb) + skb->csum_offset;
  2411. if (offset + sizeof(__sum16) > hdlen)
  2412. return -EINVAL;
  2413. }
  2414. return __pskb_trim(skb, len);
  2415. }
  2416. EXPORT_SYMBOL(pskb_trim_rcsum_slow);
  2417. /**
  2418. * __pskb_pull_tail - advance tail of skb header
  2419. * @skb: buffer to reallocate
  2420. * @delta: number of bytes to advance tail
  2421. *
  2422. * The function makes a sense only on a fragmented &sk_buff,
  2423. * it expands header moving its tail forward and copying necessary
  2424. * data from fragmented part.
  2425. *
  2426. * &sk_buff MUST have reference count of 1.
  2427. *
  2428. * Returns %NULL (and &sk_buff does not change) if pull failed
  2429. * or value of new tail of skb in the case of success.
  2430. *
  2431. * All the pointers pointing into skb header may change and must be
  2432. * reloaded after call to this function.
  2433. */
  2434. /* Moves tail of skb head forward, copying data from fragmented part,
  2435. * when it is necessary.
  2436. * 1. It may fail due to malloc failure.
  2437. * 2. It may change skb pointers.
  2438. *
  2439. * It is pretty complicated. Luckily, it is called only in exceptional cases.
  2440. */
  2441. void *__pskb_pull_tail(struct sk_buff *skb, int delta)
  2442. {
  2443. /* If skb has not enough free space at tail, get new one
  2444. * plus 128 bytes for future expansions. If we have enough
  2445. * room at tail, reallocate without expansion only if skb is cloned.
  2446. */
  2447. int i, k, eat = (skb->tail + delta) - skb->end;
  2448. if (!skb_frags_readable(skb))
  2449. return NULL;
  2450. if (eat > 0 || skb_cloned(skb)) {
  2451. if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0,
  2452. GFP_ATOMIC))
  2453. return NULL;
  2454. }
  2455. BUG_ON(skb_copy_bits(skb, skb_headlen(skb),
  2456. skb_tail_pointer(skb), delta));
  2457. /* Optimization: no fragments, no reasons to preestimate
  2458. * size of pulled pages. Superb.
  2459. */
  2460. if (!skb_has_frag_list(skb))
  2461. goto pull_pages;
  2462. /* Estimate size of pulled pages. */
  2463. eat = delta;
  2464. for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
  2465. int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
  2466. if (size >= eat)
  2467. goto pull_pages;
  2468. eat -= size;
  2469. }
  2470. /* If we need update frag list, we are in troubles.
  2471. * Certainly, it is possible to add an offset to skb data,
  2472. * but taking into account that pulling is expected to
  2473. * be very rare operation, it is worth to fight against
  2474. * further bloating skb head and crucify ourselves here instead.
  2475. * Pure masohism, indeed. 8)8)
  2476. */
  2477. if (eat) {
  2478. struct sk_buff *list = skb_shinfo(skb)->frag_list;
  2479. struct sk_buff *clone = NULL;
  2480. struct sk_buff *insp = NULL;
  2481. do {
  2482. if (list->len <= eat) {
  2483. /* Eaten as whole. */
  2484. eat -= list->len;
  2485. list = list->next;
  2486. insp = list;
  2487. } else {
  2488. /* Eaten partially. */
  2489. if (skb_is_gso(skb) && !list->head_frag &&
  2490. skb_headlen(list))
  2491. skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
  2492. if (skb_shared(list)) {
  2493. /* Sucks! We need to fork list. :-( */
  2494. clone = skb_clone(list, GFP_ATOMIC);
  2495. if (!clone)
  2496. return NULL;
  2497. insp = list->next;
  2498. list = clone;
  2499. } else {
  2500. /* This may be pulled without
  2501. * problems. */
  2502. insp = list;
  2503. }
  2504. if (!pskb_pull(list, eat)) {
  2505. kfree_skb(clone);
  2506. return NULL;
  2507. }
  2508. break;
  2509. }
  2510. } while (eat);
  2511. /* Free pulled out fragments. */
  2512. while ((list = skb_shinfo(skb)->frag_list) != insp) {
  2513. skb_shinfo(skb)->frag_list = list->next;
  2514. consume_skb(list);
  2515. }
  2516. /* And insert new clone at head. */
  2517. if (clone) {
  2518. clone->next = list;
  2519. skb_shinfo(skb)->frag_list = clone;
  2520. }
  2521. }
  2522. /* Success! Now we may commit changes to skb data. */
  2523. pull_pages:
  2524. eat = delta;
  2525. k = 0;
  2526. for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
  2527. int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
  2528. if (size <= eat) {
  2529. skb_frag_unref(skb, i);
  2530. eat -= size;
  2531. } else {
  2532. skb_frag_t *frag = &skb_shinfo(skb)->frags[k];
  2533. *frag = skb_shinfo(skb)->frags[i];
  2534. if (eat) {
  2535. skb_frag_off_add(frag, eat);
  2536. skb_frag_size_sub(frag, eat);
  2537. if (!i)
  2538. goto end;
  2539. eat = 0;
  2540. }
  2541. k++;
  2542. }
  2543. }
  2544. skb_shinfo(skb)->nr_frags = k;
  2545. end:
  2546. skb->tail += delta;
  2547. skb->data_len -= delta;
  2548. if (!skb->data_len)
  2549. skb_zcopy_clear(skb, false);
  2550. return skb_tail_pointer(skb);
  2551. }
  2552. EXPORT_SYMBOL(__pskb_pull_tail);
  2553. /**
  2554. * skb_copy_bits - copy bits from skb to kernel buffer
  2555. * @skb: source skb
  2556. * @offset: offset in source
  2557. * @to: destination buffer
  2558. * @len: number of bytes to copy
  2559. *
  2560. * Copy the specified number of bytes from the source skb to the
  2561. * destination buffer.
  2562. *
  2563. * CAUTION ! :
  2564. * If its prototype is ever changed,
  2565. * check arch/{*}/net/{*}.S files,
  2566. * since it is called from BPF assembly code.
  2567. */
  2568. int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
  2569. {
  2570. int start = skb_headlen(skb);
  2571. struct sk_buff *frag_iter;
  2572. int i, copy;
  2573. if (offset > (int)skb->len - len)
  2574. goto fault;
  2575. /* Copy header. */
  2576. if ((copy = start - offset) > 0) {
  2577. if (copy > len)
  2578. copy = len;
  2579. skb_copy_from_linear_data_offset(skb, offset, to, copy);
  2580. if ((len -= copy) == 0)
  2581. return 0;
  2582. offset += copy;
  2583. to += copy;
  2584. }
  2585. if (!skb_frags_readable(skb))
  2586. goto fault;
  2587. for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
  2588. int end;
  2589. skb_frag_t *f = &skb_shinfo(skb)->frags[i];
  2590. WARN_ON(start > offset + len);
  2591. end = start + skb_frag_size(f);
  2592. if ((copy = end - offset) > 0) {
  2593. u32 p_off, p_len, copied;
  2594. struct page *p;
  2595. u8 *vaddr;
  2596. if (copy > len)
  2597. copy = len;
  2598. skb_frag_foreach_page(f,
  2599. skb_frag_off(f) + offset - start,
  2600. copy, p, p_off, p_len, copied) {
  2601. vaddr = kmap_atomic(p);
  2602. memcpy(to + copied, vaddr + p_off, p_len);
  2603. kunmap_atomic(vaddr);
  2604. }
  2605. if ((len -= copy) == 0)
  2606. return 0;
  2607. offset += copy;
  2608. to += copy;
  2609. }
  2610. start = end;
  2611. }
  2612. skb_walk_frags(skb, frag_iter) {
  2613. int end;
  2614. WARN_ON(start > offset + len);
  2615. end = start + frag_iter->len;
  2616. if ((copy = end - offset) > 0) {
  2617. if (copy > len)
  2618. copy = len;
  2619. if (skb_copy_bits(frag_iter, offset - start, to, copy))
  2620. goto fault;
  2621. if ((len -= copy) == 0)
  2622. return 0;
  2623. offset += copy;
  2624. to += copy;
  2625. }
  2626. start = end;
  2627. }
  2628. if (!len)
  2629. return 0;
  2630. fault:
  2631. return -EFAULT;
  2632. }
  2633. EXPORT_SYMBOL(skb_copy_bits);
  2634. /*
  2635. * Callback from splice_to_pipe(), if we need to release some pages
  2636. * at the end of the spd in case we error'ed out in filling the pipe.
  2637. */
  2638. static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i)
  2639. {
  2640. put_page(spd->pages[i]);
  2641. }
  2642. static struct page *linear_to_page(struct page *page, unsigned int *len,
  2643. unsigned int *offset,
  2644. struct sock *sk)
  2645. {
  2646. struct page_frag *pfrag = sk_page_frag(sk);
  2647. if (!sk_page_frag_refill(sk, pfrag))
  2648. return NULL;
  2649. *len = min_t(unsigned int, *len, pfrag->size - pfrag->offset);
  2650. memcpy(page_address(pfrag->page) + pfrag->offset,
  2651. page_address(page) + *offset, *len);
  2652. *offset = pfrag->offset;
  2653. pfrag->offset += *len;
  2654. return pfrag->page;
  2655. }
  2656. static bool spd_can_coalesce(const struct splice_pipe_desc *spd,
  2657. struct page *page,
  2658. unsigned int offset)
  2659. {
  2660. return spd->nr_pages &&
  2661. spd->pages[spd->nr_pages - 1] == page &&
  2662. (spd->partial[spd->nr_pages - 1].offset +
  2663. spd->partial[spd->nr_pages - 1].len == offset);
  2664. }
  2665. /*
  2666. * Fill page/offset/length into spd, if it can hold more pages.
  2667. */
  2668. static bool spd_fill_page(struct splice_pipe_desc *spd, struct page *page,
  2669. unsigned int *len, unsigned int offset, bool linear,
  2670. struct sock *sk)
  2671. {
  2672. if (unlikely(spd->nr_pages == MAX_SKB_FRAGS))
  2673. return true;
  2674. if (linear) {
  2675. page = linear_to_page(page, len, &offset, sk);
  2676. if (!page)
  2677. return true;
  2678. }
  2679. if (spd_can_coalesce(spd, page, offset)) {
  2680. spd->partial[spd->nr_pages - 1].len += *len;
  2681. return false;
  2682. }
  2683. get_page(page);
  2684. spd->pages[spd->nr_pages] = page;
  2685. spd->partial[spd->nr_pages].len = *len;
  2686. spd->partial[spd->nr_pages].offset = offset;
  2687. spd->nr_pages++;
  2688. return false;
  2689. }
  2690. static bool __splice_segment(struct page *page, unsigned int poff,
  2691. unsigned int plen, unsigned int *off,
  2692. unsigned int *len,
  2693. struct splice_pipe_desc *spd, bool linear,
  2694. struct sock *sk)
  2695. {
  2696. if (!*len)
  2697. return true;
  2698. /* skip this segment if already processed */
  2699. if (*off >= plen) {
  2700. *off -= plen;
  2701. return false;
  2702. }
  2703. /* ignore any bits we already processed */
  2704. poff += *off;
  2705. plen -= *off;
  2706. *off = 0;
  2707. do {
  2708. unsigned int flen = min(*len, plen);
  2709. if (spd_fill_page(spd, page, &flen, poff, linear, sk))
  2710. return true;
  2711. poff += flen;
  2712. plen -= flen;
  2713. *len -= flen;
  2714. if (!*len)
  2715. return true;
  2716. } while (plen);
  2717. return false;
  2718. }
  2719. /*
  2720. * Map linear and fragment data from the skb to spd. It reports true if the
  2721. * pipe is full or if we already spliced the requested length.
  2722. */
  2723. static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
  2724. unsigned int *offset, unsigned int *len,
  2725. struct splice_pipe_desc *spd, struct sock *sk)
  2726. {
  2727. struct sk_buff *iter;
  2728. int seg;
  2729. /* map the linear part :
  2730. * If skb->head_frag is set, this 'linear' part is backed by a
  2731. * fragment, and if the head is not shared with any clones then
  2732. * we can avoid a copy since we own the head portion of this page.
  2733. */
  2734. if (__splice_segment(virt_to_page(skb->data),
  2735. (unsigned long) skb->data & (PAGE_SIZE - 1),
  2736. skb_headlen(skb),
  2737. offset, len, spd,
  2738. skb_head_is_locked(skb),
  2739. sk))
  2740. return true;
  2741. /*
  2742. * then map the fragments
  2743. */
  2744. if (!skb_frags_readable(skb))
  2745. return false;
  2746. for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) {
  2747. const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];
  2748. if (WARN_ON_ONCE(!skb_frag_page(f)))
  2749. return false;
  2750. if (__splice_segment(skb_frag_page(f),
  2751. skb_frag_off(f), skb_frag_size(f),
  2752. offset, len, spd, false, sk))
  2753. return true;
  2754. }
  2755. skb_walk_frags(skb, iter) {
  2756. if (*offset >= iter->len) {
  2757. *offset -= iter->len;
  2758. continue;
  2759. }
  2760. /* __skb_splice_bits() only fails if the output has no room
  2761. * left, so no point in going over the frag_list for the error
  2762. * case.
  2763. */
  2764. if (__skb_splice_bits(iter, pipe, offset, len, spd, sk))
  2765. return true;
  2766. }
  2767. return false;
  2768. }
  2769. /*
  2770. * Map data from the skb to a pipe. Should handle both the linear part,
  2771. * the fragments, and the frag list.
  2772. */
  2773. int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
  2774. struct pipe_inode_info *pipe, unsigned int tlen,
  2775. unsigned int flags)
  2776. {
  2777. struct partial_page partial[MAX_SKB_FRAGS];
  2778. struct page *pages[MAX_SKB_FRAGS];
  2779. struct splice_pipe_desc spd = {
  2780. .pages = pages,
  2781. .partial = partial,
  2782. .nr_pages_max = MAX_SKB_FRAGS,
  2783. .ops = &nosteal_pipe_buf_ops,
  2784. .spd_release = sock_spd_release,
  2785. };
  2786. int ret = 0;
  2787. __skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk);
  2788. if (spd.nr_pages)
  2789. ret = splice_to_pipe(pipe, &spd);
  2790. return ret;
  2791. }
  2792. EXPORT_SYMBOL_GPL(skb_splice_bits);
  2793. static int sendmsg_locked(struct sock *sk, struct msghdr *msg)
  2794. {
  2795. struct socket *sock = sk->sk_socket;
  2796. size_t size = msg_data_left(msg);
  2797. if (!sock)
  2798. return -EINVAL;
  2799. if (!sock->ops->sendmsg_locked)
  2800. return sock_no_sendmsg_locked(sk, msg, size);
  2801. return sock->ops->sendmsg_locked(sk, msg, size);
  2802. }
  2803. static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg)
  2804. {
  2805. struct socket *sock = sk->sk_socket;
  2806. if (!sock)
  2807. return -EINVAL;
  2808. return sock_sendmsg(sock, msg);
  2809. }
  2810. typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg);
  2811. static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset,
  2812. int len, sendmsg_func sendmsg, int flags)
  2813. {
  2814. int more_hint = sk_is_tcp(sk) ? MSG_MORE : 0;
  2815. unsigned int orig_len = len;
  2816. struct sk_buff *head = skb;
  2817. unsigned short fragidx;
  2818. int slen, ret;
  2819. do_frag_list:
  2820. /* Deal with head data */
  2821. while (offset < skb_headlen(skb) && len) {
  2822. struct kvec kv;
  2823. struct msghdr msg;
  2824. slen = min_t(int, len, skb_headlen(skb) - offset);
  2825. kv.iov_base = skb->data + offset;
  2826. kv.iov_len = slen;
  2827. memset(&msg, 0, sizeof(msg));
  2828. msg.msg_flags = MSG_DONTWAIT | flags;
  2829. if (slen < len)
  2830. msg.msg_flags |= more_hint;
  2831. iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &kv, 1, slen);
  2832. ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked,
  2833. sendmsg_unlocked, sk, &msg);
  2834. if (ret <= 0)
  2835. goto error;
  2836. offset += ret;
  2837. len -= ret;
  2838. }
  2839. /* All the data was skb head? */
  2840. if (!len)
  2841. goto out;
  2842. /* Make offset relative to start of frags */
  2843. offset -= skb_headlen(skb);
  2844. /* Find where we are in frag list */
  2845. for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; fragidx++) {
  2846. skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx];
  2847. if (offset < skb_frag_size(frag))
  2848. break;
  2849. offset -= skb_frag_size(frag);
  2850. }
  2851. for (; len && fragidx < skb_shinfo(skb)->nr_frags; fragidx++) {
  2852. skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx];
  2853. slen = min_t(size_t, len, skb_frag_size(frag) - offset);
  2854. while (slen) {
  2855. struct bio_vec bvec;
  2856. struct msghdr msg = {
  2857. .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT |
  2858. flags,
  2859. };
  2860. if (slen < len)
  2861. msg.msg_flags |= more_hint;
  2862. bvec_set_page(&bvec, skb_frag_page(frag), slen,
  2863. skb_frag_off(frag) + offset);
  2864. iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1,
  2865. slen);
  2866. ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked,
  2867. sendmsg_unlocked, sk, &msg);
  2868. if (ret <= 0)
  2869. goto error;
  2870. len -= ret;
  2871. offset += ret;
  2872. slen -= ret;
  2873. }
  2874. offset = 0;
  2875. }
  2876. if (len) {
  2877. /* Process any frag lists */
  2878. if (skb == head) {
  2879. if (skb_has_frag_list(skb)) {
  2880. skb = skb_shinfo(skb)->frag_list;
  2881. goto do_frag_list;
  2882. }
  2883. } else if (skb->next) {
  2884. skb = skb->next;
  2885. goto do_frag_list;
  2886. }
  2887. }
  2888. out:
  2889. return orig_len - len;
  2890. error:
  2891. return orig_len == len ? ret : orig_len - len;
  2892. }
  2893. /* Send skb data on a socket. Socket must be locked. */
  2894. int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
  2895. int len)
  2896. {
  2897. return __skb_send_sock(sk, skb, offset, len, sendmsg_locked, 0);
  2898. }
  2899. EXPORT_SYMBOL_GPL(skb_send_sock_locked);
  2900. int skb_send_sock_locked_with_flags(struct sock *sk, struct sk_buff *skb,
  2901. int offset, int len, int flags)
  2902. {
  2903. return __skb_send_sock(sk, skb, offset, len, sendmsg_locked, flags);
  2904. }
  2905. EXPORT_SYMBOL_GPL(skb_send_sock_locked_with_flags);
  2906. /* Send skb data on a socket. Socket must be unlocked. */
  2907. int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len)
  2908. {
  2909. return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked, 0);
  2910. }
  2911. /**
  2912. * skb_store_bits - store bits from kernel buffer to skb
  2913. * @skb: destination buffer
  2914. * @offset: offset in destination
  2915. * @from: source buffer
  2916. * @len: number of bytes to copy
  2917. *
  2918. * Copy the specified number of bytes from the source buffer to the
  2919. * destination skb. This function handles all the messy bits of
  2920. * traversing fragment lists and such.
  2921. */
  2922. int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len)
  2923. {
  2924. int start = skb_headlen(skb);
  2925. struct sk_buff *frag_iter;
  2926. int i, copy;
  2927. if (offset > (int)skb->len - len)
  2928. goto fault;
  2929. if ((copy = start - offset) > 0) {
  2930. if (copy > len)
  2931. copy = len;
  2932. skb_copy_to_linear_data_offset(skb, offset, from, copy);
  2933. if ((len -= copy) == 0)
  2934. return 0;
  2935. offset += copy;
  2936. from += copy;
  2937. }
  2938. if (!skb_frags_readable(skb))
  2939. goto fault;
  2940. for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
  2941. skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
  2942. int end;
  2943. WARN_ON(start > offset + len);
  2944. end = start + skb_frag_size(frag);
  2945. if ((copy = end - offset) > 0) {
  2946. u32 p_off, p_len, copied;
  2947. struct page *p;
  2948. u8 *vaddr;
  2949. if (copy > len)
  2950. copy = len;
  2951. skb_frag_foreach_page(frag,
  2952. skb_frag_off(frag) + offset - start,
  2953. copy, p, p_off, p_len, copied) {
  2954. vaddr = kmap_atomic(p);
  2955. memcpy(vaddr + p_off, from + copied, p_len);
  2956. kunmap_atomic(vaddr);
  2957. }
  2958. if ((len -= copy) == 0)
  2959. return 0;
  2960. offset += copy;
  2961. from += copy;
  2962. }
  2963. start = end;
  2964. }
  2965. skb_walk_frags(skb, frag_iter) {
  2966. int end;
  2967. WARN_ON(start > offset + len);
  2968. end = start + frag_iter->len;
  2969. if ((copy = end - offset) > 0) {
  2970. if (copy > len)
  2971. copy = len;
  2972. if (skb_store_bits(frag_iter, offset - start,
  2973. from, copy))
  2974. goto fault;
  2975. if ((len -= copy) == 0)
  2976. return 0;
  2977. offset += copy;
  2978. from += copy;
  2979. }
  2980. start = end;
  2981. }
  2982. if (!len)
  2983. return 0;
  2984. fault:
  2985. return -EFAULT;
  2986. }
  2987. EXPORT_SYMBOL(skb_store_bits);
  2988. /* Checksum skb data. */
  2989. __wsum skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum)
  2990. {
  2991. int start = skb_headlen(skb);
  2992. int i, copy = start - offset;
  2993. struct sk_buff *frag_iter;
  2994. int pos = 0;
  2995. /* Checksum header. */
  2996. if (copy > 0) {
  2997. if (copy > len)
  2998. copy = len;
  2999. csum = csum_partial(skb->data + offset, copy, csum);
  3000. if ((len -= copy) == 0)
  3001. return csum;
  3002. offset += copy;
  3003. pos = copy;
  3004. }
  3005. if (WARN_ON_ONCE(!skb_frags_readable(skb)))
  3006. return 0;
  3007. for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
  3008. int end;
  3009. skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
  3010. WARN_ON(start > offset + len);
  3011. end = start + skb_frag_size(frag);
  3012. if ((copy = end - offset) > 0) {
  3013. u32 p_off, p_len, copied;
  3014. struct page *p;
  3015. __wsum csum2;
  3016. u8 *vaddr;
  3017. if (copy > len)
  3018. copy = len;
  3019. skb_frag_foreach_page(frag,
  3020. skb_frag_off(frag) + offset - start,
  3021. copy, p, p_off, p_len, copied) {
  3022. vaddr = kmap_atomic(p);
  3023. csum2 = csum_partial(vaddr + p_off, p_len, 0);
  3024. kunmap_atomic(vaddr);
  3025. csum = csum_block_add(csum, csum2, pos);
  3026. pos += p_len;
  3027. }
  3028. if (!(len -= copy))
  3029. return csum;
  3030. offset += copy;
  3031. }
  3032. start = end;
  3033. }
  3034. skb_walk_frags(skb, frag_iter) {
  3035. int end;
  3036. WARN_ON(start > offset + len);
  3037. end = start + frag_iter->len;
  3038. if ((copy = end - offset) > 0) {
  3039. __wsum csum2;
  3040. if (copy > len)
  3041. copy = len;
  3042. csum2 = skb_checksum(frag_iter, offset - start, copy,
  3043. 0);
  3044. csum = csum_block_add(csum, csum2, pos);
  3045. if ((len -= copy) == 0)
  3046. return csum;
  3047. offset += copy;
  3048. pos += copy;
  3049. }
  3050. start = end;
  3051. }
  3052. BUG_ON(len);
  3053. return csum;
  3054. }
  3055. EXPORT_SYMBOL(skb_checksum);
  3056. /* Both of above in one bottle. */
  3057. __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
  3058. u8 *to, int len)
  3059. {
  3060. int start = skb_headlen(skb);
  3061. int i, copy = start - offset;
  3062. struct sk_buff *frag_iter;
  3063. int pos = 0;
  3064. __wsum csum = 0;
  3065. /* Copy header. */
  3066. if (copy > 0) {
  3067. if (copy > len)
  3068. copy = len;
  3069. csum = csum_partial_copy_nocheck(skb->data + offset, to,
  3070. copy);
  3071. if ((len -= copy) == 0)
  3072. return csum;
  3073. offset += copy;
  3074. to += copy;
  3075. pos = copy;
  3076. }
  3077. if (!skb_frags_readable(skb))
  3078. return 0;
  3079. for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
  3080. int end;
  3081. WARN_ON(start > offset + len);
  3082. end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
  3083. if ((copy = end - offset) > 0) {
  3084. skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
  3085. u32 p_off, p_len, copied;
  3086. struct page *p;
  3087. __wsum csum2;
  3088. u8 *vaddr;
  3089. if (copy > len)
  3090. copy = len;
  3091. skb_frag_foreach_page(frag,
  3092. skb_frag_off(frag) + offset - start,
  3093. copy, p, p_off, p_len, copied) {
  3094. vaddr = kmap_atomic(p);
  3095. csum2 = csum_partial_copy_nocheck(vaddr + p_off,
  3096. to + copied,
  3097. p_len);
  3098. kunmap_atomic(vaddr);
  3099. csum = csum_block_add(csum, csum2, pos);
  3100. pos += p_len;
  3101. }
  3102. if (!(len -= copy))
  3103. return csum;
  3104. offset += copy;
  3105. to += copy;
  3106. }
  3107. start = end;
  3108. }
  3109. skb_walk_frags(skb, frag_iter) {
  3110. __wsum csum2;
  3111. int end;
  3112. WARN_ON(start > offset + len);
  3113. end = start + frag_iter->len;
  3114. if ((copy = end - offset) > 0) {
  3115. if (copy > len)
  3116. copy = len;
  3117. csum2 = skb_copy_and_csum_bits(frag_iter,
  3118. offset - start,
  3119. to, copy);
  3120. csum = csum_block_add(csum, csum2, pos);
  3121. if ((len -= copy) == 0)
  3122. return csum;
  3123. offset += copy;
  3124. to += copy;
  3125. pos += copy;
  3126. }
  3127. start = end;
  3128. }
  3129. BUG_ON(len);
  3130. return csum;
  3131. }
  3132. EXPORT_SYMBOL(skb_copy_and_csum_bits);
  3133. #ifdef CONFIG_NET_CRC32C
  3134. u32 skb_crc32c(const struct sk_buff *skb, int offset, int len, u32 crc)
  3135. {
  3136. int start = skb_headlen(skb);
  3137. int i, copy = start - offset;
  3138. struct sk_buff *frag_iter;
  3139. if (copy > 0) {
  3140. copy = min(copy, len);
  3141. crc = crc32c(crc, skb->data + offset, copy);
  3142. len -= copy;
  3143. if (len == 0)
  3144. return crc;
  3145. offset += copy;
  3146. }
  3147. if (WARN_ON_ONCE(!skb_frags_readable(skb)))
  3148. return 0;
  3149. for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
  3150. int end;
  3151. skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
  3152. WARN_ON(start > offset + len);
  3153. end = start + skb_frag_size(frag);
  3154. copy = end - offset;
  3155. if (copy > 0) {
  3156. u32 p_off, p_len, copied;
  3157. struct page *p;
  3158. u8 *vaddr;
  3159. copy = min(copy, len);
  3160. skb_frag_foreach_page(frag,
  3161. skb_frag_off(frag) + offset - start,
  3162. copy, p, p_off, p_len, copied) {
  3163. vaddr = kmap_atomic(p);
  3164. crc = crc32c(crc, vaddr + p_off, p_len);
  3165. kunmap_atomic(vaddr);
  3166. }
  3167. len -= copy;
  3168. if (len == 0)
  3169. return crc;
  3170. offset += copy;
  3171. }
  3172. start = end;
  3173. }
  3174. skb_walk_frags(skb, frag_iter) {
  3175. int end;
  3176. WARN_ON(start > offset + len);
  3177. end = start + frag_iter->len;
  3178. copy = end - offset;
  3179. if (copy > 0) {
  3180. copy = min(copy, len);
  3181. crc = skb_crc32c(frag_iter, offset - start, copy, crc);
  3182. len -= copy;
  3183. if (len == 0)
  3184. return crc;
  3185. offset += copy;
  3186. }
  3187. start = end;
  3188. }
  3189. BUG_ON(len);
  3190. return crc;
  3191. }
  3192. EXPORT_SYMBOL(skb_crc32c);
  3193. #endif /* CONFIG_NET_CRC32C */
  3194. __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)
  3195. {
  3196. __sum16 sum;
  3197. sum = csum_fold(skb_checksum(skb, 0, len, skb->csum));
  3198. /* See comments in __skb_checksum_complete(). */
  3199. if (likely(!sum)) {
  3200. if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
  3201. !skb->csum_complete_sw)
  3202. netdev_rx_csum_fault(skb->dev, skb);
  3203. }
  3204. if (!skb_shared(skb))
  3205. skb->csum_valid = !sum;
  3206. return sum;
  3207. }
  3208. EXPORT_SYMBOL(__skb_checksum_complete_head);
  3209. /* This function assumes skb->csum already holds pseudo header's checksum,
  3210. * which has been changed from the hardware checksum, for example, by
  3211. * __skb_checksum_validate_complete(). And, the original skb->csum must
  3212. * have been validated unsuccessfully for CHECKSUM_COMPLETE case.
  3213. *
  3214. * It returns non-zero if the recomputed checksum is still invalid, otherwise
  3215. * zero. The new checksum is stored back into skb->csum unless the skb is
  3216. * shared.
  3217. */
  3218. __sum16 __skb_checksum_complete(struct sk_buff *skb)
  3219. {
  3220. __wsum csum;
  3221. __sum16 sum;
  3222. csum = skb_checksum(skb, 0, skb->len, 0);
  3223. sum = csum_fold(csum_add(skb->csum, csum));
  3224. /* This check is inverted, because we already knew the hardware
  3225. * checksum is invalid before calling this function. So, if the
  3226. * re-computed checksum is valid instead, then we have a mismatch
  3227. * between the original skb->csum and skb_checksum(). This means either
  3228. * the original hardware checksum is incorrect or we screw up skb->csum
  3229. * when moving skb->data around.
  3230. */
  3231. if (likely(!sum)) {
  3232. if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
  3233. !skb->csum_complete_sw)
  3234. netdev_rx_csum_fault(skb->dev, skb);
  3235. }
  3236. if (!skb_shared(skb)) {
  3237. /* Save full packet checksum */
  3238. skb->csum = csum;
  3239. skb->ip_summed = CHECKSUM_COMPLETE;
  3240. skb->csum_complete_sw = 1;
  3241. skb->csum_valid = !sum;
  3242. }
  3243. return sum;
  3244. }
  3245. EXPORT_SYMBOL(__skb_checksum_complete);
  3246. /**
  3247. * skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy()
  3248. * @from: source buffer
  3249. *
  3250. * Calculates the amount of linear headroom needed in the 'to' skb passed
  3251. * into skb_zerocopy().
  3252. */
  3253. unsigned int
  3254. skb_zerocopy_headlen(const struct sk_buff *from)
  3255. {
  3256. unsigned int hlen = 0;
  3257. if (!from->head_frag ||
  3258. skb_headlen(from) < L1_CACHE_BYTES ||
  3259. skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) {
  3260. hlen = skb_headlen(from);
  3261. if (!hlen)
  3262. hlen = from->len;
  3263. }
  3264. if (skb_has_frag_list(from))
  3265. hlen = from->len;
  3266. return hlen;
  3267. }
  3268. EXPORT_SYMBOL_GPL(skb_zerocopy_headlen);
  3269. /**
  3270. * skb_zerocopy - Zero copy skb to skb
  3271. * @to: destination buffer
  3272. * @from: source buffer
  3273. * @len: number of bytes to copy from source buffer
  3274. * @hlen: size of linear headroom in destination buffer
  3275. *
  3276. * Copies up to `len` bytes from `from` to `to` by creating references
  3277. * to the frags in the source buffer.
  3278. *
  3279. * The `hlen` as calculated by skb_zerocopy_headlen() specifies the
  3280. * headroom in the `to` buffer.
  3281. *
  3282. * Return value:
  3283. * 0: everything is OK
  3284. * -ENOMEM: couldn't orphan frags of @from due to lack of memory
  3285. * -EFAULT: skb_copy_bits() found some problem with skb geometry
  3286. */
  3287. int
  3288. skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen)
  3289. {
  3290. int i, j = 0;
  3291. int plen = 0; /* length of skb->head fragment */
  3292. int ret;
  3293. struct page *page;
  3294. unsigned int offset;
  3295. BUG_ON(!from->head_frag && !hlen);
  3296. /* dont bother with small payloads */
  3297. if (len <= skb_tailroom(to))
  3298. return skb_copy_bits(from, 0, skb_put(to, len), len);
  3299. if (hlen) {
  3300. ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen);
  3301. if (unlikely(ret))
  3302. return ret;
  3303. len -= hlen;
  3304. } else {
  3305. plen = min_t(int, skb_headlen(from), len);
  3306. if (plen) {
  3307. page = virt_to_head_page(from->head);
  3308. offset = from->data - (unsigned char *)page_address(page);
  3309. __skb_fill_netmem_desc(to, 0, page_to_netmem(page),
  3310. offset, plen);
  3311. get_page(page);
  3312. j = 1;
  3313. len -= plen;
  3314. }
  3315. }
  3316. skb_len_add(to, len + plen);
  3317. if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) {
  3318. skb_tx_error(from);
  3319. return -ENOMEM;
  3320. }
  3321. skb_zerocopy_clone(to, from, GFP_ATOMIC);
  3322. for (i = 0; i < skb_shinfo(from)->nr_frags; i++) {
  3323. int size;
  3324. if (!len)
  3325. break;
  3326. skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i];
  3327. size = min_t(int, skb_frag_size(&skb_shinfo(to)->frags[j]),
  3328. len);
  3329. skb_frag_size_set(&skb_shinfo(to)->frags[j], size);
  3330. len -= size;
  3331. skb_frag_ref(to, j);
  3332. j++;
  3333. }
  3334. skb_shinfo(to)->nr_frags = j;
  3335. return 0;
  3336. }
  3337. EXPORT_SYMBOL_GPL(skb_zerocopy);
  3338. void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)
  3339. {
  3340. __wsum csum;
  3341. long csstart;
  3342. if (skb->ip_summed == CHECKSUM_PARTIAL)
  3343. csstart = skb_checksum_start_offset(skb);
  3344. else
  3345. csstart = skb_headlen(skb);
  3346. BUG_ON(csstart > skb_headlen(skb));
  3347. skb_copy_from_linear_data(skb, to, csstart);
  3348. csum = 0;
  3349. if (csstart != skb->len)
  3350. csum = skb_copy_and_csum_bits(skb, csstart, to + csstart,
  3351. skb->len - csstart);
  3352. if (skb->ip_summed == CHECKSUM_PARTIAL) {
  3353. long csstuff = csstart + skb->csum_offset;
  3354. *((__sum16 *)(to + csstuff)) = csum_fold(csum);
  3355. }
  3356. }
  3357. EXPORT_SYMBOL(skb_copy_and_csum_dev);
  3358. /**
  3359. * skb_dequeue - remove from the head of the queue
  3360. * @list: list to dequeue from
  3361. *
  3362. * Remove the head of the list. The list lock is taken so the function
  3363. * may be used safely with other locking list functions. The head item is
  3364. * returned or %NULL if the list is empty.
  3365. */
  3366. struct sk_buff *skb_dequeue(struct sk_buff_head *list)
  3367. {
  3368. unsigned long flags;
  3369. struct sk_buff *result;
  3370. spin_lock_irqsave(&list->lock, flags);
  3371. result = __skb_dequeue(list);
  3372. spin_unlock_irqrestore(&list->lock, flags);
  3373. return result;
  3374. }
  3375. EXPORT_SYMBOL(skb_dequeue);
  3376. /**
  3377. * skb_dequeue_tail - remove from the tail of the queue
  3378. * @list: list to dequeue from
  3379. *
  3380. * Remove the tail of the list. The list lock is taken so the function
  3381. * may be used safely with other locking list functions. The tail item is
  3382. * returned or %NULL if the list is empty.
  3383. */
  3384. struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list)
  3385. {
  3386. unsigned long flags;
  3387. struct sk_buff *result;
  3388. spin_lock_irqsave(&list->lock, flags);
  3389. result = __skb_dequeue_tail(list);
  3390. spin_unlock_irqrestore(&list->lock, flags);
  3391. return result;
  3392. }
  3393. EXPORT_SYMBOL(skb_dequeue_tail);
  3394. /**
  3395. * skb_queue_purge_reason - empty a list
  3396. * @list: list to empty
  3397. * @reason: drop reason
  3398. *
  3399. * Delete all buffers on an &sk_buff list. Each buffer is removed from
  3400. * the list and one reference dropped. This function takes the list
  3401. * lock and is atomic with respect to other list locking functions.
  3402. */
  3403. void skb_queue_purge_reason(struct sk_buff_head *list,
  3404. enum skb_drop_reason reason)
  3405. {
  3406. struct sk_buff_head tmp;
  3407. unsigned long flags;
  3408. if (skb_queue_empty_lockless(list))
  3409. return;
  3410. __skb_queue_head_init(&tmp);
  3411. spin_lock_irqsave(&list->lock, flags);
  3412. skb_queue_splice_init(list, &tmp);
  3413. spin_unlock_irqrestore(&list->lock, flags);
  3414. __skb_queue_purge_reason(&tmp, reason);
  3415. }
  3416. EXPORT_SYMBOL(skb_queue_purge_reason);
  3417. /**
  3418. * skb_rbtree_purge - empty a skb rbtree
  3419. * @root: root of the rbtree to empty
  3420. * Return value: the sum of truesizes of all purged skbs.
  3421. *
  3422. * Delete all buffers on an &sk_buff rbtree. Each buffer is removed from
  3423. * the list and one reference dropped. This function does not take
  3424. * any lock. Synchronization should be handled by the caller (e.g., TCP
  3425. * out-of-order queue is protected by the socket lock).
  3426. */
  3427. unsigned int skb_rbtree_purge(struct rb_root *root)
  3428. {
  3429. struct rb_node *p = rb_first(root);
  3430. unsigned int sum = 0;
  3431. while (p) {
  3432. struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
  3433. p = rb_next(p);
  3434. rb_erase(&skb->rbnode, root);
  3435. sum += skb->truesize;
  3436. kfree_skb(skb);
  3437. }
  3438. return sum;
  3439. }
  3440. void skb_errqueue_purge(struct sk_buff_head *list)
  3441. {
  3442. struct sk_buff *skb, *next;
  3443. struct sk_buff_head kill;
  3444. unsigned long flags;
  3445. __skb_queue_head_init(&kill);
  3446. spin_lock_irqsave(&list->lock, flags);
  3447. skb_queue_walk_safe(list, skb, next) {
  3448. if (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ZEROCOPY ||
  3449. SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING)
  3450. continue;
  3451. __skb_unlink(skb, list);
  3452. __skb_queue_tail(&kill, skb);
  3453. }
  3454. spin_unlock_irqrestore(&list->lock, flags);
  3455. __skb_queue_purge(&kill);
  3456. }
  3457. EXPORT_SYMBOL(skb_errqueue_purge);
  3458. /**
  3459. * skb_queue_head - queue a buffer at the list head
  3460. * @list: list to use
  3461. * @newsk: buffer to queue
  3462. *
  3463. * Queue a buffer at the start of the list. This function takes the
  3464. * list lock and can be used safely with other locking &sk_buff functions
  3465. * safely.
  3466. *
  3467. * A buffer cannot be placed on two lists at the same time.
  3468. */
  3469. void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk)
  3470. {
  3471. unsigned long flags;
  3472. spin_lock_irqsave(&list->lock, flags);
  3473. __skb_queue_head(list, newsk);
  3474. spin_unlock_irqrestore(&list->lock, flags);
  3475. }
  3476. EXPORT_SYMBOL(skb_queue_head);
  3477. /**
  3478. * skb_queue_tail - queue a buffer at the list tail
  3479. * @list: list to use
  3480. * @newsk: buffer to queue
  3481. *
  3482. * Queue a buffer at the tail of the list. This function takes the
  3483. * list lock and can be used safely with other locking &sk_buff functions
  3484. * safely.
  3485. *
  3486. * A buffer cannot be placed on two lists at the same time.
  3487. */
  3488. void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk)
  3489. {
  3490. unsigned long flags;
  3491. spin_lock_irqsave(&list->lock, flags);
  3492. __skb_queue_tail(list, newsk);
  3493. spin_unlock_irqrestore(&list->lock, flags);
  3494. }
  3495. EXPORT_SYMBOL(skb_queue_tail);
  3496. /**
  3497. * skb_unlink - remove a buffer from a list
  3498. * @skb: buffer to remove
  3499. * @list: list to use
  3500. *
  3501. * Remove a packet from a list. The list locks are taken and this
  3502. * function is atomic with respect to other list locked calls
  3503. *
  3504. * You must know what list the SKB is on.
  3505. */
  3506. void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
  3507. {
  3508. unsigned long flags;
  3509. spin_lock_irqsave(&list->lock, flags);
  3510. __skb_unlink(skb, list);
  3511. spin_unlock_irqrestore(&list->lock, flags);
  3512. }
  3513. EXPORT_SYMBOL(skb_unlink);
  3514. /**
  3515. * skb_append - append a buffer
  3516. * @old: buffer to insert after
  3517. * @newsk: buffer to insert
  3518. * @list: list to use
  3519. *
  3520. * Place a packet after a given packet in a list. The list locks are taken
  3521. * and this function is atomic with respect to other list locked calls.
  3522. * A buffer cannot be placed on two lists at the same time.
  3523. */
  3524. void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
  3525. {
  3526. unsigned long flags;
  3527. spin_lock_irqsave(&list->lock, flags);
  3528. __skb_queue_after(list, old, newsk);
  3529. spin_unlock_irqrestore(&list->lock, flags);
  3530. }
  3531. EXPORT_SYMBOL(skb_append);
  3532. static inline void skb_split_inside_header(struct sk_buff *skb,
  3533. struct sk_buff* skb1,
  3534. const u32 len, const int pos)
  3535. {
  3536. int i;
  3537. skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len),
  3538. pos - len);
  3539. /* And move data appendix as is. */
  3540. for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
  3541. skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
  3542. skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
  3543. skb1->unreadable = skb->unreadable;
  3544. skb_shinfo(skb)->nr_frags = 0;
  3545. skb1->data_len = skb->data_len;
  3546. skb1->len += skb1->data_len;
  3547. skb->data_len = 0;
  3548. skb->len = len;
  3549. skb_set_tail_pointer(skb, len);
  3550. }
  3551. static inline void skb_split_no_header(struct sk_buff *skb,
  3552. struct sk_buff* skb1,
  3553. const u32 len, int pos)
  3554. {
  3555. int i, k = 0;
  3556. const int nfrags = skb_shinfo(skb)->nr_frags;
  3557. skb_shinfo(skb)->nr_frags = 0;
  3558. skb1->len = skb1->data_len = skb->len - len;
  3559. skb->len = len;
  3560. skb->data_len = len - pos;
  3561. for (i = 0; i < nfrags; i++) {
  3562. int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
  3563. if (pos + size > len) {
  3564. skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];
  3565. if (pos < len) {
  3566. /* Split frag.
  3567. * We have two variants in this case:
  3568. * 1. Move all the frag to the second
  3569. * part, if it is possible. F.e.
  3570. * this approach is mandatory for TUX,
  3571. * where splitting is expensive.
  3572. * 2. Split is accurately. We make this.
  3573. */
  3574. skb_frag_ref(skb, i);
  3575. skb_frag_off_add(&skb_shinfo(skb1)->frags[0], len - pos);
  3576. skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos);
  3577. skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos);
  3578. skb_shinfo(skb)->nr_frags++;
  3579. }
  3580. k++;
  3581. } else
  3582. skb_shinfo(skb)->nr_frags++;
  3583. pos += size;
  3584. }
  3585. skb_shinfo(skb1)->nr_frags = k;
  3586. skb1->unreadable = skb->unreadable;
  3587. }
  3588. /**
  3589. * skb_split - Split fragmented skb to two parts at length len.
  3590. * @skb: the buffer to split
  3591. * @skb1: the buffer to receive the second part
  3592. * @len: new length for skb
  3593. */
  3594. void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
  3595. {
  3596. int pos = skb_headlen(skb);
  3597. const int zc_flags = SKBFL_SHARED_FRAG | SKBFL_PURE_ZEROCOPY;
  3598. skb_zcopy_downgrade_managed(skb);
  3599. skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & zc_flags;
  3600. skb_zerocopy_clone(skb1, skb, 0);
  3601. if (len < pos) /* Split line is inside header. */
  3602. skb_split_inside_header(skb, skb1, len, pos);
  3603. else /* Second chunk has no header, nothing to copy. */
  3604. skb_split_no_header(skb, skb1, len, pos);
  3605. }
  3606. EXPORT_SYMBOL(skb_split);
  3607. /* Shifting from/to a cloned skb is a no-go.
  3608. *
  3609. * Caller cannot keep skb_shinfo related pointers past calling here!
  3610. */
  3611. static int skb_prepare_for_shift(struct sk_buff *skb)
  3612. {
  3613. return skb_unclone_keeptruesize(skb, GFP_ATOMIC);
  3614. }
  3615. /**
  3616. * skb_shift - Shifts paged data partially from skb to another
  3617. * @tgt: buffer into which tail data gets added
  3618. * @skb: buffer from which the paged data comes from
  3619. * @shiftlen: shift up to this many bytes
  3620. *
  3621. * Attempts to shift up to shiftlen worth of bytes, which may be less than
  3622. * the length of the skb, from skb to tgt. Returns number bytes shifted.
  3623. * It's up to caller to free skb if everything was shifted.
  3624. *
  3625. * If @tgt runs out of frags, the whole operation is aborted.
  3626. *
  3627. * Skb cannot include anything else but paged data while tgt is allowed
  3628. * to have non-paged data as well.
  3629. *
  3630. * TODO: full sized shift could be optimized but that would need
  3631. * specialized skb free'er to handle frags without up-to-date nr_frags.
  3632. */
  3633. int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
  3634. {
  3635. int from, to, merge, todo;
  3636. skb_frag_t *fragfrom, *fragto;
  3637. BUG_ON(shiftlen > skb->len);
  3638. if (skb_headlen(skb))
  3639. return 0;
  3640. if (skb_zcopy(tgt) || skb_zcopy(skb))
  3641. return 0;
  3642. DEBUG_NET_WARN_ON_ONCE(tgt->pp_recycle != skb->pp_recycle);
  3643. DEBUG_NET_WARN_ON_ONCE(skb_cmp_decrypted(tgt, skb));
  3644. todo = shiftlen;
  3645. from = 0;
  3646. to = skb_shinfo(tgt)->nr_frags;
  3647. fragfrom = &skb_shinfo(skb)->frags[from];
  3648. /* Actual merge is delayed until the point when we know we can
  3649. * commit all, so that we don't have to undo partial changes
  3650. */
  3651. if (!skb_can_coalesce(tgt, to, skb_frag_page(fragfrom),
  3652. skb_frag_off(fragfrom))) {
  3653. merge = -1;
  3654. } else {
  3655. merge = to - 1;
  3656. todo -= skb_frag_size(fragfrom);
  3657. if (todo < 0) {
  3658. if (skb_prepare_for_shift(skb) ||
  3659. skb_prepare_for_shift(tgt))
  3660. return 0;
  3661. /* All previous frag pointers might be stale! */
  3662. fragfrom = &skb_shinfo(skb)->frags[from];
  3663. fragto = &skb_shinfo(tgt)->frags[merge];
  3664. skb_frag_size_add(fragto, shiftlen);
  3665. skb_frag_size_sub(fragfrom, shiftlen);
  3666. skb_frag_off_add(fragfrom, shiftlen);
  3667. goto onlymerged;
  3668. }
  3669. from++;
  3670. }
  3671. /* Skip full, not-fitting skb to avoid expensive operations */
  3672. if ((shiftlen == skb->len) &&
  3673. (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to))
  3674. return 0;
  3675. if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt))
  3676. return 0;
  3677. while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) {
  3678. if (to == MAX_SKB_FRAGS)
  3679. return 0;
  3680. fragfrom = &skb_shinfo(skb)->frags[from];
  3681. fragto = &skb_shinfo(tgt)->frags[to];
  3682. if (todo >= skb_frag_size(fragfrom)) {
  3683. *fragto = *fragfrom;
  3684. todo -= skb_frag_size(fragfrom);
  3685. from++;
  3686. to++;
  3687. } else {
  3688. __skb_frag_ref(fragfrom);
  3689. skb_frag_page_copy(fragto, fragfrom);
  3690. skb_frag_off_copy(fragto, fragfrom);
  3691. skb_frag_size_set(fragto, todo);
  3692. skb_frag_off_add(fragfrom, todo);
  3693. skb_frag_size_sub(fragfrom, todo);
  3694. todo = 0;
  3695. to++;
  3696. break;
  3697. }
  3698. }
  3699. /* Ready to "commit" this state change to tgt */
  3700. skb_shinfo(tgt)->nr_frags = to;
  3701. if (merge >= 0) {
  3702. fragfrom = &skb_shinfo(skb)->frags[0];
  3703. fragto = &skb_shinfo(tgt)->frags[merge];
  3704. skb_frag_size_add(fragto, skb_frag_size(fragfrom));
  3705. __skb_frag_unref(fragfrom, skb->pp_recycle);
  3706. }
  3707. /* Reposition in the original skb */
  3708. to = 0;
  3709. while (from < skb_shinfo(skb)->nr_frags)
  3710. skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++];
  3711. skb_shinfo(skb)->nr_frags = to;
  3712. BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags);
  3713. onlymerged:
  3714. /* Most likely the tgt won't ever need its checksum anymore, skb on
  3715. * the other hand might need it if it needs to be resent
  3716. */
  3717. tgt->ip_summed = CHECKSUM_PARTIAL;
  3718. skb->ip_summed = CHECKSUM_PARTIAL;
  3719. skb_len_add(skb, -shiftlen);
  3720. skb_len_add(tgt, shiftlen);
  3721. return shiftlen;
  3722. }
  3723. /**
  3724. * skb_prepare_seq_read - Prepare a sequential read of skb data
  3725. * @skb: the buffer to read
  3726. * @from: lower offset of data to be read
  3727. * @to: upper offset of data to be read
  3728. * @st: state variable
  3729. *
  3730. * Initializes the specified state variable. Must be called before
  3731. * invoking skb_seq_read() for the first time.
  3732. */
  3733. void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
  3734. unsigned int to, struct skb_seq_state *st)
  3735. {
  3736. st->lower_offset = from;
  3737. st->upper_offset = to;
  3738. st->root_skb = st->cur_skb = skb;
  3739. st->frag_idx = st->stepped_offset = 0;
  3740. st->frag_data = NULL;
  3741. st->frag_off = 0;
  3742. }
  3743. EXPORT_SYMBOL(skb_prepare_seq_read);
  3744. /**
  3745. * skb_seq_read - Sequentially read skb data
  3746. * @consumed: number of bytes consumed by the caller so far
  3747. * @data: destination pointer for data to be returned
  3748. * @st: state variable
  3749. *
  3750. * Reads a block of skb data at @consumed relative to the
  3751. * lower offset specified to skb_prepare_seq_read(). Assigns
  3752. * the head of the data block to @data and returns the length
  3753. * of the block or 0 if the end of the skb data or the upper
  3754. * offset has been reached.
  3755. *
  3756. * The caller is not required to consume all of the data
  3757. * returned, i.e. @consumed is typically set to the number
  3758. * of bytes already consumed and the next call to
  3759. * skb_seq_read() will return the remaining part of the block.
  3760. *
  3761. * Note 1: The size of each block of data returned can be arbitrary,
  3762. * this limitation is the cost for zerocopy sequential
  3763. * reads of potentially non linear data.
  3764. *
  3765. * Note 2: Fragment lists within fragments are not implemented
  3766. * at the moment, state->root_skb could be replaced with
  3767. * a stack for this purpose.
  3768. */
  3769. unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
  3770. struct skb_seq_state *st)
  3771. {
  3772. unsigned int block_limit, abs_offset = consumed + st->lower_offset;
  3773. skb_frag_t *frag;
  3774. if (unlikely(abs_offset >= st->upper_offset)) {
  3775. if (st->frag_data) {
  3776. kunmap_atomic(st->frag_data);
  3777. st->frag_data = NULL;
  3778. }
  3779. return 0;
  3780. }
  3781. next_skb:
  3782. block_limit = skb_headlen(st->cur_skb) + st->stepped_offset;
  3783. if (abs_offset < block_limit && !st->frag_data) {
  3784. *data = st->cur_skb->data + (abs_offset - st->stepped_offset);
  3785. return block_limit - abs_offset;
  3786. }
  3787. if (!skb_frags_readable(st->cur_skb))
  3788. return 0;
  3789. if (st->frag_idx == 0 && !st->frag_data)
  3790. st->stepped_offset += skb_headlen(st->cur_skb);
  3791. while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
  3792. unsigned int pg_idx, pg_off, pg_sz;
  3793. frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];
  3794. pg_idx = 0;
  3795. pg_off = skb_frag_off(frag);
  3796. pg_sz = skb_frag_size(frag);
  3797. if (skb_frag_must_loop(skb_frag_page(frag))) {
  3798. pg_idx = (pg_off + st->frag_off) >> PAGE_SHIFT;
  3799. pg_off = offset_in_page(pg_off + st->frag_off);
  3800. pg_sz = min_t(unsigned int, pg_sz - st->frag_off,
  3801. PAGE_SIZE - pg_off);
  3802. }
  3803. block_limit = pg_sz + st->stepped_offset;
  3804. if (abs_offset < block_limit) {
  3805. if (!st->frag_data)
  3806. st->frag_data = kmap_atomic(skb_frag_page(frag) + pg_idx);
  3807. *data = (u8 *)st->frag_data + pg_off +
  3808. (abs_offset - st->stepped_offset);
  3809. return block_limit - abs_offset;
  3810. }
  3811. if (st->frag_data) {
  3812. kunmap_atomic(st->frag_data);
  3813. st->frag_data = NULL;
  3814. }
  3815. st->stepped_offset += pg_sz;
  3816. st->frag_off += pg_sz;
  3817. if (st->frag_off == skb_frag_size(frag)) {
  3818. st->frag_off = 0;
  3819. st->frag_idx++;
  3820. }
  3821. }
  3822. if (st->frag_data) {
  3823. kunmap_atomic(st->frag_data);
  3824. st->frag_data = NULL;
  3825. }
  3826. if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) {
  3827. st->cur_skb = skb_shinfo(st->root_skb)->frag_list;
  3828. st->frag_idx = 0;
  3829. goto next_skb;
  3830. } else if (st->cur_skb->next) {
  3831. st->cur_skb = st->cur_skb->next;
  3832. st->frag_idx = 0;
  3833. goto next_skb;
  3834. }
  3835. return 0;
  3836. }
  3837. EXPORT_SYMBOL(skb_seq_read);
  3838. /**
  3839. * skb_abort_seq_read - Abort a sequential read of skb data
  3840. * @st: state variable
  3841. *
  3842. * Must be called if skb_seq_read() was not called until it
  3843. * returned 0.
  3844. */
  3845. void skb_abort_seq_read(struct skb_seq_state *st)
  3846. {
  3847. if (st->frag_data)
  3848. kunmap_atomic(st->frag_data);
  3849. }
  3850. EXPORT_SYMBOL(skb_abort_seq_read);
  3851. /**
  3852. * skb_copy_seq_read() - copy from a skb_seq_state to a buffer
  3853. * @st: source skb_seq_state
  3854. * @offset: offset in source
  3855. * @to: destination buffer
  3856. * @len: number of bytes to copy
  3857. *
  3858. * Copy @len bytes from @offset bytes into the source @st to the destination
  3859. * buffer @to. `offset` should increase (or be unchanged) with each subsequent
  3860. * call to this function. If offset needs to decrease from the previous use `st`
  3861. * should be reset first.
  3862. *
  3863. * Return: 0 on success or -EINVAL if the copy ended early
  3864. */
  3865. int skb_copy_seq_read(struct skb_seq_state *st, int offset, void *to, int len)
  3866. {
  3867. const u8 *data;
  3868. u32 sqlen;
  3869. for (;;) {
  3870. sqlen = skb_seq_read(offset, &data, st);
  3871. if (sqlen == 0)
  3872. return -EINVAL;
  3873. if (sqlen >= len) {
  3874. memcpy(to, data, len);
  3875. return 0;
  3876. }
  3877. memcpy(to, data, sqlen);
  3878. to += sqlen;
  3879. offset += sqlen;
  3880. len -= sqlen;
  3881. }
  3882. }
  3883. EXPORT_SYMBOL(skb_copy_seq_read);
  3884. #define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb))
  3885. static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text,
  3886. struct ts_config *conf,
  3887. struct ts_state *state)
  3888. {
  3889. return skb_seq_read(offset, text, TS_SKB_CB(state));
  3890. }
  3891. static void skb_ts_finish(struct ts_config *conf, struct ts_state *state)
  3892. {
  3893. skb_abort_seq_read(TS_SKB_CB(state));
  3894. }
  3895. /**
  3896. * skb_find_text - Find a text pattern in skb data
  3897. * @skb: the buffer to look in
  3898. * @from: search offset
  3899. * @to: search limit
  3900. * @config: textsearch configuration
  3901. *
  3902. * Finds a pattern in the skb data according to the specified
  3903. * textsearch configuration. Use textsearch_next() to retrieve
  3904. * subsequent occurrences of the pattern. Returns the offset
  3905. * to the first occurrence or UINT_MAX if no match was found.
  3906. */
  3907. unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
  3908. unsigned int to, struct ts_config *config)
  3909. {
  3910. unsigned int patlen = config->ops->get_pattern_len(config);
  3911. struct ts_state state;
  3912. unsigned int ret;
  3913. BUILD_BUG_ON(sizeof(struct skb_seq_state) > sizeof(state.cb));
  3914. config->get_next_block = skb_ts_get_next_block;
  3915. config->finish = skb_ts_finish;
  3916. skb_prepare_seq_read(skb, from, to, TS_SKB_CB(&state));
  3917. ret = textsearch_find(config, &state);
  3918. return (ret + patlen <= to - from ? ret : UINT_MAX);
  3919. }
  3920. EXPORT_SYMBOL(skb_find_text);
  3921. int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
  3922. int offset, size_t size, size_t max_frags)
  3923. {
  3924. int i = skb_shinfo(skb)->nr_frags;
  3925. if (skb_can_coalesce(skb, i, page, offset)) {
  3926. skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size);
  3927. } else if (i < max_frags) {
  3928. skb_zcopy_downgrade_managed(skb);
  3929. get_page(page);
  3930. skb_fill_page_desc_noacc(skb, i, page, offset, size);
  3931. } else {
  3932. return -EMSGSIZE;
  3933. }
  3934. return 0;
  3935. }
  3936. EXPORT_SYMBOL_GPL(skb_append_pagefrags);
  3937. /**
  3938. * skb_pull_rcsum - pull skb and update receive checksum
  3939. * @skb: buffer to update
  3940. * @len: length of data pulled
  3941. *
  3942. * This function performs an skb_pull on the packet and updates
  3943. * the CHECKSUM_COMPLETE checksum. It should be used on
  3944. * receive path processing instead of skb_pull unless you know
  3945. * that the checksum difference is zero (e.g., a valid IP header)
  3946. * or you are setting ip_summed to CHECKSUM_NONE.
  3947. */
  3948. void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len)
  3949. {
  3950. unsigned char *data = skb->data;
  3951. BUG_ON(len > skb->len);
  3952. __skb_pull(skb, len);
  3953. skb_postpull_rcsum(skb, data, len);
  3954. return skb->data;
  3955. }
  3956. EXPORT_SYMBOL_GPL(skb_pull_rcsum);
  3957. static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb)
  3958. {
  3959. skb_frag_t head_frag;
  3960. struct page *page;
  3961. page = virt_to_head_page(frag_skb->head);
  3962. skb_frag_fill_page_desc(&head_frag, page, frag_skb->data -
  3963. (unsigned char *)page_address(page),
  3964. skb_headlen(frag_skb));
  3965. return head_frag;
  3966. }
  3967. struct sk_buff *skb_segment_list(struct sk_buff *skb,
  3968. netdev_features_t features,
  3969. unsigned int offset)
  3970. {
  3971. struct sk_buff *list_skb = skb_shinfo(skb)->frag_list;
  3972. unsigned int tnl_hlen = skb_tnl_header_len(skb);
  3973. unsigned int delta_len = 0;
  3974. struct sk_buff *tail = NULL;
  3975. struct sk_buff *nskb, *tmp;
  3976. int len_diff, err;
  3977. /* Only skb_gro_receive_list generated skbs arrive here */
  3978. DEBUG_NET_WARN_ON_ONCE(!(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST));
  3979. skb_push(skb, -skb_network_offset(skb) + offset);
  3980. /* Ensure the head is writeable before touching the shared info */
  3981. err = skb_unclone(skb, GFP_ATOMIC);
  3982. if (err)
  3983. goto err_linearize;
  3984. skb_shinfo(skb)->frag_list = NULL;
  3985. while (list_skb) {
  3986. nskb = list_skb;
  3987. list_skb = list_skb->next;
  3988. DEBUG_NET_WARN_ON_ONCE(nskb->sk);
  3989. err = 0;
  3990. if (skb_shared(nskb)) {
  3991. tmp = skb_clone(nskb, GFP_ATOMIC);
  3992. if (tmp) {
  3993. consume_skb(nskb);
  3994. nskb = tmp;
  3995. err = skb_unclone(nskb, GFP_ATOMIC);
  3996. } else {
  3997. err = -ENOMEM;
  3998. }
  3999. }
  4000. if (!tail)
  4001. skb->next = nskb;
  4002. else
  4003. tail->next = nskb;
  4004. if (unlikely(err)) {
  4005. nskb->next = list_skb;
  4006. goto err_linearize;
  4007. }
  4008. tail = nskb;
  4009. delta_len += nskb->len;
  4010. skb_push(nskb, -skb_network_offset(nskb) + offset);
  4011. skb_release_head_state(nskb);
  4012. len_diff = skb_network_header_len(nskb) - skb_network_header_len(skb);
  4013. __copy_skb_header(nskb, skb);
  4014. skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb));
  4015. nskb->transport_header += len_diff;
  4016. skb_copy_from_linear_data_offset(skb, -tnl_hlen,
  4017. nskb->data - tnl_hlen,
  4018. offset + tnl_hlen);
  4019. if (skb_needs_linearize(nskb, features) &&
  4020. __skb_linearize(nskb))
  4021. goto err_linearize;
  4022. }
  4023. skb->data_len = skb->data_len - delta_len;
  4024. skb->len = skb->len - delta_len;
  4025. skb_gso_reset(skb);
  4026. skb->prev = tail;
  4027. if (skb_needs_linearize(skb, features) &&
  4028. __skb_linearize(skb))
  4029. goto err_linearize;
  4030. skb_get(skb);
  4031. return skb;
  4032. err_linearize:
  4033. kfree_skb_list(skb->next);
  4034. skb->next = NULL;
  4035. return ERR_PTR(-ENOMEM);
  4036. }
  4037. EXPORT_SYMBOL_GPL(skb_segment_list);
  4038. /**
  4039. * skb_segment - Perform protocol segmentation on skb.
  4040. * @head_skb: buffer to segment
  4041. * @features: features for the output path (see dev->features)
  4042. *
  4043. * This function performs segmentation on the given skb. It returns
  4044. * a pointer to the first in a list of new skbs for the segments.
  4045. * In case of error it returns ERR_PTR(err).
  4046. */
  4047. struct sk_buff *skb_segment(struct sk_buff *head_skb,
  4048. netdev_features_t features)
  4049. {
  4050. struct sk_buff *segs = NULL;
  4051. struct sk_buff *tail = NULL;
  4052. struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list;
  4053. unsigned int mss = skb_shinfo(head_skb)->gso_size;
  4054. unsigned int doffset = head_skb->data - skb_mac_header(head_skb);
  4055. unsigned int offset = doffset;
  4056. unsigned int tnl_hlen = skb_tnl_header_len(head_skb);
  4057. unsigned int partial_segs = 0;
  4058. unsigned int headroom;
  4059. unsigned int len = head_skb->len;
  4060. struct sk_buff *frag_skb;
  4061. skb_frag_t *frag;
  4062. __be16 proto;
  4063. bool csum, sg;
  4064. int err = -ENOMEM;
  4065. int i = 0;
  4066. int nfrags, pos;
  4067. if ((skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY) &&
  4068. mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb)) {
  4069. struct sk_buff *check_skb;
  4070. for (check_skb = list_skb; check_skb; check_skb = check_skb->next) {
  4071. if (skb_headlen(check_skb) && !check_skb->head_frag) {
  4072. /* gso_size is untrusted, and we have a frag_list with
  4073. * a linear non head_frag item.
  4074. *
  4075. * If head_skb's headlen does not fit requested gso_size,
  4076. * it means that the frag_list members do NOT terminate
  4077. * on exact gso_size boundaries. Hence we cannot perform
  4078. * skb_frag_t page sharing. Therefore we must fallback to
  4079. * copying the frag_list skbs; we do so by disabling SG.
  4080. */
  4081. features &= ~NETIF_F_SG;
  4082. break;
  4083. }
  4084. }
  4085. }
  4086. __skb_push(head_skb, doffset);
  4087. proto = skb_network_protocol(head_skb, NULL);
  4088. if (unlikely(!proto))
  4089. return ERR_PTR(-EINVAL);
  4090. sg = !!(features & NETIF_F_SG);
  4091. csum = !!can_checksum_protocol(features, proto);
  4092. if (sg && csum && (mss != GSO_BY_FRAGS)) {
  4093. if (!(features & NETIF_F_GSO_PARTIAL)) {
  4094. struct sk_buff *iter;
  4095. unsigned int frag_len;
  4096. if (!list_skb ||
  4097. !net_gso_ok(features, skb_shinfo(head_skb)->gso_type))
  4098. goto normal;
  4099. /* If we get here then all the required
  4100. * GSO features except frag_list are supported.
  4101. * Try to split the SKB to multiple GSO SKBs
  4102. * with no frag_list.
  4103. * Currently we can do that only when the buffers don't
  4104. * have a linear part and all the buffers except
  4105. * the last are of the same length.
  4106. */
  4107. frag_len = list_skb->len;
  4108. skb_walk_frags(head_skb, iter) {
  4109. if (frag_len != iter->len && iter->next)
  4110. goto normal;
  4111. if (skb_headlen(iter) && !iter->head_frag)
  4112. goto normal;
  4113. len -= iter->len;
  4114. }
  4115. if (len != frag_len)
  4116. goto normal;
  4117. }
  4118. /* GSO partial only requires that we trim off any excess that
  4119. * doesn't fit into an MSS sized block, so take care of that
  4120. * now.
  4121. * Cap len to not accidentally hit GSO_BY_FRAGS.
  4122. */
  4123. partial_segs = min(len, GSO_BY_FRAGS - 1) / mss;
  4124. if (partial_segs > 1)
  4125. mss *= partial_segs;
  4126. else
  4127. partial_segs = 0;
  4128. }
  4129. normal:
  4130. headroom = skb_headroom(head_skb);
  4131. pos = skb_headlen(head_skb);
  4132. if (skb_orphan_frags(head_skb, GFP_ATOMIC))
  4133. return ERR_PTR(-ENOMEM);
  4134. nfrags = skb_shinfo(head_skb)->nr_frags;
  4135. frag = skb_shinfo(head_skb)->frags;
  4136. frag_skb = head_skb;
  4137. do {
  4138. struct sk_buff *nskb;
  4139. skb_frag_t *nskb_frag;
  4140. int hsize;
  4141. int size;
  4142. if (unlikely(mss == GSO_BY_FRAGS)) {
  4143. len = list_skb->len;
  4144. } else {
  4145. len = head_skb->len - offset;
  4146. if (len > mss)
  4147. len = mss;
  4148. }
  4149. hsize = skb_headlen(head_skb) - offset;
  4150. if (hsize <= 0 && i >= nfrags && skb_headlen(list_skb) &&
  4151. (skb_headlen(list_skb) == len || sg)) {
  4152. BUG_ON(skb_headlen(list_skb) > len);
  4153. nskb = skb_clone(list_skb, GFP_ATOMIC);
  4154. if (unlikely(!nskb))
  4155. goto err;
  4156. i = 0;
  4157. nfrags = skb_shinfo(list_skb)->nr_frags;
  4158. frag = skb_shinfo(list_skb)->frags;
  4159. frag_skb = list_skb;
  4160. pos += skb_headlen(list_skb);
  4161. while (pos < offset + len) {
  4162. BUG_ON(i >= nfrags);
  4163. size = skb_frag_size(frag);
  4164. if (pos + size > offset + len)
  4165. break;
  4166. i++;
  4167. pos += size;
  4168. frag++;
  4169. }
  4170. list_skb = list_skb->next;
  4171. if (unlikely(pskb_trim(nskb, len))) {
  4172. kfree_skb(nskb);
  4173. goto err;
  4174. }
  4175. hsize = skb_end_offset(nskb);
  4176. if (skb_cow_head(nskb, doffset + headroom)) {
  4177. kfree_skb(nskb);
  4178. goto err;
  4179. }
  4180. nskb->truesize += skb_end_offset(nskb) - hsize;
  4181. skb_release_head_state(nskb);
  4182. __skb_push(nskb, doffset);
  4183. } else {
  4184. if (hsize < 0)
  4185. hsize = 0;
  4186. if (hsize > len || !sg)
  4187. hsize = len;
  4188. nskb = __alloc_skb(hsize + doffset + headroom,
  4189. GFP_ATOMIC, skb_alloc_rx_flag(head_skb),
  4190. NUMA_NO_NODE);
  4191. if (unlikely(!nskb))
  4192. goto err;
  4193. skb_reserve(nskb, headroom);
  4194. __skb_put(nskb, doffset);
  4195. }
  4196. if (segs)
  4197. tail->next = nskb;
  4198. else
  4199. segs = nskb;
  4200. tail = nskb;
  4201. __copy_skb_header(nskb, head_skb);
  4202. skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom);
  4203. skb_reset_mac_len(nskb);
  4204. skb_copy_from_linear_data_offset(head_skb, -tnl_hlen,
  4205. nskb->data - tnl_hlen,
  4206. doffset + tnl_hlen);
  4207. if (nskb->len == len + doffset)
  4208. goto perform_csum_check;
  4209. if (!sg) {
  4210. if (!csum) {
  4211. if (!nskb->remcsum_offload)
  4212. nskb->ip_summed = CHECKSUM_NONE;
  4213. SKB_GSO_CB(nskb)->csum =
  4214. skb_copy_and_csum_bits(head_skb, offset,
  4215. skb_put(nskb,
  4216. len),
  4217. len);
  4218. SKB_GSO_CB(nskb)->csum_start =
  4219. skb_headroom(nskb) + doffset;
  4220. } else {
  4221. if (skb_copy_bits(head_skb, offset, skb_put(nskb, len), len))
  4222. goto err;
  4223. }
  4224. continue;
  4225. }
  4226. nskb_frag = skb_shinfo(nskb)->frags;
  4227. skb_copy_from_linear_data_offset(head_skb, offset,
  4228. skb_put(nskb, hsize), hsize);
  4229. skb_shinfo(nskb)->flags |= skb_shinfo(head_skb)->flags &
  4230. SKBFL_SHARED_FRAG;
  4231. if (skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC))
  4232. goto err;
  4233. while (pos < offset + len) {
  4234. if (i >= nfrags) {
  4235. if (skb_orphan_frags(list_skb, GFP_ATOMIC) ||
  4236. skb_zerocopy_clone(nskb, list_skb,
  4237. GFP_ATOMIC))
  4238. goto err;
  4239. i = 0;
  4240. nfrags = skb_shinfo(list_skb)->nr_frags;
  4241. frag = skb_shinfo(list_skb)->frags;
  4242. frag_skb = list_skb;
  4243. if (!skb_headlen(list_skb)) {
  4244. BUG_ON(!nfrags);
  4245. } else {
  4246. BUG_ON(!list_skb->head_frag);
  4247. /* to make room for head_frag. */
  4248. i--;
  4249. frag--;
  4250. }
  4251. list_skb = list_skb->next;
  4252. }
  4253. if (unlikely(skb_shinfo(nskb)->nr_frags >=
  4254. MAX_SKB_FRAGS)) {
  4255. net_warn_ratelimited(
  4256. "skb_segment: too many frags: %u %u\n",
  4257. pos, mss);
  4258. err = -EINVAL;
  4259. goto err;
  4260. }
  4261. *nskb_frag = (i < 0) ? skb_head_frag_to_page_desc(frag_skb) : *frag;
  4262. __skb_frag_ref(nskb_frag);
  4263. size = skb_frag_size(nskb_frag);
  4264. if (pos < offset) {
  4265. skb_frag_off_add(nskb_frag, offset - pos);
  4266. skb_frag_size_sub(nskb_frag, offset - pos);
  4267. }
  4268. skb_shinfo(nskb)->nr_frags++;
  4269. if (pos + size <= offset + len) {
  4270. i++;
  4271. frag++;
  4272. pos += size;
  4273. } else {
  4274. skb_frag_size_sub(nskb_frag, pos + size - (offset + len));
  4275. goto skip_fraglist;
  4276. }
  4277. nskb_frag++;
  4278. }
  4279. skip_fraglist:
  4280. nskb->data_len = len - hsize;
  4281. nskb->len += nskb->data_len;
  4282. nskb->truesize += nskb->data_len;
  4283. perform_csum_check:
  4284. if (!csum) {
  4285. if (skb_has_shared_frag(nskb) &&
  4286. __skb_linearize(nskb))
  4287. goto err;
  4288. if (!nskb->remcsum_offload)
  4289. nskb->ip_summed = CHECKSUM_NONE;
  4290. SKB_GSO_CB(nskb)->csum =
  4291. skb_checksum(nskb, doffset,
  4292. nskb->len - doffset, 0);
  4293. SKB_GSO_CB(nskb)->csum_start =
  4294. skb_headroom(nskb) + doffset;
  4295. }
  4296. } while ((offset += len) < head_skb->len);
  4297. /* Some callers want to get the end of the list.
  4298. * Put it in segs->prev to avoid walking the list.
  4299. * (see validate_xmit_skb_list() for example)
  4300. */
  4301. segs->prev = tail;
  4302. if (partial_segs) {
  4303. struct sk_buff *iter;
  4304. int type = skb_shinfo(head_skb)->gso_type;
  4305. unsigned short gso_size = skb_shinfo(head_skb)->gso_size;
  4306. /* Update type to add partial and then remove dodgy if set */
  4307. type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL;
  4308. type &= ~SKB_GSO_DODGY;
  4309. /* Update GSO info and prepare to start updating headers on
  4310. * our way back down the stack of protocols.
  4311. */
  4312. for (iter = segs; iter; iter = iter->next) {
  4313. skb_shinfo(iter)->gso_size = gso_size;
  4314. skb_shinfo(iter)->gso_segs = partial_segs;
  4315. skb_shinfo(iter)->gso_type = type;
  4316. SKB_GSO_CB(iter)->data_offset = skb_headroom(iter) + doffset;
  4317. }
  4318. if (tail->len - doffset <= gso_size)
  4319. skb_shinfo(tail)->gso_size = 0;
  4320. else if (tail != segs)
  4321. skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size);
  4322. }
  4323. /* Following permits correct backpressure, for protocols
  4324. * using skb_set_owner_w().
  4325. * Idea is to tranfert ownership from head_skb to last segment.
  4326. */
  4327. if (head_skb->destructor == sock_wfree) {
  4328. swap(tail->truesize, head_skb->truesize);
  4329. swap(tail->destructor, head_skb->destructor);
  4330. swap(tail->sk, head_skb->sk);
  4331. }
  4332. return segs;
  4333. err:
  4334. kfree_skb_list(segs);
  4335. return ERR_PTR(err);
  4336. }
  4337. EXPORT_SYMBOL_GPL(skb_segment);
  4338. #ifdef CONFIG_SKB_EXTENSIONS
  4339. #define SKB_EXT_ALIGN_VALUE 8
  4340. #define SKB_EXT_CHUNKSIZEOF(x) (ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE)
  4341. static const u8 skb_ext_type_len[] = {
  4342. #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
  4343. [SKB_EXT_BRIDGE_NF] = SKB_EXT_CHUNKSIZEOF(struct nf_bridge_info),
  4344. #endif
  4345. #ifdef CONFIG_XFRM
  4346. [SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path),
  4347. #endif
  4348. #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
  4349. [TC_SKB_EXT] = SKB_EXT_CHUNKSIZEOF(struct tc_skb_ext),
  4350. #endif
  4351. #if IS_ENABLED(CONFIG_MPTCP)
  4352. [SKB_EXT_MPTCP] = SKB_EXT_CHUNKSIZEOF(struct mptcp_ext),
  4353. #endif
  4354. #if IS_ENABLED(CONFIG_MCTP_FLOWS)
  4355. [SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow),
  4356. #endif
  4357. #if IS_ENABLED(CONFIG_INET_PSP)
  4358. [SKB_EXT_PSP] = SKB_EXT_CHUNKSIZEOF(struct psp_skb_ext),
  4359. #endif
  4360. #if IS_ENABLED(CONFIG_CAN)
  4361. [SKB_EXT_CAN] = SKB_EXT_CHUNKSIZEOF(struct can_skb_ext),
  4362. #endif
  4363. };
  4364. static __always_inline unsigned int skb_ext_total_length(void)
  4365. {
  4366. unsigned int l = SKB_EXT_CHUNKSIZEOF(struct skb_ext);
  4367. int i;
  4368. for (i = 0; i < ARRAY_SIZE(skb_ext_type_len); i++)
  4369. l += skb_ext_type_len[i];
  4370. return l;
  4371. }
  4372. static void skb_extensions_init(void)
  4373. {
  4374. BUILD_BUG_ON(SKB_EXT_NUM > 8);
  4375. #if !IS_ENABLED(CONFIG_KCOV_INSTRUMENT_ALL)
  4376. BUILD_BUG_ON(skb_ext_total_length() > 255);
  4377. #endif
  4378. skbuff_ext_cache = kmem_cache_create("skbuff_ext_cache",
  4379. SKB_EXT_ALIGN_VALUE * skb_ext_total_length(),
  4380. 0,
  4381. SLAB_HWCACHE_ALIGN|SLAB_PANIC,
  4382. NULL);
  4383. }
  4384. #else
  4385. static void skb_extensions_init(void) {}
  4386. #endif
  4387. /* The SKB kmem_cache slab is critical for network performance. Never
  4388. * merge/alias the slab with similar sized objects. This avoids fragmentation
  4389. * that hurts performance of kmem_cache_{alloc,free}_bulk APIs.
  4390. */
  4391. #ifndef CONFIG_SLUB_TINY
  4392. #define FLAG_SKB_NO_MERGE SLAB_NO_MERGE
  4393. #else /* CONFIG_SLUB_TINY - simple loop in kmem_cache_alloc_bulk */
  4394. #define FLAG_SKB_NO_MERGE 0
  4395. #endif
  4396. void __init skb_init(void)
  4397. {
  4398. net_hotdata.skbuff_cache = kmem_cache_create_usercopy("skbuff_head_cache",
  4399. sizeof(struct sk_buff),
  4400. 0,
  4401. SLAB_HWCACHE_ALIGN|SLAB_PANIC|
  4402. FLAG_SKB_NO_MERGE,
  4403. offsetof(struct sk_buff, cb),
  4404. sizeof_field(struct sk_buff, cb),
  4405. NULL);
  4406. skbuff_cache_size = kmem_cache_size(net_hotdata.skbuff_cache);
  4407. net_hotdata.skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
  4408. sizeof(struct sk_buff_fclones),
  4409. 0,
  4410. SLAB_HWCACHE_ALIGN|SLAB_PANIC,
  4411. NULL);
  4412. /* usercopy should only access first SKB_SMALL_HEAD_HEADROOM bytes.
  4413. * struct skb_shared_info is located at the end of skb->head,
  4414. * and should not be copied to/from user.
  4415. */
  4416. net_hotdata.skb_small_head_cache = kmem_cache_create_usercopy("skbuff_small_head",
  4417. SKB_SMALL_HEAD_CACHE_SIZE,
  4418. 0,
  4419. SLAB_HWCACHE_ALIGN | SLAB_PANIC,
  4420. 0,
  4421. SKB_SMALL_HEAD_HEADROOM,
  4422. NULL);
  4423. skb_extensions_init();
  4424. }
  4425. static int
  4426. __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len,
  4427. unsigned int recursion_level)
  4428. {
  4429. int start = skb_headlen(skb);
  4430. int i, copy = start - offset;
  4431. struct sk_buff *frag_iter;
  4432. int elt = 0;
  4433. if (unlikely(recursion_level >= 24))
  4434. return -EMSGSIZE;
  4435. if (copy > 0) {
  4436. if (copy > len)
  4437. copy = len;
  4438. sg_set_buf(sg, skb->data + offset, copy);
  4439. elt++;
  4440. if ((len -= copy) == 0)
  4441. return elt;
  4442. offset += copy;
  4443. }
  4444. for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
  4445. int end;
  4446. WARN_ON(start > offset + len);
  4447. end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
  4448. if ((copy = end - offset) > 0) {
  4449. skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
  4450. if (unlikely(elt && sg_is_last(&sg[elt - 1])))
  4451. return -EMSGSIZE;
  4452. if (copy > len)
  4453. copy = len;
  4454. sg_set_page(&sg[elt], skb_frag_page(frag), copy,
  4455. skb_frag_off(frag) + offset - start);
  4456. elt++;
  4457. if (!(len -= copy))
  4458. return elt;
  4459. offset += copy;
  4460. }
  4461. start = end;
  4462. }
  4463. skb_walk_frags(skb, frag_iter) {
  4464. int end, ret;
  4465. WARN_ON(start > offset + len);
  4466. end = start + frag_iter->len;
  4467. if ((copy = end - offset) > 0) {
  4468. if (unlikely(elt && sg_is_last(&sg[elt - 1])))
  4469. return -EMSGSIZE;
  4470. if (copy > len)
  4471. copy = len;
  4472. ret = __skb_to_sgvec(frag_iter, sg+elt, offset - start,
  4473. copy, recursion_level + 1);
  4474. if (unlikely(ret < 0))
  4475. return ret;
  4476. elt += ret;
  4477. if ((len -= copy) == 0)
  4478. return elt;
  4479. offset += copy;
  4480. }
  4481. start = end;
  4482. }
  4483. BUG_ON(len);
  4484. return elt;
  4485. }
  4486. /**
  4487. * skb_to_sgvec - Fill a scatter-gather list from a socket buffer
  4488. * @skb: Socket buffer containing the buffers to be mapped
  4489. * @sg: The scatter-gather list to map into
  4490. * @offset: The offset into the buffer's contents to start mapping
  4491. * @len: Length of buffer space to be mapped
  4492. *
  4493. * Fill the specified scatter-gather list with mappings/pointers into a
  4494. * region of the buffer space attached to a socket buffer. Returns either
  4495. * the number of scatterlist items used, or -EMSGSIZE if the contents
  4496. * could not fit.
  4497. */
  4498. int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
  4499. {
  4500. int nsg = __skb_to_sgvec(skb, sg, offset, len, 0);
  4501. if (nsg <= 0)
  4502. return nsg;
  4503. sg_mark_end(&sg[nsg - 1]);
  4504. return nsg;
  4505. }
  4506. EXPORT_SYMBOL_GPL(skb_to_sgvec);
  4507. /* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given
  4508. * sglist without mark the sg which contain last skb data as the end.
  4509. * So the caller can mannipulate sg list as will when padding new data after
  4510. * the first call without calling sg_unmark_end to expend sg list.
  4511. *
  4512. * Scenario to use skb_to_sgvec_nomark:
  4513. * 1. sg_init_table
  4514. * 2. skb_to_sgvec_nomark(payload1)
  4515. * 3. skb_to_sgvec_nomark(payload2)
  4516. *
  4517. * This is equivalent to:
  4518. * 1. sg_init_table
  4519. * 2. skb_to_sgvec(payload1)
  4520. * 3. sg_unmark_end
  4521. * 4. skb_to_sgvec(payload2)
  4522. *
  4523. * When mapping multiple payload conditionally, skb_to_sgvec_nomark
  4524. * is more preferable.
  4525. */
  4526. int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg,
  4527. int offset, int len)
  4528. {
  4529. return __skb_to_sgvec(skb, sg, offset, len, 0);
  4530. }
  4531. EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark);
  4532. /**
  4533. * skb_cow_data - Check that a socket buffer's data buffers are writable
  4534. * @skb: The socket buffer to check.
  4535. * @tailbits: Amount of trailing space to be added
  4536. * @trailer: Returned pointer to the skb where the @tailbits space begins
  4537. *
  4538. * Make sure that the data buffers attached to a socket buffer are
  4539. * writable. If they are not, private copies are made of the data buffers
  4540. * and the socket buffer is set to use these instead.
  4541. *
  4542. * If @tailbits is given, make sure that there is space to write @tailbits
  4543. * bytes of data beyond current end of socket buffer. @trailer will be
  4544. * set to point to the skb in which this space begins.
  4545. *
  4546. * The number of scatterlist elements required to completely map the
  4547. * COW'd and extended socket buffer will be returned.
  4548. */
  4549. int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
  4550. {
  4551. int copyflag;
  4552. int elt;
  4553. struct sk_buff *skb1, **skb_p;
  4554. /* If skb is cloned or its head is paged, reallocate
  4555. * head pulling out all the pages (pages are considered not writable
  4556. * at the moment even if they are anonymous).
  4557. */
  4558. if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) &&
  4559. !__pskb_pull_tail(skb, __skb_pagelen(skb)))
  4560. return -ENOMEM;
  4561. /* Easy case. Most of packets will go this way. */
  4562. if (!skb_has_frag_list(skb)) {
  4563. /* A little of trouble, not enough of space for trailer.
  4564. * This should not happen, when stack is tuned to generate
  4565. * good frames. OK, on miss we reallocate and reserve even more
  4566. * space, 128 bytes is fair. */
  4567. if (skb_tailroom(skb) < tailbits &&
  4568. pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC))
  4569. return -ENOMEM;
  4570. /* Voila! */
  4571. *trailer = skb;
  4572. return 1;
  4573. }
  4574. /* Misery. We are in troubles, going to mincer fragments... */
  4575. elt = 1;
  4576. skb_p = &skb_shinfo(skb)->frag_list;
  4577. copyflag = 0;
  4578. while ((skb1 = *skb_p) != NULL) {
  4579. int ntail = 0;
  4580. /* The fragment is partially pulled by someone,
  4581. * this can happen on input. Copy it and everything
  4582. * after it. */
  4583. if (skb_shared(skb1))
  4584. copyflag = 1;
  4585. /* If the skb is the last, worry about trailer. */
  4586. if (skb1->next == NULL && tailbits) {
  4587. if (skb_shinfo(skb1)->nr_frags ||
  4588. skb_has_frag_list(skb1) ||
  4589. skb_tailroom(skb1) < tailbits)
  4590. ntail = tailbits + 128;
  4591. }
  4592. if (copyflag ||
  4593. skb_cloned(skb1) ||
  4594. ntail ||
  4595. skb_shinfo(skb1)->nr_frags ||
  4596. skb_has_frag_list(skb1)) {
  4597. struct sk_buff *skb2;
  4598. /* Fuck, we are miserable poor guys... */
  4599. if (ntail == 0)
  4600. skb2 = skb_copy(skb1, GFP_ATOMIC);
  4601. else
  4602. skb2 = skb_copy_expand(skb1,
  4603. skb_headroom(skb1),
  4604. ntail,
  4605. GFP_ATOMIC);
  4606. if (unlikely(skb2 == NULL))
  4607. return -ENOMEM;
  4608. if (skb1->sk)
  4609. skb_set_owner_w(skb2, skb1->sk);
  4610. /* Looking around. Are we still alive?
  4611. * OK, link new skb, drop old one */
  4612. skb2->next = skb1->next;
  4613. *skb_p = skb2;
  4614. kfree_skb(skb1);
  4615. skb1 = skb2;
  4616. }
  4617. elt++;
  4618. *trailer = skb1;
  4619. skb_p = &skb1->next;
  4620. }
  4621. return elt;
  4622. }
  4623. EXPORT_SYMBOL_GPL(skb_cow_data);
  4624. static void sock_rmem_free(struct sk_buff *skb)
  4625. {
  4626. struct sock *sk = skb->sk;
  4627. atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
  4628. }
  4629. static void skb_set_err_queue(struct sk_buff *skb)
  4630. {
  4631. /* pkt_type of skbs received on local sockets is never PACKET_OUTGOING.
  4632. * So, it is safe to (mis)use it to mark skbs on the error queue.
  4633. */
  4634. skb->pkt_type = PACKET_OUTGOING;
  4635. BUILD_BUG_ON(PACKET_OUTGOING == 0);
  4636. }
  4637. /*
  4638. * Note: We dont mem charge error packets (no sk_forward_alloc changes)
  4639. */
  4640. int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
  4641. {
  4642. if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
  4643. (unsigned int)READ_ONCE(sk->sk_rcvbuf))
  4644. return -ENOMEM;
  4645. skb_orphan(skb);
  4646. skb->sk = sk;
  4647. skb->destructor = sock_rmem_free;
  4648. atomic_add(skb->truesize, &sk->sk_rmem_alloc);
  4649. skb_set_err_queue(skb);
  4650. /* before exiting rcu section, make sure dst is refcounted */
  4651. skb_dst_force(skb);
  4652. skb_queue_tail(&sk->sk_error_queue, skb);
  4653. if (!sock_flag(sk, SOCK_DEAD))
  4654. sk_error_report(sk);
  4655. return 0;
  4656. }
  4657. EXPORT_SYMBOL(sock_queue_err_skb);
  4658. static bool is_icmp_err_skb(const struct sk_buff *skb)
  4659. {
  4660. return skb && (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP ||
  4661. SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP6);
  4662. }
  4663. struct sk_buff *sock_dequeue_err_skb(struct sock *sk)
  4664. {
  4665. struct sk_buff_head *q = &sk->sk_error_queue;
  4666. struct sk_buff *skb, *skb_next = NULL;
  4667. bool icmp_next = false;
  4668. unsigned long flags;
  4669. if (skb_queue_empty_lockless(q))
  4670. return NULL;
  4671. spin_lock_irqsave(&q->lock, flags);
  4672. skb = __skb_dequeue(q);
  4673. if (skb && (skb_next = skb_peek(q))) {
  4674. icmp_next = is_icmp_err_skb(skb_next);
  4675. if (icmp_next)
  4676. sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_errno;
  4677. }
  4678. spin_unlock_irqrestore(&q->lock, flags);
  4679. if (is_icmp_err_skb(skb) && !icmp_next)
  4680. sk->sk_err = 0;
  4681. if (skb_next)
  4682. sk_error_report(sk);
  4683. return skb;
  4684. }
  4685. EXPORT_SYMBOL(sock_dequeue_err_skb);
  4686. /**
  4687. * skb_clone_sk - create clone of skb, and take reference to socket
  4688. * @skb: the skb to clone
  4689. *
  4690. * This function creates a clone of a buffer that holds a reference on
  4691. * sk_refcnt. Buffers created via this function are meant to be
  4692. * returned using sock_queue_err_skb, or free via kfree_skb.
  4693. *
  4694. * When passing buffers allocated with this function to sock_queue_err_skb
  4695. * it is necessary to wrap the call with sock_hold/sock_put in order to
  4696. * prevent the socket from being released prior to being enqueued on
  4697. * the sk_error_queue.
  4698. */
  4699. struct sk_buff *skb_clone_sk(struct sk_buff *skb)
  4700. {
  4701. struct sock *sk = skb->sk;
  4702. struct sk_buff *clone;
  4703. if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt))
  4704. return NULL;
  4705. clone = skb_clone(skb, GFP_ATOMIC);
  4706. if (!clone) {
  4707. sock_put(sk);
  4708. return NULL;
  4709. }
  4710. clone->sk = sk;
  4711. clone->destructor = sock_efree;
  4712. return clone;
  4713. }
  4714. EXPORT_SYMBOL(skb_clone_sk);
  4715. static void __skb_complete_tx_timestamp(struct sk_buff *skb,
  4716. struct sock *sk,
  4717. int tstype,
  4718. bool opt_stats)
  4719. {
  4720. struct sock_exterr_skb *serr;
  4721. int err;
  4722. BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb));
  4723. serr = SKB_EXT_ERR(skb);
  4724. memset(serr, 0, sizeof(*serr));
  4725. serr->ee.ee_errno = ENOMSG;
  4726. serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
  4727. serr->ee.ee_info = tstype;
  4728. serr->opt_stats = opt_stats;
  4729. serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0;
  4730. if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) {
  4731. serr->ee.ee_data = skb_shinfo(skb)->tskey;
  4732. if (sk_is_tcp(sk))
  4733. serr->ee.ee_data -= atomic_read(&sk->sk_tskey);
  4734. }
  4735. err = sock_queue_err_skb(sk, skb);
  4736. if (err)
  4737. kfree_skb(skb);
  4738. }
  4739. static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly)
  4740. {
  4741. struct socket *sock;
  4742. struct file *file;
  4743. bool ret = false;
  4744. if (likely(tsonly || READ_ONCE(sock_net(sk)->core.sysctl_tstamp_allow_data)))
  4745. return true;
  4746. /* The sk pointer remains valid as long as the skb is. The sk_socket and
  4747. * file pointer may become NULL if the socket is closed. Both structures
  4748. * (including file->cred) are RCU freed which means they can be accessed
  4749. * within a RCU read section.
  4750. */
  4751. rcu_read_lock();
  4752. sock = READ_ONCE(sk->sk_socket);
  4753. if (!sock)
  4754. goto out;
  4755. file = READ_ONCE(sock->file);
  4756. if (!file)
  4757. goto out;
  4758. ret = file_ns_capable(file, &init_user_ns, CAP_NET_RAW);
  4759. out:
  4760. rcu_read_unlock();
  4761. return ret;
  4762. }
  4763. void skb_complete_tx_timestamp(struct sk_buff *skb,
  4764. struct skb_shared_hwtstamps *hwtstamps)
  4765. {
  4766. struct sock *sk = skb->sk;
  4767. if (!skb_may_tx_timestamp(sk, false))
  4768. goto err;
  4769. /* Take a reference to prevent skb_orphan() from freeing the socket,
  4770. * but only if the socket refcount is not zero.
  4771. */
  4772. if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) {
  4773. *skb_hwtstamps(skb) = *hwtstamps;
  4774. __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false);
  4775. sock_put(sk);
  4776. return;
  4777. }
  4778. err:
  4779. kfree_skb(skb);
  4780. }
  4781. EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
  4782. static bool skb_tstamp_tx_report_so_timestamping(struct sk_buff *skb,
  4783. struct skb_shared_hwtstamps *hwtstamps,
  4784. int tstype)
  4785. {
  4786. switch (tstype) {
  4787. case SCM_TSTAMP_SCHED:
  4788. return skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP;
  4789. case SCM_TSTAMP_SND:
  4790. return skb_shinfo(skb)->tx_flags & (hwtstamps ? SKBTX_HW_TSTAMP_NOBPF :
  4791. SKBTX_SW_TSTAMP);
  4792. case SCM_TSTAMP_ACK:
  4793. return TCP_SKB_CB(skb)->txstamp_ack & TSTAMP_ACK_SK;
  4794. case SCM_TSTAMP_COMPLETION:
  4795. return skb_shinfo(skb)->tx_flags & SKBTX_COMPLETION_TSTAMP;
  4796. }
  4797. return false;
  4798. }
  4799. static void skb_tstamp_tx_report_bpf_timestamping(struct sk_buff *skb,
  4800. struct skb_shared_hwtstamps *hwtstamps,
  4801. struct sock *sk,
  4802. int tstype)
  4803. {
  4804. int op;
  4805. switch (tstype) {
  4806. case SCM_TSTAMP_SCHED:
  4807. op = BPF_SOCK_OPS_TSTAMP_SCHED_CB;
  4808. break;
  4809. case SCM_TSTAMP_SND:
  4810. if (hwtstamps) {
  4811. op = BPF_SOCK_OPS_TSTAMP_SND_HW_CB;
  4812. *skb_hwtstamps(skb) = *hwtstamps;
  4813. } else {
  4814. op = BPF_SOCK_OPS_TSTAMP_SND_SW_CB;
  4815. }
  4816. break;
  4817. case SCM_TSTAMP_ACK:
  4818. op = BPF_SOCK_OPS_TSTAMP_ACK_CB;
  4819. break;
  4820. default:
  4821. return;
  4822. }
  4823. bpf_skops_tx_timestamping(sk, skb, op);
  4824. }
  4825. void __skb_tstamp_tx(struct sk_buff *orig_skb,
  4826. const struct sk_buff *ack_skb,
  4827. struct skb_shared_hwtstamps *hwtstamps,
  4828. struct sock *sk, int tstype)
  4829. {
  4830. struct sk_buff *skb;
  4831. bool tsonly, opt_stats = false;
  4832. u32 tsflags;
  4833. if (!sk)
  4834. return;
  4835. if (skb_shinfo(orig_skb)->tx_flags & SKBTX_BPF)
  4836. skb_tstamp_tx_report_bpf_timestamping(orig_skb, hwtstamps,
  4837. sk, tstype);
  4838. if (!skb_tstamp_tx_report_so_timestamping(orig_skb, hwtstamps, tstype))
  4839. return;
  4840. tsflags = READ_ONCE(sk->sk_tsflags);
  4841. if (!hwtstamps && !(tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) &&
  4842. skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS)
  4843. return;
  4844. tsonly = tsflags & SOF_TIMESTAMPING_OPT_TSONLY;
  4845. if (!skb_may_tx_timestamp(sk, tsonly))
  4846. return;
  4847. if (tsonly) {
  4848. #ifdef CONFIG_INET
  4849. if ((tsflags & SOF_TIMESTAMPING_OPT_STATS) &&
  4850. sk_is_tcp(sk)) {
  4851. skb = tcp_get_timestamping_opt_stats(sk, orig_skb,
  4852. ack_skb);
  4853. opt_stats = true;
  4854. } else
  4855. #endif
  4856. skb = alloc_skb(0, GFP_ATOMIC);
  4857. } else {
  4858. skb = skb_clone(orig_skb, GFP_ATOMIC);
  4859. if (skb_orphan_frags_rx(skb, GFP_ATOMIC)) {
  4860. kfree_skb(skb);
  4861. return;
  4862. }
  4863. }
  4864. if (!skb)
  4865. return;
  4866. if (tsonly) {
  4867. skb_shinfo(skb)->tx_flags |= skb_shinfo(orig_skb)->tx_flags &
  4868. SKBTX_ANY_TSTAMP;
  4869. skb_shinfo(skb)->tskey = skb_shinfo(orig_skb)->tskey;
  4870. }
  4871. if (hwtstamps)
  4872. *skb_hwtstamps(skb) = *hwtstamps;
  4873. else
  4874. __net_timestamp(skb);
  4875. __skb_complete_tx_timestamp(skb, sk, tstype, opt_stats);
  4876. }
  4877. EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
  4878. void skb_tstamp_tx(struct sk_buff *orig_skb,
  4879. struct skb_shared_hwtstamps *hwtstamps)
  4880. {
  4881. return __skb_tstamp_tx(orig_skb, NULL, hwtstamps, orig_skb->sk,
  4882. SCM_TSTAMP_SND);
  4883. }
  4884. EXPORT_SYMBOL_GPL(skb_tstamp_tx);
  4885. #ifdef CONFIG_WIRELESS
  4886. void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
  4887. {
  4888. struct sock *sk = skb->sk;
  4889. struct sock_exterr_skb *serr;
  4890. int err = 1;
  4891. skb->wifi_acked_valid = 1;
  4892. skb->wifi_acked = acked;
  4893. serr = SKB_EXT_ERR(skb);
  4894. memset(serr, 0, sizeof(*serr));
  4895. serr->ee.ee_errno = ENOMSG;
  4896. serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS;
  4897. /* Take a reference to prevent skb_orphan() from freeing the socket,
  4898. * but only if the socket refcount is not zero.
  4899. */
  4900. if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) {
  4901. err = sock_queue_err_skb(sk, skb);
  4902. sock_put(sk);
  4903. }
  4904. if (err)
  4905. kfree_skb(skb);
  4906. }
  4907. EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
  4908. #endif /* CONFIG_WIRELESS */
  4909. /**
  4910. * skb_partial_csum_set - set up and verify partial csum values for packet
  4911. * @skb: the skb to set
  4912. * @start: the number of bytes after skb->data to start checksumming.
  4913. * @off: the offset from start to place the checksum.
  4914. *
  4915. * For untrusted partially-checksummed packets, we need to make sure the values
  4916. * for skb->csum_start and skb->csum_offset are valid so we don't oops.
  4917. *
  4918. * This function checks and sets those values and skb->ip_summed: if this
  4919. * returns false you should drop the packet.
  4920. */
  4921. bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off)
  4922. {
  4923. u32 csum_end = (u32)start + (u32)off + sizeof(__sum16);
  4924. u32 csum_start = skb_headroom(skb) + (u32)start;
  4925. if (unlikely(csum_start >= U16_MAX || csum_end > skb_headlen(skb))) {
  4926. net_warn_ratelimited("bad partial csum: csum=%u/%u headroom=%u headlen=%u\n",
  4927. start, off, skb_headroom(skb), skb_headlen(skb));
  4928. return false;
  4929. }
  4930. skb->ip_summed = CHECKSUM_PARTIAL;
  4931. skb->csum_start = csum_start;
  4932. skb->csum_offset = off;
  4933. skb->transport_header = csum_start;
  4934. return true;
  4935. }
  4936. EXPORT_SYMBOL_GPL(skb_partial_csum_set);
  4937. static int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len,
  4938. unsigned int max)
  4939. {
  4940. if (skb_headlen(skb) >= len)
  4941. return 0;
  4942. /* If we need to pullup then pullup to the max, so we
  4943. * won't need to do it again.
  4944. */
  4945. if (max > skb->len)
  4946. max = skb->len;
  4947. if (__pskb_pull_tail(skb, max - skb_headlen(skb)) == NULL)
  4948. return -ENOMEM;
  4949. if (skb_headlen(skb) < len)
  4950. return -EPROTO;
  4951. return 0;
  4952. }
  4953. #define MAX_TCP_HDR_LEN (15 * 4)
  4954. static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb,
  4955. typeof(IPPROTO_IP) proto,
  4956. unsigned int off)
  4957. {
  4958. int err;
  4959. switch (proto) {
  4960. case IPPROTO_TCP:
  4961. err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr),
  4962. off + MAX_TCP_HDR_LEN);
  4963. if (!err && !skb_partial_csum_set(skb, off,
  4964. offsetof(struct tcphdr,
  4965. check)))
  4966. err = -EPROTO;
  4967. return err ? ERR_PTR(err) : &tcp_hdr(skb)->check;
  4968. case IPPROTO_UDP:
  4969. err = skb_maybe_pull_tail(skb, off + sizeof(struct udphdr),
  4970. off + sizeof(struct udphdr));
  4971. if (!err && !skb_partial_csum_set(skb, off,
  4972. offsetof(struct udphdr,
  4973. check)))
  4974. err = -EPROTO;
  4975. return err ? ERR_PTR(err) : &udp_hdr(skb)->check;
  4976. }
  4977. return ERR_PTR(-EPROTO);
  4978. }
  4979. /* This value should be large enough to cover a tagged ethernet header plus
  4980. * maximally sized IP and TCP or UDP headers.
  4981. */
  4982. #define MAX_IP_HDR_LEN 128
  4983. static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate)
  4984. {
  4985. unsigned int off;
  4986. bool fragment;
  4987. __sum16 *csum;
  4988. int err;
  4989. fragment = false;
  4990. err = skb_maybe_pull_tail(skb,
  4991. sizeof(struct iphdr),
  4992. MAX_IP_HDR_LEN);
  4993. if (err < 0)
  4994. goto out;
  4995. if (ip_is_fragment(ip_hdr(skb)))
  4996. fragment = true;
  4997. off = ip_hdrlen(skb);
  4998. err = -EPROTO;
  4999. if (fragment)
  5000. goto out;
  5001. csum = skb_checksum_setup_ip(skb, ip_hdr(skb)->protocol, off);
  5002. if (IS_ERR(csum))
  5003. return PTR_ERR(csum);
  5004. if (recalculate)
  5005. *csum = ~csum_tcpudp_magic(ip_hdr(skb)->saddr,
  5006. ip_hdr(skb)->daddr,
  5007. skb->len - off,
  5008. ip_hdr(skb)->protocol, 0);
  5009. err = 0;
  5010. out:
  5011. return err;
  5012. }
  5013. /* This value should be large enough to cover a tagged ethernet header plus
  5014. * an IPv6 header, all options, and a maximal TCP or UDP header.
  5015. */
  5016. #define MAX_IPV6_HDR_LEN 256
  5017. #define OPT_HDR(type, skb, off) \
  5018. (type *)(skb_network_header(skb) + (off))
  5019. static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate)
  5020. {
  5021. int err;
  5022. u8 nexthdr;
  5023. unsigned int off;
  5024. unsigned int len;
  5025. bool fragment;
  5026. bool done;
  5027. __sum16 *csum;
  5028. fragment = false;
  5029. done = false;
  5030. off = sizeof(struct ipv6hdr);
  5031. err = skb_maybe_pull_tail(skb, off, MAX_IPV6_HDR_LEN);
  5032. if (err < 0)
  5033. goto out;
  5034. nexthdr = ipv6_hdr(skb)->nexthdr;
  5035. len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len);
  5036. while (off <= len && !done) {
  5037. switch (nexthdr) {
  5038. case IPPROTO_DSTOPTS:
  5039. case IPPROTO_HOPOPTS:
  5040. case IPPROTO_ROUTING: {
  5041. struct ipv6_opt_hdr *hp;
  5042. err = skb_maybe_pull_tail(skb,
  5043. off +
  5044. sizeof(struct ipv6_opt_hdr),
  5045. MAX_IPV6_HDR_LEN);
  5046. if (err < 0)
  5047. goto out;
  5048. hp = OPT_HDR(struct ipv6_opt_hdr, skb, off);
  5049. nexthdr = hp->nexthdr;
  5050. off += ipv6_optlen(hp);
  5051. break;
  5052. }
  5053. case IPPROTO_AH: {
  5054. struct ip_auth_hdr *hp;
  5055. err = skb_maybe_pull_tail(skb,
  5056. off +
  5057. sizeof(struct ip_auth_hdr),
  5058. MAX_IPV6_HDR_LEN);
  5059. if (err < 0)
  5060. goto out;
  5061. hp = OPT_HDR(struct ip_auth_hdr, skb, off);
  5062. nexthdr = hp->nexthdr;
  5063. off += ipv6_authlen(hp);
  5064. break;
  5065. }
  5066. case IPPROTO_FRAGMENT: {
  5067. struct frag_hdr *hp;
  5068. err = skb_maybe_pull_tail(skb,
  5069. off +
  5070. sizeof(struct frag_hdr),
  5071. MAX_IPV6_HDR_LEN);
  5072. if (err < 0)
  5073. goto out;
  5074. hp = OPT_HDR(struct frag_hdr, skb, off);
  5075. if (hp->frag_off & htons(IP6_OFFSET | IP6_MF))
  5076. fragment = true;
  5077. nexthdr = hp->nexthdr;
  5078. off += sizeof(struct frag_hdr);
  5079. break;
  5080. }
  5081. default:
  5082. done = true;
  5083. break;
  5084. }
  5085. }
  5086. err = -EPROTO;
  5087. if (!done || fragment)
  5088. goto out;
  5089. csum = skb_checksum_setup_ip(skb, nexthdr, off);
  5090. if (IS_ERR(csum))
  5091. return PTR_ERR(csum);
  5092. if (recalculate)
  5093. *csum = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
  5094. &ipv6_hdr(skb)->daddr,
  5095. skb->len - off, nexthdr, 0);
  5096. err = 0;
  5097. out:
  5098. return err;
  5099. }
  5100. /**
  5101. * skb_checksum_setup - set up partial checksum offset
  5102. * @skb: the skb to set up
  5103. * @recalculate: if true the pseudo-header checksum will be recalculated
  5104. */
  5105. int skb_checksum_setup(struct sk_buff *skb, bool recalculate)
  5106. {
  5107. int err;
  5108. switch (skb->protocol) {
  5109. case htons(ETH_P_IP):
  5110. err = skb_checksum_setup_ipv4(skb, recalculate);
  5111. break;
  5112. case htons(ETH_P_IPV6):
  5113. err = skb_checksum_setup_ipv6(skb, recalculate);
  5114. break;
  5115. default:
  5116. err = -EPROTO;
  5117. break;
  5118. }
  5119. return err;
  5120. }
  5121. EXPORT_SYMBOL(skb_checksum_setup);
  5122. /**
  5123. * skb_checksum_maybe_trim - maybe trims the given skb
  5124. * @skb: the skb to check
  5125. * @transport_len: the data length beyond the network header
  5126. *
  5127. * Checks whether the given skb has data beyond the given transport length.
  5128. * If so, returns a cloned skb trimmed to this transport length.
  5129. * Otherwise returns the provided skb. Returns NULL in error cases
  5130. * (e.g. transport_len exceeds skb length or out-of-memory).
  5131. *
  5132. * Caller needs to set the skb transport header and free any returned skb if it
  5133. * differs from the provided skb.
  5134. */
  5135. static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb,
  5136. unsigned int transport_len)
  5137. {
  5138. struct sk_buff *skb_chk;
  5139. unsigned int len = skb_transport_offset(skb) + transport_len;
  5140. int ret;
  5141. if (skb->len < len)
  5142. return NULL;
  5143. else if (skb->len == len)
  5144. return skb;
  5145. skb_chk = skb_clone(skb, GFP_ATOMIC);
  5146. if (!skb_chk)
  5147. return NULL;
  5148. ret = pskb_trim_rcsum(skb_chk, len);
  5149. if (ret) {
  5150. kfree_skb(skb_chk);
  5151. return NULL;
  5152. }
  5153. return skb_chk;
  5154. }
  5155. /**
  5156. * skb_checksum_trimmed - validate checksum of an skb
  5157. * @skb: the skb to check
  5158. * @transport_len: the data length beyond the network header
  5159. * @skb_chkf: checksum function to use
  5160. *
  5161. * Applies the given checksum function skb_chkf to the provided skb.
  5162. * Returns a checked and maybe trimmed skb. Returns NULL on error.
  5163. *
  5164. * If the skb has data beyond the given transport length, then a
  5165. * trimmed & cloned skb is checked and returned.
  5166. *
  5167. * Caller needs to set the skb transport header and free any returned skb if it
  5168. * differs from the provided skb.
  5169. */
  5170. struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
  5171. unsigned int transport_len,
  5172. __sum16(*skb_chkf)(struct sk_buff *skb))
  5173. {
  5174. struct sk_buff *skb_chk;
  5175. unsigned int offset = skb_transport_offset(skb);
  5176. __sum16 ret;
  5177. skb_chk = skb_checksum_maybe_trim(skb, transport_len);
  5178. if (!skb_chk)
  5179. goto err;
  5180. if (!pskb_may_pull(skb_chk, offset))
  5181. goto err;
  5182. skb_pull_rcsum(skb_chk, offset);
  5183. ret = skb_chkf(skb_chk);
  5184. skb_push_rcsum(skb_chk, offset);
  5185. if (ret)
  5186. goto err;
  5187. return skb_chk;
  5188. err:
  5189. if (skb_chk && skb_chk != skb)
  5190. kfree_skb(skb_chk);
  5191. return NULL;
  5192. }
  5193. EXPORT_SYMBOL(skb_checksum_trimmed);
  5194. void __skb_warn_lro_forwarding(const struct sk_buff *skb)
  5195. {
  5196. net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n",
  5197. skb->dev->name);
  5198. }
  5199. EXPORT_SYMBOL(__skb_warn_lro_forwarding);
  5200. void kfree_skb_partial(struct sk_buff *skb, bool head_stolen)
  5201. {
  5202. if (head_stolen) {
  5203. skb_release_head_state(skb);
  5204. kmem_cache_free(net_hotdata.skbuff_cache, skb);
  5205. } else {
  5206. __kfree_skb(skb);
  5207. }
  5208. }
  5209. EXPORT_SYMBOL(kfree_skb_partial);
  5210. /**
  5211. * skb_try_coalesce - try to merge skb to prior one
  5212. * @to: prior buffer
  5213. * @from: buffer to add
  5214. * @fragstolen: pointer to boolean
  5215. * @delta_truesize: how much more was allocated than was requested
  5216. */
  5217. bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
  5218. bool *fragstolen, int *delta_truesize)
  5219. {
  5220. struct skb_shared_info *to_shinfo, *from_shinfo;
  5221. int i, delta, len = from->len;
  5222. *fragstolen = false;
  5223. if (skb_cloned(to))
  5224. return false;
  5225. /* In general, avoid mixing page_pool and non-page_pool allocated
  5226. * pages within the same SKB. In theory we could take full
  5227. * references if @from is cloned and !@to->pp_recycle but its
  5228. * tricky (due to potential race with the clone disappearing) and
  5229. * rare, so not worth dealing with.
  5230. */
  5231. if (to->pp_recycle != from->pp_recycle)
  5232. return false;
  5233. if (skb_frags_readable(from) != skb_frags_readable(to))
  5234. return false;
  5235. if (len <= skb_tailroom(to) && skb_frags_readable(from)) {
  5236. if (len)
  5237. BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
  5238. *delta_truesize = 0;
  5239. return true;
  5240. }
  5241. to_shinfo = skb_shinfo(to);
  5242. from_shinfo = skb_shinfo(from);
  5243. if (to_shinfo->frag_list || from_shinfo->frag_list)
  5244. return false;
  5245. if (skb_zcopy(to) || skb_zcopy(from))
  5246. return false;
  5247. if (skb_headlen(from) != 0) {
  5248. struct page *page;
  5249. unsigned int offset;
  5250. if (to_shinfo->nr_frags +
  5251. from_shinfo->nr_frags >= MAX_SKB_FRAGS)
  5252. return false;
  5253. if (skb_head_is_locked(from))
  5254. return false;
  5255. delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));
  5256. page = virt_to_head_page(from->head);
  5257. offset = from->data - (unsigned char *)page_address(page);
  5258. skb_fill_page_desc(to, to_shinfo->nr_frags,
  5259. page, offset, skb_headlen(from));
  5260. *fragstolen = true;
  5261. } else {
  5262. if (to_shinfo->nr_frags +
  5263. from_shinfo->nr_frags > MAX_SKB_FRAGS)
  5264. return false;
  5265. delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from));
  5266. }
  5267. WARN_ON_ONCE(delta < len);
  5268. memcpy(to_shinfo->frags + to_shinfo->nr_frags,
  5269. from_shinfo->frags,
  5270. from_shinfo->nr_frags * sizeof(skb_frag_t));
  5271. to_shinfo->nr_frags += from_shinfo->nr_frags;
  5272. if (!skb_cloned(from))
  5273. from_shinfo->nr_frags = 0;
  5274. /* if the skb is not cloned this does nothing
  5275. * since we set nr_frags to 0.
  5276. */
  5277. if (skb_pp_frag_ref(from)) {
  5278. for (i = 0; i < from_shinfo->nr_frags; i++)
  5279. __skb_frag_ref(&from_shinfo->frags[i]);
  5280. }
  5281. to->truesize += delta;
  5282. to->len += len;
  5283. to->data_len += len;
  5284. *delta_truesize = delta;
  5285. return true;
  5286. }
  5287. EXPORT_SYMBOL(skb_try_coalesce);
  5288. /**
  5289. * skb_scrub_packet - scrub an skb
  5290. *
  5291. * @skb: buffer to clean
  5292. * @xnet: packet is crossing netns
  5293. *
  5294. * skb_scrub_packet can be used after encapsulating or decapsulating a packet
  5295. * into/from a tunnel. Some information have to be cleared during these
  5296. * operations.
  5297. * skb_scrub_packet can also be used to clean a skb before injecting it in
  5298. * another namespace (@xnet == true). We have to clear all information in the
  5299. * skb that could impact namespace isolation.
  5300. */
  5301. void skb_scrub_packet(struct sk_buff *skb, bool xnet)
  5302. {
  5303. skb->pkt_type = PACKET_HOST;
  5304. skb->skb_iif = 0;
  5305. skb->ignore_df = 0;
  5306. skb_dst_drop(skb);
  5307. skb_ext_reset(skb);
  5308. nf_reset_ct(skb);
  5309. nf_reset_trace(skb);
  5310. #ifdef CONFIG_NET_SWITCHDEV
  5311. skb->offload_fwd_mark = 0;
  5312. skb->offload_l3_fwd_mark = 0;
  5313. #endif
  5314. ipvs_reset(skb);
  5315. if (!xnet)
  5316. return;
  5317. skb->mark = 0;
  5318. skb_clear_tstamp(skb);
  5319. }
  5320. EXPORT_SYMBOL_GPL(skb_scrub_packet);
  5321. static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb)
  5322. {
  5323. int mac_len, meta_len;
  5324. void *meta;
  5325. if (skb_cow(skb, skb_headroom(skb)) < 0) {
  5326. kfree_skb(skb);
  5327. return NULL;
  5328. }
  5329. mac_len = skb->data - skb_mac_header(skb);
  5330. if (likely(mac_len > VLAN_HLEN + ETH_TLEN)) {
  5331. memmove(skb_mac_header(skb) + VLAN_HLEN, skb_mac_header(skb),
  5332. mac_len - VLAN_HLEN - ETH_TLEN);
  5333. }
  5334. meta_len = skb_metadata_len(skb);
  5335. if (meta_len) {
  5336. meta = skb_metadata_end(skb) - meta_len;
  5337. memmove(meta + VLAN_HLEN, meta, meta_len);
  5338. }
  5339. skb->mac_header += VLAN_HLEN;
  5340. return skb;
  5341. }
  5342. struct sk_buff *skb_vlan_untag(struct sk_buff *skb)
  5343. {
  5344. struct vlan_hdr *vhdr;
  5345. u16 vlan_tci;
  5346. if (unlikely(skb_vlan_tag_present(skb))) {
  5347. /* vlan_tci is already set-up so leave this for another time */
  5348. return skb;
  5349. }
  5350. skb = skb_share_check(skb, GFP_ATOMIC);
  5351. if (unlikely(!skb))
  5352. goto err_free;
  5353. /* We may access the two bytes after vlan_hdr in vlan_set_encap_proto(). */
  5354. if (unlikely(!pskb_may_pull(skb, VLAN_HLEN + sizeof(unsigned short))))
  5355. goto err_free;
  5356. vhdr = (struct vlan_hdr *)skb->data;
  5357. vlan_tci = ntohs(vhdr->h_vlan_TCI);
  5358. __vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci);
  5359. skb_pull_rcsum(skb, VLAN_HLEN);
  5360. vlan_set_encap_proto(skb, vhdr);
  5361. skb = skb_reorder_vlan_header(skb);
  5362. if (unlikely(!skb))
  5363. goto err_free;
  5364. skb_reset_network_header(skb);
  5365. if (!skb_transport_header_was_set(skb))
  5366. skb_reset_transport_header(skb);
  5367. skb_reset_mac_len(skb);
  5368. return skb;
  5369. err_free:
  5370. kfree_skb(skb);
  5371. return NULL;
  5372. }
  5373. EXPORT_SYMBOL(skb_vlan_untag);
  5374. int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len)
  5375. {
  5376. if (!pskb_may_pull(skb, write_len))
  5377. return -ENOMEM;
  5378. if (!skb_cloned(skb) || skb_clone_writable(skb, write_len))
  5379. return 0;
  5380. return pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
  5381. }
  5382. EXPORT_SYMBOL(skb_ensure_writable);
  5383. int skb_ensure_writable_head_tail(struct sk_buff *skb, struct net_device *dev)
  5384. {
  5385. int needed_headroom = dev->needed_headroom;
  5386. int needed_tailroom = dev->needed_tailroom;
  5387. /* For tail taggers, we need to pad short frames ourselves, to ensure
  5388. * that the tail tag does not fail at its role of being at the end of
  5389. * the packet, once the conduit interface pads the frame. Account for
  5390. * that pad length here, and pad later.
  5391. */
  5392. if (unlikely(needed_tailroom && skb->len < ETH_ZLEN))
  5393. needed_tailroom += ETH_ZLEN - skb->len;
  5394. /* skb_headroom() returns unsigned int... */
  5395. needed_headroom = max_t(int, needed_headroom - skb_headroom(skb), 0);
  5396. needed_tailroom = max_t(int, needed_tailroom - skb_tailroom(skb), 0);
  5397. if (likely(!needed_headroom && !needed_tailroom && !skb_cloned(skb)))
  5398. /* No reallocation needed, yay! */
  5399. return 0;
  5400. return pskb_expand_head(skb, needed_headroom, needed_tailroom,
  5401. GFP_ATOMIC);
  5402. }
  5403. EXPORT_SYMBOL(skb_ensure_writable_head_tail);
  5404. /* remove VLAN header from packet and update csum accordingly.
  5405. * expects a non skb_vlan_tag_present skb with a vlan tag payload
  5406. */
  5407. int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
  5408. {
  5409. int offset = skb->data - skb_mac_header(skb);
  5410. int err;
  5411. if (WARN_ONCE(offset,
  5412. "__skb_vlan_pop got skb with skb->data not at mac header (offset %d)\n",
  5413. offset)) {
  5414. return -EINVAL;
  5415. }
  5416. err = skb_ensure_writable(skb, VLAN_ETH_HLEN);
  5417. if (unlikely(err))
  5418. return err;
  5419. skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
  5420. vlan_remove_tag(skb, vlan_tci);
  5421. skb->mac_header += VLAN_HLEN;
  5422. if (skb_network_offset(skb) < ETH_HLEN)
  5423. skb_set_network_header(skb, ETH_HLEN);
  5424. skb_reset_mac_len(skb);
  5425. return err;
  5426. }
  5427. EXPORT_SYMBOL(__skb_vlan_pop);
  5428. /* Pop a vlan tag either from hwaccel or from payload.
  5429. * Expects skb->data at mac header.
  5430. */
  5431. int skb_vlan_pop(struct sk_buff *skb)
  5432. {
  5433. u16 vlan_tci;
  5434. __be16 vlan_proto;
  5435. int err;
  5436. if (likely(skb_vlan_tag_present(skb))) {
  5437. __vlan_hwaccel_clear_tag(skb);
  5438. } else {
  5439. if (unlikely(!eth_type_vlan(skb->protocol)))
  5440. return 0;
  5441. err = __skb_vlan_pop(skb, &vlan_tci);
  5442. if (err)
  5443. return err;
  5444. }
  5445. /* move next vlan tag to hw accel tag */
  5446. if (likely(!eth_type_vlan(skb->protocol)))
  5447. return 0;
  5448. vlan_proto = skb->protocol;
  5449. err = __skb_vlan_pop(skb, &vlan_tci);
  5450. if (unlikely(err))
  5451. return err;
  5452. __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
  5453. return 0;
  5454. }
  5455. EXPORT_SYMBOL(skb_vlan_pop);
  5456. /* Push a vlan tag either into hwaccel or into payload (if hwaccel tag present).
  5457. * Expects skb->data at mac header.
  5458. */
  5459. int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
  5460. {
  5461. if (skb_vlan_tag_present(skb)) {
  5462. int offset = skb->data - skb_mac_header(skb);
  5463. int err;
  5464. if (WARN_ONCE(offset,
  5465. "skb_vlan_push got skb with skb->data not at mac header (offset %d)\n",
  5466. offset)) {
  5467. return -EINVAL;
  5468. }
  5469. err = __vlan_insert_tag(skb, skb->vlan_proto,
  5470. skb_vlan_tag_get(skb));
  5471. if (err)
  5472. return err;
  5473. skb->protocol = skb->vlan_proto;
  5474. skb->network_header -= VLAN_HLEN;
  5475. skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
  5476. }
  5477. __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
  5478. return 0;
  5479. }
  5480. EXPORT_SYMBOL(skb_vlan_push);
  5481. /**
  5482. * skb_eth_pop() - Drop the Ethernet header at the head of a packet
  5483. *
  5484. * @skb: Socket buffer to modify
  5485. *
  5486. * Drop the Ethernet header of @skb.
  5487. *
  5488. * Expects that skb->data points to the mac header and that no VLAN tags are
  5489. * present.
  5490. *
  5491. * Returns 0 on success, -errno otherwise.
  5492. */
  5493. int skb_eth_pop(struct sk_buff *skb)
  5494. {
  5495. if (!pskb_may_pull(skb, ETH_HLEN) || skb_vlan_tagged(skb) ||
  5496. skb_network_offset(skb) < ETH_HLEN)
  5497. return -EPROTO;
  5498. skb_pull_rcsum(skb, ETH_HLEN);
  5499. skb_reset_mac_header(skb);
  5500. skb_reset_mac_len(skb);
  5501. return 0;
  5502. }
  5503. EXPORT_SYMBOL(skb_eth_pop);
  5504. /**
  5505. * skb_eth_push() - Add a new Ethernet header at the head of a packet
  5506. *
  5507. * @skb: Socket buffer to modify
  5508. * @dst: Destination MAC address of the new header
  5509. * @src: Source MAC address of the new header
  5510. *
  5511. * Prepend @skb with a new Ethernet header.
  5512. *
  5513. * Expects that skb->data points to the mac header, which must be empty.
  5514. *
  5515. * Returns 0 on success, -errno otherwise.
  5516. */
  5517. int skb_eth_push(struct sk_buff *skb, const unsigned char *dst,
  5518. const unsigned char *src)
  5519. {
  5520. struct ethhdr *eth;
  5521. int err;
  5522. if (skb_network_offset(skb) || skb_vlan_tag_present(skb))
  5523. return -EPROTO;
  5524. err = skb_cow_head(skb, sizeof(*eth));
  5525. if (err < 0)
  5526. return err;
  5527. skb_push(skb, sizeof(*eth));
  5528. skb_reset_mac_header(skb);
  5529. skb_reset_mac_len(skb);
  5530. eth = eth_hdr(skb);
  5531. ether_addr_copy(eth->h_dest, dst);
  5532. ether_addr_copy(eth->h_source, src);
  5533. eth->h_proto = skb->protocol;
  5534. skb_postpush_rcsum(skb, eth, sizeof(*eth));
  5535. return 0;
  5536. }
  5537. EXPORT_SYMBOL(skb_eth_push);
  5538. /* Update the ethertype of hdr and the skb csum value if required. */
  5539. static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr,
  5540. __be16 ethertype)
  5541. {
  5542. if (skb->ip_summed == CHECKSUM_COMPLETE) {
  5543. __be16 diff[] = { ~hdr->h_proto, ethertype };
  5544. skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum);
  5545. }
  5546. hdr->h_proto = ethertype;
  5547. }
  5548. /**
  5549. * skb_mpls_push() - push a new MPLS header after mac_len bytes from start of
  5550. * the packet
  5551. *
  5552. * @skb: buffer
  5553. * @mpls_lse: MPLS label stack entry to push
  5554. * @mpls_proto: ethertype of the new MPLS header (expects 0x8847 or 0x8848)
  5555. * @mac_len: length of the MAC header
  5556. * @ethernet: flag to indicate if the resulting packet after skb_mpls_push is
  5557. * ethernet
  5558. *
  5559. * Expects skb->data at mac header.
  5560. *
  5561. * Returns 0 on success, -errno otherwise.
  5562. */
  5563. int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto,
  5564. int mac_len, bool ethernet)
  5565. {
  5566. struct mpls_shim_hdr *lse;
  5567. int err;
  5568. if (unlikely(!eth_p_mpls(mpls_proto)))
  5569. return -EINVAL;
  5570. /* Networking stack does not allow simultaneous Tunnel and MPLS GSO. */
  5571. if (skb->encapsulation)
  5572. return -EINVAL;
  5573. err = skb_cow_head(skb, MPLS_HLEN);
  5574. if (unlikely(err))
  5575. return err;
  5576. if (!skb->inner_protocol) {
  5577. skb_set_inner_network_header(skb, skb_network_offset(skb));
  5578. skb_set_inner_protocol(skb, skb->protocol);
  5579. }
  5580. skb_push(skb, MPLS_HLEN);
  5581. memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb),
  5582. mac_len);
  5583. skb_reset_mac_header(skb);
  5584. skb_set_network_header(skb, mac_len);
  5585. skb_reset_mac_len(skb);
  5586. lse = mpls_hdr(skb);
  5587. lse->label_stack_entry = mpls_lse;
  5588. skb_postpush_rcsum(skb, lse, MPLS_HLEN);
  5589. if (ethernet && mac_len >= ETH_HLEN)
  5590. skb_mod_eth_type(skb, eth_hdr(skb), mpls_proto);
  5591. skb->protocol = mpls_proto;
  5592. return 0;
  5593. }
  5594. EXPORT_SYMBOL_GPL(skb_mpls_push);
  5595. /**
  5596. * skb_mpls_pop() - pop the outermost MPLS header
  5597. *
  5598. * @skb: buffer
  5599. * @next_proto: ethertype of header after popped MPLS header
  5600. * @mac_len: length of the MAC header
  5601. * @ethernet: flag to indicate if the packet is ethernet
  5602. *
  5603. * Expects skb->data at mac header.
  5604. *
  5605. * Returns 0 on success, -errno otherwise.
  5606. */
  5607. int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len,
  5608. bool ethernet)
  5609. {
  5610. int err;
  5611. if (unlikely(!eth_p_mpls(skb->protocol)))
  5612. return 0;
  5613. err = skb_ensure_writable(skb, mac_len + MPLS_HLEN);
  5614. if (unlikely(err))
  5615. return err;
  5616. skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN);
  5617. memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb),
  5618. mac_len);
  5619. __skb_pull(skb, MPLS_HLEN);
  5620. skb_reset_mac_header(skb);
  5621. skb_set_network_header(skb, mac_len);
  5622. if (ethernet && mac_len >= ETH_HLEN) {
  5623. struct ethhdr *hdr;
  5624. /* use mpls_hdr() to get ethertype to account for VLANs. */
  5625. hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN);
  5626. skb_mod_eth_type(skb, hdr, next_proto);
  5627. }
  5628. skb->protocol = next_proto;
  5629. return 0;
  5630. }
  5631. EXPORT_SYMBOL_GPL(skb_mpls_pop);
  5632. /**
  5633. * skb_mpls_update_lse() - modify outermost MPLS header and update csum
  5634. *
  5635. * @skb: buffer
  5636. * @mpls_lse: new MPLS label stack entry to update to
  5637. *
  5638. * Expects skb->data at mac header.
  5639. *
  5640. * Returns 0 on success, -errno otherwise.
  5641. */
  5642. int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse)
  5643. {
  5644. int err;
  5645. if (unlikely(!eth_p_mpls(skb->protocol)))
  5646. return -EINVAL;
  5647. err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN);
  5648. if (unlikely(err))
  5649. return err;
  5650. if (skb->ip_summed == CHECKSUM_COMPLETE) {
  5651. __be32 diff[] = { ~mpls_hdr(skb)->label_stack_entry, mpls_lse };
  5652. skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum);
  5653. }
  5654. mpls_hdr(skb)->label_stack_entry = mpls_lse;
  5655. return 0;
  5656. }
  5657. EXPORT_SYMBOL_GPL(skb_mpls_update_lse);
  5658. /**
  5659. * skb_mpls_dec_ttl() - decrement the TTL of the outermost MPLS header
  5660. *
  5661. * @skb: buffer
  5662. *
  5663. * Expects skb->data at mac header.
  5664. *
  5665. * Returns 0 on success, -errno otherwise.
  5666. */
  5667. int skb_mpls_dec_ttl(struct sk_buff *skb)
  5668. {
  5669. u32 lse;
  5670. u8 ttl;
  5671. if (unlikely(!eth_p_mpls(skb->protocol)))
  5672. return -EINVAL;
  5673. if (!pskb_may_pull(skb, skb_network_offset(skb) + MPLS_HLEN))
  5674. return -ENOMEM;
  5675. lse = be32_to_cpu(mpls_hdr(skb)->label_stack_entry);
  5676. ttl = (lse & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT;
  5677. if (!--ttl)
  5678. return -EINVAL;
  5679. lse &= ~MPLS_LS_TTL_MASK;
  5680. lse |= ttl << MPLS_LS_TTL_SHIFT;
  5681. return skb_mpls_update_lse(skb, cpu_to_be32(lse));
  5682. }
  5683. EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl);
  5684. /**
  5685. * alloc_skb_with_frags - allocate skb with page frags
  5686. *
  5687. * @header_len: size of linear part
  5688. * @data_len: needed length in frags
  5689. * @order: max page order desired.
  5690. * @errcode: pointer to error code if any
  5691. * @gfp_mask: allocation mask
  5692. *
  5693. * This can be used to allocate a paged skb, given a maximal order for frags.
  5694. */
  5695. struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
  5696. unsigned long data_len,
  5697. int order,
  5698. int *errcode,
  5699. gfp_t gfp_mask)
  5700. {
  5701. unsigned long chunk;
  5702. struct sk_buff *skb;
  5703. struct page *page;
  5704. int nr_frags = 0;
  5705. *errcode = -EMSGSIZE;
  5706. if (unlikely(data_len > MAX_SKB_FRAGS * (PAGE_SIZE << order)))
  5707. return NULL;
  5708. *errcode = -ENOBUFS;
  5709. skb = alloc_skb(header_len, gfp_mask);
  5710. if (!skb)
  5711. return NULL;
  5712. while (data_len) {
  5713. if (nr_frags == MAX_SKB_FRAGS)
  5714. goto failure;
  5715. while (order && PAGE_ALIGN(data_len) < (PAGE_SIZE << order))
  5716. order--;
  5717. if (order) {
  5718. page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) |
  5719. __GFP_COMP |
  5720. __GFP_NOWARN,
  5721. order);
  5722. if (!page) {
  5723. order--;
  5724. continue;
  5725. }
  5726. } else {
  5727. page = alloc_page(gfp_mask);
  5728. if (!page)
  5729. goto failure;
  5730. }
  5731. chunk = min_t(unsigned long, data_len,
  5732. PAGE_SIZE << order);
  5733. skb_fill_page_desc(skb, nr_frags, page, 0, chunk);
  5734. nr_frags++;
  5735. skb->truesize += (PAGE_SIZE << order);
  5736. data_len -= chunk;
  5737. }
  5738. return skb;
  5739. failure:
  5740. kfree_skb(skb);
  5741. return NULL;
  5742. }
  5743. EXPORT_SYMBOL(alloc_skb_with_frags);
  5744. /* carve out the first off bytes from skb when off < headlen */
  5745. static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
  5746. const int headlen, gfp_t gfp_mask)
  5747. {
  5748. int i;
  5749. unsigned int size = skb_end_offset(skb);
  5750. int new_hlen = headlen - off;
  5751. u8 *data;
  5752. if (skb_pfmemalloc(skb))
  5753. gfp_mask |= __GFP_MEMALLOC;
  5754. data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
  5755. if (!data)
  5756. return -ENOMEM;
  5757. size = SKB_WITH_OVERHEAD(size);
  5758. /* Copy real data, and all frags */
  5759. skb_copy_from_linear_data_offset(skb, off, data, new_hlen);
  5760. skb->len -= off;
  5761. memcpy((struct skb_shared_info *)(data + size),
  5762. skb_shinfo(skb),
  5763. offsetof(struct skb_shared_info,
  5764. frags[skb_shinfo(skb)->nr_frags]));
  5765. if (skb_cloned(skb)) {
  5766. /* drop the old head gracefully */
  5767. if (skb_orphan_frags(skb, gfp_mask)) {
  5768. skb_kfree_head(data, size);
  5769. return -ENOMEM;
  5770. }
  5771. for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
  5772. skb_frag_ref(skb, i);
  5773. if (skb_has_frag_list(skb))
  5774. skb_clone_fraglist(skb);
  5775. skb_release_data(skb, SKB_CONSUMED);
  5776. } else {
  5777. /* we can reuse existing recount- all we did was
  5778. * relocate values
  5779. */
  5780. skb_free_head(skb);
  5781. }
  5782. skb->head = data;
  5783. skb->data = data;
  5784. skb->head_frag = 0;
  5785. skb_set_end_offset(skb, size);
  5786. skb_set_tail_pointer(skb, skb_headlen(skb));
  5787. skb_headers_offset_update(skb, 0);
  5788. skb->cloned = 0;
  5789. skb->hdr_len = 0;
  5790. skb->nohdr = 0;
  5791. atomic_set(&skb_shinfo(skb)->dataref, 1);
  5792. return 0;
  5793. }
  5794. static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp);
  5795. /* carve out the first eat bytes from skb's frag_list. May recurse into
  5796. * pskb_carve()
  5797. */
  5798. static int pskb_carve_frag_list(struct skb_shared_info *shinfo, int eat,
  5799. gfp_t gfp_mask)
  5800. {
  5801. struct sk_buff *list = shinfo->frag_list;
  5802. struct sk_buff *clone = NULL;
  5803. struct sk_buff *insp = NULL;
  5804. do {
  5805. if (!list) {
  5806. pr_err("Not enough bytes to eat. Want %d\n", eat);
  5807. return -EFAULT;
  5808. }
  5809. if (list->len <= eat) {
  5810. /* Eaten as whole. */
  5811. eat -= list->len;
  5812. list = list->next;
  5813. insp = list;
  5814. } else {
  5815. /* Eaten partially. */
  5816. if (skb_shared(list)) {
  5817. clone = skb_clone(list, gfp_mask);
  5818. if (!clone)
  5819. return -ENOMEM;
  5820. insp = list->next;
  5821. list = clone;
  5822. } else {
  5823. /* This may be pulled without problems. */
  5824. insp = list;
  5825. }
  5826. if (pskb_carve(list, eat, gfp_mask) < 0) {
  5827. kfree_skb(clone);
  5828. return -ENOMEM;
  5829. }
  5830. break;
  5831. }
  5832. } while (eat);
  5833. /* Free pulled out fragments. */
  5834. while ((list = shinfo->frag_list) != insp) {
  5835. shinfo->frag_list = list->next;
  5836. consume_skb(list);
  5837. }
  5838. /* And insert new clone at head. */
  5839. if (clone) {
  5840. clone->next = list;
  5841. shinfo->frag_list = clone;
  5842. }
  5843. return 0;
  5844. }
  5845. /* carve off first len bytes from skb. Split line (off) is in the
  5846. * non-linear part of skb
  5847. */
  5848. static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
  5849. int pos, gfp_t gfp_mask)
  5850. {
  5851. int i, k = 0;
  5852. unsigned int size = skb_end_offset(skb);
  5853. u8 *data;
  5854. const int nfrags = skb_shinfo(skb)->nr_frags;
  5855. struct skb_shared_info *shinfo;
  5856. if (skb_pfmemalloc(skb))
  5857. gfp_mask |= __GFP_MEMALLOC;
  5858. data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
  5859. if (!data)
  5860. return -ENOMEM;
  5861. size = SKB_WITH_OVERHEAD(size);
  5862. memcpy((struct skb_shared_info *)(data + size),
  5863. skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0]));
  5864. if (skb_orphan_frags(skb, gfp_mask)) {
  5865. skb_kfree_head(data, size);
  5866. return -ENOMEM;
  5867. }
  5868. shinfo = (struct skb_shared_info *)(data + size);
  5869. for (i = 0; i < nfrags; i++) {
  5870. int fsize = skb_frag_size(&skb_shinfo(skb)->frags[i]);
  5871. if (pos + fsize > off) {
  5872. shinfo->frags[k] = skb_shinfo(skb)->frags[i];
  5873. if (pos < off) {
  5874. /* Split frag.
  5875. * We have two variants in this case:
  5876. * 1. Move all the frag to the second
  5877. * part, if it is possible. F.e.
  5878. * this approach is mandatory for TUX,
  5879. * where splitting is expensive.
  5880. * 2. Split is accurately. We make this.
  5881. */
  5882. skb_frag_off_add(&shinfo->frags[0], off - pos);
  5883. skb_frag_size_sub(&shinfo->frags[0], off - pos);
  5884. }
  5885. skb_frag_ref(skb, i);
  5886. k++;
  5887. }
  5888. pos += fsize;
  5889. }
  5890. shinfo->nr_frags = k;
  5891. if (skb_has_frag_list(skb))
  5892. skb_clone_fraglist(skb);
  5893. /* split line is in frag list */
  5894. if (k == 0 && pskb_carve_frag_list(shinfo, off - pos, gfp_mask)) {
  5895. /* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */
  5896. if (skb_has_frag_list(skb))
  5897. kfree_skb_list(skb_shinfo(skb)->frag_list);
  5898. skb_kfree_head(data, size);
  5899. return -ENOMEM;
  5900. }
  5901. skb_release_data(skb, SKB_CONSUMED);
  5902. skb->head = data;
  5903. skb->head_frag = 0;
  5904. skb->data = data;
  5905. skb_set_end_offset(skb, size);
  5906. skb_reset_tail_pointer(skb);
  5907. skb_headers_offset_update(skb, 0);
  5908. skb->cloned = 0;
  5909. skb->hdr_len = 0;
  5910. skb->nohdr = 0;
  5911. skb->len -= off;
  5912. skb->data_len = skb->len;
  5913. atomic_set(&skb_shinfo(skb)->dataref, 1);
  5914. return 0;
  5915. }
  5916. /* remove len bytes from the beginning of the skb */
  5917. static int pskb_carve(struct sk_buff *skb, const u32 len, gfp_t gfp)
  5918. {
  5919. int headlen = skb_headlen(skb);
  5920. if (len < headlen)
  5921. return pskb_carve_inside_header(skb, len, headlen, gfp);
  5922. else
  5923. return pskb_carve_inside_nonlinear(skb, len, headlen, gfp);
  5924. }
  5925. /* Extract to_copy bytes starting at off from skb, and return this in
  5926. * a new skb
  5927. */
  5928. struct sk_buff *pskb_extract(struct sk_buff *skb, int off,
  5929. int to_copy, gfp_t gfp)
  5930. {
  5931. struct sk_buff *clone = skb_clone(skb, gfp);
  5932. if (!clone)
  5933. return NULL;
  5934. if (pskb_carve(clone, off, gfp) < 0 ||
  5935. pskb_trim(clone, to_copy)) {
  5936. kfree_skb(clone);
  5937. return NULL;
  5938. }
  5939. return clone;
  5940. }
  5941. EXPORT_SYMBOL(pskb_extract);
  5942. /**
  5943. * skb_condense - try to get rid of fragments/frag_list if possible
  5944. * @skb: buffer
  5945. *
  5946. * Can be used to save memory before skb is added to a busy queue.
  5947. * If packet has bytes in frags and enough tail room in skb->head,
  5948. * pull all of them, so that we can free the frags right now and adjust
  5949. * truesize.
  5950. * Notes:
  5951. * We do not reallocate skb->head thus can not fail.
  5952. * Caller must re-evaluate skb->truesize if needed.
  5953. */
  5954. void skb_condense(struct sk_buff *skb)
  5955. {
  5956. if (skb->data_len) {
  5957. if (skb->data_len > skb->end - skb->tail ||
  5958. skb_cloned(skb) || !skb_frags_readable(skb))
  5959. return;
  5960. /* Nice, we can free page frag(s) right now */
  5961. __pskb_pull_tail(skb, skb->data_len);
  5962. }
  5963. /* At this point, skb->truesize might be over estimated,
  5964. * because skb had a fragment, and fragments do not tell
  5965. * their truesize.
  5966. * When we pulled its content into skb->head, fragment
  5967. * was freed, but __pskb_pull_tail() could not possibly
  5968. * adjust skb->truesize, not knowing the frag truesize.
  5969. */
  5970. skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
  5971. }
  5972. EXPORT_SYMBOL(skb_condense);
  5973. #ifdef CONFIG_SKB_EXTENSIONS
  5974. static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id)
  5975. {
  5976. return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE);
  5977. }
  5978. /**
  5979. * __skb_ext_alloc - allocate a new skb extensions storage
  5980. *
  5981. * @flags: See kmalloc().
  5982. *
  5983. * Returns the newly allocated pointer. The pointer can later attached to a
  5984. * skb via __skb_ext_set().
  5985. * Note: caller must handle the skb_ext as an opaque data.
  5986. */
  5987. struct skb_ext *__skb_ext_alloc(gfp_t flags)
  5988. {
  5989. struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, flags);
  5990. if (new) {
  5991. memset(new->offset, 0, sizeof(new->offset));
  5992. refcount_set(&new->refcnt, 1);
  5993. }
  5994. return new;
  5995. }
  5996. static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old,
  5997. unsigned int old_active)
  5998. {
  5999. struct skb_ext *new;
  6000. if (refcount_read(&old->refcnt) == 1)
  6001. return old;
  6002. new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC);
  6003. if (!new)
  6004. return NULL;
  6005. memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE);
  6006. refcount_set(&new->refcnt, 1);
  6007. #ifdef CONFIG_XFRM
  6008. if (old_active & (1 << SKB_EXT_SEC_PATH)) {
  6009. struct sec_path *sp = skb_ext_get_ptr(old, SKB_EXT_SEC_PATH);
  6010. unsigned int i;
  6011. for (i = 0; i < sp->len; i++)
  6012. xfrm_state_hold(sp->xvec[i]);
  6013. }
  6014. #endif
  6015. #ifdef CONFIG_MCTP_FLOWS
  6016. if (old_active & (1 << SKB_EXT_MCTP)) {
  6017. struct mctp_flow *flow = skb_ext_get_ptr(old, SKB_EXT_MCTP);
  6018. if (flow->key)
  6019. refcount_inc(&flow->key->refs);
  6020. }
  6021. #endif
  6022. __skb_ext_put(old);
  6023. return new;
  6024. }
  6025. /**
  6026. * __skb_ext_set - attach the specified extension storage to this skb
  6027. * @skb: buffer
  6028. * @id: extension id
  6029. * @ext: extension storage previously allocated via __skb_ext_alloc()
  6030. *
  6031. * Existing extensions, if any, are cleared.
  6032. *
  6033. * Returns the pointer to the extension.
  6034. */
  6035. void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id,
  6036. struct skb_ext *ext)
  6037. {
  6038. unsigned int newlen, newoff = SKB_EXT_CHUNKSIZEOF(*ext);
  6039. skb_ext_put(skb);
  6040. newlen = newoff + skb_ext_type_len[id];
  6041. ext->chunks = newlen;
  6042. ext->offset[id] = newoff;
  6043. skb->extensions = ext;
  6044. skb->active_extensions = 1 << id;
  6045. return skb_ext_get_ptr(ext, id);
  6046. }
  6047. EXPORT_SYMBOL_NS_GPL(__skb_ext_set, "NETDEV_INTERNAL");
  6048. /**
  6049. * skb_ext_add - allocate space for given extension, COW if needed
  6050. * @skb: buffer
  6051. * @id: extension to allocate space for
  6052. *
  6053. * Allocates enough space for the given extension.
  6054. * If the extension is already present, a pointer to that extension
  6055. * is returned.
  6056. *
  6057. * If the skb was cloned, COW applies and the returned memory can be
  6058. * modified without changing the extension space of clones buffers.
  6059. *
  6060. * Returns pointer to the extension or NULL on allocation failure.
  6061. */
  6062. void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id)
  6063. {
  6064. struct skb_ext *new, *old = NULL;
  6065. unsigned int newlen, newoff;
  6066. if (skb->active_extensions) {
  6067. old = skb->extensions;
  6068. new = skb_ext_maybe_cow(old, skb->active_extensions);
  6069. if (!new)
  6070. return NULL;
  6071. if (__skb_ext_exist(new, id))
  6072. goto set_active;
  6073. newoff = new->chunks;
  6074. } else {
  6075. newoff = SKB_EXT_CHUNKSIZEOF(*new);
  6076. new = __skb_ext_alloc(GFP_ATOMIC);
  6077. if (!new)
  6078. return NULL;
  6079. }
  6080. newlen = newoff + skb_ext_type_len[id];
  6081. new->chunks = newlen;
  6082. new->offset[id] = newoff;
  6083. set_active:
  6084. skb->slow_gro = 1;
  6085. skb->extensions = new;
  6086. skb->active_extensions |= 1 << id;
  6087. return skb_ext_get_ptr(new, id);
  6088. }
  6089. EXPORT_SYMBOL(skb_ext_add);
  6090. #ifdef CONFIG_XFRM
  6091. static void skb_ext_put_sp(struct sec_path *sp)
  6092. {
  6093. unsigned int i;
  6094. for (i = 0; i < sp->len; i++)
  6095. xfrm_state_put(sp->xvec[i]);
  6096. }
  6097. #endif
  6098. #ifdef CONFIG_MCTP_FLOWS
  6099. static void skb_ext_put_mctp(struct mctp_flow *flow)
  6100. {
  6101. if (flow->key)
  6102. mctp_key_unref(flow->key);
  6103. }
  6104. #endif
  6105. void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id)
  6106. {
  6107. struct skb_ext *ext = skb->extensions;
  6108. skb->active_extensions &= ~(1 << id);
  6109. if (skb->active_extensions == 0) {
  6110. skb->extensions = NULL;
  6111. __skb_ext_put(ext);
  6112. #ifdef CONFIG_XFRM
  6113. } else if (id == SKB_EXT_SEC_PATH &&
  6114. refcount_read(&ext->refcnt) == 1) {
  6115. struct sec_path *sp = skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH);
  6116. skb_ext_put_sp(sp);
  6117. sp->len = 0;
  6118. #endif
  6119. }
  6120. }
  6121. EXPORT_SYMBOL(__skb_ext_del);
  6122. void __skb_ext_put(struct skb_ext *ext)
  6123. {
  6124. /* If this is last clone, nothing can increment
  6125. * it after check passes. Avoids one atomic op.
  6126. */
  6127. if (refcount_read(&ext->refcnt) == 1)
  6128. goto free_now;
  6129. if (!refcount_dec_and_test(&ext->refcnt))
  6130. return;
  6131. free_now:
  6132. #ifdef CONFIG_XFRM
  6133. if (__skb_ext_exist(ext, SKB_EXT_SEC_PATH))
  6134. skb_ext_put_sp(skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH));
  6135. #endif
  6136. #ifdef CONFIG_MCTP_FLOWS
  6137. if (__skb_ext_exist(ext, SKB_EXT_MCTP))
  6138. skb_ext_put_mctp(skb_ext_get_ptr(ext, SKB_EXT_MCTP));
  6139. #endif
  6140. kmem_cache_free(skbuff_ext_cache, ext);
  6141. }
  6142. EXPORT_SYMBOL(__skb_ext_put);
  6143. #endif /* CONFIG_SKB_EXTENSIONS */
  6144. static void kfree_skb_napi_cache(struct sk_buff *skb)
  6145. {
  6146. /* if SKB is a clone, don't handle this case */
  6147. if (skb->fclone != SKB_FCLONE_UNAVAILABLE) {
  6148. __kfree_skb(skb);
  6149. return;
  6150. }
  6151. local_bh_disable();
  6152. __napi_kfree_skb(skb, SKB_CONSUMED);
  6153. local_bh_enable();
  6154. }
  6155. /**
  6156. * skb_attempt_defer_free - queue skb for remote freeing
  6157. * @skb: buffer
  6158. *
  6159. * Put @skb in a per-cpu list, using the cpu which
  6160. * allocated the skb/pages to reduce false sharing
  6161. * and memory zone spinlock contention.
  6162. */
  6163. void skb_attempt_defer_free(struct sk_buff *skb)
  6164. {
  6165. struct skb_defer_node *sdn;
  6166. unsigned long defer_count;
  6167. unsigned int defer_max;
  6168. bool kick;
  6169. int cpu;
  6170. /* zero copy notifications should not be delayed. */
  6171. if (skb_zcopy(skb))
  6172. goto nodefer;
  6173. cpu = skb->alloc_cpu;
  6174. if (cpu == raw_smp_processor_id() ||
  6175. WARN_ON_ONCE(cpu >= nr_cpu_ids) ||
  6176. !cpu_online(cpu)) {
  6177. nodefer: kfree_skb_napi_cache(skb);
  6178. return;
  6179. }
  6180. DEBUG_NET_WARN_ON_ONCE(skb_dst(skb));
  6181. DEBUG_NET_WARN_ON_ONCE(skb->destructor);
  6182. DEBUG_NET_WARN_ON_ONCE(skb_nfct(skb));
  6183. sdn = per_cpu_ptr(net_hotdata.skb_defer_nodes, cpu) + numa_node_id();
  6184. defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max);
  6185. defer_count = atomic_long_inc_return(&sdn->defer_count);
  6186. if (defer_count >= defer_max)
  6187. goto nodefer;
  6188. llist_add(&skb->ll_node, &sdn->defer_list);
  6189. /* Send an IPI every time queue reaches half capacity. */
  6190. kick = (defer_count - 1) == (defer_max >> 1);
  6191. /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU
  6192. * if we are unlucky enough (this seems very unlikely).
  6193. */
  6194. if (unlikely(kick))
  6195. kick_defer_list_purge(cpu);
  6196. }
  6197. static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,
  6198. size_t offset, size_t len)
  6199. {
  6200. const char *kaddr;
  6201. __wsum csum;
  6202. kaddr = kmap_local_page(page);
  6203. csum = csum_partial(kaddr + offset, len, 0);
  6204. kunmap_local(kaddr);
  6205. skb->csum = csum_block_add(skb->csum, csum, skb->len);
  6206. }
  6207. /**
  6208. * skb_splice_from_iter - Splice (or copy) pages to skbuff
  6209. * @skb: The buffer to add pages to
  6210. * @iter: Iterator representing the pages to be added
  6211. * @maxsize: Maximum amount of pages to be added
  6212. *
  6213. * This is a common helper function for supporting MSG_SPLICE_PAGES. It
  6214. * extracts pages from an iterator and adds them to the socket buffer if
  6215. * possible, copying them to fragments if not possible (such as if they're slab
  6216. * pages).
  6217. *
  6218. * Returns the amount of data spliced/copied or -EMSGSIZE if there's
  6219. * insufficient space in the buffer to transfer anything.
  6220. */
  6221. ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter,
  6222. ssize_t maxsize)
  6223. {
  6224. size_t frag_limit = READ_ONCE(net_hotdata.sysctl_max_skb_frags);
  6225. struct page *pages[8], **ppages = pages;
  6226. ssize_t spliced = 0, ret = 0;
  6227. unsigned int i;
  6228. while (iter->count > 0) {
  6229. ssize_t space, nr, len;
  6230. size_t off;
  6231. ret = -EMSGSIZE;
  6232. space = frag_limit - skb_shinfo(skb)->nr_frags;
  6233. if (space < 0)
  6234. break;
  6235. /* We might be able to coalesce without increasing nr_frags */
  6236. nr = clamp_t(size_t, space, 1, ARRAY_SIZE(pages));
  6237. len = iov_iter_extract_pages(iter, &ppages, maxsize, nr, 0, &off);
  6238. if (len <= 0) {
  6239. ret = len ?: -EIO;
  6240. break;
  6241. }
  6242. i = 0;
  6243. do {
  6244. struct page *page = pages[i++];
  6245. size_t part = min_t(size_t, PAGE_SIZE - off, len);
  6246. ret = -EIO;
  6247. if (WARN_ON_ONCE(!sendpage_ok(page)))
  6248. goto out;
  6249. ret = skb_append_pagefrags(skb, page, off, part,
  6250. frag_limit);
  6251. if (ret < 0) {
  6252. iov_iter_revert(iter, len);
  6253. goto out;
  6254. }
  6255. if (skb->ip_summed == CHECKSUM_NONE)
  6256. skb_splice_csum_page(skb, page, off, part);
  6257. off = 0;
  6258. spliced += part;
  6259. maxsize -= part;
  6260. len -= part;
  6261. } while (len > 0);
  6262. if (maxsize <= 0)
  6263. break;
  6264. }
  6265. out:
  6266. skb_len_add(skb, spliced);
  6267. return spliced ?: ret;
  6268. }
  6269. EXPORT_SYMBOL(skb_splice_from_iter);
  6270. static __always_inline
  6271. size_t memcpy_from_iter_csum(void *iter_from, size_t progress,
  6272. size_t len, void *to, void *priv2)
  6273. {
  6274. __wsum *csum = priv2;
  6275. __wsum next = csum_partial_copy_nocheck(iter_from, to + progress, len);
  6276. *csum = csum_block_add(*csum, next, progress);
  6277. return 0;
  6278. }
  6279. static __always_inline
  6280. size_t copy_from_user_iter_csum(void __user *iter_from, size_t progress,
  6281. size_t len, void *to, void *priv2)
  6282. {
  6283. __wsum next, *csum = priv2;
  6284. next = csum_and_copy_from_user(iter_from, to + progress, len);
  6285. *csum = csum_block_add(*csum, next, progress);
  6286. return next ? 0 : len;
  6287. }
  6288. bool csum_and_copy_from_iter_full(void *addr, size_t bytes,
  6289. __wsum *csum, struct iov_iter *i)
  6290. {
  6291. size_t copied;
  6292. if (WARN_ON_ONCE(!i->data_source))
  6293. return false;
  6294. copied = iterate_and_advance2(i, bytes, addr, csum,
  6295. copy_from_user_iter_csum,
  6296. memcpy_from_iter_csum);
  6297. if (likely(copied == bytes))
  6298. return true;
  6299. iov_iter_revert(i, copied);
  6300. return false;
  6301. }
  6302. EXPORT_SYMBOL(csum_and_copy_from_iter_full);
  6303. void __get_netmem(netmem_ref netmem)
  6304. {
  6305. struct net_iov *niov = netmem_to_net_iov(netmem);
  6306. if (net_is_devmem_iov(niov))
  6307. net_devmem_get_net_iov(netmem_to_net_iov(netmem));
  6308. }
  6309. EXPORT_SYMBOL(__get_netmem);
  6310. void __put_netmem(netmem_ref netmem)
  6311. {
  6312. struct net_iov *niov = netmem_to_net_iov(netmem);
  6313. if (net_is_devmem_iov(niov))
  6314. net_devmem_put_net_iov(netmem_to_net_iov(netmem));
  6315. }
  6316. EXPORT_SYMBOL(__put_netmem);
  6317. struct vlan_type_depth __vlan_get_protocol_offset(const struct sk_buff *skb,
  6318. __be16 type,
  6319. int mac_offset)
  6320. {
  6321. unsigned int vlan_depth = skb->mac_len, parse_depth = VLAN_MAX_DEPTH;
  6322. /* if type is 802.1Q/AD then the header should already be
  6323. * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
  6324. * ETH_HLEN otherwise
  6325. */
  6326. if (vlan_depth) {
  6327. if (WARN_ON_ONCE(vlan_depth < VLAN_HLEN))
  6328. return (struct vlan_type_depth) { 0 };
  6329. vlan_depth -= VLAN_HLEN;
  6330. } else {
  6331. vlan_depth = ETH_HLEN;
  6332. }
  6333. do {
  6334. struct vlan_hdr vhdr, *vh;
  6335. vh = skb_header_pointer(skb, mac_offset + vlan_depth,
  6336. sizeof(vhdr), &vhdr);
  6337. if (unlikely(!vh || !--parse_depth))
  6338. return (struct vlan_type_depth) { 0 };
  6339. type = vh->h_vlan_encapsulated_proto;
  6340. vlan_depth += VLAN_HLEN;
  6341. } while (eth_type_vlan(type));
  6342. return (struct vlan_type_depth) {
  6343. .type = type,
  6344. .depth = vlan_depth
  6345. };
  6346. }
  6347. EXPORT_SYMBOL(__vlan_get_protocol_offset);