inode.c 89 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216
  1. // SPDX-License-Identifier: GPL-2.0
  2. #include <linux/ceph/ceph_debug.h>
  3. #include <linux/module.h>
  4. #include <linux/fs.h>
  5. #include <linux/slab.h>
  6. #include <linux/string.h>
  7. #include <linux/uaccess.h>
  8. #include <linux/kernel.h>
  9. #include <linux/writeback.h>
  10. #include <linux/vmalloc.h>
  11. #include <linux/xattr.h>
  12. #include <linux/posix_acl.h>
  13. #include <linux/random.h>
  14. #include <linux/sort.h>
  15. #include <linux/iversion.h>
  16. #include <linux/fscrypt.h>
  17. #include "super.h"
  18. #include "mds_client.h"
  19. #include "cache.h"
  20. #include "crypto.h"
  21. #include <linux/ceph/decode.h>
  22. /*
  23. * Ceph inode operations
  24. *
  25. * Implement basic inode helpers (get, alloc) and inode ops (getattr,
  26. * setattr, etc.), xattr helpers, and helpers for assimilating
  27. * metadata returned by the MDS into our cache.
  28. *
  29. * Also define helpers for doing asynchronous writeback, invalidation,
  30. * and truncation for the benefit of those who can't afford to block
  31. * (typically because they are in the message handler path).
  32. */
  33. static const struct inode_operations ceph_symlink_iops;
  34. static const struct inode_operations ceph_encrypted_symlink_iops;
  35. static void ceph_inode_work(struct work_struct *work);
  36. /*
  37. * find or create an inode, given the ceph ino number
  38. */
  39. static int ceph_set_ino_cb(struct inode *inode, void *data)
  40. {
  41. struct ceph_inode_info *ci = ceph_inode(inode);
  42. struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
  43. ci->i_vino = *(struct ceph_vino *)data;
  44. inode->i_ino = ceph_vino_to_ino_t(ci->i_vino);
  45. inode_set_iversion_raw(inode, 0);
  46. percpu_counter_inc(&mdsc->metric.total_inodes);
  47. return 0;
  48. }
  49. /*
  50. * Check if the parent inode matches the vino from directory reply info
  51. */
  52. static inline bool ceph_vino_matches_parent(struct inode *parent,
  53. struct ceph_vino vino)
  54. {
  55. return ceph_ino(parent) == vino.ino && ceph_snap(parent) == vino.snap;
  56. }
  57. /*
  58. * Validate that the directory inode referenced by @req->r_parent matches the
  59. * inode number and snapshot id contained in the reply's directory record. If
  60. * they do not match – which can theoretically happen if the parent dentry was
  61. * moved between the time the request was issued and the reply arrived – fall
  62. * back to looking up the correct inode in the inode cache.
  63. *
  64. * A reference is *always* returned. Callers that receive a different inode
  65. * than the original @parent are responsible for dropping the extra reference
  66. * once the reply has been processed.
  67. */
  68. static struct inode *ceph_get_reply_dir(struct super_block *sb,
  69. struct inode *parent,
  70. struct ceph_mds_reply_info_parsed *rinfo)
  71. {
  72. struct ceph_vino vino;
  73. if (unlikely(!rinfo->diri.in))
  74. return parent; /* nothing to compare against */
  75. /* If we didn't have a cached parent inode to begin with, just bail out. */
  76. if (!parent)
  77. return NULL;
  78. vino.ino = le64_to_cpu(rinfo->diri.in->ino);
  79. vino.snap = le64_to_cpu(rinfo->diri.in->snapid);
  80. if (likely(ceph_vino_matches_parent(parent, vino)))
  81. return parent; /* matches – use the original reference */
  82. /* Mismatch – this should be rare. Emit a WARN and obtain the correct inode. */
  83. WARN_ONCE(1, "ceph: reply dir mismatch (parent valid %llx.%llx reply %llx.%llx)\n",
  84. ceph_ino(parent), ceph_snap(parent), vino.ino, vino.snap);
  85. return ceph_get_inode(sb, vino, NULL);
  86. }
  87. /**
  88. * ceph_new_inode - allocate a new inode in advance of an expected create
  89. * @dir: parent directory for new inode
  90. * @dentry: dentry that may eventually point to new inode
  91. * @mode: mode of new inode
  92. * @as_ctx: pointer to inherited security context
  93. *
  94. * Allocate a new inode in advance of an operation to create a new inode.
  95. * This allocates the inode and sets up the acl_sec_ctx with appropriate
  96. * info for the new inode.
  97. *
  98. * Returns a pointer to the new inode or an ERR_PTR.
  99. */
  100. struct inode *ceph_new_inode(struct inode *dir, struct dentry *dentry,
  101. umode_t *mode, struct ceph_acl_sec_ctx *as_ctx)
  102. {
  103. int err;
  104. struct inode *inode;
  105. inode = new_inode(dir->i_sb);
  106. if (!inode)
  107. return ERR_PTR(-ENOMEM);
  108. inode->i_blkbits = CEPH_FSCRYPT_BLOCK_SHIFT;
  109. if (!S_ISLNK(*mode)) {
  110. err = ceph_pre_init_acls(dir, mode, as_ctx);
  111. if (err < 0)
  112. goto out_err;
  113. }
  114. inode_state_assign_raw(inode, 0);
  115. inode->i_mode = *mode;
  116. err = ceph_security_init_secctx(dentry, *mode, as_ctx);
  117. if (err < 0)
  118. goto out_err;
  119. /*
  120. * We'll skip setting fscrypt context for snapshots, leaving that for
  121. * the handle_reply().
  122. */
  123. if (ceph_snap(dir) != CEPH_SNAPDIR) {
  124. err = ceph_fscrypt_prepare_context(dir, inode, as_ctx);
  125. if (err)
  126. goto out_err;
  127. }
  128. return inode;
  129. out_err:
  130. iput(inode);
  131. return ERR_PTR(err);
  132. }
  133. void ceph_as_ctx_to_req(struct ceph_mds_request *req,
  134. struct ceph_acl_sec_ctx *as_ctx)
  135. {
  136. if (as_ctx->pagelist) {
  137. req->r_pagelist = as_ctx->pagelist;
  138. as_ctx->pagelist = NULL;
  139. }
  140. ceph_fscrypt_as_ctx_to_req(req, as_ctx);
  141. }
  142. /**
  143. * ceph_get_inode - find or create/hash a new inode
  144. * @sb: superblock to search and allocate in
  145. * @vino: vino to search for
  146. * @newino: optional new inode to insert if one isn't found (may be NULL)
  147. *
  148. * Search for or insert a new inode into the hash for the given vino, and
  149. * return a reference to it. If new is non-NULL, its reference is consumed.
  150. */
  151. struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino,
  152. struct inode *newino)
  153. {
  154. struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb);
  155. struct ceph_client *cl = mdsc->fsc->client;
  156. struct inode *inode;
  157. if (ceph_vino_is_reserved(vino))
  158. return ERR_PTR(-EREMOTEIO);
  159. if (newino) {
  160. inode = inode_insert5(newino, (unsigned long)vino.ino,
  161. ceph_ino_compare, ceph_set_ino_cb, &vino);
  162. if (inode != newino)
  163. iput(newino);
  164. } else {
  165. inode = iget5_locked(sb, (unsigned long)vino.ino,
  166. ceph_ino_compare, ceph_set_ino_cb, &vino);
  167. }
  168. if (!inode) {
  169. doutc(cl, "no inode found for %llx.%llx\n", vino.ino, vino.snap);
  170. return ERR_PTR(-ENOMEM);
  171. }
  172. doutc(cl, "on %llx=%llx.%llx got %p new %d\n",
  173. ceph_present_inode(inode), ceph_vinop(inode), inode,
  174. !!(inode_state_read_once(inode) & I_NEW));
  175. return inode;
  176. }
  177. /*
  178. * get/construct snapdir inode for a given directory
  179. */
  180. struct inode *ceph_get_snapdir(struct inode *parent)
  181. {
  182. struct ceph_client *cl = ceph_inode_to_client(parent);
  183. struct ceph_vino vino = {
  184. .ino = ceph_ino(parent),
  185. .snap = CEPH_SNAPDIR,
  186. };
  187. struct inode *inode = ceph_get_inode(parent->i_sb, vino, NULL);
  188. struct ceph_inode_info *ci = ceph_inode(inode);
  189. int ret = -ENOTDIR;
  190. if (IS_ERR(inode))
  191. return inode;
  192. if (!S_ISDIR(parent->i_mode)) {
  193. pr_warn_once_client(cl, "bad snapdir parent type (mode=0%o)\n",
  194. parent->i_mode);
  195. goto err;
  196. }
  197. if (!(inode_state_read_once(inode) & I_NEW) && !S_ISDIR(inode->i_mode)) {
  198. pr_warn_once_client(cl, "bad snapdir inode type (mode=0%o)\n",
  199. inode->i_mode);
  200. goto err;
  201. }
  202. inode->i_mode = parent->i_mode;
  203. inode->i_uid = parent->i_uid;
  204. inode->i_gid = parent->i_gid;
  205. inode_set_mtime_to_ts(inode, inode_get_mtime(parent));
  206. inode_set_ctime_to_ts(inode, inode_get_ctime(parent));
  207. inode_set_atime_to_ts(inode, inode_get_atime(parent));
  208. ci->i_rbytes = 0;
  209. ci->i_btime = ceph_inode(parent)->i_btime;
  210. #ifdef CONFIG_FS_ENCRYPTION
  211. /* if encrypted, just borrow fscrypt_auth from parent */
  212. if (IS_ENCRYPTED(parent)) {
  213. struct ceph_inode_info *pci = ceph_inode(parent);
  214. ci->fscrypt_auth = kmemdup(pci->fscrypt_auth,
  215. pci->fscrypt_auth_len,
  216. GFP_KERNEL);
  217. if (ci->fscrypt_auth) {
  218. inode->i_flags |= S_ENCRYPTED;
  219. ci->fscrypt_auth_len = pci->fscrypt_auth_len;
  220. } else {
  221. doutc(cl, "Failed to alloc snapdir fscrypt_auth\n");
  222. ret = -ENOMEM;
  223. goto err;
  224. }
  225. }
  226. #endif
  227. if (inode_state_read_once(inode) & I_NEW) {
  228. inode->i_op = &ceph_snapdir_iops;
  229. inode->i_fop = &ceph_snapdir_fops;
  230. ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
  231. unlock_new_inode(inode);
  232. }
  233. return inode;
  234. err:
  235. if ((inode_state_read_once(inode) & I_NEW))
  236. discard_new_inode(inode);
  237. else
  238. iput(inode);
  239. return ERR_PTR(ret);
  240. }
  241. const struct inode_operations ceph_file_iops = {
  242. .permission = ceph_permission,
  243. .setattr = ceph_setattr,
  244. .getattr = ceph_getattr,
  245. .listxattr = ceph_listxattr,
  246. .get_inode_acl = ceph_get_acl,
  247. .set_acl = ceph_set_acl,
  248. };
  249. /*
  250. * We use a 'frag tree' to keep track of the MDS's directory fragments
  251. * for a given inode (usually there is just a single fragment). We
  252. * need to know when a child frag is delegated to a new MDS, or when
  253. * it is flagged as replicated, so we can direct our requests
  254. * accordingly.
  255. */
  256. /*
  257. * find/create a frag in the tree
  258. */
  259. static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,
  260. u32 f)
  261. {
  262. struct inode *inode = &ci->netfs.inode;
  263. struct ceph_client *cl = ceph_inode_to_client(inode);
  264. struct rb_node **p;
  265. struct rb_node *parent = NULL;
  266. struct ceph_inode_frag *frag;
  267. int c;
  268. p = &ci->i_fragtree.rb_node;
  269. while (*p) {
  270. parent = *p;
  271. frag = rb_entry(parent, struct ceph_inode_frag, node);
  272. c = ceph_frag_compare(f, frag->frag);
  273. if (c < 0)
  274. p = &(*p)->rb_left;
  275. else if (c > 0)
  276. p = &(*p)->rb_right;
  277. else
  278. return frag;
  279. }
  280. frag = kmalloc_obj(*frag, GFP_NOFS);
  281. if (!frag)
  282. return ERR_PTR(-ENOMEM);
  283. frag->frag = f;
  284. frag->split_by = 0;
  285. frag->mds = -1;
  286. frag->ndist = 0;
  287. rb_link_node(&frag->node, parent, p);
  288. rb_insert_color(&frag->node, &ci->i_fragtree);
  289. doutc(cl, "added %p %llx.%llx frag %x\n", inode, ceph_vinop(inode), f);
  290. return frag;
  291. }
  292. /*
  293. * find a specific frag @f
  294. */
  295. struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f)
  296. {
  297. struct rb_node *n = ci->i_fragtree.rb_node;
  298. while (n) {
  299. struct ceph_inode_frag *frag =
  300. rb_entry(n, struct ceph_inode_frag, node);
  301. int c = ceph_frag_compare(f, frag->frag);
  302. if (c < 0)
  303. n = n->rb_left;
  304. else if (c > 0)
  305. n = n->rb_right;
  306. else
  307. return frag;
  308. }
  309. return NULL;
  310. }
  311. /*
  312. * Choose frag containing the given value @v. If @pfrag is
  313. * specified, copy the frag delegation info to the caller if
  314. * it is present.
  315. */
  316. static u32 __ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
  317. struct ceph_inode_frag *pfrag, int *found)
  318. {
  319. struct ceph_client *cl = ceph_inode_to_client(&ci->netfs.inode);
  320. u32 t = ceph_frag_make(0, 0);
  321. struct ceph_inode_frag *frag;
  322. unsigned nway, i;
  323. u32 n;
  324. if (found)
  325. *found = 0;
  326. while (1) {
  327. WARN_ON(!ceph_frag_contains_value(t, v));
  328. frag = __ceph_find_frag(ci, t);
  329. if (!frag)
  330. break; /* t is a leaf */
  331. if (frag->split_by == 0) {
  332. if (pfrag)
  333. memcpy(pfrag, frag, sizeof(*pfrag));
  334. if (found)
  335. *found = 1;
  336. break;
  337. }
  338. /* choose child */
  339. nway = 1 << frag->split_by;
  340. doutc(cl, "frag(%x) %x splits by %d (%d ways)\n", v, t,
  341. frag->split_by, nway);
  342. for (i = 0; i < nway; i++) {
  343. n = ceph_frag_make_child(t, frag->split_by, i);
  344. if (ceph_frag_contains_value(n, v)) {
  345. t = n;
  346. break;
  347. }
  348. }
  349. BUG_ON(i == nway);
  350. }
  351. doutc(cl, "frag(%x) = %x\n", v, t);
  352. return t;
  353. }
  354. u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
  355. struct ceph_inode_frag *pfrag, int *found)
  356. {
  357. u32 ret;
  358. mutex_lock(&ci->i_fragtree_mutex);
  359. ret = __ceph_choose_frag(ci, v, pfrag, found);
  360. mutex_unlock(&ci->i_fragtree_mutex);
  361. return ret;
  362. }
  363. /*
  364. * Process dirfrag (delegation) info from the mds. Include leaf
  365. * fragment in tree ONLY if ndist > 0. Otherwise, only
  366. * branches/splits are included in i_fragtree)
  367. */
  368. static int ceph_fill_dirfrag(struct inode *inode,
  369. struct ceph_mds_reply_dirfrag *dirinfo)
  370. {
  371. struct ceph_inode_info *ci = ceph_inode(inode);
  372. struct ceph_client *cl = ceph_inode_to_client(inode);
  373. struct ceph_inode_frag *frag;
  374. u32 id = le32_to_cpu(dirinfo->frag);
  375. int mds = le32_to_cpu(dirinfo->auth);
  376. int ndist = le32_to_cpu(dirinfo->ndist);
  377. int diri_auth = -1;
  378. int i;
  379. int err = 0;
  380. spin_lock(&ci->i_ceph_lock);
  381. if (ci->i_auth_cap)
  382. diri_auth = ci->i_auth_cap->mds;
  383. spin_unlock(&ci->i_ceph_lock);
  384. if (mds == -1) /* CDIR_AUTH_PARENT */
  385. mds = diri_auth;
  386. mutex_lock(&ci->i_fragtree_mutex);
  387. if (ndist == 0 && mds == diri_auth) {
  388. /* no delegation info needed. */
  389. frag = __ceph_find_frag(ci, id);
  390. if (!frag)
  391. goto out;
  392. if (frag->split_by == 0) {
  393. /* tree leaf, remove */
  394. doutc(cl, "removed %p %llx.%llx frag %x (no ref)\n",
  395. inode, ceph_vinop(inode), id);
  396. rb_erase(&frag->node, &ci->i_fragtree);
  397. kfree(frag);
  398. } else {
  399. /* tree branch, keep and clear */
  400. doutc(cl, "cleared %p %llx.%llx frag %x referral\n",
  401. inode, ceph_vinop(inode), id);
  402. frag->mds = -1;
  403. frag->ndist = 0;
  404. }
  405. goto out;
  406. }
  407. /* find/add this frag to store mds delegation info */
  408. frag = __get_or_create_frag(ci, id);
  409. if (IS_ERR(frag)) {
  410. /* this is not the end of the world; we can continue
  411. with bad/inaccurate delegation info */
  412. pr_err_client(cl, "ENOMEM on mds ref %p %llx.%llx fg %x\n",
  413. inode, ceph_vinop(inode),
  414. le32_to_cpu(dirinfo->frag));
  415. err = -ENOMEM;
  416. goto out;
  417. }
  418. frag->mds = mds;
  419. frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP);
  420. for (i = 0; i < frag->ndist; i++)
  421. frag->dist[i] = le32_to_cpu(dirinfo->dist[i]);
  422. doutc(cl, "%p %llx.%llx frag %x ndist=%d\n", inode,
  423. ceph_vinop(inode), frag->frag, frag->ndist);
  424. out:
  425. mutex_unlock(&ci->i_fragtree_mutex);
  426. return err;
  427. }
  428. static int frag_tree_split_cmp(const void *l, const void *r)
  429. {
  430. struct ceph_frag_tree_split *ls = (struct ceph_frag_tree_split*)l;
  431. struct ceph_frag_tree_split *rs = (struct ceph_frag_tree_split*)r;
  432. return ceph_frag_compare(le32_to_cpu(ls->frag),
  433. le32_to_cpu(rs->frag));
  434. }
  435. static bool is_frag_child(u32 f, struct ceph_inode_frag *frag)
  436. {
  437. if (!frag)
  438. return f == ceph_frag_make(0, 0);
  439. if (ceph_frag_bits(f) != ceph_frag_bits(frag->frag) + frag->split_by)
  440. return false;
  441. return ceph_frag_contains_value(frag->frag, ceph_frag_value(f));
  442. }
  443. static int ceph_fill_fragtree(struct inode *inode,
  444. struct ceph_frag_tree_head *fragtree,
  445. struct ceph_mds_reply_dirfrag *dirinfo)
  446. {
  447. struct ceph_client *cl = ceph_inode_to_client(inode);
  448. struct ceph_inode_info *ci = ceph_inode(inode);
  449. struct ceph_inode_frag *frag, *prev_frag = NULL;
  450. struct rb_node *rb_node;
  451. unsigned i, split_by, nsplits;
  452. u32 id;
  453. bool update = false;
  454. mutex_lock(&ci->i_fragtree_mutex);
  455. nsplits = le32_to_cpu(fragtree->nsplits);
  456. if (nsplits != ci->i_fragtree_nsplits) {
  457. update = true;
  458. } else if (nsplits) {
  459. i = get_random_u32_below(nsplits);
  460. id = le32_to_cpu(fragtree->splits[i].frag);
  461. if (!__ceph_find_frag(ci, id))
  462. update = true;
  463. } else if (!RB_EMPTY_ROOT(&ci->i_fragtree)) {
  464. rb_node = rb_first(&ci->i_fragtree);
  465. frag = rb_entry(rb_node, struct ceph_inode_frag, node);
  466. if (frag->frag != ceph_frag_make(0, 0) || rb_next(rb_node))
  467. update = true;
  468. }
  469. if (!update && dirinfo) {
  470. id = le32_to_cpu(dirinfo->frag);
  471. if (id != __ceph_choose_frag(ci, id, NULL, NULL))
  472. update = true;
  473. }
  474. if (!update)
  475. goto out_unlock;
  476. if (nsplits > 1) {
  477. sort(fragtree->splits, nsplits, sizeof(fragtree->splits[0]),
  478. frag_tree_split_cmp, NULL);
  479. }
  480. doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode));
  481. rb_node = rb_first(&ci->i_fragtree);
  482. for (i = 0; i < nsplits; i++) {
  483. id = le32_to_cpu(fragtree->splits[i].frag);
  484. split_by = le32_to_cpu(fragtree->splits[i].by);
  485. if (split_by == 0 || ceph_frag_bits(id) + split_by > 24) {
  486. pr_err_client(cl, "%p %llx.%llx invalid split %d/%u, "
  487. "frag %x split by %d\n", inode,
  488. ceph_vinop(inode), i, nsplits, id, split_by);
  489. continue;
  490. }
  491. frag = NULL;
  492. while (rb_node) {
  493. frag = rb_entry(rb_node, struct ceph_inode_frag, node);
  494. if (ceph_frag_compare(frag->frag, id) >= 0) {
  495. if (frag->frag != id)
  496. frag = NULL;
  497. else
  498. rb_node = rb_next(rb_node);
  499. break;
  500. }
  501. rb_node = rb_next(rb_node);
  502. /* delete stale split/leaf node */
  503. if (frag->split_by > 0 ||
  504. !is_frag_child(frag->frag, prev_frag)) {
  505. rb_erase(&frag->node, &ci->i_fragtree);
  506. if (frag->split_by > 0)
  507. ci->i_fragtree_nsplits--;
  508. kfree(frag);
  509. }
  510. frag = NULL;
  511. }
  512. if (!frag) {
  513. frag = __get_or_create_frag(ci, id);
  514. if (IS_ERR(frag))
  515. continue;
  516. }
  517. if (frag->split_by == 0)
  518. ci->i_fragtree_nsplits++;
  519. frag->split_by = split_by;
  520. doutc(cl, " frag %x split by %d\n", frag->frag, frag->split_by);
  521. prev_frag = frag;
  522. }
  523. while (rb_node) {
  524. frag = rb_entry(rb_node, struct ceph_inode_frag, node);
  525. rb_node = rb_next(rb_node);
  526. /* delete stale split/leaf node */
  527. if (frag->split_by > 0 ||
  528. !is_frag_child(frag->frag, prev_frag)) {
  529. rb_erase(&frag->node, &ci->i_fragtree);
  530. if (frag->split_by > 0)
  531. ci->i_fragtree_nsplits--;
  532. kfree(frag);
  533. }
  534. }
  535. out_unlock:
  536. mutex_unlock(&ci->i_fragtree_mutex);
  537. return 0;
  538. }
  539. /*
  540. * initialize a newly allocated inode.
  541. */
  542. struct inode *ceph_alloc_inode(struct super_block *sb)
  543. {
  544. struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
  545. struct ceph_inode_info *ci;
  546. int i;
  547. ci = alloc_inode_sb(sb, ceph_inode_cachep, GFP_NOFS);
  548. if (!ci)
  549. return NULL;
  550. doutc(fsc->client, "%p\n", &ci->netfs.inode);
  551. /* Set parameters for the netfs library */
  552. netfs_inode_init(&ci->netfs, &ceph_netfs_ops, false);
  553. spin_lock_init(&ci->i_ceph_lock);
  554. ci->i_version = 0;
  555. ci->i_inline_version = 0;
  556. ci->i_time_warp_seq = 0;
  557. ci->i_ceph_flags = 0;
  558. atomic64_set(&ci->i_ordered_count, 1);
  559. atomic64_set(&ci->i_release_count, 1);
  560. atomic64_set(&ci->i_complete_seq[0], 0);
  561. atomic64_set(&ci->i_complete_seq[1], 0);
  562. ci->i_symlink = NULL;
  563. ci->i_max_bytes = 0;
  564. ci->i_max_files = 0;
  565. memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
  566. memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
  567. RCU_INIT_POINTER(ci->i_layout.pool_ns, NULL);
  568. ci->i_fragtree = RB_ROOT;
  569. mutex_init(&ci->i_fragtree_mutex);
  570. ci->i_xattrs.blob = NULL;
  571. ci->i_xattrs.prealloc_blob = NULL;
  572. ci->i_xattrs.dirty = false;
  573. ci->i_xattrs.index = RB_ROOT;
  574. ci->i_xattrs.count = 0;
  575. ci->i_xattrs.names_size = 0;
  576. ci->i_xattrs.vals_size = 0;
  577. ci->i_xattrs.version = 0;
  578. ci->i_xattrs.index_version = 0;
  579. ci->i_caps = RB_ROOT;
  580. ci->i_auth_cap = NULL;
  581. ci->i_dirty_caps = 0;
  582. ci->i_flushing_caps = 0;
  583. INIT_LIST_HEAD(&ci->i_dirty_item);
  584. INIT_LIST_HEAD(&ci->i_flushing_item);
  585. ci->i_prealloc_cap_flush = NULL;
  586. INIT_LIST_HEAD(&ci->i_cap_flush_list);
  587. init_waitqueue_head(&ci->i_cap_wq);
  588. ci->i_hold_caps_max = 0;
  589. INIT_LIST_HEAD(&ci->i_cap_delay_list);
  590. INIT_LIST_HEAD(&ci->i_cap_snaps);
  591. ci->i_head_snapc = NULL;
  592. ci->i_snap_caps = 0;
  593. ci->i_last_rd = ci->i_last_wr = jiffies - 3600 * HZ;
  594. for (i = 0; i < CEPH_FILE_MODE_BITS; i++)
  595. ci->i_nr_by_mode[i] = 0;
  596. mutex_init(&ci->i_truncate_mutex);
  597. ci->i_truncate_seq = 0;
  598. ci->i_truncate_size = 0;
  599. ci->i_truncate_pending = 0;
  600. ci->i_truncate_pagecache_size = 0;
  601. ci->i_max_size = 0;
  602. ci->i_reported_size = 0;
  603. ci->i_wanted_max_size = 0;
  604. ci->i_requested_max_size = 0;
  605. ci->i_pin_ref = 0;
  606. ci->i_rd_ref = 0;
  607. ci->i_rdcache_ref = 0;
  608. ci->i_wr_ref = 0;
  609. ci->i_wb_ref = 0;
  610. ci->i_fx_ref = 0;
  611. ci->i_wrbuffer_ref = 0;
  612. ci->i_wrbuffer_ref_head = 0;
  613. atomic_set(&ci->i_filelock_ref, 0);
  614. atomic_set(&ci->i_shared_gen, 1);
  615. ci->i_rdcache_gen = 0;
  616. ci->i_rdcache_revoking = 0;
  617. INIT_LIST_HEAD(&ci->i_unsafe_dirops);
  618. INIT_LIST_HEAD(&ci->i_unsafe_iops);
  619. spin_lock_init(&ci->i_unsafe_lock);
  620. ci->i_snap_realm = NULL;
  621. INIT_LIST_HEAD(&ci->i_snap_realm_item);
  622. INIT_LIST_HEAD(&ci->i_snap_flush_item);
  623. INIT_WORK(&ci->i_work, ceph_inode_work);
  624. ci->i_work_mask = 0;
  625. memset(&ci->i_btime, '\0', sizeof(ci->i_btime));
  626. #ifdef CONFIG_FS_ENCRYPTION
  627. ci->i_crypt_info = NULL;
  628. ci->fscrypt_auth = NULL;
  629. ci->fscrypt_auth_len = 0;
  630. #endif
  631. return &ci->netfs.inode;
  632. }
  633. void ceph_free_inode(struct inode *inode)
  634. {
  635. struct ceph_inode_info *ci = ceph_inode(inode);
  636. kfree(ci->i_symlink);
  637. #ifdef CONFIG_FS_ENCRYPTION
  638. kfree(ci->fscrypt_auth);
  639. #endif
  640. fscrypt_free_inode(inode);
  641. kmem_cache_free(ceph_inode_cachep, ci);
  642. }
  643. void ceph_evict_inode(struct inode *inode)
  644. {
  645. struct ceph_inode_info *ci = ceph_inode(inode);
  646. struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
  647. struct ceph_client *cl = ceph_inode_to_client(inode);
  648. struct ceph_inode_frag *frag;
  649. struct rb_node *n;
  650. doutc(cl, "%p ino %llx.%llx\n", inode, ceph_vinop(inode));
  651. percpu_counter_dec(&mdsc->metric.total_inodes);
  652. netfs_wait_for_outstanding_io(inode);
  653. truncate_inode_pages_final(&inode->i_data);
  654. if (inode_state_read_once(inode) & I_PINNING_NETFS_WB)
  655. ceph_fscache_unuse_cookie(inode, true);
  656. clear_inode(inode);
  657. ceph_fscache_unregister_inode_cookie(ci);
  658. fscrypt_put_encryption_info(inode);
  659. __ceph_remove_caps(ci);
  660. if (__ceph_has_quota(ci, QUOTA_GET_ANY))
  661. ceph_adjust_quota_realms_count(inode, false);
  662. /*
  663. * we may still have a snap_realm reference if there are stray
  664. * caps in i_snap_caps.
  665. */
  666. if (ci->i_snap_realm) {
  667. if (ceph_snap(inode) == CEPH_NOSNAP) {
  668. doutc(cl, " dropping residual ref to snap realm %p\n",
  669. ci->i_snap_realm);
  670. ceph_change_snap_realm(inode, NULL);
  671. } else {
  672. ceph_put_snapid_map(mdsc, ci->i_snapid_map);
  673. ci->i_snap_realm = NULL;
  674. }
  675. }
  676. while ((n = rb_first(&ci->i_fragtree)) != NULL) {
  677. frag = rb_entry(n, struct ceph_inode_frag, node);
  678. rb_erase(n, &ci->i_fragtree);
  679. kfree(frag);
  680. }
  681. ci->i_fragtree_nsplits = 0;
  682. __ceph_destroy_xattrs(ci);
  683. if (ci->i_xattrs.blob)
  684. ceph_buffer_put(ci->i_xattrs.blob);
  685. if (ci->i_xattrs.prealloc_blob)
  686. ceph_buffer_put(ci->i_xattrs.prealloc_blob);
  687. ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns));
  688. ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
  689. }
  690. static inline blkcnt_t calc_inode_blocks(u64 size)
  691. {
  692. return (size + (1<<9) - 1) >> 9;
  693. }
  694. /*
  695. * Helpers to fill in size, ctime, mtime, and atime. We have to be
  696. * careful because either the client or MDS may have more up to date
  697. * info, depending on which capabilities are held, and whether
  698. * time_warp_seq or truncate_seq have increased. (Ordinarily, mtime
  699. * and size are monotonically increasing, except when utimes() or
  700. * truncate() increments the corresponding _seq values.)
  701. */
  702. int ceph_fill_file_size(struct inode *inode, int issued,
  703. u32 truncate_seq, u64 truncate_size, u64 size)
  704. {
  705. struct ceph_client *cl = ceph_inode_to_client(inode);
  706. struct ceph_inode_info *ci = ceph_inode(inode);
  707. int queue_trunc = 0;
  708. loff_t isize = i_size_read(inode);
  709. if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
  710. (truncate_seq == ci->i_truncate_seq && size > isize)) {
  711. doutc(cl, "size %lld -> %llu\n", isize, size);
  712. if (size > 0 && S_ISDIR(inode->i_mode)) {
  713. pr_err_client(cl, "non-zero size for directory\n");
  714. size = 0;
  715. }
  716. i_size_write(inode, size);
  717. inode->i_blocks = calc_inode_blocks(size);
  718. /*
  719. * If we're expanding, then we should be able to just update
  720. * the existing cookie.
  721. */
  722. if (size > isize)
  723. ceph_fscache_update(inode);
  724. ci->i_reported_size = size;
  725. if (truncate_seq != ci->i_truncate_seq) {
  726. doutc(cl, "truncate_seq %u -> %u\n",
  727. ci->i_truncate_seq, truncate_seq);
  728. ci->i_truncate_seq = truncate_seq;
  729. /* the MDS should have revoked these caps */
  730. WARN_ON_ONCE(issued & (CEPH_CAP_FILE_RD |
  731. CEPH_CAP_FILE_LAZYIO));
  732. /*
  733. * If we hold relevant caps, or in the case where we're
  734. * not the only client referencing this file and we
  735. * don't hold those caps, then we need to check whether
  736. * the file is either opened or mmaped
  737. */
  738. if ((issued & (CEPH_CAP_FILE_CACHE|
  739. CEPH_CAP_FILE_BUFFER)) ||
  740. mapping_mapped(inode->i_mapping) ||
  741. __ceph_is_file_opened(ci)) {
  742. ci->i_truncate_pending++;
  743. queue_trunc = 1;
  744. }
  745. }
  746. }
  747. /*
  748. * It's possible that the new sizes of the two consecutive
  749. * size truncations will be in the same fscrypt last block,
  750. * and we need to truncate the corresponding page caches
  751. * anyway.
  752. */
  753. if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0) {
  754. doutc(cl, "truncate_size %lld -> %llu, encrypted %d\n",
  755. ci->i_truncate_size, truncate_size,
  756. !!IS_ENCRYPTED(inode));
  757. ci->i_truncate_size = truncate_size;
  758. if (IS_ENCRYPTED(inode)) {
  759. doutc(cl, "truncate_pagecache_size %lld -> %llu\n",
  760. ci->i_truncate_pagecache_size, size);
  761. ci->i_truncate_pagecache_size = size;
  762. } else {
  763. ci->i_truncate_pagecache_size = truncate_size;
  764. }
  765. }
  766. return queue_trunc;
  767. }
  768. void ceph_fill_file_time(struct inode *inode, int issued,
  769. u64 time_warp_seq, struct timespec64 *ctime,
  770. struct timespec64 *mtime, struct timespec64 *atime)
  771. {
  772. struct ceph_client *cl = ceph_inode_to_client(inode);
  773. struct ceph_inode_info *ci = ceph_inode(inode);
  774. struct timespec64 iatime = inode_get_atime(inode);
  775. struct timespec64 ictime = inode_get_ctime(inode);
  776. struct timespec64 imtime = inode_get_mtime(inode);
  777. int warn = 0;
  778. if (issued & (CEPH_CAP_FILE_EXCL|
  779. CEPH_CAP_FILE_WR|
  780. CEPH_CAP_FILE_BUFFER|
  781. CEPH_CAP_AUTH_EXCL|
  782. CEPH_CAP_XATTR_EXCL)) {
  783. if (ci->i_version == 0 ||
  784. timespec64_compare(ctime, &ictime) > 0) {
  785. doutc(cl, "ctime %ptSp -> %ptSp inc w/ cap\n", &ictime, ctime);
  786. inode_set_ctime_to_ts(inode, *ctime);
  787. }
  788. if (ci->i_version == 0 ||
  789. ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
  790. /* the MDS did a utimes() */
  791. doutc(cl, "mtime %ptSp -> %ptSp tw %d -> %d\n", &imtime, mtime,
  792. ci->i_time_warp_seq, (int)time_warp_seq);
  793. inode_set_mtime_to_ts(inode, *mtime);
  794. inode_set_atime_to_ts(inode, *atime);
  795. ci->i_time_warp_seq = time_warp_seq;
  796. } else if (time_warp_seq == ci->i_time_warp_seq) {
  797. /* nobody did utimes(); take the max */
  798. if (timespec64_compare(mtime, &imtime) > 0) {
  799. doutc(cl, "mtime %ptSp -> %ptSp inc\n", &imtime, mtime);
  800. inode_set_mtime_to_ts(inode, *mtime);
  801. }
  802. if (timespec64_compare(atime, &iatime) > 0) {
  803. doutc(cl, "atime %ptSp -> %ptSp inc\n", &iatime, atime);
  804. inode_set_atime_to_ts(inode, *atime);
  805. }
  806. } else if (issued & CEPH_CAP_FILE_EXCL) {
  807. /* we did a utimes(); ignore mds values */
  808. } else {
  809. warn = 1;
  810. }
  811. } else {
  812. /* we have no write|excl caps; whatever the MDS says is true */
  813. if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
  814. inode_set_ctime_to_ts(inode, *ctime);
  815. inode_set_mtime_to_ts(inode, *mtime);
  816. inode_set_atime_to_ts(inode, *atime);
  817. ci->i_time_warp_seq = time_warp_seq;
  818. } else {
  819. warn = 1;
  820. }
  821. }
  822. if (warn) /* time_warp_seq shouldn't go backwards */
  823. doutc(cl, "%p mds time_warp_seq %llu < %u\n", inode,
  824. time_warp_seq, ci->i_time_warp_seq);
  825. }
  826. #if IS_ENABLED(CONFIG_FS_ENCRYPTION)
  827. static int decode_encrypted_symlink(struct ceph_mds_client *mdsc,
  828. const char *encsym,
  829. int enclen, u8 **decsym)
  830. {
  831. struct ceph_client *cl = mdsc->fsc->client;
  832. int declen;
  833. u8 *sym;
  834. sym = kmalloc(enclen + 1, GFP_NOFS);
  835. if (!sym)
  836. return -ENOMEM;
  837. declen = base64_decode(encsym, enclen, sym, false, BASE64_IMAP);
  838. if (declen < 0) {
  839. pr_err_client(cl,
  840. "can't decode symlink (%d). Content: %.*s\n",
  841. declen, enclen, encsym);
  842. kfree(sym);
  843. return -EIO;
  844. }
  845. sym[declen + 1] = '\0';
  846. *decsym = sym;
  847. return declen;
  848. }
  849. #else
  850. static int decode_encrypted_symlink(struct ceph_mds_client *mdsc,
  851. const char *encsym,
  852. int symlen, u8 **decsym)
  853. {
  854. return -EOPNOTSUPP;
  855. }
  856. #endif
  857. /*
  858. * Populate an inode based on info from mds. May be called on new or
  859. * existing inodes.
  860. */
  861. int ceph_fill_inode(struct inode *inode, struct page *locked_page,
  862. struct ceph_mds_reply_info_in *iinfo,
  863. struct ceph_mds_reply_dirfrag *dirinfo,
  864. struct ceph_mds_session *session, int cap_fmode,
  865. struct ceph_cap_reservation *caps_reservation)
  866. {
  867. struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
  868. struct ceph_client *cl = mdsc->fsc->client;
  869. struct ceph_mds_reply_inode *info = iinfo->in;
  870. struct ceph_inode_info *ci = ceph_inode(inode);
  871. int issued, new_issued, info_caps;
  872. struct timespec64 mtime, atime, ctime;
  873. struct ceph_buffer *xattr_blob = NULL;
  874. struct ceph_buffer *old_blob = NULL;
  875. struct ceph_string *pool_ns = NULL;
  876. struct ceph_cap *new_cap = NULL;
  877. int err = 0;
  878. bool wake = false;
  879. bool queue_trunc = false;
  880. bool new_version = false;
  881. bool fill_inline = false;
  882. umode_t mode = le32_to_cpu(info->mode);
  883. dev_t rdev = le32_to_cpu(info->rdev);
  884. lockdep_assert_held(&mdsc->snap_rwsem);
  885. doutc(cl, "%p ino %llx.%llx v %llu had %llu\n", inode, ceph_vinop(inode),
  886. le64_to_cpu(info->version), ci->i_version);
  887. /* Once I_NEW is cleared, we can't change type or dev numbers */
  888. if (inode_state_read_once(inode) & I_NEW) {
  889. inode->i_mode = mode;
  890. } else {
  891. if (inode_wrong_type(inode, mode)) {
  892. pr_warn_once_client(cl,
  893. "inode type changed! (ino %llx.%llx is 0%o, mds says 0%o)\n",
  894. ceph_vinop(inode), inode->i_mode, mode);
  895. return -ESTALE;
  896. }
  897. if ((S_ISCHR(mode) || S_ISBLK(mode)) && inode->i_rdev != rdev) {
  898. pr_warn_once_client(cl,
  899. "dev inode rdev changed! (ino %llx.%llx is %u:%u, mds says %u:%u)\n",
  900. ceph_vinop(inode), MAJOR(inode->i_rdev),
  901. MINOR(inode->i_rdev), MAJOR(rdev),
  902. MINOR(rdev));
  903. return -ESTALE;
  904. }
  905. }
  906. info_caps = le32_to_cpu(info->cap.caps);
  907. /* prealloc new cap struct */
  908. if (info_caps && ceph_snap(inode) == CEPH_NOSNAP) {
  909. new_cap = ceph_get_cap(mdsc, caps_reservation);
  910. if (!new_cap)
  911. return -ENOMEM;
  912. }
  913. /*
  914. * prealloc xattr data, if it looks like we'll need it. only
  915. * if len > 4 (meaning there are actually xattrs; the first 4
  916. * bytes are the xattr count).
  917. */
  918. if (iinfo->xattr_len > 4) {
  919. xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS);
  920. if (!xattr_blob)
  921. pr_err_client(cl, "ENOMEM xattr blob %d bytes\n",
  922. iinfo->xattr_len);
  923. }
  924. if (iinfo->pool_ns_len > 0)
  925. pool_ns = ceph_find_or_create_string(iinfo->pool_ns_data,
  926. iinfo->pool_ns_len);
  927. if (ceph_snap(inode) != CEPH_NOSNAP && !ci->i_snapid_map)
  928. ci->i_snapid_map = ceph_get_snapid_map(mdsc, ceph_snap(inode));
  929. spin_lock(&ci->i_ceph_lock);
  930. /*
  931. * provided version will be odd if inode value is projected,
  932. * even if stable. skip the update if we have newer stable
  933. * info (ours>=theirs, e.g. due to racing mds replies), unless
  934. * we are getting projected (unstable) info (in which case the
  935. * version is odd, and we want ours>theirs).
  936. * us them
  937. * 2 2 skip
  938. * 3 2 skip
  939. * 3 3 update
  940. */
  941. if (ci->i_version == 0 ||
  942. ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
  943. le64_to_cpu(info->version) > (ci->i_version & ~1)))
  944. new_version = true;
  945. /* Update change_attribute */
  946. inode_set_max_iversion_raw(inode, iinfo->change_attr);
  947. __ceph_caps_issued(ci, &issued);
  948. issued |= __ceph_caps_dirty(ci);
  949. new_issued = ~issued & info_caps;
  950. __ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files);
  951. #ifdef CONFIG_FS_ENCRYPTION
  952. if (iinfo->fscrypt_auth_len &&
  953. ((inode_state_read_once(inode) & I_NEW) || (ci->fscrypt_auth_len == 0))) {
  954. kfree(ci->fscrypt_auth);
  955. ci->fscrypt_auth_len = iinfo->fscrypt_auth_len;
  956. ci->fscrypt_auth = iinfo->fscrypt_auth;
  957. iinfo->fscrypt_auth = NULL;
  958. iinfo->fscrypt_auth_len = 0;
  959. inode_set_flags(inode, S_ENCRYPTED, S_ENCRYPTED);
  960. }
  961. #endif
  962. if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
  963. (issued & CEPH_CAP_AUTH_EXCL) == 0) {
  964. inode->i_mode = mode;
  965. inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid));
  966. inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid));
  967. doutc(cl, "%p %llx.%llx mode 0%o uid.gid %d.%d\n", inode,
  968. ceph_vinop(inode), inode->i_mode,
  969. from_kuid(&init_user_ns, inode->i_uid),
  970. from_kgid(&init_user_ns, inode->i_gid));
  971. ceph_decode_timespec64(&ci->i_btime, &iinfo->btime);
  972. ceph_decode_timespec64(&ci->i_snap_btime, &iinfo->snap_btime);
  973. }
  974. /* directories have fl_stripe_unit set to zero */
  975. if (IS_ENCRYPTED(inode))
  976. inode->i_blkbits = CEPH_FSCRYPT_BLOCK_SHIFT;
  977. else if (le32_to_cpu(info->layout.fl_stripe_unit))
  978. inode->i_blkbits =
  979. fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
  980. else
  981. inode->i_blkbits = CEPH_BLOCK_SHIFT;
  982. if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
  983. (issued & CEPH_CAP_LINK_EXCL) == 0)
  984. set_nlink(inode, le32_to_cpu(info->nlink));
  985. if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
  986. /* be careful with mtime, atime, size */
  987. ceph_decode_timespec64(&atime, &info->atime);
  988. ceph_decode_timespec64(&mtime, &info->mtime);
  989. ceph_decode_timespec64(&ctime, &info->ctime);
  990. ceph_fill_file_time(inode, issued,
  991. le32_to_cpu(info->time_warp_seq),
  992. &ctime, &mtime, &atime);
  993. }
  994. if (new_version || (info_caps & CEPH_CAP_FILE_SHARED)) {
  995. ci->i_files = le64_to_cpu(info->files);
  996. ci->i_subdirs = le64_to_cpu(info->subdirs);
  997. }
  998. if (new_version ||
  999. (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
  1000. u64 size = le64_to_cpu(info->size);
  1001. s64 old_pool = ci->i_layout.pool_id;
  1002. struct ceph_string *old_ns;
  1003. ceph_file_layout_from_legacy(&ci->i_layout, &info->layout);
  1004. old_ns = rcu_dereference_protected(ci->i_layout.pool_ns,
  1005. lockdep_is_held(&ci->i_ceph_lock));
  1006. rcu_assign_pointer(ci->i_layout.pool_ns, pool_ns);
  1007. if (ci->i_layout.pool_id != old_pool || pool_ns != old_ns)
  1008. ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
  1009. pool_ns = old_ns;
  1010. if (IS_ENCRYPTED(inode) && size &&
  1011. iinfo->fscrypt_file_len == sizeof(__le64)) {
  1012. u64 fsize = __le64_to_cpu(*(__le64 *)iinfo->fscrypt_file);
  1013. if (size == round_up(fsize, CEPH_FSCRYPT_BLOCK_SIZE)) {
  1014. size = fsize;
  1015. } else {
  1016. pr_warn_client(cl,
  1017. "fscrypt size mismatch: size=%llu fscrypt_file=%llu, discarding fscrypt_file size.\n",
  1018. info->size, size);
  1019. }
  1020. }
  1021. queue_trunc = ceph_fill_file_size(inode, issued,
  1022. le32_to_cpu(info->truncate_seq),
  1023. le64_to_cpu(info->truncate_size),
  1024. size);
  1025. /* only update max_size on auth cap */
  1026. if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
  1027. ci->i_max_size != le64_to_cpu(info->max_size)) {
  1028. doutc(cl, "max_size %lld -> %llu\n",
  1029. ci->i_max_size, le64_to_cpu(info->max_size));
  1030. ci->i_max_size = le64_to_cpu(info->max_size);
  1031. }
  1032. }
  1033. /* layout and rstat are not tracked by capability, update them if
  1034. * the inode info is from auth mds */
  1035. if (new_version || (info->cap.flags & CEPH_CAP_FLAG_AUTH)) {
  1036. if (S_ISDIR(inode->i_mode)) {
  1037. ci->i_dir_layout = iinfo->dir_layout;
  1038. ci->i_rbytes = le64_to_cpu(info->rbytes);
  1039. ci->i_rfiles = le64_to_cpu(info->rfiles);
  1040. ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
  1041. ci->i_dir_pin = iinfo->dir_pin;
  1042. ci->i_rsnaps = iinfo->rsnaps;
  1043. ceph_decode_timespec64(&ci->i_rctime, &info->rctime);
  1044. }
  1045. }
  1046. /* xattrs */
  1047. /* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */
  1048. if ((ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
  1049. le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) {
  1050. if (ci->i_xattrs.blob)
  1051. old_blob = ci->i_xattrs.blob;
  1052. ci->i_xattrs.blob = xattr_blob;
  1053. if (xattr_blob)
  1054. memcpy(ci->i_xattrs.blob->vec.iov_base,
  1055. iinfo->xattr_data, iinfo->xattr_len);
  1056. ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
  1057. ceph_forget_all_cached_acls(inode);
  1058. ceph_security_invalidate_secctx(inode);
  1059. xattr_blob = NULL;
  1060. }
  1061. /* finally update i_version */
  1062. if (le64_to_cpu(info->version) > ci->i_version)
  1063. ci->i_version = le64_to_cpu(info->version);
  1064. inode->i_mapping->a_ops = &ceph_aops;
  1065. switch (inode->i_mode & S_IFMT) {
  1066. case S_IFIFO:
  1067. case S_IFBLK:
  1068. case S_IFCHR:
  1069. case S_IFSOCK:
  1070. inode->i_blkbits = PAGE_SHIFT;
  1071. init_special_inode(inode, inode->i_mode, rdev);
  1072. inode->i_op = &ceph_file_iops;
  1073. break;
  1074. case S_IFREG:
  1075. inode->i_op = &ceph_file_iops;
  1076. inode->i_fop = &ceph_file_fops;
  1077. break;
  1078. case S_IFLNK:
  1079. if (!ci->i_symlink) {
  1080. u32 symlen = iinfo->symlink_len;
  1081. char *sym;
  1082. spin_unlock(&ci->i_ceph_lock);
  1083. if (IS_ENCRYPTED(inode)) {
  1084. if (symlen != i_size_read(inode))
  1085. pr_err_client(cl,
  1086. "%p %llx.%llx BAD symlink size %lld\n",
  1087. inode, ceph_vinop(inode),
  1088. i_size_read(inode));
  1089. err = decode_encrypted_symlink(mdsc, iinfo->symlink,
  1090. symlen, (u8 **)&sym);
  1091. if (err < 0) {
  1092. pr_err_client(cl,
  1093. "decoding encrypted symlink failed: %d\n",
  1094. err);
  1095. goto out;
  1096. }
  1097. symlen = err;
  1098. i_size_write(inode, symlen);
  1099. inode->i_blocks = calc_inode_blocks(symlen);
  1100. } else {
  1101. if (symlen != i_size_read(inode)) {
  1102. pr_err_client(cl,
  1103. "%p %llx.%llx BAD symlink size %lld\n",
  1104. inode, ceph_vinop(inode),
  1105. i_size_read(inode));
  1106. i_size_write(inode, symlen);
  1107. inode->i_blocks = calc_inode_blocks(symlen);
  1108. }
  1109. err = -ENOMEM;
  1110. sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS);
  1111. if (!sym)
  1112. goto out;
  1113. }
  1114. spin_lock(&ci->i_ceph_lock);
  1115. if (!ci->i_symlink)
  1116. ci->i_symlink = sym;
  1117. else
  1118. kfree(sym); /* lost a race */
  1119. }
  1120. if (IS_ENCRYPTED(inode)) {
  1121. /*
  1122. * Encrypted symlinks need to be decrypted before we can
  1123. * cache their targets in i_link. Don't touch it here.
  1124. */
  1125. inode->i_op = &ceph_encrypted_symlink_iops;
  1126. } else {
  1127. inode->i_link = ci->i_symlink;
  1128. inode->i_op = &ceph_symlink_iops;
  1129. }
  1130. break;
  1131. case S_IFDIR:
  1132. inode->i_op = &ceph_dir_iops;
  1133. inode->i_fop = &ceph_dir_fops;
  1134. break;
  1135. default:
  1136. pr_err_client(cl, "%p %llx.%llx BAD mode 0%o\n", inode,
  1137. ceph_vinop(inode), inode->i_mode);
  1138. }
  1139. /* were we issued a capability? */
  1140. if (info_caps) {
  1141. if (ceph_snap(inode) == CEPH_NOSNAP) {
  1142. ceph_add_cap(inode, session,
  1143. le64_to_cpu(info->cap.cap_id),
  1144. info_caps,
  1145. le32_to_cpu(info->cap.wanted),
  1146. le32_to_cpu(info->cap.seq),
  1147. le32_to_cpu(info->cap.mseq),
  1148. le64_to_cpu(info->cap.realm),
  1149. info->cap.flags, &new_cap);
  1150. /* set dir completion flag? */
  1151. if (S_ISDIR(inode->i_mode) &&
  1152. ci->i_files == 0 && ci->i_subdirs == 0 &&
  1153. (info_caps & CEPH_CAP_FILE_SHARED) &&
  1154. (issued & CEPH_CAP_FILE_EXCL) == 0 &&
  1155. !__ceph_dir_is_complete(ci)) {
  1156. doutc(cl, " marking %p complete (empty)\n",
  1157. inode);
  1158. i_size_write(inode, 0);
  1159. __ceph_dir_set_complete(ci,
  1160. atomic64_read(&ci->i_release_count),
  1161. atomic64_read(&ci->i_ordered_count));
  1162. }
  1163. wake = true;
  1164. } else {
  1165. doutc(cl, " %p got snap_caps %s\n", inode,
  1166. ceph_cap_string(info_caps));
  1167. ci->i_snap_caps |= info_caps;
  1168. }
  1169. }
  1170. if (iinfo->inline_version > 0 &&
  1171. iinfo->inline_version >= ci->i_inline_version) {
  1172. int cache_caps = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
  1173. ci->i_inline_version = iinfo->inline_version;
  1174. if (ceph_has_inline_data(ci) &&
  1175. (locked_page || (info_caps & cache_caps)))
  1176. fill_inline = true;
  1177. }
  1178. if (cap_fmode >= 0) {
  1179. if (!info_caps)
  1180. pr_warn_client(cl, "mds issued no caps on %llx.%llx\n",
  1181. ceph_vinop(inode));
  1182. __ceph_touch_fmode(ci, mdsc, cap_fmode);
  1183. }
  1184. spin_unlock(&ci->i_ceph_lock);
  1185. ceph_fscache_register_inode_cookie(inode);
  1186. if (fill_inline)
  1187. ceph_fill_inline_data(inode, locked_page,
  1188. iinfo->inline_data, iinfo->inline_len);
  1189. if (wake)
  1190. wake_up_all(&ci->i_cap_wq);
  1191. /* queue truncate if we saw i_size decrease */
  1192. if (queue_trunc)
  1193. ceph_queue_vmtruncate(inode);
  1194. /* populate frag tree */
  1195. if (S_ISDIR(inode->i_mode))
  1196. ceph_fill_fragtree(inode, &info->fragtree, dirinfo);
  1197. /* update delegation info? */
  1198. if (dirinfo)
  1199. ceph_fill_dirfrag(inode, dirinfo);
  1200. err = 0;
  1201. out:
  1202. if (new_cap)
  1203. ceph_put_cap(mdsc, new_cap);
  1204. ceph_buffer_put(old_blob);
  1205. ceph_buffer_put(xattr_blob);
  1206. ceph_put_string(pool_ns);
  1207. return err;
  1208. }
  1209. /*
  1210. * caller should hold session s_mutex and dentry->d_lock.
  1211. */
  1212. static void __update_dentry_lease(struct inode *dir, struct dentry *dentry,
  1213. struct ceph_mds_reply_lease *lease,
  1214. struct ceph_mds_session *session,
  1215. unsigned long from_time,
  1216. struct ceph_mds_session **old_lease_session)
  1217. {
  1218. struct ceph_client *cl = ceph_inode_to_client(dir);
  1219. struct ceph_dentry_info *di = ceph_dentry(dentry);
  1220. unsigned mask = le16_to_cpu(lease->mask);
  1221. long unsigned duration = le32_to_cpu(lease->duration_ms);
  1222. long unsigned ttl = from_time + (duration * HZ) / 1000;
  1223. long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
  1224. doutc(cl, "%p duration %lu ms ttl %lu\n", dentry, duration, ttl);
  1225. /* only track leases on regular dentries */
  1226. if (ceph_snap(dir) != CEPH_NOSNAP)
  1227. return;
  1228. if (mask & CEPH_LEASE_PRIMARY_LINK)
  1229. di->flags |= CEPH_DENTRY_PRIMARY_LINK;
  1230. else
  1231. di->flags &= ~CEPH_DENTRY_PRIMARY_LINK;
  1232. di->lease_shared_gen = atomic_read(&ceph_inode(dir)->i_shared_gen);
  1233. if (!(mask & CEPH_LEASE_VALID)) {
  1234. __ceph_dentry_dir_lease_touch(di);
  1235. return;
  1236. }
  1237. if (di->lease_gen == atomic_read(&session->s_cap_gen) &&
  1238. time_before(ttl, di->time))
  1239. return; /* we already have a newer lease. */
  1240. if (di->lease_session && di->lease_session != session) {
  1241. *old_lease_session = di->lease_session;
  1242. di->lease_session = NULL;
  1243. }
  1244. if (!di->lease_session)
  1245. di->lease_session = ceph_get_mds_session(session);
  1246. di->lease_gen = atomic_read(&session->s_cap_gen);
  1247. di->lease_seq = le32_to_cpu(lease->seq);
  1248. di->lease_renew_after = half_ttl;
  1249. di->lease_renew_from = 0;
  1250. di->time = ttl;
  1251. __ceph_dentry_lease_touch(di);
  1252. }
  1253. static inline void update_dentry_lease(struct inode *dir, struct dentry *dentry,
  1254. struct ceph_mds_reply_lease *lease,
  1255. struct ceph_mds_session *session,
  1256. unsigned long from_time)
  1257. {
  1258. struct ceph_mds_session *old_lease_session = NULL;
  1259. spin_lock(&dentry->d_lock);
  1260. __update_dentry_lease(dir, dentry, lease, session, from_time,
  1261. &old_lease_session);
  1262. spin_unlock(&dentry->d_lock);
  1263. ceph_put_mds_session(old_lease_session);
  1264. }
  1265. /*
  1266. * update dentry lease without having parent inode locked
  1267. */
  1268. static void update_dentry_lease_careful(struct dentry *dentry,
  1269. struct ceph_mds_reply_lease *lease,
  1270. struct ceph_mds_session *session,
  1271. unsigned long from_time,
  1272. char *dname, u32 dname_len,
  1273. struct ceph_vino *pdvino,
  1274. struct ceph_vino *ptvino)
  1275. {
  1276. struct inode *dir;
  1277. struct ceph_mds_session *old_lease_session = NULL;
  1278. spin_lock(&dentry->d_lock);
  1279. /* make sure dentry's name matches target */
  1280. if (dentry->d_name.len != dname_len ||
  1281. memcmp(dentry->d_name.name, dname, dname_len))
  1282. goto out_unlock;
  1283. dir = d_inode(dentry->d_parent);
  1284. /* make sure parent matches dvino */
  1285. if (!ceph_ino_compare(dir, pdvino))
  1286. goto out_unlock;
  1287. /* make sure dentry's inode matches target. NULL ptvino means that
  1288. * we expect a negative dentry */
  1289. if (ptvino) {
  1290. if (d_really_is_negative(dentry))
  1291. goto out_unlock;
  1292. if (!ceph_ino_compare(d_inode(dentry), ptvino))
  1293. goto out_unlock;
  1294. } else {
  1295. if (d_really_is_positive(dentry))
  1296. goto out_unlock;
  1297. }
  1298. __update_dentry_lease(dir, dentry, lease, session,
  1299. from_time, &old_lease_session);
  1300. out_unlock:
  1301. spin_unlock(&dentry->d_lock);
  1302. ceph_put_mds_session(old_lease_session);
  1303. }
  1304. /*
  1305. * splice a dentry to an inode.
  1306. * caller must hold directory i_rwsem for this to be safe.
  1307. */
  1308. static int splice_dentry(struct dentry **pdn, struct inode *in)
  1309. {
  1310. struct ceph_client *cl = ceph_inode_to_client(in);
  1311. struct dentry *dn = *pdn;
  1312. struct dentry *realdn;
  1313. BUG_ON(d_inode(dn));
  1314. if (S_ISDIR(in->i_mode)) {
  1315. /* If inode is directory, d_splice_alias() below will remove
  1316. * 'realdn' from its origin parent. We need to ensure that
  1317. * origin parent's readdir cache will not reference 'realdn'
  1318. */
  1319. realdn = d_find_any_alias(in);
  1320. if (realdn) {
  1321. struct ceph_dentry_info *di = ceph_dentry(realdn);
  1322. spin_lock(&realdn->d_lock);
  1323. realdn->d_op->d_prune(realdn);
  1324. di->time = jiffies;
  1325. di->lease_shared_gen = 0;
  1326. di->offset = 0;
  1327. spin_unlock(&realdn->d_lock);
  1328. dput(realdn);
  1329. }
  1330. }
  1331. /* dn must be unhashed */
  1332. if (!d_unhashed(dn))
  1333. d_drop(dn);
  1334. realdn = d_splice_alias(in, dn);
  1335. if (IS_ERR(realdn)) {
  1336. pr_err_client(cl, "error %ld %p inode %p ino %llx.%llx\n",
  1337. PTR_ERR(realdn), dn, in, ceph_vinop(in));
  1338. return PTR_ERR(realdn);
  1339. }
  1340. if (realdn) {
  1341. doutc(cl, "dn %p (%d) spliced with %p (%d) inode %p ino %llx.%llx\n",
  1342. dn, d_count(dn), realdn, d_count(realdn),
  1343. d_inode(realdn), ceph_vinop(d_inode(realdn)));
  1344. dput(dn);
  1345. *pdn = realdn;
  1346. } else {
  1347. BUG_ON(!ceph_dentry(dn));
  1348. doutc(cl, "dn %p attached to %p ino %llx.%llx\n", dn,
  1349. d_inode(dn), ceph_vinop(d_inode(dn)));
  1350. }
  1351. return 0;
  1352. }
  1353. /*
  1354. * Incorporate results into the local cache. This is either just
  1355. * one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
  1356. * after a lookup).
  1357. *
  1358. * A reply may contain
  1359. * a directory inode along with a dentry.
  1360. * and/or a target inode
  1361. *
  1362. * Called with snap_rwsem (read).
  1363. */
  1364. int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
  1365. {
  1366. struct ceph_mds_session *session = req->r_session;
  1367. struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
  1368. struct inode *in = NULL;
  1369. struct ceph_vino tvino, dvino;
  1370. struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
  1371. struct ceph_client *cl = fsc->client;
  1372. struct inode *parent_dir = NULL;
  1373. int err = 0;
  1374. doutc(cl, "%p is_dentry %d is_target %d\n", req,
  1375. rinfo->head->is_dentry, rinfo->head->is_target);
  1376. if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
  1377. doutc(cl, "reply is empty!\n");
  1378. if (rinfo->head->result == 0 && req->r_parent)
  1379. ceph_invalidate_dir_request(req);
  1380. return 0;
  1381. }
  1382. if (rinfo->head->is_dentry) {
  1383. /*
  1384. * r_parent may be stale, in cases when R_PARENT_LOCKED is not set,
  1385. * so we need to get the correct inode
  1386. */
  1387. parent_dir = ceph_get_reply_dir(sb, req->r_parent, rinfo);
  1388. if (unlikely(IS_ERR(parent_dir))) {
  1389. err = PTR_ERR(parent_dir);
  1390. goto done;
  1391. }
  1392. if (parent_dir) {
  1393. err = ceph_fill_inode(parent_dir, NULL, &rinfo->diri,
  1394. rinfo->dirfrag, session, -1,
  1395. &req->r_caps_reservation);
  1396. if (err < 0)
  1397. goto done;
  1398. } else {
  1399. WARN_ON_ONCE(1);
  1400. }
  1401. if (parent_dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME &&
  1402. test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
  1403. !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
  1404. bool is_nokey = false;
  1405. struct qstr dname;
  1406. struct dentry *dn, *parent;
  1407. struct fscrypt_str oname = FSTR_INIT(NULL, 0);
  1408. struct ceph_fname fname = { .dir = parent_dir,
  1409. .name = rinfo->dname,
  1410. .ctext = rinfo->altname,
  1411. .name_len = rinfo->dname_len,
  1412. .ctext_len = rinfo->altname_len };
  1413. BUG_ON(!rinfo->head->is_target);
  1414. BUG_ON(req->r_dentry);
  1415. parent = d_find_any_alias(parent_dir);
  1416. BUG_ON(!parent);
  1417. err = ceph_fname_alloc_buffer(parent_dir, &oname);
  1418. if (err < 0) {
  1419. dput(parent);
  1420. goto done;
  1421. }
  1422. err = ceph_fname_to_usr(&fname, NULL, &oname, &is_nokey);
  1423. if (err < 0) {
  1424. dput(parent);
  1425. ceph_fname_free_buffer(parent_dir, &oname);
  1426. goto done;
  1427. }
  1428. dname.name = oname.name;
  1429. dname.len = oname.len;
  1430. dname.hash = full_name_hash(parent, dname.name, dname.len);
  1431. tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
  1432. tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
  1433. retry_lookup:
  1434. dn = d_lookup(parent, &dname);
  1435. doutc(cl, "d_lookup on parent=%p name=%.*s got %p\n",
  1436. parent, dname.len, dname.name, dn);
  1437. if (!dn) {
  1438. dn = d_alloc(parent, &dname);
  1439. doutc(cl, "d_alloc %p '%.*s' = %p\n", parent,
  1440. dname.len, dname.name, dn);
  1441. if (!dn) {
  1442. dput(parent);
  1443. ceph_fname_free_buffer(parent_dir, &oname);
  1444. err = -ENOMEM;
  1445. goto done;
  1446. }
  1447. if (is_nokey) {
  1448. spin_lock(&dn->d_lock);
  1449. dn->d_flags |= DCACHE_NOKEY_NAME;
  1450. spin_unlock(&dn->d_lock);
  1451. }
  1452. err = 0;
  1453. } else if (d_really_is_positive(dn) &&
  1454. (ceph_ino(d_inode(dn)) != tvino.ino ||
  1455. ceph_snap(d_inode(dn)) != tvino.snap)) {
  1456. doutc(cl, " dn %p points to wrong inode %p\n",
  1457. dn, d_inode(dn));
  1458. ceph_dir_clear_ordered(parent_dir);
  1459. d_delete(dn);
  1460. dput(dn);
  1461. goto retry_lookup;
  1462. }
  1463. ceph_fname_free_buffer(parent_dir, &oname);
  1464. req->r_dentry = dn;
  1465. dput(parent);
  1466. }
  1467. }
  1468. if (rinfo->head->is_target) {
  1469. /* Should be filled in by handle_reply */
  1470. BUG_ON(!req->r_target_inode);
  1471. in = req->r_target_inode;
  1472. err = ceph_fill_inode(in, req->r_locked_page, &rinfo->targeti,
  1473. NULL, session,
  1474. (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
  1475. !test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags) &&
  1476. rinfo->head->result == 0) ? req->r_fmode : -1,
  1477. &req->r_caps_reservation);
  1478. if (err < 0) {
  1479. pr_err_client(cl, "badness %p %llx.%llx\n", in,
  1480. ceph_vinop(in));
  1481. req->r_target_inode = NULL;
  1482. if (inode_state_read_once(in) & I_NEW)
  1483. discard_new_inode(in);
  1484. else
  1485. iput(in);
  1486. goto done;
  1487. }
  1488. if (inode_state_read_once(in) & I_NEW)
  1489. unlock_new_inode(in);
  1490. }
  1491. /*
  1492. * ignore null lease/binding on snapdir ENOENT, or else we
  1493. * will have trouble splicing in the virtual snapdir later
  1494. */
  1495. if (rinfo->head->is_dentry &&
  1496. !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
  1497. test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
  1498. (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
  1499. fsc->mount_options->snapdir_name,
  1500. req->r_dentry->d_name.len))) {
  1501. /*
  1502. * lookup link rename : null -> possibly existing inode
  1503. * mknod symlink mkdir : null -> new inode
  1504. * unlink : linked -> null
  1505. */
  1506. struct inode *dir = req->r_parent;
  1507. struct dentry *dn = req->r_dentry;
  1508. bool have_dir_cap, have_lease;
  1509. BUG_ON(!dn);
  1510. BUG_ON(!dir);
  1511. BUG_ON(d_inode(dn->d_parent) != dir);
  1512. dvino.ino = le64_to_cpu(rinfo->diri.in->ino);
  1513. dvino.snap = le64_to_cpu(rinfo->diri.in->snapid);
  1514. BUG_ON(ceph_ino(dir) != dvino.ino);
  1515. BUG_ON(ceph_snap(dir) != dvino.snap);
  1516. /* do we have a lease on the whole dir? */
  1517. have_dir_cap =
  1518. (le32_to_cpu(rinfo->diri.in->cap.caps) &
  1519. CEPH_CAP_FILE_SHARED);
  1520. /* do we have a dn lease? */
  1521. have_lease = have_dir_cap ||
  1522. le32_to_cpu(rinfo->dlease->duration_ms);
  1523. if (!have_lease)
  1524. doutc(cl, "no dentry lease or dir cap\n");
  1525. /* rename? */
  1526. if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) {
  1527. struct inode *olddir = req->r_old_dentry_dir;
  1528. BUG_ON(!olddir);
  1529. doutc(cl, " src %p '%pd' dst %p '%pd'\n",
  1530. req->r_old_dentry, req->r_old_dentry, dn, dn);
  1531. doutc(cl, "doing d_move %p -> %p\n", req->r_old_dentry, dn);
  1532. /* d_move screws up sibling dentries' offsets */
  1533. ceph_dir_clear_ordered(dir);
  1534. ceph_dir_clear_ordered(olddir);
  1535. d_move(req->r_old_dentry, dn);
  1536. doutc(cl, " src %p '%pd' dst %p '%pd'\n",
  1537. req->r_old_dentry, req->r_old_dentry, dn, dn);
  1538. /* ensure target dentry is invalidated, despite
  1539. rehashing bug in vfs_rename_dir */
  1540. ceph_invalidate_dentry_lease(dn);
  1541. doutc(cl, "dn %p gets new offset %lld\n",
  1542. req->r_old_dentry,
  1543. ceph_dentry(req->r_old_dentry)->offset);
  1544. /* swap r_dentry and r_old_dentry in case that
  1545. * splice_dentry() gets called later. This is safe
  1546. * because no other place will use them */
  1547. req->r_dentry = req->r_old_dentry;
  1548. req->r_old_dentry = dn;
  1549. dn = req->r_dentry;
  1550. }
  1551. /* null dentry? */
  1552. if (!rinfo->head->is_target) {
  1553. doutc(cl, "null dentry\n");
  1554. if (d_really_is_positive(dn)) {
  1555. doutc(cl, "d_delete %p\n", dn);
  1556. ceph_dir_clear_ordered(dir);
  1557. d_delete(dn);
  1558. } else if (have_lease) {
  1559. if (d_unhashed(dn))
  1560. d_add(dn, NULL);
  1561. }
  1562. if (!d_unhashed(dn) && have_lease)
  1563. update_dentry_lease(dir, dn,
  1564. rinfo->dlease, session,
  1565. req->r_request_started);
  1566. goto done;
  1567. }
  1568. if (unlikely(!in)) {
  1569. err = -EINVAL;
  1570. goto done;
  1571. }
  1572. /* attach proper inode */
  1573. if (d_really_is_negative(dn)) {
  1574. ceph_dir_clear_ordered(dir);
  1575. ihold(in);
  1576. err = splice_dentry(&req->r_dentry, in);
  1577. if (err < 0)
  1578. goto done;
  1579. dn = req->r_dentry; /* may have spliced */
  1580. } else if (d_really_is_positive(dn) && d_inode(dn) != in) {
  1581. doutc(cl, " %p links to %p %llx.%llx, not %llx.%llx\n",
  1582. dn, d_inode(dn), ceph_vinop(d_inode(dn)),
  1583. ceph_vinop(in));
  1584. d_invalidate(dn);
  1585. have_lease = false;
  1586. }
  1587. if (have_lease) {
  1588. update_dentry_lease(dir, dn,
  1589. rinfo->dlease, session,
  1590. req->r_request_started);
  1591. }
  1592. doutc(cl, " final dn %p\n", dn);
  1593. } else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
  1594. req->r_op == CEPH_MDS_OP_MKSNAP) &&
  1595. test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
  1596. !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
  1597. struct inode *dir = req->r_parent;
  1598. /* fill out a snapdir LOOKUPSNAP dentry */
  1599. BUG_ON(!dir);
  1600. BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR);
  1601. BUG_ON(!req->r_dentry);
  1602. doutc(cl, " linking snapped dir %p to dn %p\n", in,
  1603. req->r_dentry);
  1604. ceph_dir_clear_ordered(dir);
  1605. if (unlikely(!in)) {
  1606. err = -EINVAL;
  1607. goto done;
  1608. }
  1609. ihold(in);
  1610. err = splice_dentry(&req->r_dentry, in);
  1611. if (err < 0)
  1612. goto done;
  1613. } else if (rinfo->head->is_dentry && req->r_dentry) {
  1614. /* parent inode is not locked, be careful */
  1615. struct ceph_vino *ptvino = NULL;
  1616. dvino.ino = le64_to_cpu(rinfo->diri.in->ino);
  1617. dvino.snap = le64_to_cpu(rinfo->diri.in->snapid);
  1618. if (rinfo->head->is_target) {
  1619. tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
  1620. tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
  1621. ptvino = &tvino;
  1622. }
  1623. update_dentry_lease_careful(req->r_dentry, rinfo->dlease,
  1624. session, req->r_request_started,
  1625. rinfo->dname, rinfo->dname_len,
  1626. &dvino, ptvino);
  1627. }
  1628. done:
  1629. /* Drop extra ref from ceph_get_reply_dir() if it returned a new inode */
  1630. if (unlikely(!IS_ERR_OR_NULL(parent_dir) && parent_dir != req->r_parent))
  1631. iput(parent_dir);
  1632. doutc(cl, "done err=%d\n", err);
  1633. return err;
  1634. }
  1635. /*
  1636. * Prepopulate our cache with readdir results, leases, etc.
  1637. */
  1638. static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
  1639. struct ceph_mds_session *session)
  1640. {
  1641. struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
  1642. struct ceph_client *cl = session->s_mdsc->fsc->client;
  1643. int i, err = 0;
  1644. for (i = 0; i < rinfo->dir_nr; i++) {
  1645. struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
  1646. struct ceph_vino vino;
  1647. struct inode *in;
  1648. int rc;
  1649. vino.ino = le64_to_cpu(rde->inode.in->ino);
  1650. vino.snap = le64_to_cpu(rde->inode.in->snapid);
  1651. in = ceph_get_inode(req->r_dentry->d_sb, vino, NULL);
  1652. if (IS_ERR(in)) {
  1653. err = PTR_ERR(in);
  1654. doutc(cl, "badness got %d\n", err);
  1655. continue;
  1656. }
  1657. rc = ceph_fill_inode(in, NULL, &rde->inode, NULL, session,
  1658. -1, &req->r_caps_reservation);
  1659. if (rc < 0) {
  1660. pr_err_client(cl, "inode badness on %p got %d\n", in,
  1661. rc);
  1662. err = rc;
  1663. if (inode_state_read_once(in) & I_NEW) {
  1664. ihold(in);
  1665. discard_new_inode(in);
  1666. }
  1667. } else if (inode_state_read_once(in) & I_NEW) {
  1668. unlock_new_inode(in);
  1669. }
  1670. iput(in);
  1671. }
  1672. return err;
  1673. }
  1674. void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl)
  1675. {
  1676. if (ctl->folio) {
  1677. folio_release_kmap(ctl->folio, ctl->dentries);
  1678. ctl->folio = NULL;
  1679. }
  1680. }
  1681. static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
  1682. struct ceph_readdir_cache_control *ctl,
  1683. struct ceph_mds_request *req)
  1684. {
  1685. struct ceph_client *cl = ceph_inode_to_client(dir);
  1686. struct ceph_inode_info *ci = ceph_inode(dir);
  1687. unsigned nsize = PAGE_SIZE / sizeof(struct dentry*);
  1688. unsigned idx = ctl->index % nsize;
  1689. pgoff_t pgoff = ctl->index / nsize;
  1690. if (!ctl->folio || pgoff != ctl->folio->index) {
  1691. ceph_readdir_cache_release(ctl);
  1692. fgf_t fgf = FGP_LOCK;
  1693. if (idx == 0)
  1694. fgf |= FGP_ACCESSED | FGP_CREAT;
  1695. ctl->folio = __filemap_get_folio(&dir->i_data, pgoff,
  1696. fgf, mapping_gfp_mask(&dir->i_data));
  1697. if (IS_ERR(ctl->folio)) {
  1698. int err = PTR_ERR(ctl->folio);
  1699. ctl->folio = NULL;
  1700. ctl->index = -1;
  1701. return idx == 0 ? err : 0;
  1702. }
  1703. /* reading/filling the cache are serialized by
  1704. * i_rwsem, no need to use folio lock */
  1705. folio_unlock(ctl->folio);
  1706. ctl->dentries = kmap_local_folio(ctl->folio, 0);
  1707. if (idx == 0)
  1708. memset(ctl->dentries, 0, PAGE_SIZE);
  1709. }
  1710. if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) &&
  1711. req->r_dir_ordered_cnt == atomic64_read(&ci->i_ordered_count)) {
  1712. doutc(cl, "dn %p idx %d\n", dn, ctl->index);
  1713. ctl->dentries[idx] = dn;
  1714. ctl->index++;
  1715. } else {
  1716. doutc(cl, "disable readdir cache\n");
  1717. ctl->index = -1;
  1718. }
  1719. return 0;
  1720. }
  1721. int ceph_readdir_prepopulate(struct ceph_mds_request *req,
  1722. struct ceph_mds_session *session)
  1723. {
  1724. struct dentry *parent = req->r_dentry;
  1725. struct inode *inode = d_inode(parent);
  1726. struct ceph_inode_info *ci = ceph_inode(inode);
  1727. struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
  1728. struct ceph_client *cl = session->s_mdsc->fsc->client;
  1729. struct qstr dname;
  1730. struct dentry *dn;
  1731. struct inode *in;
  1732. int err = 0, skipped = 0, ret, i;
  1733. u32 frag = le32_to_cpu(req->r_args.readdir.frag);
  1734. u32 last_hash = 0;
  1735. u32 fpos_offset;
  1736. struct ceph_readdir_cache_control cache_ctl = {};
  1737. if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags))
  1738. return readdir_prepopulate_inodes_only(req, session);
  1739. if (rinfo->hash_order) {
  1740. if (req->r_path2) {
  1741. last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
  1742. req->r_path2,
  1743. strlen(req->r_path2));
  1744. last_hash = ceph_frag_value(last_hash);
  1745. } else if (rinfo->offset_hash) {
  1746. /* mds understands offset_hash */
  1747. WARN_ON_ONCE(req->r_readdir_offset != 2);
  1748. last_hash = le32_to_cpu(req->r_args.readdir.offset_hash);
  1749. }
  1750. }
  1751. if (rinfo->dir_dir &&
  1752. le32_to_cpu(rinfo->dir_dir->frag) != frag) {
  1753. doutc(cl, "got new frag %x -> %x\n", frag,
  1754. le32_to_cpu(rinfo->dir_dir->frag));
  1755. frag = le32_to_cpu(rinfo->dir_dir->frag);
  1756. if (!rinfo->hash_order)
  1757. req->r_readdir_offset = 2;
  1758. }
  1759. if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
  1760. doutc(cl, "%d items under SNAPDIR dn %p\n",
  1761. rinfo->dir_nr, parent);
  1762. } else {
  1763. doutc(cl, "%d items under dn %p\n", rinfo->dir_nr, parent);
  1764. if (rinfo->dir_dir)
  1765. ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir);
  1766. if (ceph_frag_is_leftmost(frag) &&
  1767. req->r_readdir_offset == 2 &&
  1768. !(rinfo->hash_order && last_hash)) {
  1769. /* note dir version at start of readdir so we can
  1770. * tell if any dentries get dropped */
  1771. req->r_dir_release_cnt =
  1772. atomic64_read(&ci->i_release_count);
  1773. req->r_dir_ordered_cnt =
  1774. atomic64_read(&ci->i_ordered_count);
  1775. req->r_readdir_cache_idx = 0;
  1776. }
  1777. }
  1778. cache_ctl.index = req->r_readdir_cache_idx;
  1779. fpos_offset = req->r_readdir_offset;
  1780. /* FIXME: release caps/leases if error occurs */
  1781. for (i = 0; i < rinfo->dir_nr; i++) {
  1782. struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
  1783. struct ceph_vino tvino;
  1784. dname.name = rde->name;
  1785. dname.len = rde->name_len;
  1786. dname.hash = full_name_hash(parent, dname.name, dname.len);
  1787. tvino.ino = le64_to_cpu(rde->inode.in->ino);
  1788. tvino.snap = le64_to_cpu(rde->inode.in->snapid);
  1789. if (rinfo->hash_order) {
  1790. u32 hash = ceph_frag_value(rde->raw_hash);
  1791. if (hash != last_hash)
  1792. fpos_offset = 2;
  1793. last_hash = hash;
  1794. rde->offset = ceph_make_fpos(hash, fpos_offset++, true);
  1795. } else {
  1796. rde->offset = ceph_make_fpos(frag, fpos_offset++, false);
  1797. }
  1798. retry_lookup:
  1799. dn = d_lookup(parent, &dname);
  1800. doutc(cl, "d_lookup on parent=%p name=%.*s got %p\n",
  1801. parent, dname.len, dname.name, dn);
  1802. if (!dn) {
  1803. dn = d_alloc(parent, &dname);
  1804. doutc(cl, "d_alloc %p '%.*s' = %p\n", parent,
  1805. dname.len, dname.name, dn);
  1806. if (!dn) {
  1807. doutc(cl, "d_alloc badness\n");
  1808. err = -ENOMEM;
  1809. goto out;
  1810. }
  1811. if (rde->is_nokey) {
  1812. spin_lock(&dn->d_lock);
  1813. dn->d_flags |= DCACHE_NOKEY_NAME;
  1814. spin_unlock(&dn->d_lock);
  1815. }
  1816. } else if (d_really_is_positive(dn) &&
  1817. (ceph_ino(d_inode(dn)) != tvino.ino ||
  1818. ceph_snap(d_inode(dn)) != tvino.snap)) {
  1819. struct ceph_dentry_info *di = ceph_dentry(dn);
  1820. doutc(cl, " dn %p points to wrong inode %p\n",
  1821. dn, d_inode(dn));
  1822. spin_lock(&dn->d_lock);
  1823. if (di->offset > 0 &&
  1824. di->lease_shared_gen ==
  1825. atomic_read(&ci->i_shared_gen)) {
  1826. __ceph_dir_clear_ordered(ci);
  1827. di->offset = 0;
  1828. }
  1829. spin_unlock(&dn->d_lock);
  1830. d_delete(dn);
  1831. dput(dn);
  1832. goto retry_lookup;
  1833. }
  1834. /* inode */
  1835. if (d_really_is_positive(dn)) {
  1836. in = d_inode(dn);
  1837. } else {
  1838. in = ceph_get_inode(parent->d_sb, tvino, NULL);
  1839. if (IS_ERR(in)) {
  1840. doutc(cl, "new_inode badness\n");
  1841. d_drop(dn);
  1842. dput(dn);
  1843. err = PTR_ERR(in);
  1844. goto out;
  1845. }
  1846. }
  1847. ret = ceph_fill_inode(in, NULL, &rde->inode, NULL, session,
  1848. -1, &req->r_caps_reservation);
  1849. if (ret < 0) {
  1850. pr_err_client(cl, "badness on %p %llx.%llx\n", in,
  1851. ceph_vinop(in));
  1852. if (d_really_is_negative(dn)) {
  1853. if (inode_state_read_once(in) & I_NEW) {
  1854. ihold(in);
  1855. discard_new_inode(in);
  1856. }
  1857. iput(in);
  1858. }
  1859. d_drop(dn);
  1860. err = ret;
  1861. goto next_item;
  1862. }
  1863. if (inode_state_read_once(in) & I_NEW)
  1864. unlock_new_inode(in);
  1865. if (d_really_is_negative(dn)) {
  1866. if (ceph_security_xattr_deadlock(in)) {
  1867. doutc(cl, " skip splicing dn %p to inode %p"
  1868. " (security xattr deadlock)\n", dn, in);
  1869. iput(in);
  1870. skipped++;
  1871. goto next_item;
  1872. }
  1873. err = splice_dentry(&dn, in);
  1874. if (err < 0)
  1875. goto next_item;
  1876. }
  1877. ceph_dentry(dn)->offset = rde->offset;
  1878. update_dentry_lease(d_inode(parent), dn,
  1879. rde->lease, req->r_session,
  1880. req->r_request_started);
  1881. if (err == 0 && skipped == 0 && cache_ctl.index >= 0) {
  1882. ret = fill_readdir_cache(d_inode(parent), dn,
  1883. &cache_ctl, req);
  1884. if (ret < 0)
  1885. err = ret;
  1886. }
  1887. next_item:
  1888. dput(dn);
  1889. }
  1890. out:
  1891. if (err == 0 && skipped == 0) {
  1892. set_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags);
  1893. req->r_readdir_cache_idx = cache_ctl.index;
  1894. }
  1895. ceph_readdir_cache_release(&cache_ctl);
  1896. doutc(cl, "done\n");
  1897. return err;
  1898. }
  1899. bool ceph_inode_set_size(struct inode *inode, loff_t size)
  1900. {
  1901. struct ceph_client *cl = ceph_inode_to_client(inode);
  1902. struct ceph_inode_info *ci = ceph_inode(inode);
  1903. bool ret;
  1904. spin_lock(&ci->i_ceph_lock);
  1905. doutc(cl, "set_size %p %llu -> %llu\n", inode, i_size_read(inode), size);
  1906. i_size_write(inode, size);
  1907. ceph_fscache_update(inode);
  1908. inode->i_blocks = calc_inode_blocks(size);
  1909. ret = __ceph_should_report_size(ci);
  1910. spin_unlock(&ci->i_ceph_lock);
  1911. return ret;
  1912. }
  1913. void ceph_queue_inode_work(struct inode *inode, int work_bit)
  1914. {
  1915. struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
  1916. struct ceph_client *cl = fsc->client;
  1917. struct ceph_inode_info *ci = ceph_inode(inode);
  1918. set_bit(work_bit, &ci->i_work_mask);
  1919. ihold(inode);
  1920. if (queue_work(fsc->inode_wq, &ci->i_work)) {
  1921. doutc(cl, "%p %llx.%llx mask=%lx\n", inode,
  1922. ceph_vinop(inode), ci->i_work_mask);
  1923. } else {
  1924. doutc(cl, "%p %llx.%llx already queued, mask=%lx\n",
  1925. inode, ceph_vinop(inode), ci->i_work_mask);
  1926. iput(inode);
  1927. }
  1928. }
  1929. static void ceph_do_invalidate_pages(struct inode *inode)
  1930. {
  1931. struct ceph_client *cl = ceph_inode_to_client(inode);
  1932. struct ceph_inode_info *ci = ceph_inode(inode);
  1933. u32 orig_gen;
  1934. int check = 0;
  1935. ceph_fscache_invalidate(inode, false);
  1936. mutex_lock(&ci->i_truncate_mutex);
  1937. if (ceph_inode_is_shutdown(inode)) {
  1938. pr_warn_ratelimited_client(cl,
  1939. "%p %llx.%llx is shut down\n", inode,
  1940. ceph_vinop(inode));
  1941. mapping_set_error(inode->i_mapping, -EIO);
  1942. truncate_pagecache(inode, 0);
  1943. mutex_unlock(&ci->i_truncate_mutex);
  1944. goto out;
  1945. }
  1946. spin_lock(&ci->i_ceph_lock);
  1947. doutc(cl, "%p %llx.%llx gen %d revoking %d\n", inode,
  1948. ceph_vinop(inode), ci->i_rdcache_gen, ci->i_rdcache_revoking);
  1949. if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
  1950. if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE))
  1951. check = 1;
  1952. spin_unlock(&ci->i_ceph_lock);
  1953. mutex_unlock(&ci->i_truncate_mutex);
  1954. goto out;
  1955. }
  1956. orig_gen = ci->i_rdcache_gen;
  1957. spin_unlock(&ci->i_ceph_lock);
  1958. if (invalidate_inode_pages2(inode->i_mapping) < 0) {
  1959. pr_err_client(cl, "invalidate_inode_pages2 %llx.%llx failed\n",
  1960. ceph_vinop(inode));
  1961. }
  1962. spin_lock(&ci->i_ceph_lock);
  1963. if (orig_gen == ci->i_rdcache_gen &&
  1964. orig_gen == ci->i_rdcache_revoking) {
  1965. doutc(cl, "%p %llx.%llx gen %d successful\n", inode,
  1966. ceph_vinop(inode), ci->i_rdcache_gen);
  1967. ci->i_rdcache_revoking--;
  1968. check = 1;
  1969. } else {
  1970. doutc(cl, "%p %llx.%llx gen %d raced, now %d revoking %d\n",
  1971. inode, ceph_vinop(inode), orig_gen, ci->i_rdcache_gen,
  1972. ci->i_rdcache_revoking);
  1973. if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE))
  1974. check = 1;
  1975. }
  1976. spin_unlock(&ci->i_ceph_lock);
  1977. mutex_unlock(&ci->i_truncate_mutex);
  1978. out:
  1979. if (check)
  1980. ceph_check_caps(ci, 0);
  1981. }
  1982. /*
  1983. * Make sure any pending truncation is applied before doing anything
  1984. * that may depend on it.
  1985. */
  1986. void __ceph_do_pending_vmtruncate(struct inode *inode)
  1987. {
  1988. struct ceph_client *cl = ceph_inode_to_client(inode);
  1989. struct ceph_inode_info *ci = ceph_inode(inode);
  1990. u64 to;
  1991. int wrbuffer_refs, finish = 0;
  1992. mutex_lock(&ci->i_truncate_mutex);
  1993. retry:
  1994. spin_lock(&ci->i_ceph_lock);
  1995. if (ci->i_truncate_pending == 0) {
  1996. doutc(cl, "%p %llx.%llx none pending\n", inode,
  1997. ceph_vinop(inode));
  1998. spin_unlock(&ci->i_ceph_lock);
  1999. mutex_unlock(&ci->i_truncate_mutex);
  2000. return;
  2001. }
  2002. /*
  2003. * make sure any dirty snapped pages are flushed before we
  2004. * possibly truncate them.. so write AND block!
  2005. */
  2006. if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
  2007. spin_unlock(&ci->i_ceph_lock);
  2008. doutc(cl, "%p %llx.%llx flushing snaps first\n", inode,
  2009. ceph_vinop(inode));
  2010. filemap_write_and_wait_range(&inode->i_data, 0,
  2011. inode->i_sb->s_maxbytes);
  2012. goto retry;
  2013. }
  2014. /* there should be no reader or writer */
  2015. WARN_ON_ONCE(ci->i_rd_ref || ci->i_wr_ref);
  2016. to = ci->i_truncate_pagecache_size;
  2017. wrbuffer_refs = ci->i_wrbuffer_ref;
  2018. doutc(cl, "%p %llx.%llx (%d) to %lld\n", inode, ceph_vinop(inode),
  2019. ci->i_truncate_pending, to);
  2020. spin_unlock(&ci->i_ceph_lock);
  2021. ceph_fscache_resize(inode, to);
  2022. truncate_pagecache(inode, to);
  2023. spin_lock(&ci->i_ceph_lock);
  2024. if (to == ci->i_truncate_pagecache_size) {
  2025. ci->i_truncate_pending = 0;
  2026. finish = 1;
  2027. }
  2028. spin_unlock(&ci->i_ceph_lock);
  2029. if (!finish)
  2030. goto retry;
  2031. mutex_unlock(&ci->i_truncate_mutex);
  2032. if (wrbuffer_refs == 0)
  2033. ceph_check_caps(ci, 0);
  2034. wake_up_all(&ci->i_cap_wq);
  2035. }
  2036. static void ceph_inode_work(struct work_struct *work)
  2037. {
  2038. struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
  2039. i_work);
  2040. struct inode *inode = &ci->netfs.inode;
  2041. struct ceph_client *cl = ceph_inode_to_client(inode);
  2042. if (test_and_clear_bit(CEPH_I_WORK_WRITEBACK, &ci->i_work_mask)) {
  2043. doutc(cl, "writeback %p %llx.%llx\n", inode, ceph_vinop(inode));
  2044. filemap_fdatawrite(&inode->i_data);
  2045. }
  2046. if (test_and_clear_bit(CEPH_I_WORK_INVALIDATE_PAGES, &ci->i_work_mask))
  2047. ceph_do_invalidate_pages(inode);
  2048. if (test_and_clear_bit(CEPH_I_WORK_VMTRUNCATE, &ci->i_work_mask))
  2049. __ceph_do_pending_vmtruncate(inode);
  2050. if (test_and_clear_bit(CEPH_I_WORK_CHECK_CAPS, &ci->i_work_mask))
  2051. ceph_check_caps(ci, 0);
  2052. if (test_and_clear_bit(CEPH_I_WORK_FLUSH_SNAPS, &ci->i_work_mask))
  2053. ceph_flush_snaps(ci, NULL);
  2054. iput(inode);
  2055. }
  2056. static const char *ceph_encrypted_get_link(struct dentry *dentry,
  2057. struct inode *inode,
  2058. struct delayed_call *done)
  2059. {
  2060. struct ceph_inode_info *ci = ceph_inode(inode);
  2061. if (!dentry)
  2062. return ERR_PTR(-ECHILD);
  2063. return fscrypt_get_symlink(inode, ci->i_symlink, i_size_read(inode),
  2064. done);
  2065. }
  2066. static int ceph_encrypted_symlink_getattr(struct mnt_idmap *idmap,
  2067. const struct path *path,
  2068. struct kstat *stat, u32 request_mask,
  2069. unsigned int query_flags)
  2070. {
  2071. int ret;
  2072. ret = ceph_getattr(idmap, path, stat, request_mask, query_flags);
  2073. if (ret)
  2074. return ret;
  2075. return fscrypt_symlink_getattr(path, stat);
  2076. }
  2077. /*
  2078. * symlinks
  2079. */
  2080. static const struct inode_operations ceph_symlink_iops = {
  2081. .get_link = simple_get_link,
  2082. .setattr = ceph_setattr,
  2083. .getattr = ceph_getattr,
  2084. .listxattr = ceph_listxattr,
  2085. };
  2086. static const struct inode_operations ceph_encrypted_symlink_iops = {
  2087. .get_link = ceph_encrypted_get_link,
  2088. .setattr = ceph_setattr,
  2089. .getattr = ceph_encrypted_symlink_getattr,
  2090. .listxattr = ceph_listxattr,
  2091. };
  2092. /*
  2093. * Transfer the encrypted last block to the MDS and the MDS
  2094. * will help update it when truncating a smaller size.
  2095. *
  2096. * We don't support a PAGE_SIZE that is smaller than the
  2097. * CEPH_FSCRYPT_BLOCK_SIZE.
  2098. */
  2099. static int fill_fscrypt_truncate(struct inode *inode,
  2100. struct ceph_mds_request *req,
  2101. struct iattr *attr)
  2102. {
  2103. struct ceph_client *cl = ceph_inode_to_client(inode);
  2104. struct ceph_inode_info *ci = ceph_inode(inode);
  2105. int boff = attr->ia_size % CEPH_FSCRYPT_BLOCK_SIZE;
  2106. loff_t pos, orig_pos = round_down(attr->ia_size,
  2107. CEPH_FSCRYPT_BLOCK_SIZE);
  2108. u64 block = orig_pos >> CEPH_FSCRYPT_BLOCK_SHIFT;
  2109. struct ceph_pagelist *pagelist = NULL;
  2110. struct kvec iov = {0};
  2111. struct iov_iter iter;
  2112. struct page *page = NULL;
  2113. struct ceph_fscrypt_truncate_size_header header;
  2114. int retry_op = 0;
  2115. int len = CEPH_FSCRYPT_BLOCK_SIZE;
  2116. loff_t i_size = i_size_read(inode);
  2117. int got, ret, issued;
  2118. u64 objver;
  2119. ret = __ceph_get_caps(inode, NULL, CEPH_CAP_FILE_RD, 0, -1, &got);
  2120. if (ret < 0)
  2121. return ret;
  2122. issued = __ceph_caps_issued(ci, NULL);
  2123. doutc(cl, "size %lld -> %lld got cap refs on %s, issued %s\n",
  2124. i_size, attr->ia_size, ceph_cap_string(got),
  2125. ceph_cap_string(issued));
  2126. /* Try to writeback the dirty pagecaches */
  2127. if (issued & (CEPH_CAP_FILE_BUFFER)) {
  2128. loff_t lend = orig_pos + CEPH_FSCRYPT_BLOCK_SIZE - 1;
  2129. ret = filemap_write_and_wait_range(inode->i_mapping,
  2130. orig_pos, lend);
  2131. if (ret < 0)
  2132. goto out;
  2133. }
  2134. page = __page_cache_alloc(GFP_KERNEL);
  2135. if (page == NULL) {
  2136. ret = -ENOMEM;
  2137. goto out;
  2138. }
  2139. pagelist = ceph_pagelist_alloc(GFP_KERNEL);
  2140. if (!pagelist) {
  2141. ret = -ENOMEM;
  2142. goto out;
  2143. }
  2144. iov.iov_base = kmap_local_page(page);
  2145. iov.iov_len = len;
  2146. iov_iter_kvec(&iter, READ, &iov, 1, len);
  2147. pos = orig_pos;
  2148. ret = __ceph_sync_read(inode, &pos, &iter, &retry_op, &objver);
  2149. if (ret < 0)
  2150. goto out;
  2151. /* Insert the header first */
  2152. header.ver = 1;
  2153. header.compat = 1;
  2154. header.change_attr = cpu_to_le64(inode_peek_iversion_raw(inode));
  2155. /*
  2156. * Always set the block_size to CEPH_FSCRYPT_BLOCK_SIZE,
  2157. * because in MDS it may need this to do the truncate.
  2158. */
  2159. header.block_size = cpu_to_le32(CEPH_FSCRYPT_BLOCK_SIZE);
  2160. /*
  2161. * If we hit a hole here, we should just skip filling
  2162. * the fscrypt for the request, because once the fscrypt
  2163. * is enabled, the file will be split into many blocks
  2164. * with the size of CEPH_FSCRYPT_BLOCK_SIZE, if there
  2165. * has a hole, the hole size should be multiple of block
  2166. * size.
  2167. *
  2168. * If the Rados object doesn't exist, it will be set to 0.
  2169. */
  2170. if (!objver) {
  2171. doutc(cl, "hit hole, ppos %lld < size %lld\n", pos, i_size);
  2172. header.data_len = cpu_to_le32(8 + 8 + 4);
  2173. header.file_offset = 0;
  2174. ret = 0;
  2175. } else {
  2176. header.data_len = cpu_to_le32(8 + 8 + 4 + CEPH_FSCRYPT_BLOCK_SIZE);
  2177. header.file_offset = cpu_to_le64(orig_pos);
  2178. doutc(cl, "encrypt block boff/bsize %d/%lu\n", boff,
  2179. CEPH_FSCRYPT_BLOCK_SIZE);
  2180. /* truncate and zero out the extra contents for the last block */
  2181. memset(iov.iov_base + boff, 0, PAGE_SIZE - boff);
  2182. /* encrypt the last block */
  2183. ret = ceph_fscrypt_encrypt_block_inplace(inode, page,
  2184. CEPH_FSCRYPT_BLOCK_SIZE,
  2185. 0, block);
  2186. if (ret)
  2187. goto out;
  2188. }
  2189. /* Insert the header */
  2190. ret = ceph_pagelist_append(pagelist, &header, sizeof(header));
  2191. if (ret)
  2192. goto out;
  2193. if (header.block_size) {
  2194. /* Append the last block contents to pagelist */
  2195. ret = ceph_pagelist_append(pagelist, iov.iov_base,
  2196. CEPH_FSCRYPT_BLOCK_SIZE);
  2197. if (ret)
  2198. goto out;
  2199. }
  2200. req->r_pagelist = pagelist;
  2201. out:
  2202. doutc(cl, "%p %llx.%llx size dropping cap refs on %s\n", inode,
  2203. ceph_vinop(inode), ceph_cap_string(got));
  2204. ceph_put_cap_refs(ci, got);
  2205. if (iov.iov_base)
  2206. kunmap_local(iov.iov_base);
  2207. if (page)
  2208. __free_pages(page, 0);
  2209. if (ret && pagelist)
  2210. ceph_pagelist_release(pagelist);
  2211. return ret;
  2212. }
  2213. int __ceph_setattr(struct mnt_idmap *idmap, struct inode *inode,
  2214. struct iattr *attr, struct ceph_iattr *cia)
  2215. {
  2216. struct ceph_inode_info *ci = ceph_inode(inode);
  2217. unsigned int ia_valid = attr->ia_valid;
  2218. struct ceph_mds_request *req;
  2219. struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc;
  2220. struct ceph_client *cl = ceph_inode_to_client(inode);
  2221. struct ceph_cap_flush *prealloc_cf;
  2222. loff_t isize = i_size_read(inode);
  2223. int issued;
  2224. int release = 0, dirtied = 0;
  2225. int mask = 0;
  2226. int err = 0;
  2227. int inode_dirty_flags = 0;
  2228. bool lock_snap_rwsem = false;
  2229. bool fill_fscrypt;
  2230. int truncate_retry = 20; /* The RMW will take around 50ms */
  2231. struct dentry *dentry;
  2232. char *path;
  2233. bool do_sync = false;
  2234. dentry = d_find_alias(inode);
  2235. if (!dentry) {
  2236. do_sync = true;
  2237. } else {
  2238. struct ceph_path_info path_info = {0};
  2239. path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0);
  2240. if (IS_ERR(path)) {
  2241. do_sync = true;
  2242. err = 0;
  2243. } else {
  2244. err = ceph_mds_check_access(mdsc, path, MAY_WRITE);
  2245. }
  2246. ceph_mdsc_free_path_info(&path_info);
  2247. dput(dentry);
  2248. /* For none EACCES cases will let the MDS do the mds auth check */
  2249. if (err == -EACCES) {
  2250. return err;
  2251. } else if (err < 0) {
  2252. do_sync = true;
  2253. err = 0;
  2254. }
  2255. }
  2256. retry:
  2257. prealloc_cf = ceph_alloc_cap_flush();
  2258. if (!prealloc_cf)
  2259. return -ENOMEM;
  2260. req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR,
  2261. USE_AUTH_MDS);
  2262. if (IS_ERR(req)) {
  2263. ceph_free_cap_flush(prealloc_cf);
  2264. return PTR_ERR(req);
  2265. }
  2266. fill_fscrypt = false;
  2267. spin_lock(&ci->i_ceph_lock);
  2268. issued = __ceph_caps_issued(ci, NULL);
  2269. if (!ci->i_head_snapc &&
  2270. (issued & (CEPH_CAP_ANY_EXCL | CEPH_CAP_FILE_WR))) {
  2271. lock_snap_rwsem = true;
  2272. if (!down_read_trylock(&mdsc->snap_rwsem)) {
  2273. spin_unlock(&ci->i_ceph_lock);
  2274. down_read(&mdsc->snap_rwsem);
  2275. spin_lock(&ci->i_ceph_lock);
  2276. issued = __ceph_caps_issued(ci, NULL);
  2277. }
  2278. }
  2279. doutc(cl, "%p %llx.%llx issued %s\n", inode, ceph_vinop(inode),
  2280. ceph_cap_string(issued));
  2281. #if IS_ENABLED(CONFIG_FS_ENCRYPTION)
  2282. if (cia && cia->fscrypt_auth) {
  2283. u32 len = ceph_fscrypt_auth_len(cia->fscrypt_auth);
  2284. if (len > sizeof(*cia->fscrypt_auth)) {
  2285. err = -EINVAL;
  2286. spin_unlock(&ci->i_ceph_lock);
  2287. goto out;
  2288. }
  2289. doutc(cl, "%p %llx.%llx fscrypt_auth len %u to %u)\n", inode,
  2290. ceph_vinop(inode), ci->fscrypt_auth_len, len);
  2291. /* It should never be re-set once set */
  2292. WARN_ON_ONCE(ci->fscrypt_auth);
  2293. if (!do_sync && (issued & CEPH_CAP_AUTH_EXCL)) {
  2294. dirtied |= CEPH_CAP_AUTH_EXCL;
  2295. kfree(ci->fscrypt_auth);
  2296. ci->fscrypt_auth = (u8 *)cia->fscrypt_auth;
  2297. ci->fscrypt_auth_len = len;
  2298. } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
  2299. ci->fscrypt_auth_len != len ||
  2300. memcmp(ci->fscrypt_auth, cia->fscrypt_auth, len)) {
  2301. req->r_fscrypt_auth = cia->fscrypt_auth;
  2302. mask |= CEPH_SETATTR_FSCRYPT_AUTH;
  2303. release |= CEPH_CAP_AUTH_SHARED;
  2304. }
  2305. cia->fscrypt_auth = NULL;
  2306. }
  2307. #else
  2308. if (cia && cia->fscrypt_auth) {
  2309. err = -EINVAL;
  2310. spin_unlock(&ci->i_ceph_lock);
  2311. goto out;
  2312. }
  2313. #endif /* CONFIG_FS_ENCRYPTION */
  2314. if (ia_valid & ATTR_UID) {
  2315. kuid_t fsuid = from_vfsuid(idmap, i_user_ns(inode), attr->ia_vfsuid);
  2316. doutc(cl, "%p %llx.%llx uid %d -> %d\n", inode,
  2317. ceph_vinop(inode),
  2318. from_kuid(&init_user_ns, inode->i_uid),
  2319. from_kuid(&init_user_ns, attr->ia_uid));
  2320. if (!do_sync && (issued & CEPH_CAP_AUTH_EXCL)) {
  2321. inode->i_uid = fsuid;
  2322. dirtied |= CEPH_CAP_AUTH_EXCL;
  2323. } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
  2324. !uid_eq(fsuid, inode->i_uid)) {
  2325. req->r_args.setattr.uid = cpu_to_le32(
  2326. from_kuid(&init_user_ns, fsuid));
  2327. mask |= CEPH_SETATTR_UID;
  2328. release |= CEPH_CAP_AUTH_SHARED;
  2329. }
  2330. }
  2331. if (ia_valid & ATTR_GID) {
  2332. kgid_t fsgid = from_vfsgid(idmap, i_user_ns(inode), attr->ia_vfsgid);
  2333. doutc(cl, "%p %llx.%llx gid %d -> %d\n", inode,
  2334. ceph_vinop(inode),
  2335. from_kgid(&init_user_ns, inode->i_gid),
  2336. from_kgid(&init_user_ns, attr->ia_gid));
  2337. if (!do_sync && (issued & CEPH_CAP_AUTH_EXCL)) {
  2338. inode->i_gid = fsgid;
  2339. dirtied |= CEPH_CAP_AUTH_EXCL;
  2340. } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
  2341. !gid_eq(fsgid, inode->i_gid)) {
  2342. req->r_args.setattr.gid = cpu_to_le32(
  2343. from_kgid(&init_user_ns, fsgid));
  2344. mask |= CEPH_SETATTR_GID;
  2345. release |= CEPH_CAP_AUTH_SHARED;
  2346. }
  2347. }
  2348. if (ia_valid & ATTR_MODE) {
  2349. doutc(cl, "%p %llx.%llx mode 0%o -> 0%o\n", inode,
  2350. ceph_vinop(inode), inode->i_mode, attr->ia_mode);
  2351. if (!do_sync && (issued & CEPH_CAP_AUTH_EXCL)) {
  2352. inode->i_mode = attr->ia_mode;
  2353. dirtied |= CEPH_CAP_AUTH_EXCL;
  2354. } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
  2355. attr->ia_mode != inode->i_mode) {
  2356. inode->i_mode = attr->ia_mode;
  2357. req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode);
  2358. mask |= CEPH_SETATTR_MODE;
  2359. release |= CEPH_CAP_AUTH_SHARED;
  2360. }
  2361. }
  2362. if (ia_valid & ATTR_ATIME) {
  2363. struct timespec64 atime = inode_get_atime(inode);
  2364. doutc(cl, "%p %llx.%llx atime %ptSp -> %ptSp\n",
  2365. inode, ceph_vinop(inode), &atime, &attr->ia_atime);
  2366. if (!do_sync && (issued & CEPH_CAP_FILE_EXCL)) {
  2367. ci->i_time_warp_seq++;
  2368. inode_set_atime_to_ts(inode, attr->ia_atime);
  2369. dirtied |= CEPH_CAP_FILE_EXCL;
  2370. } else if (!do_sync && (issued & CEPH_CAP_FILE_WR) &&
  2371. timespec64_compare(&atime,
  2372. &attr->ia_atime) < 0) {
  2373. inode_set_atime_to_ts(inode, attr->ia_atime);
  2374. dirtied |= CEPH_CAP_FILE_WR;
  2375. } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
  2376. !timespec64_equal(&atime, &attr->ia_atime)) {
  2377. ceph_encode_timespec64(&req->r_args.setattr.atime,
  2378. &attr->ia_atime);
  2379. mask |= CEPH_SETATTR_ATIME;
  2380. release |= CEPH_CAP_FILE_SHARED |
  2381. CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
  2382. }
  2383. }
  2384. if (ia_valid & ATTR_SIZE) {
  2385. doutc(cl, "%p %llx.%llx size %lld -> %lld\n", inode,
  2386. ceph_vinop(inode), isize, attr->ia_size);
  2387. /*
  2388. * Only when the new size is smaller and not aligned to
  2389. * CEPH_FSCRYPT_BLOCK_SIZE will the RMW is needed.
  2390. */
  2391. if (IS_ENCRYPTED(inode) && attr->ia_size < isize &&
  2392. (attr->ia_size % CEPH_FSCRYPT_BLOCK_SIZE)) {
  2393. mask |= CEPH_SETATTR_SIZE;
  2394. release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
  2395. CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
  2396. set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags);
  2397. mask |= CEPH_SETATTR_FSCRYPT_FILE;
  2398. req->r_args.setattr.size =
  2399. cpu_to_le64(round_up(attr->ia_size,
  2400. CEPH_FSCRYPT_BLOCK_SIZE));
  2401. req->r_args.setattr.old_size =
  2402. cpu_to_le64(round_up(isize,
  2403. CEPH_FSCRYPT_BLOCK_SIZE));
  2404. req->r_fscrypt_file = attr->ia_size;
  2405. fill_fscrypt = true;
  2406. } else if (!do_sync && (issued & CEPH_CAP_FILE_EXCL) && attr->ia_size >= isize) {
  2407. if (attr->ia_size > isize) {
  2408. i_size_write(inode, attr->ia_size);
  2409. inode->i_blocks = calc_inode_blocks(attr->ia_size);
  2410. ci->i_reported_size = attr->ia_size;
  2411. dirtied |= CEPH_CAP_FILE_EXCL;
  2412. ia_valid |= ATTR_MTIME;
  2413. }
  2414. } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
  2415. attr->ia_size != isize) {
  2416. mask |= CEPH_SETATTR_SIZE;
  2417. release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
  2418. CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
  2419. if (IS_ENCRYPTED(inode) && attr->ia_size) {
  2420. set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags);
  2421. mask |= CEPH_SETATTR_FSCRYPT_FILE;
  2422. req->r_args.setattr.size =
  2423. cpu_to_le64(round_up(attr->ia_size,
  2424. CEPH_FSCRYPT_BLOCK_SIZE));
  2425. req->r_args.setattr.old_size =
  2426. cpu_to_le64(round_up(isize,
  2427. CEPH_FSCRYPT_BLOCK_SIZE));
  2428. req->r_fscrypt_file = attr->ia_size;
  2429. } else {
  2430. req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
  2431. req->r_args.setattr.old_size = cpu_to_le64(isize);
  2432. req->r_fscrypt_file = 0;
  2433. }
  2434. }
  2435. }
  2436. if (ia_valid & ATTR_MTIME) {
  2437. struct timespec64 mtime = inode_get_mtime(inode);
  2438. doutc(cl, "%p %llx.%llx mtime %ptSp -> %ptSp\n",
  2439. inode, ceph_vinop(inode), &mtime, &attr->ia_mtime);
  2440. if (!do_sync && (issued & CEPH_CAP_FILE_EXCL)) {
  2441. ci->i_time_warp_seq++;
  2442. inode_set_mtime_to_ts(inode, attr->ia_mtime);
  2443. dirtied |= CEPH_CAP_FILE_EXCL;
  2444. } else if (!do_sync && (issued & CEPH_CAP_FILE_WR) &&
  2445. timespec64_compare(&mtime, &attr->ia_mtime) < 0) {
  2446. inode_set_mtime_to_ts(inode, attr->ia_mtime);
  2447. dirtied |= CEPH_CAP_FILE_WR;
  2448. } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
  2449. !timespec64_equal(&mtime, &attr->ia_mtime)) {
  2450. ceph_encode_timespec64(&req->r_args.setattr.mtime,
  2451. &attr->ia_mtime);
  2452. mask |= CEPH_SETATTR_MTIME;
  2453. release |= CEPH_CAP_FILE_SHARED |
  2454. CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
  2455. }
  2456. }
  2457. /* these do nothing */
  2458. if (ia_valid & ATTR_CTIME) {
  2459. struct timespec64 ictime = inode_get_ctime(inode);
  2460. bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME|
  2461. ATTR_MODE|ATTR_UID|ATTR_GID)) == 0;
  2462. doutc(cl, "%p %llx.%llx ctime %ptSp -> %ptSp (%s)\n",
  2463. inode, ceph_vinop(inode), &ictime, &attr->ia_ctime,
  2464. only ? "ctime only" : "ignored");
  2465. if (only) {
  2466. /*
  2467. * if kernel wants to dirty ctime but nothing else,
  2468. * we need to choose a cap to dirty under, or do
  2469. * a almost-no-op setattr
  2470. */
  2471. if (issued & CEPH_CAP_AUTH_EXCL)
  2472. dirtied |= CEPH_CAP_AUTH_EXCL;
  2473. else if (issued & CEPH_CAP_FILE_EXCL)
  2474. dirtied |= CEPH_CAP_FILE_EXCL;
  2475. else if (issued & CEPH_CAP_XATTR_EXCL)
  2476. dirtied |= CEPH_CAP_XATTR_EXCL;
  2477. else
  2478. mask |= CEPH_SETATTR_CTIME;
  2479. }
  2480. }
  2481. if (ia_valid & ATTR_FILE)
  2482. doutc(cl, "%p %llx.%llx ATTR_FILE ... hrm!\n", inode,
  2483. ceph_vinop(inode));
  2484. if (dirtied) {
  2485. inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied,
  2486. &prealloc_cf);
  2487. inode_set_ctime_to_ts(inode, attr->ia_ctime);
  2488. inode_inc_iversion_raw(inode);
  2489. }
  2490. release &= issued;
  2491. spin_unlock(&ci->i_ceph_lock);
  2492. if (lock_snap_rwsem) {
  2493. up_read(&mdsc->snap_rwsem);
  2494. lock_snap_rwsem = false;
  2495. }
  2496. if (inode_dirty_flags)
  2497. __mark_inode_dirty(inode, inode_dirty_flags);
  2498. if (mask) {
  2499. req->r_inode = inode;
  2500. ihold(inode);
  2501. req->r_inode_drop = release;
  2502. req->r_args.setattr.mask = cpu_to_le32(mask);
  2503. req->r_num_caps = 1;
  2504. req->r_stamp = attr->ia_ctime;
  2505. if (fill_fscrypt) {
  2506. err = fill_fscrypt_truncate(inode, req, attr);
  2507. if (err)
  2508. goto out;
  2509. }
  2510. /*
  2511. * The truncate request will return -EAGAIN when the
  2512. * last block has been updated just before the MDS
  2513. * successfully gets the xlock for the FILE lock. To
  2514. * avoid corrupting the file contents we need to retry
  2515. * it.
  2516. */
  2517. err = ceph_mdsc_do_request(mdsc, NULL, req);
  2518. if (err == -EAGAIN && truncate_retry--) {
  2519. doutc(cl, "%p %llx.%llx result=%d (%s locally, %d remote), retry it!\n",
  2520. inode, ceph_vinop(inode), err,
  2521. ceph_cap_string(dirtied), mask);
  2522. ceph_mdsc_put_request(req);
  2523. ceph_free_cap_flush(prealloc_cf);
  2524. goto retry;
  2525. }
  2526. }
  2527. out:
  2528. doutc(cl, "%p %llx.%llx result=%d (%s locally, %d remote)\n", inode,
  2529. ceph_vinop(inode), err, ceph_cap_string(dirtied), mask);
  2530. ceph_mdsc_put_request(req);
  2531. ceph_free_cap_flush(prealloc_cf);
  2532. if (err >= 0 && (mask & CEPH_SETATTR_SIZE))
  2533. __ceph_do_pending_vmtruncate(inode);
  2534. return err;
  2535. }
  2536. /*
  2537. * setattr
  2538. */
  2539. int ceph_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
  2540. struct iattr *attr)
  2541. {
  2542. struct inode *inode = d_inode(dentry);
  2543. struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
  2544. int err;
  2545. if (ceph_snap(inode) != CEPH_NOSNAP)
  2546. return -EROFS;
  2547. if (ceph_inode_is_shutdown(inode))
  2548. return -ESTALE;
  2549. err = fscrypt_prepare_setattr(dentry, attr);
  2550. if (err)
  2551. return err;
  2552. err = setattr_prepare(idmap, dentry, attr);
  2553. if (err != 0)
  2554. return err;
  2555. if ((attr->ia_valid & ATTR_SIZE) &&
  2556. attr->ia_size > max(i_size_read(inode), fsc->max_file_size))
  2557. return -EFBIG;
  2558. if ((attr->ia_valid & ATTR_SIZE) &&
  2559. ceph_quota_is_max_bytes_exceeded(inode, attr->ia_size))
  2560. return -EDQUOT;
  2561. err = __ceph_setattr(idmap, inode, attr, NULL);
  2562. if (err >= 0 && (attr->ia_valid & ATTR_MODE))
  2563. err = posix_acl_chmod(idmap, dentry, attr->ia_mode);
  2564. return err;
  2565. }
  2566. int ceph_try_to_choose_auth_mds(struct inode *inode, int mask)
  2567. {
  2568. int issued = ceph_caps_issued(ceph_inode(inode));
  2569. /*
  2570. * If any 'x' caps is issued we can just choose the auth MDS
  2571. * instead of the random replica MDSes. Because only when the
  2572. * Locker is in LOCK_EXEC state will the loner client could
  2573. * get the 'x' caps. And if we send the getattr requests to
  2574. * any replica MDS it must auth pin and tries to rdlock from
  2575. * the auth MDS, and then the auth MDS need to do the Locker
  2576. * state transition to LOCK_SYNC. And after that the lock state
  2577. * will change back.
  2578. *
  2579. * This cost much when doing the Locker state transition and
  2580. * usually will need to revoke caps from clients.
  2581. *
  2582. * And for the 'Xs' caps for getxattr we will also choose the
  2583. * auth MDS, because the MDS side code is buggy due to setxattr
  2584. * won't notify the replica MDSes when the values changed and
  2585. * the replica MDS will return the old values. Though we will
  2586. * fix it in MDS code, but this still makes sense for old ceph.
  2587. */
  2588. if (((mask & CEPH_CAP_ANY_SHARED) && (issued & CEPH_CAP_ANY_EXCL))
  2589. || (mask & (CEPH_STAT_RSTAT | CEPH_STAT_CAP_XATTR)))
  2590. return USE_AUTH_MDS;
  2591. else
  2592. return USE_ANY_MDS;
  2593. }
  2594. /*
  2595. * Verify that we have a lease on the given mask. If not,
  2596. * do a getattr against an mds.
  2597. */
  2598. int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
  2599. int mask, bool force)
  2600. {
  2601. struct ceph_fs_client *fsc = ceph_sb_to_fs_client(inode->i_sb);
  2602. struct ceph_client *cl = fsc->client;
  2603. struct ceph_mds_client *mdsc = fsc->mdsc;
  2604. struct ceph_mds_request *req;
  2605. int mode;
  2606. int err;
  2607. if (ceph_snap(inode) == CEPH_SNAPDIR) {
  2608. doutc(cl, "inode %p %llx.%llx SNAPDIR\n", inode,
  2609. ceph_vinop(inode));
  2610. return 0;
  2611. }
  2612. doutc(cl, "inode %p %llx.%llx mask %s mode 0%o\n", inode,
  2613. ceph_vinop(inode), ceph_cap_string(mask), inode->i_mode);
  2614. if (!force && ceph_caps_issued_mask_metric(ceph_inode(inode), mask, 1))
  2615. return 0;
  2616. mode = ceph_try_to_choose_auth_mds(inode, mask);
  2617. req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
  2618. if (IS_ERR(req))
  2619. return PTR_ERR(req);
  2620. req->r_inode = inode;
  2621. ihold(inode);
  2622. req->r_num_caps = 1;
  2623. req->r_args.getattr.mask = cpu_to_le32(mask);
  2624. req->r_locked_page = locked_page;
  2625. err = ceph_mdsc_do_request(mdsc, NULL, req);
  2626. if (locked_page && err == 0) {
  2627. u64 inline_version = req->r_reply_info.targeti.inline_version;
  2628. if (inline_version == 0) {
  2629. /* the reply is supposed to contain inline data */
  2630. err = -EINVAL;
  2631. } else if (inline_version == CEPH_INLINE_NONE ||
  2632. inline_version == 1) {
  2633. err = -ENODATA;
  2634. } else {
  2635. err = req->r_reply_info.targeti.inline_len;
  2636. }
  2637. }
  2638. ceph_mdsc_put_request(req);
  2639. doutc(cl, "result=%d\n", err);
  2640. return err;
  2641. }
  2642. int ceph_do_getvxattr(struct inode *inode, const char *name, void *value,
  2643. size_t size)
  2644. {
  2645. struct ceph_fs_client *fsc = ceph_sb_to_fs_client(inode->i_sb);
  2646. struct ceph_client *cl = fsc->client;
  2647. struct ceph_mds_client *mdsc = fsc->mdsc;
  2648. struct ceph_mds_request *req;
  2649. int mode = USE_AUTH_MDS;
  2650. int err;
  2651. char *xattr_value;
  2652. size_t xattr_value_len;
  2653. req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETVXATTR, mode);
  2654. if (IS_ERR(req)) {
  2655. err = -ENOMEM;
  2656. goto out;
  2657. }
  2658. req->r_feature_needed = CEPHFS_FEATURE_OP_GETVXATTR;
  2659. req->r_path2 = kstrdup(name, GFP_NOFS);
  2660. if (!req->r_path2) {
  2661. err = -ENOMEM;
  2662. goto put;
  2663. }
  2664. ihold(inode);
  2665. req->r_inode = inode;
  2666. err = ceph_mdsc_do_request(mdsc, NULL, req);
  2667. if (err < 0)
  2668. goto put;
  2669. xattr_value = req->r_reply_info.xattr_info.xattr_value;
  2670. xattr_value_len = req->r_reply_info.xattr_info.xattr_value_len;
  2671. doutc(cl, "xattr_value_len:%zu, size:%zu\n", xattr_value_len, size);
  2672. err = (int)xattr_value_len;
  2673. if (size == 0)
  2674. goto put;
  2675. if (xattr_value_len > size) {
  2676. err = -ERANGE;
  2677. goto put;
  2678. }
  2679. memcpy(value, xattr_value, xattr_value_len);
  2680. put:
  2681. ceph_mdsc_put_request(req);
  2682. out:
  2683. doutc(cl, "result=%d\n", err);
  2684. return err;
  2685. }
  2686. /*
  2687. * Check inode permissions. We verify we have a valid value for
  2688. * the AUTH cap, then call the generic handler.
  2689. */
  2690. int ceph_permission(struct mnt_idmap *idmap, struct inode *inode,
  2691. int mask)
  2692. {
  2693. int err;
  2694. if (mask & MAY_NOT_BLOCK)
  2695. return -ECHILD;
  2696. err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED, false);
  2697. if (!err)
  2698. err = generic_permission(idmap, inode, mask);
  2699. return err;
  2700. }
  2701. /* Craft a mask of needed caps given a set of requested statx attrs. */
  2702. static int statx_to_caps(u32 want, umode_t mode)
  2703. {
  2704. int mask = 0;
  2705. if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME|STATX_BTIME|STATX_CHANGE_COOKIE))
  2706. mask |= CEPH_CAP_AUTH_SHARED;
  2707. if (want & (STATX_NLINK|STATX_CTIME|STATX_CHANGE_COOKIE)) {
  2708. /*
  2709. * The link count for directories depends on inode->i_subdirs,
  2710. * and that is only updated when Fs caps are held.
  2711. */
  2712. if (S_ISDIR(mode))
  2713. mask |= CEPH_CAP_FILE_SHARED;
  2714. else
  2715. mask |= CEPH_CAP_LINK_SHARED;
  2716. }
  2717. if (want & (STATX_ATIME|STATX_MTIME|STATX_CTIME|STATX_SIZE|STATX_BLOCKS|STATX_CHANGE_COOKIE))
  2718. mask |= CEPH_CAP_FILE_SHARED;
  2719. if (want & (STATX_CTIME|STATX_CHANGE_COOKIE))
  2720. mask |= CEPH_CAP_XATTR_SHARED;
  2721. return mask;
  2722. }
  2723. /*
  2724. * Get all the attributes. If we have sufficient caps for the requested attrs,
  2725. * then we can avoid talking to the MDS at all.
  2726. */
  2727. int ceph_getattr(struct mnt_idmap *idmap, const struct path *path,
  2728. struct kstat *stat, u32 request_mask, unsigned int flags)
  2729. {
  2730. struct inode *inode = d_inode(path->dentry);
  2731. struct super_block *sb = inode->i_sb;
  2732. struct ceph_inode_info *ci = ceph_inode(inode);
  2733. u32 valid_mask = STATX_BASIC_STATS;
  2734. int err = 0;
  2735. if (ceph_inode_is_shutdown(inode))
  2736. return -ESTALE;
  2737. /* Skip the getattr altogether if we're asked not to sync */
  2738. if ((flags & AT_STATX_SYNC_TYPE) != AT_STATX_DONT_SYNC) {
  2739. err = ceph_do_getattr(inode,
  2740. statx_to_caps(request_mask, inode->i_mode),
  2741. flags & AT_STATX_FORCE_SYNC);
  2742. if (err)
  2743. return err;
  2744. }
  2745. generic_fillattr(idmap, request_mask, inode, stat);
  2746. stat->ino = ceph_present_inode(inode);
  2747. /*
  2748. * btime on newly-allocated inodes is 0, so if this is still set to
  2749. * that, then assume that it's not valid.
  2750. */
  2751. if (ci->i_btime.tv_sec || ci->i_btime.tv_nsec) {
  2752. stat->btime = ci->i_btime;
  2753. valid_mask |= STATX_BTIME;
  2754. }
  2755. if (request_mask & STATX_CHANGE_COOKIE) {
  2756. stat->change_cookie = inode_peek_iversion_raw(inode);
  2757. valid_mask |= STATX_CHANGE_COOKIE;
  2758. }
  2759. if (ceph_snap(inode) == CEPH_NOSNAP)
  2760. stat->dev = sb->s_dev;
  2761. else
  2762. stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0;
  2763. if (S_ISDIR(inode->i_mode)) {
  2764. if (ceph_test_mount_opt(ceph_sb_to_fs_client(sb), RBYTES)) {
  2765. stat->size = ci->i_rbytes;
  2766. } else if (ceph_snap(inode) == CEPH_SNAPDIR) {
  2767. struct ceph_inode_info *pci;
  2768. struct ceph_snap_realm *realm;
  2769. struct inode *parent;
  2770. parent = ceph_lookup_inode(sb, ceph_ino(inode));
  2771. if (IS_ERR(parent))
  2772. return PTR_ERR(parent);
  2773. pci = ceph_inode(parent);
  2774. spin_lock(&pci->i_ceph_lock);
  2775. realm = pci->i_snap_realm;
  2776. if (realm)
  2777. stat->size = realm->num_snaps;
  2778. else
  2779. stat->size = 0;
  2780. spin_unlock(&pci->i_ceph_lock);
  2781. iput(parent);
  2782. } else {
  2783. stat->size = ci->i_files + ci->i_subdirs;
  2784. }
  2785. stat->blocks = 0;
  2786. stat->blksize = 65536;
  2787. /*
  2788. * Some applications rely on the number of st_nlink
  2789. * value on directories to be either 0 (if unlinked)
  2790. * or 2 + number of subdirectories.
  2791. */
  2792. if (stat->nlink == 1)
  2793. /* '.' + '..' + subdirs */
  2794. stat->nlink = 1 + 1 + ci->i_subdirs;
  2795. }
  2796. stat->attributes |= STATX_ATTR_CHANGE_MONOTONIC;
  2797. if (IS_ENCRYPTED(inode))
  2798. stat->attributes |= STATX_ATTR_ENCRYPTED;
  2799. stat->attributes_mask |= (STATX_ATTR_CHANGE_MONOTONIC |
  2800. STATX_ATTR_ENCRYPTED);
  2801. stat->result_mask = request_mask & valid_mask;
  2802. return err;
  2803. }
  2804. void ceph_inode_shutdown(struct inode *inode)
  2805. {
  2806. struct ceph_inode_info *ci = ceph_inode(inode);
  2807. struct rb_node *p;
  2808. int iputs = 0;
  2809. bool invalidate = false;
  2810. spin_lock(&ci->i_ceph_lock);
  2811. ci->i_ceph_flags |= CEPH_I_SHUTDOWN;
  2812. p = rb_first(&ci->i_caps);
  2813. while (p) {
  2814. struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
  2815. p = rb_next(p);
  2816. iputs += ceph_purge_inode_cap(inode, cap, &invalidate);
  2817. }
  2818. spin_unlock(&ci->i_ceph_lock);
  2819. if (invalidate)
  2820. ceph_queue_invalidate(inode);
  2821. while (iputs--)
  2822. iput(inode);
  2823. }