addr.c 70 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615
  1. // SPDX-License-Identifier: GPL-2.0
  2. #include <linux/ceph/ceph_debug.h>
  3. #include <linux/backing-dev.h>
  4. #include <linux/fs.h>
  5. #include <linux/mm.h>
  6. #include <linux/swap.h>
  7. #include <linux/pagemap.h>
  8. #include <linux/slab.h>
  9. #include <linux/pagevec.h>
  10. #include <linux/task_io_accounting_ops.h>
  11. #include <linux/signal.h>
  12. #include <linux/iversion.h>
  13. #include <linux/ktime.h>
  14. #include <linux/netfs.h>
  15. #include <trace/events/netfs.h>
  16. #include "super.h"
  17. #include "mds_client.h"
  18. #include "cache.h"
  19. #include "metric.h"
  20. #include "crypto.h"
  21. #include <linux/ceph/osd_client.h>
  22. #include <linux/ceph/striper.h>
  23. /*
  24. * Ceph address space ops.
  25. *
  26. * There are a few funny things going on here.
  27. *
  28. * The page->private field is used to reference a struct
  29. * ceph_snap_context for _every_ dirty page. This indicates which
  30. * snapshot the page was logically dirtied in, and thus which snap
  31. * context needs to be associated with the osd write during writeback.
  32. *
  33. * Similarly, struct ceph_inode_info maintains a set of counters to
  34. * count dirty pages on the inode. In the absence of snapshots,
  35. * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count.
  36. *
  37. * When a snapshot is taken (that is, when the client receives
  38. * notification that a snapshot was taken), each inode with caps and
  39. * with dirty pages (dirty pages implies there is a cap) gets a new
  40. * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending
  41. * order, new snaps go to the tail). The i_wrbuffer_ref_head count is
  42. * moved to capsnap->dirty. (Unless a sync write is currently in
  43. * progress. In that case, the capsnap is said to be "pending", new
  44. * writes cannot start, and the capsnap isn't "finalized" until the
  45. * write completes (or fails) and a final size/mtime for the inode for
  46. * that snap can be settled upon.) i_wrbuffer_ref_head is reset to 0.
  47. *
  48. * On writeback, we must submit writes to the osd IN SNAP ORDER. So,
  49. * we look for the first capsnap in i_cap_snaps and write out pages in
  50. * that snap context _only_. Then we move on to the next capsnap,
  51. * eventually reaching the "live" or "head" context (i.e., pages that
  52. * are not yet snapped) and are writing the most recently dirtied
  53. * pages.
  54. *
  55. * Invalidate and so forth must take care to ensure the dirty page
  56. * accounting is preserved.
  57. */
  58. #define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10))
  59. #define CONGESTION_OFF_THRESH(congestion_kb) \
  60. (CONGESTION_ON_THRESH(congestion_kb) - \
  61. (CONGESTION_ON_THRESH(congestion_kb) >> 2))
  62. static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len,
  63. struct folio **foliop, void **_fsdata);
  64. static inline struct ceph_snap_context *page_snap_context(struct page *page)
  65. {
  66. if (PagePrivate(page))
  67. return (void *)page->private;
  68. return NULL;
  69. }
  70. /*
  71. * Dirty a page. Optimistically adjust accounting, on the assumption
  72. * that we won't race with invalidate. If we do, readjust.
  73. */
  74. static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio)
  75. {
  76. struct inode *inode = mapping->host;
  77. struct ceph_client *cl = ceph_inode_to_client(inode);
  78. struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
  79. struct ceph_inode_info *ci;
  80. struct ceph_snap_context *snapc;
  81. if (folio_test_dirty(folio)) {
  82. doutc(cl, "%llx.%llx %p idx %lu -- already dirty\n",
  83. ceph_vinop(inode), folio, folio->index);
  84. VM_BUG_ON_FOLIO(!folio_test_private(folio), folio);
  85. return false;
  86. }
  87. atomic64_inc(&mdsc->dirty_folios);
  88. ci = ceph_inode(inode);
  89. /* dirty the head */
  90. spin_lock(&ci->i_ceph_lock);
  91. if (__ceph_have_pending_cap_snap(ci)) {
  92. struct ceph_cap_snap *capsnap =
  93. list_last_entry(&ci->i_cap_snaps,
  94. struct ceph_cap_snap,
  95. ci_item);
  96. snapc = ceph_get_snap_context(capsnap->context);
  97. capsnap->dirty_pages++;
  98. } else {
  99. BUG_ON(!ci->i_head_snapc);
  100. snapc = ceph_get_snap_context(ci->i_head_snapc);
  101. ++ci->i_wrbuffer_ref_head;
  102. }
  103. if (ci->i_wrbuffer_ref == 0)
  104. ihold(inode);
  105. ++ci->i_wrbuffer_ref;
  106. doutc(cl, "%llx.%llx %p idx %lu head %d/%d -> %d/%d "
  107. "snapc %p seq %lld (%d snaps)\n",
  108. ceph_vinop(inode), folio, folio->index,
  109. ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
  110. ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
  111. snapc, snapc->seq, snapc->num_snaps);
  112. spin_unlock(&ci->i_ceph_lock);
  113. /*
  114. * Reference snap context in folio->private. Also set
  115. * PagePrivate so that we get invalidate_folio callback.
  116. */
  117. VM_WARN_ON_FOLIO(folio->private, folio);
  118. folio_attach_private(folio, snapc);
  119. return ceph_fscache_dirty_folio(mapping, folio);
  120. }
  121. /*
  122. * If we are truncating the full folio (i.e. offset == 0), adjust the
  123. * dirty folio counters appropriately. Only called if there is private
  124. * data on the folio.
  125. */
  126. static void ceph_invalidate_folio(struct folio *folio, size_t offset,
  127. size_t length)
  128. {
  129. struct inode *inode = folio->mapping->host;
  130. struct ceph_client *cl = ceph_inode_to_client(inode);
  131. struct ceph_inode_info *ci = ceph_inode(inode);
  132. struct ceph_snap_context *snapc;
  133. if (offset != 0 || length != folio_size(folio)) {
  134. doutc(cl, "%llx.%llx idx %lu partial dirty page %zu~%zu\n",
  135. ceph_vinop(inode), folio->index, offset, length);
  136. return;
  137. }
  138. WARN_ON(!folio_test_locked(folio));
  139. if (folio_test_private(folio)) {
  140. doutc(cl, "%llx.%llx idx %lu full dirty page\n",
  141. ceph_vinop(inode), folio->index);
  142. snapc = folio_detach_private(folio);
  143. ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
  144. ceph_put_snap_context(snapc);
  145. }
  146. netfs_invalidate_folio(folio, offset, length);
  147. }
  148. static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq)
  149. {
  150. struct inode *inode = rreq->inode;
  151. struct ceph_inode_info *ci = ceph_inode(inode);
  152. struct ceph_file_layout *lo = &ci->i_layout;
  153. unsigned long max_pages = inode->i_sb->s_bdi->ra_pages;
  154. loff_t end = rreq->start + rreq->len, new_end;
  155. struct ceph_netfs_request_data *priv = rreq->netfs_priv;
  156. unsigned long max_len;
  157. u32 blockoff;
  158. if (priv) {
  159. /* Readahead is disabled by posix_fadvise POSIX_FADV_RANDOM */
  160. if (priv->file_ra_disabled)
  161. max_pages = 0;
  162. else
  163. max_pages = priv->file_ra_pages;
  164. }
  165. /* Readahead is disabled */
  166. if (!max_pages)
  167. return;
  168. max_len = max_pages << PAGE_SHIFT;
  169. /*
  170. * Try to expand the length forward by rounding up it to the next
  171. * block, but do not exceed the file size, unless the original
  172. * request already exceeds it.
  173. */
  174. new_end = umin(round_up(end, lo->stripe_unit), rreq->i_size);
  175. if (new_end > end && new_end <= rreq->start + max_len)
  176. rreq->len = new_end - rreq->start;
  177. /* Try to expand the start downward */
  178. div_u64_rem(rreq->start, lo->stripe_unit, &blockoff);
  179. if (rreq->len + blockoff <= max_len) {
  180. rreq->start -= blockoff;
  181. rreq->len += blockoff;
  182. }
  183. }
  184. static void finish_netfs_read(struct ceph_osd_request *req)
  185. {
  186. struct inode *inode = req->r_inode;
  187. struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
  188. struct ceph_client *cl = fsc->client;
  189. struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
  190. struct netfs_io_subrequest *subreq = req->r_priv;
  191. struct ceph_osd_req_op *op = &req->r_ops[0];
  192. int err = req->r_result;
  193. bool sparse = (op->op == CEPH_OSD_OP_SPARSE_READ);
  194. ceph_update_read_metrics(&fsc->mdsc->metric, req->r_start_latency,
  195. req->r_end_latency, osd_data->length, err);
  196. doutc(cl, "result %d subreq->len=%zu i_size=%lld\n", req->r_result,
  197. subreq->len, i_size_read(req->r_inode));
  198. /* no object means success but no data */
  199. if (err == -ENOENT) {
  200. __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
  201. __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
  202. err = 0;
  203. } else if (err == -EBLOCKLISTED) {
  204. fsc->blocklisted = true;
  205. }
  206. if (err >= 0) {
  207. if (sparse && err > 0)
  208. err = ceph_sparse_ext_map_end(op);
  209. if (err < subreq->len &&
  210. subreq->rreq->origin != NETFS_UNBUFFERED_READ &&
  211. subreq->rreq->origin != NETFS_DIO_READ)
  212. __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
  213. if (IS_ENCRYPTED(inode) && err > 0) {
  214. err = ceph_fscrypt_decrypt_extents(inode,
  215. osd_data->pages, subreq->start,
  216. op->extent.sparse_ext,
  217. op->extent.sparse_ext_cnt);
  218. if (err > subreq->len)
  219. err = subreq->len;
  220. }
  221. if (err > 0)
  222. __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
  223. }
  224. if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
  225. ceph_put_page_vector(osd_data->pages,
  226. calc_pages_for(osd_data->alignment,
  227. osd_data->length), false);
  228. }
  229. if (err > 0) {
  230. subreq->transferred = err;
  231. err = 0;
  232. }
  233. subreq->error = err;
  234. trace_netfs_sreq(subreq, netfs_sreq_trace_io_progress);
  235. netfs_read_subreq_terminated(subreq);
  236. iput(req->r_inode);
  237. ceph_dec_osd_stopping_blocker(fsc->mdsc);
  238. }
  239. static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
  240. {
  241. struct netfs_io_request *rreq = subreq->rreq;
  242. struct inode *inode = rreq->inode;
  243. struct ceph_mds_reply_info_parsed *rinfo;
  244. struct ceph_mds_reply_info_in *iinfo;
  245. struct ceph_mds_request *req;
  246. struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
  247. struct ceph_inode_info *ci = ceph_inode(inode);
  248. ssize_t err = 0;
  249. size_t len;
  250. int mode;
  251. if (rreq->origin != NETFS_UNBUFFERED_READ &&
  252. rreq->origin != NETFS_DIO_READ)
  253. __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
  254. __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
  255. if (subreq->start >= inode->i_size)
  256. goto out;
  257. /* We need to fetch the inline data. */
  258. mode = ceph_try_to_choose_auth_mds(inode, CEPH_STAT_CAP_INLINE_DATA);
  259. req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
  260. if (IS_ERR(req)) {
  261. err = PTR_ERR(req);
  262. goto out;
  263. }
  264. req->r_ino1 = ci->i_vino;
  265. req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INLINE_DATA);
  266. req->r_num_caps = 2;
  267. trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
  268. err = ceph_mdsc_do_request(mdsc, NULL, req);
  269. if (err < 0)
  270. goto out;
  271. rinfo = &req->r_reply_info;
  272. iinfo = &rinfo->targeti;
  273. if (iinfo->inline_version == CEPH_INLINE_NONE) {
  274. /* The data got uninlined */
  275. ceph_mdsc_put_request(req);
  276. return false;
  277. }
  278. len = min_t(size_t, iinfo->inline_len - subreq->start, subreq->len);
  279. err = copy_to_iter(iinfo->inline_data + subreq->start, len, &subreq->io_iter);
  280. if (err == 0) {
  281. err = -EFAULT;
  282. } else {
  283. subreq->transferred += err;
  284. err = 0;
  285. }
  286. ceph_mdsc_put_request(req);
  287. out:
  288. subreq->error = err;
  289. trace_netfs_sreq(subreq, netfs_sreq_trace_io_progress);
  290. netfs_read_subreq_terminated(subreq);
  291. return true;
  292. }
  293. static int ceph_netfs_prepare_read(struct netfs_io_subrequest *subreq)
  294. {
  295. struct netfs_io_request *rreq = subreq->rreq;
  296. struct inode *inode = rreq->inode;
  297. struct ceph_inode_info *ci = ceph_inode(inode);
  298. struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
  299. u64 objno, objoff;
  300. u32 xlen;
  301. /* Truncate the extent at the end of the current block */
  302. ceph_calc_file_object_mapping(&ci->i_layout, subreq->start, subreq->len,
  303. &objno, &objoff, &xlen);
  304. rreq->io_streams[0].sreq_max_len = umin(xlen, fsc->mount_options->rsize);
  305. return 0;
  306. }
  307. static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
  308. {
  309. struct netfs_io_request *rreq = subreq->rreq;
  310. struct inode *inode = rreq->inode;
  311. struct ceph_inode_info *ci = ceph_inode(inode);
  312. struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
  313. struct ceph_client *cl = fsc->client;
  314. struct ceph_osd_request *req = NULL;
  315. struct ceph_vino vino = ceph_vino(inode);
  316. int err;
  317. u64 len;
  318. bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD);
  319. u64 off = subreq->start;
  320. int extent_cnt;
  321. if (ceph_inode_is_shutdown(inode)) {
  322. err = -EIO;
  323. goto out;
  324. }
  325. if (ceph_has_inline_data(ci) && ceph_netfs_issue_op_inline(subreq))
  326. return;
  327. // TODO: This rounding here is slightly dodgy. It *should* work, for
  328. // now, as the cache only deals in blocks that are a multiple of
  329. // PAGE_SIZE and fscrypt blocks are at most PAGE_SIZE. What needs to
  330. // happen is for the fscrypt driving to be moved into netfslib and the
  331. // data in the cache also to be stored encrypted.
  332. len = subreq->len;
  333. ceph_fscrypt_adjust_off_and_len(inode, &off, &len);
  334. req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino,
  335. off, &len, 0, 1, sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ,
  336. CEPH_OSD_FLAG_READ, NULL, ci->i_truncate_seq,
  337. ci->i_truncate_size, false);
  338. if (IS_ERR(req)) {
  339. err = PTR_ERR(req);
  340. req = NULL;
  341. goto out;
  342. }
  343. if (sparse) {
  344. extent_cnt = __ceph_sparse_read_ext_count(inode, len);
  345. err = ceph_alloc_sparse_ext_map(&req->r_ops[0], extent_cnt);
  346. if (err)
  347. goto out;
  348. }
  349. doutc(cl, "%llx.%llx pos=%llu orig_len=%zu len=%llu\n",
  350. ceph_vinop(inode), subreq->start, subreq->len, len);
  351. /*
  352. * FIXME: For now, use CEPH_OSD_DATA_TYPE_PAGES instead of _ITER for
  353. * encrypted inodes. We'd need infrastructure that handles an iov_iter
  354. * instead of page arrays, and we don't have that as of yet. Once the
  355. * dust settles on the write helpers and encrypt/decrypt routines for
  356. * netfs, we should be able to rework this.
  357. */
  358. if (IS_ENCRYPTED(inode)) {
  359. struct page **pages;
  360. size_t page_off;
  361. /*
  362. * FIXME: io_iter.count needs to be corrected to aligned
  363. * length. Otherwise, iov_iter_get_pages_alloc2() operates
  364. * with the initial unaligned length value. As a result,
  365. * ceph_msg_data_cursor_init() triggers BUG_ON() in the case
  366. * if msg->sparse_read_total > msg->data_length.
  367. */
  368. subreq->io_iter.count = len;
  369. err = iov_iter_get_pages_alloc2(&subreq->io_iter, &pages, len, &page_off);
  370. if (err < 0) {
  371. doutc(cl, "%llx.%llx failed to allocate pages, %d\n",
  372. ceph_vinop(inode), err);
  373. goto out;
  374. }
  375. /* should always give us a page-aligned read */
  376. WARN_ON_ONCE(page_off);
  377. len = err;
  378. err = 0;
  379. osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false,
  380. false);
  381. } else {
  382. osd_req_op_extent_osd_iter(req, 0, &subreq->io_iter);
  383. }
  384. if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) {
  385. err = -EIO;
  386. goto out;
  387. }
  388. req->r_callback = finish_netfs_read;
  389. req->r_priv = subreq;
  390. req->r_inode = inode;
  391. ihold(inode);
  392. trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
  393. ceph_osdc_start_request(req->r_osdc, req);
  394. out:
  395. ceph_osdc_put_request(req);
  396. if (err) {
  397. subreq->error = err;
  398. netfs_read_subreq_terminated(subreq);
  399. }
  400. doutc(cl, "%llx.%llx result %d\n", ceph_vinop(inode), err);
  401. }
  402. static int ceph_init_request(struct netfs_io_request *rreq, struct file *file)
  403. {
  404. struct inode *inode = rreq->inode;
  405. struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
  406. struct ceph_client *cl = ceph_inode_to_client(inode);
  407. int got = 0, want = CEPH_CAP_FILE_CACHE;
  408. struct ceph_netfs_request_data *priv;
  409. int ret = 0;
  410. /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */
  411. __set_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags);
  412. if (rreq->origin != NETFS_READAHEAD)
  413. return 0;
  414. priv = kzalloc_obj(*priv, GFP_NOFS);
  415. if (!priv)
  416. return -ENOMEM;
  417. if (file) {
  418. struct ceph_rw_context *rw_ctx;
  419. struct ceph_file_info *fi = file->private_data;
  420. priv->file_ra_pages = file->f_ra.ra_pages;
  421. priv->file_ra_disabled = file->f_mode & FMODE_RANDOM;
  422. rw_ctx = ceph_find_rw_context(fi);
  423. if (rw_ctx) {
  424. rreq->netfs_priv = priv;
  425. return 0;
  426. }
  427. }
  428. /*
  429. * readahead callers do not necessarily hold Fcb caps
  430. * (e.g. fadvise, madvise).
  431. */
  432. ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, true, &got);
  433. if (ret < 0) {
  434. doutc(cl, "%llx.%llx, error getting cap\n", ceph_vinop(inode));
  435. goto out;
  436. }
  437. if (!(got & want)) {
  438. doutc(cl, "%llx.%llx, no cache cap\n", ceph_vinop(inode));
  439. ret = -EACCES;
  440. goto out;
  441. }
  442. if (ret == 0) {
  443. ret = -EACCES;
  444. goto out;
  445. }
  446. priv->caps = got;
  447. rreq->netfs_priv = priv;
  448. rreq->io_streams[0].sreq_max_len = fsc->mount_options->rsize;
  449. out:
  450. if (ret < 0) {
  451. if (got)
  452. ceph_put_cap_refs(ceph_inode(inode), got);
  453. kfree(priv);
  454. }
  455. return ret;
  456. }
  457. static void ceph_netfs_free_request(struct netfs_io_request *rreq)
  458. {
  459. struct ceph_netfs_request_data *priv = rreq->netfs_priv;
  460. if (!priv)
  461. return;
  462. if (priv->caps)
  463. ceph_put_cap_refs(ceph_inode(rreq->inode), priv->caps);
  464. kfree(priv);
  465. rreq->netfs_priv = NULL;
  466. }
  467. const struct netfs_request_ops ceph_netfs_ops = {
  468. .init_request = ceph_init_request,
  469. .free_request = ceph_netfs_free_request,
  470. .prepare_read = ceph_netfs_prepare_read,
  471. .issue_read = ceph_netfs_issue_read,
  472. .expand_readahead = ceph_netfs_expand_readahead,
  473. .check_write_begin = ceph_netfs_check_write_begin,
  474. };
  475. #ifdef CONFIG_CEPH_FSCACHE
  476. static void ceph_set_page_fscache(struct page *page)
  477. {
  478. folio_start_private_2(page_folio(page)); /* [DEPRECATED] */
  479. }
  480. static void ceph_fscache_write_terminated(void *priv, ssize_t error)
  481. {
  482. struct inode *inode = priv;
  483. if (IS_ERR_VALUE(error) && error != -ENOBUFS)
  484. ceph_fscache_invalidate(inode, false);
  485. }
  486. static void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, bool caching)
  487. {
  488. struct ceph_inode_info *ci = ceph_inode(inode);
  489. struct fscache_cookie *cookie = ceph_fscache_cookie(ci);
  490. fscache_write_to_cache(cookie, inode->i_mapping, off, len, i_size_read(inode),
  491. ceph_fscache_write_terminated, inode, true, caching);
  492. }
  493. #else
  494. static inline void ceph_set_page_fscache(struct page *page)
  495. {
  496. }
  497. static inline void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, bool caching)
  498. {
  499. }
  500. #endif /* CONFIG_CEPH_FSCACHE */
  501. struct ceph_writeback_ctl
  502. {
  503. loff_t i_size;
  504. u64 truncate_size;
  505. u32 truncate_seq;
  506. bool size_stable;
  507. bool head_snapc;
  508. struct ceph_snap_context *snapc;
  509. struct ceph_snap_context *last_snapc;
  510. bool done;
  511. bool should_loop;
  512. bool range_whole;
  513. pgoff_t start_index;
  514. pgoff_t index;
  515. pgoff_t end;
  516. xa_mark_t tag;
  517. pgoff_t strip_unit_end;
  518. unsigned int wsize;
  519. unsigned int nr_folios;
  520. unsigned int max_pages;
  521. unsigned int locked_pages;
  522. int op_idx;
  523. int num_ops;
  524. u64 offset;
  525. u64 len;
  526. struct folio_batch fbatch;
  527. unsigned int processed_in_fbatch;
  528. bool from_pool;
  529. struct page **pages;
  530. struct page **data_pages;
  531. };
  532. /*
  533. * Get ref for the oldest snapc for an inode with dirty data... that is, the
  534. * only snap context we are allowed to write back.
  535. */
  536. static struct ceph_snap_context *
  537. get_oldest_context(struct inode *inode, struct ceph_writeback_ctl *ctl,
  538. struct ceph_snap_context *page_snapc)
  539. {
  540. struct ceph_inode_info *ci = ceph_inode(inode);
  541. struct ceph_client *cl = ceph_inode_to_client(inode);
  542. struct ceph_snap_context *snapc = NULL;
  543. struct ceph_cap_snap *capsnap = NULL;
  544. spin_lock(&ci->i_ceph_lock);
  545. list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
  546. doutc(cl, " capsnap %p snapc %p has %d dirty pages\n",
  547. capsnap, capsnap->context, capsnap->dirty_pages);
  548. if (!capsnap->dirty_pages)
  549. continue;
  550. /* get i_size, truncate_{seq,size} for page_snapc? */
  551. if (snapc && capsnap->context != page_snapc)
  552. continue;
  553. if (ctl) {
  554. if (capsnap->writing) {
  555. ctl->i_size = i_size_read(inode);
  556. ctl->size_stable = false;
  557. } else {
  558. ctl->i_size = capsnap->size;
  559. ctl->size_stable = true;
  560. }
  561. ctl->truncate_size = capsnap->truncate_size;
  562. ctl->truncate_seq = capsnap->truncate_seq;
  563. ctl->head_snapc = false;
  564. }
  565. if (snapc)
  566. break;
  567. snapc = ceph_get_snap_context(capsnap->context);
  568. if (!page_snapc ||
  569. page_snapc == snapc ||
  570. page_snapc->seq > snapc->seq)
  571. break;
  572. }
  573. if (!snapc && ci->i_wrbuffer_ref_head) {
  574. snapc = ceph_get_snap_context(ci->i_head_snapc);
  575. doutc(cl, " head snapc %p has %d dirty pages\n", snapc,
  576. ci->i_wrbuffer_ref_head);
  577. if (ctl) {
  578. ctl->i_size = i_size_read(inode);
  579. ctl->truncate_size = ci->i_truncate_size;
  580. ctl->truncate_seq = ci->i_truncate_seq;
  581. ctl->size_stable = false;
  582. ctl->head_snapc = true;
  583. }
  584. }
  585. spin_unlock(&ci->i_ceph_lock);
  586. return snapc;
  587. }
  588. static u64 get_writepages_data_length(struct inode *inode,
  589. struct page *page, u64 start)
  590. {
  591. struct ceph_inode_info *ci = ceph_inode(inode);
  592. struct ceph_snap_context *snapc;
  593. struct ceph_cap_snap *capsnap = NULL;
  594. u64 end = i_size_read(inode);
  595. u64 ret;
  596. snapc = page_snap_context(ceph_fscrypt_pagecache_page(page));
  597. if (snapc != ci->i_head_snapc) {
  598. bool found = false;
  599. spin_lock(&ci->i_ceph_lock);
  600. list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
  601. if (capsnap->context == snapc) {
  602. if (!capsnap->writing)
  603. end = capsnap->size;
  604. found = true;
  605. break;
  606. }
  607. }
  608. spin_unlock(&ci->i_ceph_lock);
  609. WARN_ON(!found);
  610. }
  611. if (end > ceph_fscrypt_page_offset(page) + thp_size(page))
  612. end = ceph_fscrypt_page_offset(page) + thp_size(page);
  613. ret = end > start ? end - start : 0;
  614. if (ret && fscrypt_is_bounce_page(page))
  615. ret = round_up(ret, CEPH_FSCRYPT_BLOCK_SIZE);
  616. return ret;
  617. }
  618. /*
  619. * Write a folio, but leave it locked.
  620. *
  621. * If we get a write error, mark the mapping for error, but still adjust the
  622. * dirty page accounting (i.e., folio is no longer dirty).
  623. */
  624. static int write_folio_nounlock(struct folio *folio,
  625. struct writeback_control *wbc)
  626. {
  627. struct page *page = &folio->page;
  628. struct inode *inode = folio->mapping->host;
  629. struct ceph_inode_info *ci = ceph_inode(inode);
  630. struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
  631. struct ceph_client *cl = fsc->client;
  632. struct ceph_snap_context *snapc, *oldest;
  633. loff_t page_off = folio_pos(folio);
  634. int err;
  635. loff_t len = folio_size(folio);
  636. loff_t wlen;
  637. struct ceph_writeback_ctl ceph_wbc;
  638. struct ceph_osd_client *osdc = &fsc->client->osdc;
  639. struct ceph_osd_request *req;
  640. bool caching = ceph_is_cache_enabled(inode);
  641. struct page *bounce_page = NULL;
  642. doutc(cl, "%llx.%llx folio %p idx %lu\n", ceph_vinop(inode), folio,
  643. folio->index);
  644. if (ceph_inode_is_shutdown(inode))
  645. return -EIO;
  646. /* verify this is a writeable snap context */
  647. snapc = page_snap_context(&folio->page);
  648. if (!snapc) {
  649. doutc(cl, "%llx.%llx folio %p not dirty?\n", ceph_vinop(inode),
  650. folio);
  651. return 0;
  652. }
  653. oldest = get_oldest_context(inode, &ceph_wbc, snapc);
  654. if (snapc->seq > oldest->seq) {
  655. doutc(cl, "%llx.%llx folio %p snapc %p not writeable - noop\n",
  656. ceph_vinop(inode), folio, snapc);
  657. /* we should only noop if called by kswapd */
  658. WARN_ON(!(current->flags & PF_MEMALLOC));
  659. ceph_put_snap_context(oldest);
  660. folio_redirty_for_writepage(wbc, folio);
  661. return 0;
  662. }
  663. ceph_put_snap_context(oldest);
  664. /* is this a partial page at end of file? */
  665. if (page_off >= ceph_wbc.i_size) {
  666. doutc(cl, "%llx.%llx folio at %lu beyond eof %llu\n",
  667. ceph_vinop(inode), folio->index, ceph_wbc.i_size);
  668. folio_invalidate(folio, 0, folio_size(folio));
  669. return 0;
  670. }
  671. if (ceph_wbc.i_size < page_off + len)
  672. len = ceph_wbc.i_size - page_off;
  673. wlen = IS_ENCRYPTED(inode) ? round_up(len, CEPH_FSCRYPT_BLOCK_SIZE) : len;
  674. doutc(cl, "%llx.%llx folio %p index %lu on %llu~%llu snapc %p seq %lld\n",
  675. ceph_vinop(inode), folio, folio->index, page_off, wlen, snapc,
  676. snapc->seq);
  677. if (atomic_long_inc_return(&fsc->writeback_count) >
  678. CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
  679. fsc->write_congested = true;
  680. req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode),
  681. page_off, &wlen, 0, 1, CEPH_OSD_OP_WRITE,
  682. CEPH_OSD_FLAG_WRITE, snapc,
  683. ceph_wbc.truncate_seq,
  684. ceph_wbc.truncate_size, true);
  685. if (IS_ERR(req)) {
  686. folio_redirty_for_writepage(wbc, folio);
  687. return PTR_ERR(req);
  688. }
  689. if (wlen < len)
  690. len = wlen;
  691. folio_start_writeback(folio);
  692. if (caching)
  693. ceph_set_page_fscache(&folio->page);
  694. ceph_fscache_write_to_cache(inode, page_off, len, caching);
  695. if (IS_ENCRYPTED(inode)) {
  696. bounce_page = fscrypt_encrypt_pagecache_blocks(folio,
  697. CEPH_FSCRYPT_BLOCK_SIZE, 0,
  698. GFP_NOFS);
  699. if (IS_ERR(bounce_page)) {
  700. folio_redirty_for_writepage(wbc, folio);
  701. folio_end_writeback(folio);
  702. ceph_osdc_put_request(req);
  703. return PTR_ERR(bounce_page);
  704. }
  705. }
  706. /* it may be a short write due to an object boundary */
  707. WARN_ON_ONCE(len > folio_size(folio));
  708. osd_req_op_extent_osd_data_pages(req, 0,
  709. bounce_page ? &bounce_page : &page, wlen, 0,
  710. false, false);
  711. doutc(cl, "%llx.%llx %llu~%llu (%llu bytes, %sencrypted)\n",
  712. ceph_vinop(inode), page_off, len, wlen,
  713. IS_ENCRYPTED(inode) ? "" : "not ");
  714. req->r_mtime = inode_get_mtime(inode);
  715. ceph_osdc_start_request(osdc, req);
  716. err = ceph_osdc_wait_request(osdc, req);
  717. ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
  718. req->r_end_latency, len, err);
  719. fscrypt_free_bounce_page(bounce_page);
  720. ceph_osdc_put_request(req);
  721. if (err == 0)
  722. err = len;
  723. if (err < 0) {
  724. struct writeback_control tmp_wbc;
  725. if (!wbc)
  726. wbc = &tmp_wbc;
  727. if (err == -ERESTARTSYS) {
  728. /* killed by SIGKILL */
  729. doutc(cl, "%llx.%llx interrupted page %p\n",
  730. ceph_vinop(inode), folio);
  731. folio_redirty_for_writepage(wbc, folio);
  732. folio_end_writeback(folio);
  733. return err;
  734. }
  735. if (err == -EBLOCKLISTED)
  736. fsc->blocklisted = true;
  737. doutc(cl, "%llx.%llx setting mapping error %d %p\n",
  738. ceph_vinop(inode), err, folio);
  739. mapping_set_error(&inode->i_data, err);
  740. wbc->pages_skipped++;
  741. } else {
  742. doutc(cl, "%llx.%llx cleaned page %p\n",
  743. ceph_vinop(inode), folio);
  744. err = 0; /* vfs expects us to return 0 */
  745. }
  746. oldest = folio_detach_private(folio);
  747. WARN_ON_ONCE(oldest != snapc);
  748. folio_end_writeback(folio);
  749. ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
  750. ceph_put_snap_context(snapc); /* page's reference */
  751. if (atomic_long_dec_return(&fsc->writeback_count) <
  752. CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
  753. fsc->write_congested = false;
  754. return err;
  755. }
  756. /*
  757. * async writeback completion handler.
  758. *
  759. * If we get an error, set the mapping error bit, but not the individual
  760. * page error bits.
  761. */
  762. static void writepages_finish(struct ceph_osd_request *req)
  763. {
  764. struct inode *inode = req->r_inode;
  765. struct ceph_inode_info *ci = ceph_inode(inode);
  766. struct ceph_client *cl = ceph_inode_to_client(inode);
  767. struct ceph_osd_data *osd_data;
  768. struct page *page;
  769. int num_pages, total_pages = 0;
  770. int i, j;
  771. int rc = req->r_result;
  772. struct ceph_snap_context *snapc = req->r_snapc;
  773. struct address_space *mapping = inode->i_mapping;
  774. struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
  775. struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
  776. unsigned int len = 0;
  777. bool remove_page;
  778. doutc(cl, "%llx.%llx rc %d\n", ceph_vinop(inode), rc);
  779. if (rc < 0) {
  780. mapping_set_error(mapping, rc);
  781. ceph_set_error_write(ci);
  782. if (rc == -EBLOCKLISTED)
  783. fsc->blocklisted = true;
  784. } else {
  785. ceph_clear_error_write(ci);
  786. }
  787. /*
  788. * We lost the cache cap, need to truncate the page before
  789. * it is unlocked, otherwise we'd truncate it later in the
  790. * page truncation thread, possibly losing some data that
  791. * raced its way in
  792. */
  793. remove_page = !(ceph_caps_issued(ci) &
  794. (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO));
  795. /* clean all pages */
  796. for (i = 0; i < req->r_num_ops; i++) {
  797. if (req->r_ops[i].op != CEPH_OSD_OP_WRITE) {
  798. pr_warn_client(cl,
  799. "%llx.%llx incorrect op %d req %p index %d tid %llu\n",
  800. ceph_vinop(inode), req->r_ops[i].op, req, i,
  801. req->r_tid);
  802. break;
  803. }
  804. osd_data = osd_req_op_extent_osd_data(req, i);
  805. BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
  806. len += osd_data->length;
  807. num_pages = calc_pages_for((u64)osd_data->alignment,
  808. (u64)osd_data->length);
  809. total_pages += num_pages;
  810. for (j = 0; j < num_pages; j++) {
  811. page = osd_data->pages[j];
  812. if (fscrypt_is_bounce_page(page)) {
  813. page = fscrypt_pagecache_page(page);
  814. fscrypt_free_bounce_page(osd_data->pages[j]);
  815. osd_data->pages[j] = page;
  816. }
  817. BUG_ON(!page);
  818. WARN_ON(!PageUptodate(page));
  819. if (atomic_long_dec_return(&fsc->writeback_count) <
  820. CONGESTION_OFF_THRESH(
  821. fsc->mount_options->congestion_kb))
  822. fsc->write_congested = false;
  823. ceph_put_snap_context(detach_page_private(page));
  824. end_page_writeback(page);
  825. if (atomic64_dec_return(&mdsc->dirty_folios) <= 0) {
  826. wake_up_all(&mdsc->flush_end_wq);
  827. WARN_ON(atomic64_read(&mdsc->dirty_folios) < 0);
  828. }
  829. doutc(cl, "unlocking %p\n", page);
  830. if (remove_page)
  831. generic_error_remove_folio(inode->i_mapping,
  832. page_folio(page));
  833. unlock_page(page);
  834. }
  835. doutc(cl, "%llx.%llx wrote %llu bytes cleaned %d pages\n",
  836. ceph_vinop(inode), osd_data->length,
  837. rc >= 0 ? num_pages : 0);
  838. release_pages(osd_data->pages, num_pages);
  839. }
  840. ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
  841. req->r_end_latency, len, rc);
  842. ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc);
  843. osd_data = osd_req_op_extent_osd_data(req, 0);
  844. if (osd_data->pages_from_pool)
  845. mempool_free(osd_data->pages, ceph_wb_pagevec_pool);
  846. else
  847. kfree(osd_data->pages);
  848. ceph_osdc_put_request(req);
  849. ceph_dec_osd_stopping_blocker(fsc->mdsc);
  850. }
  851. static inline
  852. bool is_forced_umount(struct address_space *mapping)
  853. {
  854. struct inode *inode = mapping->host;
  855. struct ceph_inode_info *ci = ceph_inode(inode);
  856. struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
  857. struct ceph_client *cl = fsc->client;
  858. if (ceph_inode_is_shutdown(inode)) {
  859. if (ci->i_wrbuffer_ref > 0) {
  860. pr_warn_ratelimited_client(cl,
  861. "%llx.%llx %lld forced umount\n",
  862. ceph_vinop(inode), ceph_ino(inode));
  863. }
  864. mapping_set_error(mapping, -EIO);
  865. return true;
  866. }
  867. return false;
  868. }
  869. static inline
  870. unsigned int ceph_define_write_size(struct address_space *mapping)
  871. {
  872. struct inode *inode = mapping->host;
  873. struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
  874. struct ceph_inode_info *ci = ceph_inode(inode);
  875. unsigned int wsize = ci->i_layout.stripe_unit;
  876. if (fsc->mount_options->wsize < wsize)
  877. wsize = fsc->mount_options->wsize;
  878. return wsize;
  879. }
  880. static inline
  881. void ceph_folio_batch_init(struct ceph_writeback_ctl *ceph_wbc)
  882. {
  883. folio_batch_init(&ceph_wbc->fbatch);
  884. ceph_wbc->processed_in_fbatch = 0;
  885. }
  886. static inline
  887. void ceph_folio_batch_reinit(struct ceph_writeback_ctl *ceph_wbc)
  888. {
  889. folio_batch_release(&ceph_wbc->fbatch);
  890. ceph_folio_batch_init(ceph_wbc);
  891. }
  892. static inline
  893. void ceph_init_writeback_ctl(struct address_space *mapping,
  894. struct writeback_control *wbc,
  895. struct ceph_writeback_ctl *ceph_wbc)
  896. {
  897. ceph_wbc->snapc = NULL;
  898. ceph_wbc->last_snapc = NULL;
  899. ceph_wbc->strip_unit_end = 0;
  900. ceph_wbc->wsize = ceph_define_write_size(mapping);
  901. ceph_wbc->nr_folios = 0;
  902. ceph_wbc->max_pages = 0;
  903. ceph_wbc->locked_pages = 0;
  904. ceph_wbc->done = false;
  905. ceph_wbc->should_loop = false;
  906. ceph_wbc->range_whole = false;
  907. ceph_wbc->start_index = wbc->range_cyclic ? mapping->writeback_index : 0;
  908. ceph_wbc->index = ceph_wbc->start_index;
  909. ceph_wbc->end = -1;
  910. ceph_wbc->tag = wbc_to_tag(wbc);
  911. ceph_wbc->op_idx = -1;
  912. ceph_wbc->num_ops = 0;
  913. ceph_wbc->offset = 0;
  914. ceph_wbc->len = 0;
  915. ceph_wbc->from_pool = false;
  916. ceph_folio_batch_init(ceph_wbc);
  917. ceph_wbc->pages = NULL;
  918. ceph_wbc->data_pages = NULL;
  919. }
  920. static inline
  921. int ceph_define_writeback_range(struct address_space *mapping,
  922. struct writeback_control *wbc,
  923. struct ceph_writeback_ctl *ceph_wbc)
  924. {
  925. struct inode *inode = mapping->host;
  926. struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
  927. struct ceph_client *cl = fsc->client;
  928. /* find oldest snap context with dirty data */
  929. ceph_wbc->snapc = get_oldest_context(inode, ceph_wbc, NULL);
  930. if (!ceph_wbc->snapc) {
  931. /* hmm, why does writepages get called when there
  932. is no dirty data? */
  933. doutc(cl, " no snap context with dirty data?\n");
  934. return -ENODATA;
  935. }
  936. doutc(cl, " oldest snapc is %p seq %lld (%d snaps)\n",
  937. ceph_wbc->snapc, ceph_wbc->snapc->seq,
  938. ceph_wbc->snapc->num_snaps);
  939. ceph_wbc->should_loop = false;
  940. if (ceph_wbc->head_snapc && ceph_wbc->snapc != ceph_wbc->last_snapc) {
  941. /* where to start/end? */
  942. if (wbc->range_cyclic) {
  943. ceph_wbc->index = ceph_wbc->start_index;
  944. ceph_wbc->end = -1;
  945. if (ceph_wbc->index > 0)
  946. ceph_wbc->should_loop = true;
  947. doutc(cl, " cyclic, start at %lu\n", ceph_wbc->index);
  948. } else {
  949. ceph_wbc->index = wbc->range_start >> PAGE_SHIFT;
  950. ceph_wbc->end = wbc->range_end >> PAGE_SHIFT;
  951. if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
  952. ceph_wbc->range_whole = true;
  953. doutc(cl, " not cyclic, %lu to %lu\n",
  954. ceph_wbc->index, ceph_wbc->end);
  955. }
  956. } else if (!ceph_wbc->head_snapc) {
  957. /* Do not respect wbc->range_{start,end}. Dirty pages
  958. * in that range can be associated with newer snapc.
  959. * They are not writeable until we write all dirty pages
  960. * associated with 'snapc' get written */
  961. if (ceph_wbc->index > 0)
  962. ceph_wbc->should_loop = true;
  963. doutc(cl, " non-head snapc, range whole\n");
  964. }
  965. ceph_put_snap_context(ceph_wbc->last_snapc);
  966. ceph_wbc->last_snapc = ceph_wbc->snapc;
  967. return 0;
  968. }
  969. static inline
  970. bool has_writeback_done(struct ceph_writeback_ctl *ceph_wbc)
  971. {
  972. return ceph_wbc->done && ceph_wbc->index > ceph_wbc->end;
  973. }
  974. static inline
  975. bool can_next_page_be_processed(struct ceph_writeback_ctl *ceph_wbc,
  976. unsigned index)
  977. {
  978. return index < ceph_wbc->nr_folios &&
  979. ceph_wbc->locked_pages < ceph_wbc->max_pages;
  980. }
  981. static
  982. int ceph_check_page_before_write(struct address_space *mapping,
  983. struct writeback_control *wbc,
  984. struct ceph_writeback_ctl *ceph_wbc,
  985. struct folio *folio)
  986. {
  987. struct inode *inode = mapping->host;
  988. struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
  989. struct ceph_client *cl = fsc->client;
  990. struct ceph_snap_context *pgsnapc;
  991. /* only dirty folios, or our accounting breaks */
  992. if (unlikely(!folio_test_dirty(folio) || folio->mapping != mapping)) {
  993. doutc(cl, "!dirty or !mapping %p\n", folio);
  994. return -ENODATA;
  995. }
  996. /* only if matching snap context */
  997. pgsnapc = page_snap_context(&folio->page);
  998. if (pgsnapc != ceph_wbc->snapc) {
  999. doutc(cl, "folio snapc %p %lld != oldest %p %lld\n",
  1000. pgsnapc, pgsnapc->seq,
  1001. ceph_wbc->snapc, ceph_wbc->snapc->seq);
  1002. if (!ceph_wbc->should_loop && !ceph_wbc->head_snapc &&
  1003. wbc->sync_mode != WB_SYNC_NONE)
  1004. ceph_wbc->should_loop = true;
  1005. return -ENODATA;
  1006. }
  1007. if (folio_pos(folio) >= ceph_wbc->i_size) {
  1008. doutc(cl, "folio at %lu beyond eof %llu\n",
  1009. folio->index, ceph_wbc->i_size);
  1010. if ((ceph_wbc->size_stable ||
  1011. folio_pos(folio) >= i_size_read(inode)) &&
  1012. folio_clear_dirty_for_io(folio))
  1013. folio_invalidate(folio, 0, folio_size(folio));
  1014. return -ENODATA;
  1015. }
  1016. if (ceph_wbc->strip_unit_end &&
  1017. (folio->index > ceph_wbc->strip_unit_end)) {
  1018. doutc(cl, "end of strip unit %p\n", folio);
  1019. return -E2BIG;
  1020. }
  1021. return 0;
  1022. }
  1023. static inline
  1024. void __ceph_allocate_page_array(struct ceph_writeback_ctl *ceph_wbc,
  1025. unsigned int max_pages)
  1026. {
  1027. ceph_wbc->pages = kmalloc_objs(*ceph_wbc->pages, max_pages, GFP_NOFS);
  1028. if (!ceph_wbc->pages) {
  1029. ceph_wbc->from_pool = true;
  1030. ceph_wbc->pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS);
  1031. BUG_ON(!ceph_wbc->pages);
  1032. }
  1033. }
  1034. static inline
  1035. void ceph_allocate_page_array(struct address_space *mapping,
  1036. struct ceph_writeback_ctl *ceph_wbc,
  1037. struct folio *folio)
  1038. {
  1039. struct inode *inode = mapping->host;
  1040. struct ceph_inode_info *ci = ceph_inode(inode);
  1041. u64 objnum;
  1042. u64 objoff;
  1043. u32 xlen;
  1044. /* prepare async write request */
  1045. ceph_wbc->offset = (u64)folio_pos(folio);
  1046. ceph_calc_file_object_mapping(&ci->i_layout,
  1047. ceph_wbc->offset, ceph_wbc->wsize,
  1048. &objnum, &objoff, &xlen);
  1049. ceph_wbc->num_ops = 1;
  1050. ceph_wbc->strip_unit_end = folio->index + ((xlen - 1) >> PAGE_SHIFT);
  1051. BUG_ON(ceph_wbc->pages);
  1052. ceph_wbc->max_pages = calc_pages_for(0, (u64)xlen);
  1053. __ceph_allocate_page_array(ceph_wbc, ceph_wbc->max_pages);
  1054. ceph_wbc->len = 0;
  1055. }
  1056. static inline
  1057. bool is_folio_index_contiguous(const struct ceph_writeback_ctl *ceph_wbc,
  1058. const struct folio *folio)
  1059. {
  1060. return folio->index == (ceph_wbc->offset + ceph_wbc->len) >> PAGE_SHIFT;
  1061. }
  1062. static inline
  1063. bool is_num_ops_too_big(struct ceph_writeback_ctl *ceph_wbc)
  1064. {
  1065. return ceph_wbc->num_ops >=
  1066. (ceph_wbc->from_pool ? CEPH_OSD_SLAB_OPS : CEPH_OSD_MAX_OPS);
  1067. }
  1068. static inline
  1069. bool is_write_congestion_happened(struct ceph_fs_client *fsc)
  1070. {
  1071. return atomic_long_inc_return(&fsc->writeback_count) >
  1072. CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb);
  1073. }
  1074. static inline int move_dirty_folio_in_page_array(struct address_space *mapping,
  1075. struct writeback_control *wbc,
  1076. struct ceph_writeback_ctl *ceph_wbc, struct folio *folio)
  1077. {
  1078. struct inode *inode = mapping->host;
  1079. struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
  1080. struct ceph_client *cl = fsc->client;
  1081. struct page **pages = ceph_wbc->pages;
  1082. unsigned int index = ceph_wbc->locked_pages;
  1083. gfp_t gfp_flags = ceph_wbc->locked_pages ? GFP_NOWAIT : GFP_NOFS;
  1084. if (IS_ENCRYPTED(inode)) {
  1085. pages[index] = fscrypt_encrypt_pagecache_blocks(folio,
  1086. PAGE_SIZE,
  1087. 0,
  1088. gfp_flags);
  1089. if (IS_ERR(pages[index])) {
  1090. int err = PTR_ERR(pages[index]);
  1091. if (err == -EINVAL) {
  1092. pr_err_client(cl, "inode->i_blkbits=%hhu\n",
  1093. inode->i_blkbits);
  1094. }
  1095. /* better not fail on first page! */
  1096. BUG_ON(ceph_wbc->locked_pages == 0);
  1097. pages[index] = NULL;
  1098. return err;
  1099. }
  1100. } else {
  1101. pages[index] = &folio->page;
  1102. }
  1103. ceph_wbc->locked_pages++;
  1104. return 0;
  1105. }
  1106. static
  1107. void ceph_process_folio_batch(struct address_space *mapping,
  1108. struct writeback_control *wbc,
  1109. struct ceph_writeback_ctl *ceph_wbc)
  1110. {
  1111. struct inode *inode = mapping->host;
  1112. struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
  1113. struct ceph_client *cl = fsc->client;
  1114. struct folio *folio = NULL;
  1115. unsigned i;
  1116. int rc;
  1117. for (i = 0; can_next_page_be_processed(ceph_wbc, i); i++) {
  1118. folio = ceph_wbc->fbatch.folios[i];
  1119. if (!folio)
  1120. continue;
  1121. doutc(cl, "? %p idx %lu, folio_test_writeback %#x, "
  1122. "folio_test_dirty %#x, folio_test_locked %#x\n",
  1123. folio, folio->index, folio_test_writeback(folio),
  1124. folio_test_dirty(folio),
  1125. folio_test_locked(folio));
  1126. if (folio_test_writeback(folio) ||
  1127. folio_test_private_2(folio) /* [DEPRECATED] */) {
  1128. doutc(cl, "waiting on writeback %p\n", folio);
  1129. folio_wait_writeback(folio);
  1130. folio_wait_private_2(folio); /* [DEPRECATED] */
  1131. continue;
  1132. }
  1133. if (ceph_wbc->locked_pages == 0)
  1134. folio_lock(folio);
  1135. else if (!folio_trylock(folio))
  1136. break;
  1137. rc = ceph_check_page_before_write(mapping, wbc,
  1138. ceph_wbc, folio);
  1139. if (rc == -ENODATA) {
  1140. folio_unlock(folio);
  1141. ceph_wbc->fbatch.folios[i] = NULL;
  1142. continue;
  1143. } else if (rc == -E2BIG) {
  1144. folio_unlock(folio);
  1145. break;
  1146. }
  1147. if (!folio_clear_dirty_for_io(folio)) {
  1148. doutc(cl, "%p !folio_clear_dirty_for_io\n", folio);
  1149. folio_unlock(folio);
  1150. ceph_wbc->fbatch.folios[i] = NULL;
  1151. continue;
  1152. }
  1153. /*
  1154. * We have something to write. If this is
  1155. * the first locked page this time through,
  1156. * calculate max possible write size and
  1157. * allocate a page array
  1158. */
  1159. if (ceph_wbc->locked_pages == 0) {
  1160. ceph_allocate_page_array(mapping, ceph_wbc, folio);
  1161. } else if (!is_folio_index_contiguous(ceph_wbc, folio)) {
  1162. if (is_num_ops_too_big(ceph_wbc)) {
  1163. folio_redirty_for_writepage(wbc, folio);
  1164. folio_unlock(folio);
  1165. break;
  1166. }
  1167. ceph_wbc->num_ops++;
  1168. ceph_wbc->offset = (u64)folio_pos(folio);
  1169. ceph_wbc->len = 0;
  1170. }
  1171. /* note position of first page in fbatch */
  1172. doutc(cl, "%llx.%llx will write folio %p idx %lu\n",
  1173. ceph_vinop(inode), folio, folio->index);
  1174. fsc->write_congested = is_write_congestion_happened(fsc);
  1175. rc = move_dirty_folio_in_page_array(mapping, wbc, ceph_wbc,
  1176. folio);
  1177. if (rc) {
  1178. folio_redirty_for_writepage(wbc, folio);
  1179. folio_unlock(folio);
  1180. break;
  1181. }
  1182. ceph_wbc->fbatch.folios[i] = NULL;
  1183. ceph_wbc->len += folio_size(folio);
  1184. }
  1185. ceph_wbc->processed_in_fbatch = i;
  1186. }
  1187. static inline
  1188. void ceph_shift_unused_folios_left(struct folio_batch *fbatch)
  1189. {
  1190. unsigned j, n = 0;
  1191. /* shift unused page to beginning of fbatch */
  1192. for (j = 0; j < folio_batch_count(fbatch); j++) {
  1193. if (!fbatch->folios[j])
  1194. continue;
  1195. if (n < j) {
  1196. fbatch->folios[n] = fbatch->folios[j];
  1197. }
  1198. n++;
  1199. }
  1200. fbatch->nr = n;
  1201. }
  1202. static
  1203. int ceph_submit_write(struct address_space *mapping,
  1204. struct writeback_control *wbc,
  1205. struct ceph_writeback_ctl *ceph_wbc)
  1206. {
  1207. struct inode *inode = mapping->host;
  1208. struct ceph_inode_info *ci = ceph_inode(inode);
  1209. struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
  1210. struct ceph_client *cl = fsc->client;
  1211. struct ceph_vino vino = ceph_vino(inode);
  1212. struct ceph_osd_request *req = NULL;
  1213. struct page *page = NULL;
  1214. bool caching = ceph_is_cache_enabled(inode);
  1215. u64 offset;
  1216. u64 len;
  1217. unsigned i;
  1218. new_request:
  1219. offset = ceph_fscrypt_page_offset(ceph_wbc->pages[0]);
  1220. len = ceph_wbc->wsize;
  1221. req = ceph_osdc_new_request(&fsc->client->osdc,
  1222. &ci->i_layout, vino,
  1223. offset, &len, 0, ceph_wbc->num_ops,
  1224. CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
  1225. ceph_wbc->snapc, ceph_wbc->truncate_seq,
  1226. ceph_wbc->truncate_size, false);
  1227. if (IS_ERR(req)) {
  1228. req = ceph_osdc_new_request(&fsc->client->osdc,
  1229. &ci->i_layout, vino,
  1230. offset, &len, 0,
  1231. min(ceph_wbc->num_ops,
  1232. CEPH_OSD_SLAB_OPS),
  1233. CEPH_OSD_OP_WRITE,
  1234. CEPH_OSD_FLAG_WRITE,
  1235. ceph_wbc->snapc,
  1236. ceph_wbc->truncate_seq,
  1237. ceph_wbc->truncate_size,
  1238. true);
  1239. BUG_ON(IS_ERR(req));
  1240. }
  1241. page = ceph_wbc->pages[ceph_wbc->locked_pages - 1];
  1242. BUG_ON(len < ceph_fscrypt_page_offset(page) + thp_size(page) - offset);
  1243. if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) {
  1244. for (i = 0; i < folio_batch_count(&ceph_wbc->fbatch); i++) {
  1245. struct folio *folio = ceph_wbc->fbatch.folios[i];
  1246. if (!folio)
  1247. continue;
  1248. page = &folio->page;
  1249. redirty_page_for_writepage(wbc, page);
  1250. unlock_page(page);
  1251. }
  1252. for (i = 0; i < ceph_wbc->locked_pages; i++) {
  1253. page = ceph_fscrypt_pagecache_page(ceph_wbc->pages[i]);
  1254. if (!page)
  1255. continue;
  1256. redirty_page_for_writepage(wbc, page);
  1257. unlock_page(page);
  1258. }
  1259. ceph_osdc_put_request(req);
  1260. return -EIO;
  1261. }
  1262. req->r_callback = writepages_finish;
  1263. req->r_inode = inode;
  1264. /* Format the osd request message and submit the write */
  1265. len = 0;
  1266. ceph_wbc->data_pages = ceph_wbc->pages;
  1267. ceph_wbc->op_idx = 0;
  1268. for (i = 0; i < ceph_wbc->locked_pages; i++) {
  1269. u64 cur_offset;
  1270. page = ceph_fscrypt_pagecache_page(ceph_wbc->pages[i]);
  1271. cur_offset = page_offset(page);
  1272. /*
  1273. * Discontinuity in page range? Ceph can handle that by just passing
  1274. * multiple extents in the write op.
  1275. */
  1276. if (offset + len != cur_offset) {
  1277. /* If it's full, stop here */
  1278. if (ceph_wbc->op_idx + 1 == req->r_num_ops)
  1279. break;
  1280. /* Kick off an fscache write with what we have so far. */
  1281. ceph_fscache_write_to_cache(inode, offset, len, caching);
  1282. /* Start a new extent */
  1283. osd_req_op_extent_dup_last(req, ceph_wbc->op_idx,
  1284. cur_offset - offset);
  1285. doutc(cl, "got pages at %llu~%llu\n", offset, len);
  1286. osd_req_op_extent_osd_data_pages(req, ceph_wbc->op_idx,
  1287. ceph_wbc->data_pages,
  1288. len, 0,
  1289. ceph_wbc->from_pool,
  1290. false);
  1291. osd_req_op_extent_update(req, ceph_wbc->op_idx, len);
  1292. len = 0;
  1293. offset = cur_offset;
  1294. ceph_wbc->data_pages = ceph_wbc->pages + i;
  1295. ceph_wbc->op_idx++;
  1296. }
  1297. set_page_writeback(page);
  1298. if (caching)
  1299. ceph_set_page_fscache(page);
  1300. len += thp_size(page);
  1301. }
  1302. ceph_fscache_write_to_cache(inode, offset, len, caching);
  1303. if (ceph_wbc->size_stable) {
  1304. len = min(len, ceph_wbc->i_size - offset);
  1305. } else if (i == ceph_wbc->locked_pages) {
  1306. /* writepages_finish() clears writeback pages
  1307. * according to the data length, so make sure
  1308. * data length covers all locked pages */
  1309. u64 min_len = len + 1 - thp_size(page);
  1310. len = get_writepages_data_length(inode,
  1311. ceph_wbc->pages[i - 1],
  1312. offset);
  1313. len = max(len, min_len);
  1314. }
  1315. if (IS_ENCRYPTED(inode))
  1316. len = round_up(len, CEPH_FSCRYPT_BLOCK_SIZE);
  1317. doutc(cl, "got pages at %llu~%llu\n", offset, len);
  1318. if (IS_ENCRYPTED(inode) &&
  1319. ((offset | len) & ~CEPH_FSCRYPT_BLOCK_MASK)) {
  1320. pr_warn_client(cl,
  1321. "bad encrypted write offset=%lld len=%llu\n",
  1322. offset, len);
  1323. }
  1324. osd_req_op_extent_osd_data_pages(req, ceph_wbc->op_idx,
  1325. ceph_wbc->data_pages, len,
  1326. 0, ceph_wbc->from_pool, false);
  1327. osd_req_op_extent_update(req, ceph_wbc->op_idx, len);
  1328. BUG_ON(ceph_wbc->op_idx + 1 != req->r_num_ops);
  1329. ceph_wbc->from_pool = false;
  1330. if (i < ceph_wbc->locked_pages) {
  1331. BUG_ON(ceph_wbc->num_ops <= req->r_num_ops);
  1332. ceph_wbc->num_ops -= req->r_num_ops;
  1333. ceph_wbc->locked_pages -= i;
  1334. /* allocate new pages array for next request */
  1335. ceph_wbc->data_pages = ceph_wbc->pages;
  1336. __ceph_allocate_page_array(ceph_wbc, ceph_wbc->locked_pages);
  1337. memcpy(ceph_wbc->pages, ceph_wbc->data_pages + i,
  1338. ceph_wbc->locked_pages * sizeof(*ceph_wbc->pages));
  1339. memset(ceph_wbc->data_pages + i, 0,
  1340. ceph_wbc->locked_pages * sizeof(*ceph_wbc->pages));
  1341. } else {
  1342. BUG_ON(ceph_wbc->num_ops != req->r_num_ops);
  1343. /* request message now owns the pages array */
  1344. ceph_wbc->pages = NULL;
  1345. }
  1346. req->r_mtime = inode_get_mtime(inode);
  1347. ceph_osdc_start_request(&fsc->client->osdc, req);
  1348. req = NULL;
  1349. wbc->nr_to_write -= i;
  1350. if (ceph_wbc->pages)
  1351. goto new_request;
  1352. return 0;
  1353. }
  1354. static
  1355. void ceph_wait_until_current_writes_complete(struct address_space *mapping,
  1356. struct writeback_control *wbc,
  1357. struct ceph_writeback_ctl *ceph_wbc)
  1358. {
  1359. struct page *page;
  1360. unsigned i, nr;
  1361. if (wbc->sync_mode != WB_SYNC_NONE &&
  1362. ceph_wbc->start_index == 0 && /* all dirty pages were checked */
  1363. !ceph_wbc->head_snapc) {
  1364. ceph_wbc->index = 0;
  1365. while ((ceph_wbc->index <= ceph_wbc->end) &&
  1366. (nr = filemap_get_folios_tag(mapping,
  1367. &ceph_wbc->index,
  1368. (pgoff_t)-1,
  1369. PAGECACHE_TAG_WRITEBACK,
  1370. &ceph_wbc->fbatch))) {
  1371. for (i = 0; i < nr; i++) {
  1372. page = &ceph_wbc->fbatch.folios[i]->page;
  1373. if (page_snap_context(page) != ceph_wbc->snapc)
  1374. continue;
  1375. wait_on_page_writeback(page);
  1376. }
  1377. folio_batch_release(&ceph_wbc->fbatch);
  1378. cond_resched();
  1379. }
  1380. }
  1381. }
  1382. /*
  1383. * initiate async writeback
  1384. */
  1385. static int ceph_writepages_start(struct address_space *mapping,
  1386. struct writeback_control *wbc)
  1387. {
  1388. struct inode *inode = mapping->host;
  1389. struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
  1390. struct ceph_client *cl = fsc->client;
  1391. struct ceph_writeback_ctl ceph_wbc;
  1392. int rc = 0;
  1393. if (wbc->sync_mode == WB_SYNC_NONE && fsc->write_congested)
  1394. return 0;
  1395. doutc(cl, "%llx.%llx (mode=%s)\n", ceph_vinop(inode),
  1396. wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
  1397. (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
  1398. if (is_forced_umount(mapping)) {
  1399. /* we're in a forced umount, don't write! */
  1400. return -EIO;
  1401. }
  1402. ceph_init_writeback_ctl(mapping, wbc, &ceph_wbc);
  1403. if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) {
  1404. rc = -EIO;
  1405. goto out;
  1406. }
  1407. retry:
  1408. rc = ceph_define_writeback_range(mapping, wbc, &ceph_wbc);
  1409. if (rc == -ENODATA) {
  1410. /* hmm, why does writepages get called when there
  1411. is no dirty data? */
  1412. rc = 0;
  1413. goto dec_osd_stopping_blocker;
  1414. }
  1415. if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
  1416. tag_pages_for_writeback(mapping, ceph_wbc.index, ceph_wbc.end);
  1417. while (!has_writeback_done(&ceph_wbc)) {
  1418. BUG_ON(ceph_wbc.locked_pages);
  1419. BUG_ON(ceph_wbc.pages);
  1420. ceph_wbc.max_pages = ceph_wbc.wsize >> PAGE_SHIFT;
  1421. get_more_pages:
  1422. ceph_folio_batch_reinit(&ceph_wbc);
  1423. ceph_wbc.nr_folios = filemap_get_folios_tag(mapping,
  1424. &ceph_wbc.index,
  1425. ceph_wbc.end,
  1426. ceph_wbc.tag,
  1427. &ceph_wbc.fbatch);
  1428. doutc(cl, "pagevec_lookup_range_tag for tag %#x got %d\n",
  1429. ceph_wbc.tag, ceph_wbc.nr_folios);
  1430. if (!ceph_wbc.nr_folios && !ceph_wbc.locked_pages)
  1431. break;
  1432. process_folio_batch:
  1433. ceph_process_folio_batch(mapping, wbc, &ceph_wbc);
  1434. ceph_shift_unused_folios_left(&ceph_wbc.fbatch);
  1435. /* did we get anything? */
  1436. if (!ceph_wbc.locked_pages)
  1437. goto release_folios;
  1438. if (ceph_wbc.processed_in_fbatch) {
  1439. if (folio_batch_count(&ceph_wbc.fbatch) == 0 &&
  1440. ceph_wbc.locked_pages < ceph_wbc.max_pages) {
  1441. doutc(cl, "reached end fbatch, trying for more\n");
  1442. goto get_more_pages;
  1443. }
  1444. }
  1445. rc = ceph_submit_write(mapping, wbc, &ceph_wbc);
  1446. if (rc)
  1447. goto release_folios;
  1448. ceph_wbc.locked_pages = 0;
  1449. ceph_wbc.strip_unit_end = 0;
  1450. if (folio_batch_count(&ceph_wbc.fbatch) > 0) {
  1451. ceph_wbc.nr_folios =
  1452. folio_batch_count(&ceph_wbc.fbatch);
  1453. goto process_folio_batch;
  1454. }
  1455. /*
  1456. * We stop writing back only if we are not doing
  1457. * integrity sync. In case of integrity sync we have to
  1458. * keep going until we have written all the pages
  1459. * we tagged for writeback prior to entering this loop.
  1460. */
  1461. if (wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE)
  1462. ceph_wbc.done = true;
  1463. release_folios:
  1464. doutc(cl, "folio_batch release on %d folios (%p)\n",
  1465. (int)ceph_wbc.fbatch.nr,
  1466. ceph_wbc.fbatch.nr ? ceph_wbc.fbatch.folios[0] : NULL);
  1467. folio_batch_release(&ceph_wbc.fbatch);
  1468. }
  1469. if (ceph_wbc.should_loop && !ceph_wbc.done) {
  1470. /* more to do; loop back to beginning of file */
  1471. doutc(cl, "looping back to beginning of file\n");
  1472. /* OK even when start_index == 0 */
  1473. ceph_wbc.end = ceph_wbc.start_index - 1;
  1474. /* to write dirty pages associated with next snapc,
  1475. * we need to wait until current writes complete */
  1476. ceph_wait_until_current_writes_complete(mapping, wbc, &ceph_wbc);
  1477. ceph_wbc.start_index = 0;
  1478. ceph_wbc.index = 0;
  1479. goto retry;
  1480. }
  1481. if (wbc->range_cyclic || (ceph_wbc.range_whole && wbc->nr_to_write > 0))
  1482. mapping->writeback_index = ceph_wbc.index;
  1483. dec_osd_stopping_blocker:
  1484. ceph_dec_osd_stopping_blocker(fsc->mdsc);
  1485. out:
  1486. ceph_put_snap_context(ceph_wbc.last_snapc);
  1487. doutc(cl, "%llx.%llx dend - startone, rc = %d\n", ceph_vinop(inode),
  1488. rc);
  1489. return rc;
  1490. }
  1491. /*
  1492. * See if a given @snapc is either writeable, or already written.
  1493. */
  1494. static int context_is_writeable_or_written(struct inode *inode,
  1495. struct ceph_snap_context *snapc)
  1496. {
  1497. struct ceph_snap_context *oldest = get_oldest_context(inode, NULL, NULL);
  1498. int ret = !oldest || snapc->seq <= oldest->seq;
  1499. ceph_put_snap_context(oldest);
  1500. return ret;
  1501. }
  1502. /**
  1503. * ceph_find_incompatible - find an incompatible context and return it
  1504. * @folio: folio being dirtied
  1505. *
  1506. * We are only allowed to write into/dirty a folio if the folio is
  1507. * clean, or already dirty within the same snap context. Returns a
  1508. * conflicting context if there is one, NULL if there isn't, or a
  1509. * negative error code on other errors.
  1510. *
  1511. * Must be called with folio lock held.
  1512. */
  1513. static struct ceph_snap_context *
  1514. ceph_find_incompatible(struct folio *folio)
  1515. {
  1516. struct inode *inode = folio->mapping->host;
  1517. struct ceph_client *cl = ceph_inode_to_client(inode);
  1518. struct ceph_inode_info *ci = ceph_inode(inode);
  1519. if (ceph_inode_is_shutdown(inode)) {
  1520. doutc(cl, " %llx.%llx folio %p is shutdown\n",
  1521. ceph_vinop(inode), folio);
  1522. return ERR_PTR(-ESTALE);
  1523. }
  1524. for (;;) {
  1525. struct ceph_snap_context *snapc, *oldest;
  1526. folio_wait_writeback(folio);
  1527. snapc = page_snap_context(&folio->page);
  1528. if (!snapc || snapc == ci->i_head_snapc)
  1529. break;
  1530. /*
  1531. * this folio is already dirty in another (older) snap
  1532. * context! is it writeable now?
  1533. */
  1534. oldest = get_oldest_context(inode, NULL, NULL);
  1535. if (snapc->seq > oldest->seq) {
  1536. /* not writeable -- return it for the caller to deal with */
  1537. ceph_put_snap_context(oldest);
  1538. doutc(cl, " %llx.%llx folio %p snapc %p not current or oldest\n",
  1539. ceph_vinop(inode), folio, snapc);
  1540. return ceph_get_snap_context(snapc);
  1541. }
  1542. ceph_put_snap_context(oldest);
  1543. /* yay, writeable, do it now (without dropping folio lock) */
  1544. doutc(cl, " %llx.%llx folio %p snapc %p not current, but oldest\n",
  1545. ceph_vinop(inode), folio, snapc);
  1546. if (folio_clear_dirty_for_io(folio)) {
  1547. int r = write_folio_nounlock(folio, NULL);
  1548. if (r < 0)
  1549. return ERR_PTR(r);
  1550. }
  1551. }
  1552. return NULL;
  1553. }
  1554. static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len,
  1555. struct folio **foliop, void **_fsdata)
  1556. {
  1557. struct inode *inode = file_inode(file);
  1558. struct ceph_inode_info *ci = ceph_inode(inode);
  1559. struct ceph_snap_context *snapc;
  1560. snapc = ceph_find_incompatible(*foliop);
  1561. if (snapc) {
  1562. int r;
  1563. folio_unlock(*foliop);
  1564. folio_put(*foliop);
  1565. *foliop = NULL;
  1566. if (IS_ERR(snapc))
  1567. return PTR_ERR(snapc);
  1568. ceph_queue_writeback(inode);
  1569. r = wait_event_killable(ci->i_cap_wq,
  1570. context_is_writeable_or_written(inode, snapc));
  1571. ceph_put_snap_context(snapc);
  1572. return r == 0 ? -EAGAIN : r;
  1573. }
  1574. return 0;
  1575. }
  1576. /*
  1577. * We are only allowed to write into/dirty the page if the page is
  1578. * clean, or already dirty within the same snap context.
  1579. */
  1580. static int ceph_write_begin(const struct kiocb *iocb,
  1581. struct address_space *mapping,
  1582. loff_t pos, unsigned len,
  1583. struct folio **foliop, void **fsdata)
  1584. {
  1585. struct file *file = iocb->ki_filp;
  1586. struct inode *inode = file_inode(file);
  1587. struct ceph_inode_info *ci = ceph_inode(inode);
  1588. int r;
  1589. r = netfs_write_begin(&ci->netfs, file, inode->i_mapping, pos, len, foliop, NULL);
  1590. if (r < 0)
  1591. return r;
  1592. folio_wait_private_2(*foliop); /* [DEPRECATED] */
  1593. WARN_ON_ONCE(!folio_test_locked(*foliop));
  1594. return 0;
  1595. }
  1596. /*
  1597. * we don't do anything in here that simple_write_end doesn't do
  1598. * except adjust dirty page accounting
  1599. */
  1600. static int ceph_write_end(const struct kiocb *iocb,
  1601. struct address_space *mapping, loff_t pos,
  1602. unsigned len, unsigned copied,
  1603. struct folio *folio, void *fsdata)
  1604. {
  1605. struct file *file = iocb->ki_filp;
  1606. struct inode *inode = file_inode(file);
  1607. struct ceph_client *cl = ceph_inode_to_client(inode);
  1608. bool check_cap = false;
  1609. doutc(cl, "%llx.%llx file %p folio %p %d~%d (%d)\n", ceph_vinop(inode),
  1610. file, folio, (int)pos, (int)copied, (int)len);
  1611. if (!folio_test_uptodate(folio)) {
  1612. /* just return that nothing was copied on a short copy */
  1613. if (copied < len) {
  1614. copied = 0;
  1615. goto out;
  1616. }
  1617. folio_mark_uptodate(folio);
  1618. }
  1619. /* did file size increase? */
  1620. if (pos+copied > i_size_read(inode))
  1621. check_cap = ceph_inode_set_size(inode, pos+copied);
  1622. folio_mark_dirty(folio);
  1623. out:
  1624. folio_unlock(folio);
  1625. folio_put(folio);
  1626. if (check_cap)
  1627. ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY);
  1628. return copied;
  1629. }
  1630. const struct address_space_operations ceph_aops = {
  1631. .read_folio = netfs_read_folio,
  1632. .readahead = netfs_readahead,
  1633. .writepages = ceph_writepages_start,
  1634. .write_begin = ceph_write_begin,
  1635. .write_end = ceph_write_end,
  1636. .dirty_folio = ceph_dirty_folio,
  1637. .invalidate_folio = ceph_invalidate_folio,
  1638. .release_folio = netfs_release_folio,
  1639. .direct_IO = noop_direct_IO,
  1640. .migrate_folio = filemap_migrate_folio,
  1641. };
  1642. static void ceph_block_sigs(sigset_t *oldset)
  1643. {
  1644. sigset_t mask;
  1645. siginitsetinv(&mask, sigmask(SIGKILL));
  1646. sigprocmask(SIG_BLOCK, &mask, oldset);
  1647. }
  1648. static void ceph_restore_sigs(sigset_t *oldset)
  1649. {
  1650. sigprocmask(SIG_SETMASK, oldset, NULL);
  1651. }
  1652. /*
  1653. * vm ops
  1654. */
  1655. static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
  1656. {
  1657. struct vm_area_struct *vma = vmf->vma;
  1658. struct inode *inode = file_inode(vma->vm_file);
  1659. struct ceph_inode_info *ci = ceph_inode(inode);
  1660. struct ceph_client *cl = ceph_inode_to_client(inode);
  1661. struct ceph_file_info *fi = vma->vm_file->private_data;
  1662. loff_t off = (loff_t)vmf->pgoff << PAGE_SHIFT;
  1663. int want, got, err;
  1664. sigset_t oldset;
  1665. vm_fault_t ret = VM_FAULT_SIGBUS;
  1666. if (ceph_inode_is_shutdown(inode))
  1667. return ret;
  1668. ceph_block_sigs(&oldset);
  1669. doutc(cl, "%llx.%llx %llu trying to get caps\n",
  1670. ceph_vinop(inode), off);
  1671. if (fi->fmode & CEPH_FILE_MODE_LAZY)
  1672. want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
  1673. else
  1674. want = CEPH_CAP_FILE_CACHE;
  1675. got = 0;
  1676. err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_RD, want, -1, &got);
  1677. if (err < 0)
  1678. goto out_restore;
  1679. doutc(cl, "%llx.%llx %llu got cap refs on %s\n", ceph_vinop(inode),
  1680. off, ceph_cap_string(got));
  1681. if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) ||
  1682. !ceph_has_inline_data(ci)) {
  1683. CEPH_DEFINE_RW_CONTEXT(rw_ctx, got);
  1684. ceph_add_rw_context(fi, &rw_ctx);
  1685. ret = filemap_fault(vmf);
  1686. ceph_del_rw_context(fi, &rw_ctx);
  1687. doutc(cl, "%llx.%llx %llu drop cap refs %s ret %x\n",
  1688. ceph_vinop(inode), off, ceph_cap_string(got), ret);
  1689. } else
  1690. err = -EAGAIN;
  1691. ceph_put_cap_refs(ci, got);
  1692. if (err != -EAGAIN)
  1693. goto out_restore;
  1694. /* read inline data */
  1695. if (off >= PAGE_SIZE) {
  1696. /* does not support inline data > PAGE_SIZE */
  1697. ret = VM_FAULT_SIGBUS;
  1698. } else {
  1699. struct address_space *mapping = inode->i_mapping;
  1700. struct page *page;
  1701. filemap_invalidate_lock_shared(mapping);
  1702. page = find_or_create_page(mapping, 0,
  1703. mapping_gfp_constraint(mapping, ~__GFP_FS));
  1704. if (!page) {
  1705. ret = VM_FAULT_OOM;
  1706. goto out_inline;
  1707. }
  1708. err = __ceph_do_getattr(inode, page,
  1709. CEPH_STAT_CAP_INLINE_DATA, true);
  1710. if (err < 0 || off >= i_size_read(inode)) {
  1711. unlock_page(page);
  1712. put_page(page);
  1713. ret = vmf_error(err);
  1714. goto out_inline;
  1715. }
  1716. if (err < PAGE_SIZE)
  1717. zero_user_segment(page, err, PAGE_SIZE);
  1718. else
  1719. flush_dcache_page(page);
  1720. SetPageUptodate(page);
  1721. vmf->page = page;
  1722. ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED;
  1723. out_inline:
  1724. filemap_invalidate_unlock_shared(mapping);
  1725. doutc(cl, "%llx.%llx %llu read inline data ret %x\n",
  1726. ceph_vinop(inode), off, ret);
  1727. }
  1728. out_restore:
  1729. ceph_restore_sigs(&oldset);
  1730. if (err < 0)
  1731. ret = vmf_error(err);
  1732. return ret;
  1733. }
  1734. static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
  1735. {
  1736. struct vm_area_struct *vma = vmf->vma;
  1737. struct inode *inode = file_inode(vma->vm_file);
  1738. struct ceph_client *cl = ceph_inode_to_client(inode);
  1739. struct ceph_inode_info *ci = ceph_inode(inode);
  1740. struct ceph_file_info *fi = vma->vm_file->private_data;
  1741. struct ceph_cap_flush *prealloc_cf;
  1742. struct folio *folio = page_folio(vmf->page);
  1743. loff_t off = folio_pos(folio);
  1744. loff_t size = i_size_read(inode);
  1745. size_t len;
  1746. int want, got, err;
  1747. sigset_t oldset;
  1748. vm_fault_t ret = VM_FAULT_SIGBUS;
  1749. if (ceph_inode_is_shutdown(inode))
  1750. return ret;
  1751. prealloc_cf = ceph_alloc_cap_flush();
  1752. if (!prealloc_cf)
  1753. return VM_FAULT_OOM;
  1754. sb_start_pagefault(inode->i_sb);
  1755. ceph_block_sigs(&oldset);
  1756. if (off + folio_size(folio) <= size)
  1757. len = folio_size(folio);
  1758. else
  1759. len = offset_in_folio(folio, size);
  1760. doutc(cl, "%llx.%llx %llu~%zd getting caps i_size %llu\n",
  1761. ceph_vinop(inode), off, len, size);
  1762. if (fi->fmode & CEPH_FILE_MODE_LAZY)
  1763. want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
  1764. else
  1765. want = CEPH_CAP_FILE_BUFFER;
  1766. got = 0;
  1767. err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_WR, want, off + len, &got);
  1768. if (err < 0)
  1769. goto out_free;
  1770. doutc(cl, "%llx.%llx %llu~%zd got cap refs on %s\n", ceph_vinop(inode),
  1771. off, len, ceph_cap_string(got));
  1772. /* Update time before taking folio lock */
  1773. file_update_time(vma->vm_file);
  1774. inode_inc_iversion_raw(inode);
  1775. do {
  1776. struct ceph_snap_context *snapc;
  1777. folio_lock(folio);
  1778. if (folio_mkwrite_check_truncate(folio, inode) < 0) {
  1779. folio_unlock(folio);
  1780. ret = VM_FAULT_NOPAGE;
  1781. break;
  1782. }
  1783. snapc = ceph_find_incompatible(folio);
  1784. if (!snapc) {
  1785. /* success. we'll keep the folio locked. */
  1786. folio_mark_dirty(folio);
  1787. ret = VM_FAULT_LOCKED;
  1788. break;
  1789. }
  1790. folio_unlock(folio);
  1791. if (IS_ERR(snapc)) {
  1792. ret = VM_FAULT_SIGBUS;
  1793. break;
  1794. }
  1795. ceph_queue_writeback(inode);
  1796. err = wait_event_killable(ci->i_cap_wq,
  1797. context_is_writeable_or_written(inode, snapc));
  1798. ceph_put_snap_context(snapc);
  1799. } while (err == 0);
  1800. if (ret == VM_FAULT_LOCKED) {
  1801. int dirty;
  1802. spin_lock(&ci->i_ceph_lock);
  1803. dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
  1804. &prealloc_cf);
  1805. spin_unlock(&ci->i_ceph_lock);
  1806. if (dirty)
  1807. __mark_inode_dirty(inode, dirty);
  1808. }
  1809. doutc(cl, "%llx.%llx %llu~%zd dropping cap refs on %s ret %x\n",
  1810. ceph_vinop(inode), off, len, ceph_cap_string(got), ret);
  1811. ceph_put_cap_refs_async(ci, got);
  1812. out_free:
  1813. ceph_restore_sigs(&oldset);
  1814. sb_end_pagefault(inode->i_sb);
  1815. ceph_free_cap_flush(prealloc_cf);
  1816. if (err < 0)
  1817. ret = vmf_error(err);
  1818. return ret;
  1819. }
  1820. void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
  1821. char *data, size_t len)
  1822. {
  1823. struct ceph_client *cl = ceph_inode_to_client(inode);
  1824. struct address_space *mapping = inode->i_mapping;
  1825. struct page *page;
  1826. if (locked_page) {
  1827. page = locked_page;
  1828. } else {
  1829. if (i_size_read(inode) == 0)
  1830. return;
  1831. page = find_or_create_page(mapping, 0,
  1832. mapping_gfp_constraint(mapping,
  1833. ~__GFP_FS));
  1834. if (!page)
  1835. return;
  1836. if (PageUptodate(page)) {
  1837. unlock_page(page);
  1838. put_page(page);
  1839. return;
  1840. }
  1841. }
  1842. doutc(cl, "%p %llx.%llx len %zu locked_page %p\n", inode,
  1843. ceph_vinop(inode), len, locked_page);
  1844. if (len > 0) {
  1845. void *kaddr = kmap_atomic(page);
  1846. memcpy(kaddr, data, len);
  1847. kunmap_atomic(kaddr);
  1848. }
  1849. if (page != locked_page) {
  1850. if (len < PAGE_SIZE)
  1851. zero_user_segment(page, len, PAGE_SIZE);
  1852. else
  1853. flush_dcache_page(page);
  1854. SetPageUptodate(page);
  1855. unlock_page(page);
  1856. put_page(page);
  1857. }
  1858. }
  1859. int ceph_uninline_data(struct file *file)
  1860. {
  1861. struct inode *inode = file_inode(file);
  1862. struct ceph_inode_info *ci = ceph_inode(inode);
  1863. struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
  1864. struct ceph_client *cl = fsc->client;
  1865. struct ceph_osd_request *req = NULL;
  1866. struct ceph_cap_flush *prealloc_cf = NULL;
  1867. struct folio *folio = NULL;
  1868. struct ceph_snap_context *snapc = NULL;
  1869. u64 inline_version = CEPH_INLINE_NONE;
  1870. struct page *pages[1];
  1871. int err = 0;
  1872. u64 len;
  1873. spin_lock(&ci->i_ceph_lock);
  1874. inline_version = ci->i_inline_version;
  1875. spin_unlock(&ci->i_ceph_lock);
  1876. doutc(cl, "%llx.%llx inline_version %llu\n", ceph_vinop(inode),
  1877. inline_version);
  1878. if (ceph_inode_is_shutdown(inode)) {
  1879. err = -EIO;
  1880. goto out;
  1881. }
  1882. if (inline_version == CEPH_INLINE_NONE)
  1883. return 0;
  1884. prealloc_cf = ceph_alloc_cap_flush();
  1885. if (!prealloc_cf)
  1886. return -ENOMEM;
  1887. if (inline_version == 1) /* initial version, no data */
  1888. goto out_uninline;
  1889. down_read(&fsc->mdsc->snap_rwsem);
  1890. spin_lock(&ci->i_ceph_lock);
  1891. if (__ceph_have_pending_cap_snap(ci)) {
  1892. struct ceph_cap_snap *capsnap =
  1893. list_last_entry(&ci->i_cap_snaps,
  1894. struct ceph_cap_snap,
  1895. ci_item);
  1896. snapc = ceph_get_snap_context(capsnap->context);
  1897. } else {
  1898. if (!ci->i_head_snapc) {
  1899. ci->i_head_snapc = ceph_get_snap_context(
  1900. ci->i_snap_realm->cached_context);
  1901. }
  1902. snapc = ceph_get_snap_context(ci->i_head_snapc);
  1903. }
  1904. spin_unlock(&ci->i_ceph_lock);
  1905. up_read(&fsc->mdsc->snap_rwsem);
  1906. folio = read_mapping_folio(inode->i_mapping, 0, file);
  1907. if (IS_ERR(folio)) {
  1908. err = PTR_ERR(folio);
  1909. goto out;
  1910. }
  1911. folio_lock(folio);
  1912. len = i_size_read(inode);
  1913. if (len > folio_size(folio))
  1914. len = folio_size(folio);
  1915. req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
  1916. ceph_vino(inode), 0, &len, 0, 1,
  1917. CEPH_OSD_OP_CREATE, CEPH_OSD_FLAG_WRITE,
  1918. snapc, 0, 0, false);
  1919. if (IS_ERR(req)) {
  1920. err = PTR_ERR(req);
  1921. goto out_unlock;
  1922. }
  1923. req->r_mtime = inode_get_mtime(inode);
  1924. ceph_osdc_start_request(&fsc->client->osdc, req);
  1925. err = ceph_osdc_wait_request(&fsc->client->osdc, req);
  1926. ceph_osdc_put_request(req);
  1927. if (err < 0)
  1928. goto out_unlock;
  1929. req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
  1930. ceph_vino(inode), 0, &len, 1, 3,
  1931. CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
  1932. snapc, ci->i_truncate_seq,
  1933. ci->i_truncate_size, false);
  1934. if (IS_ERR(req)) {
  1935. err = PTR_ERR(req);
  1936. goto out_unlock;
  1937. }
  1938. pages[0] = folio_page(folio, 0);
  1939. osd_req_op_extent_osd_data_pages(req, 1, pages, len, 0, false, false);
  1940. {
  1941. __le64 xattr_buf = cpu_to_le64(inline_version);
  1942. err = osd_req_op_xattr_init(req, 0, CEPH_OSD_OP_CMPXATTR,
  1943. "inline_version", &xattr_buf,
  1944. sizeof(xattr_buf),
  1945. CEPH_OSD_CMPXATTR_OP_GT,
  1946. CEPH_OSD_CMPXATTR_MODE_U64);
  1947. if (err)
  1948. goto out_put_req;
  1949. }
  1950. {
  1951. char xattr_buf[32];
  1952. int xattr_len = snprintf(xattr_buf, sizeof(xattr_buf),
  1953. "%llu", inline_version);
  1954. err = osd_req_op_xattr_init(req, 2, CEPH_OSD_OP_SETXATTR,
  1955. "inline_version",
  1956. xattr_buf, xattr_len, 0, 0);
  1957. if (err)
  1958. goto out_put_req;
  1959. }
  1960. req->r_mtime = inode_get_mtime(inode);
  1961. ceph_osdc_start_request(&fsc->client->osdc, req);
  1962. err = ceph_osdc_wait_request(&fsc->client->osdc, req);
  1963. ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
  1964. req->r_end_latency, len, err);
  1965. out_uninline:
  1966. if (!err) {
  1967. int dirty;
  1968. /* Set to CAP_INLINE_NONE and dirty the caps */
  1969. down_read(&fsc->mdsc->snap_rwsem);
  1970. spin_lock(&ci->i_ceph_lock);
  1971. ci->i_inline_version = CEPH_INLINE_NONE;
  1972. dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &prealloc_cf);
  1973. spin_unlock(&ci->i_ceph_lock);
  1974. up_read(&fsc->mdsc->snap_rwsem);
  1975. if (dirty)
  1976. __mark_inode_dirty(inode, dirty);
  1977. }
  1978. out_put_req:
  1979. ceph_osdc_put_request(req);
  1980. if (err == -ECANCELED)
  1981. err = 0;
  1982. out_unlock:
  1983. if (folio) {
  1984. folio_unlock(folio);
  1985. folio_put(folio);
  1986. }
  1987. out:
  1988. ceph_put_snap_context(snapc);
  1989. ceph_free_cap_flush(prealloc_cf);
  1990. doutc(cl, "%llx.%llx inline_version %llu = %d\n",
  1991. ceph_vinop(inode), inline_version, err);
  1992. return err;
  1993. }
  1994. static const struct vm_operations_struct ceph_vmops = {
  1995. .fault = ceph_filemap_fault,
  1996. .page_mkwrite = ceph_page_mkwrite,
  1997. };
  1998. int ceph_mmap_prepare(struct vm_area_desc *desc)
  1999. {
  2000. struct address_space *mapping = desc->file->f_mapping;
  2001. if (!mapping->a_ops->read_folio)
  2002. return -ENOEXEC;
  2003. desc->vm_ops = &ceph_vmops;
  2004. return 0;
  2005. }
  2006. enum {
  2007. POOL_READ = 1,
  2008. POOL_WRITE = 2,
  2009. };
  2010. static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
  2011. s64 pool, struct ceph_string *pool_ns)
  2012. {
  2013. struct ceph_fs_client *fsc = ceph_inode_to_fs_client(&ci->netfs.inode);
  2014. struct ceph_mds_client *mdsc = fsc->mdsc;
  2015. struct ceph_client *cl = fsc->client;
  2016. struct ceph_osd_request *rd_req = NULL, *wr_req = NULL;
  2017. struct rb_node **p, *parent;
  2018. struct ceph_pool_perm *perm;
  2019. struct page **pages;
  2020. size_t pool_ns_len;
  2021. int err = 0, err2 = 0, have = 0;
  2022. down_read(&mdsc->pool_perm_rwsem);
  2023. p = &mdsc->pool_perm_tree.rb_node;
  2024. while (*p) {
  2025. perm = rb_entry(*p, struct ceph_pool_perm, node);
  2026. if (pool < perm->pool)
  2027. p = &(*p)->rb_left;
  2028. else if (pool > perm->pool)
  2029. p = &(*p)->rb_right;
  2030. else {
  2031. int ret = ceph_compare_string(pool_ns,
  2032. perm->pool_ns,
  2033. perm->pool_ns_len);
  2034. if (ret < 0)
  2035. p = &(*p)->rb_left;
  2036. else if (ret > 0)
  2037. p = &(*p)->rb_right;
  2038. else {
  2039. have = perm->perm;
  2040. break;
  2041. }
  2042. }
  2043. }
  2044. up_read(&mdsc->pool_perm_rwsem);
  2045. if (*p)
  2046. goto out;
  2047. if (pool_ns)
  2048. doutc(cl, "pool %lld ns %.*s no perm cached\n", pool,
  2049. (int)pool_ns->len, pool_ns->str);
  2050. else
  2051. doutc(cl, "pool %lld no perm cached\n", pool);
  2052. down_write(&mdsc->pool_perm_rwsem);
  2053. p = &mdsc->pool_perm_tree.rb_node;
  2054. parent = NULL;
  2055. while (*p) {
  2056. parent = *p;
  2057. perm = rb_entry(parent, struct ceph_pool_perm, node);
  2058. if (pool < perm->pool)
  2059. p = &(*p)->rb_left;
  2060. else if (pool > perm->pool)
  2061. p = &(*p)->rb_right;
  2062. else {
  2063. int ret = ceph_compare_string(pool_ns,
  2064. perm->pool_ns,
  2065. perm->pool_ns_len);
  2066. if (ret < 0)
  2067. p = &(*p)->rb_left;
  2068. else if (ret > 0)
  2069. p = &(*p)->rb_right;
  2070. else {
  2071. have = perm->perm;
  2072. break;
  2073. }
  2074. }
  2075. }
  2076. if (*p) {
  2077. up_write(&mdsc->pool_perm_rwsem);
  2078. goto out;
  2079. }
  2080. rd_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL,
  2081. 1, false, GFP_NOFS);
  2082. if (!rd_req) {
  2083. err = -ENOMEM;
  2084. goto out_unlock;
  2085. }
  2086. rd_req->r_flags = CEPH_OSD_FLAG_READ;
  2087. osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0);
  2088. rd_req->r_base_oloc.pool = pool;
  2089. if (pool_ns)
  2090. rd_req->r_base_oloc.pool_ns = ceph_get_string(pool_ns);
  2091. ceph_oid_printf(&rd_req->r_base_oid, "%llx.00000000", ci->i_vino.ino);
  2092. err = ceph_osdc_alloc_messages(rd_req, GFP_NOFS);
  2093. if (err)
  2094. goto out_unlock;
  2095. wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL,
  2096. 1, false, GFP_NOFS);
  2097. if (!wr_req) {
  2098. err = -ENOMEM;
  2099. goto out_unlock;
  2100. }
  2101. wr_req->r_flags = CEPH_OSD_FLAG_WRITE;
  2102. osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL);
  2103. ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc);
  2104. ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid);
  2105. err = ceph_osdc_alloc_messages(wr_req, GFP_NOFS);
  2106. if (err)
  2107. goto out_unlock;
  2108. /* one page should be large enough for STAT data */
  2109. pages = ceph_alloc_page_vector(1, GFP_KERNEL);
  2110. if (IS_ERR(pages)) {
  2111. err = PTR_ERR(pages);
  2112. goto out_unlock;
  2113. }
  2114. osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE,
  2115. 0, false, true);
  2116. ceph_osdc_start_request(&fsc->client->osdc, rd_req);
  2117. wr_req->r_mtime = inode_get_mtime(&ci->netfs.inode);
  2118. ceph_osdc_start_request(&fsc->client->osdc, wr_req);
  2119. err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req);
  2120. err2 = ceph_osdc_wait_request(&fsc->client->osdc, wr_req);
  2121. if (err >= 0 || err == -ENOENT)
  2122. have |= POOL_READ;
  2123. else if (err != -EPERM) {
  2124. if (err == -EBLOCKLISTED)
  2125. fsc->blocklisted = true;
  2126. goto out_unlock;
  2127. }
  2128. if (err2 == 0 || err2 == -EEXIST)
  2129. have |= POOL_WRITE;
  2130. else if (err2 != -EPERM) {
  2131. if (err2 == -EBLOCKLISTED)
  2132. fsc->blocklisted = true;
  2133. err = err2;
  2134. goto out_unlock;
  2135. }
  2136. pool_ns_len = pool_ns ? pool_ns->len : 0;
  2137. perm = kmalloc_flex(*perm, pool_ns, pool_ns_len + 1, GFP_NOFS);
  2138. if (!perm) {
  2139. err = -ENOMEM;
  2140. goto out_unlock;
  2141. }
  2142. perm->pool = pool;
  2143. perm->perm = have;
  2144. perm->pool_ns_len = pool_ns_len;
  2145. if (pool_ns_len > 0)
  2146. memcpy(perm->pool_ns, pool_ns->str, pool_ns_len);
  2147. perm->pool_ns[pool_ns_len] = 0;
  2148. rb_link_node(&perm->node, parent, p);
  2149. rb_insert_color(&perm->node, &mdsc->pool_perm_tree);
  2150. err = 0;
  2151. out_unlock:
  2152. up_write(&mdsc->pool_perm_rwsem);
  2153. ceph_osdc_put_request(rd_req);
  2154. ceph_osdc_put_request(wr_req);
  2155. out:
  2156. if (!err)
  2157. err = have;
  2158. if (pool_ns)
  2159. doutc(cl, "pool %lld ns %.*s result = %d\n", pool,
  2160. (int)pool_ns->len, pool_ns->str, err);
  2161. else
  2162. doutc(cl, "pool %lld result = %d\n", pool, err);
  2163. return err;
  2164. }
  2165. int ceph_pool_perm_check(struct inode *inode, int need)
  2166. {
  2167. struct ceph_client *cl = ceph_inode_to_client(inode);
  2168. struct ceph_inode_info *ci = ceph_inode(inode);
  2169. struct ceph_string *pool_ns;
  2170. s64 pool;
  2171. int ret, flags;
  2172. /* Only need to do this for regular files */
  2173. if (!S_ISREG(inode->i_mode))
  2174. return 0;
  2175. if (ci->i_vino.snap != CEPH_NOSNAP) {
  2176. /*
  2177. * Pool permission check needs to write to the first object.
  2178. * But for snapshot, head of the first object may have already
  2179. * been deleted. Skip check to avoid creating orphan object.
  2180. */
  2181. return 0;
  2182. }
  2183. if (ceph_test_mount_opt(ceph_inode_to_fs_client(inode),
  2184. NOPOOLPERM))
  2185. return 0;
  2186. spin_lock(&ci->i_ceph_lock);
  2187. flags = ci->i_ceph_flags;
  2188. pool = ci->i_layout.pool_id;
  2189. spin_unlock(&ci->i_ceph_lock);
  2190. check:
  2191. if (flags & CEPH_I_POOL_PERM) {
  2192. if ((need & CEPH_CAP_FILE_RD) && !(flags & CEPH_I_POOL_RD)) {
  2193. doutc(cl, "pool %lld no read perm\n", pool);
  2194. return -EPERM;
  2195. }
  2196. if ((need & CEPH_CAP_FILE_WR) && !(flags & CEPH_I_POOL_WR)) {
  2197. doutc(cl, "pool %lld no write perm\n", pool);
  2198. return -EPERM;
  2199. }
  2200. return 0;
  2201. }
  2202. pool_ns = ceph_try_get_string(ci->i_layout.pool_ns);
  2203. ret = __ceph_pool_perm_get(ci, pool, pool_ns);
  2204. ceph_put_string(pool_ns);
  2205. if (ret < 0)
  2206. return ret;
  2207. flags = CEPH_I_POOL_PERM;
  2208. if (ret & POOL_READ)
  2209. flags |= CEPH_I_POOL_RD;
  2210. if (ret & POOL_WRITE)
  2211. flags |= CEPH_I_POOL_WR;
  2212. spin_lock(&ci->i_ceph_lock);
  2213. if (pool == ci->i_layout.pool_id &&
  2214. pool_ns == rcu_dereference_raw(ci->i_layout.pool_ns)) {
  2215. ci->i_ceph_flags |= flags;
  2216. } else {
  2217. pool = ci->i_layout.pool_id;
  2218. flags = ci->i_ceph_flags;
  2219. }
  2220. spin_unlock(&ci->i_ceph_lock);
  2221. goto check;
  2222. }
  2223. void ceph_pool_perm_destroy(struct ceph_mds_client *mdsc)
  2224. {
  2225. struct ceph_pool_perm *perm;
  2226. struct rb_node *n;
  2227. while (!RB_EMPTY_ROOT(&mdsc->pool_perm_tree)) {
  2228. n = rb_first(&mdsc->pool_perm_tree);
  2229. perm = rb_entry(n, struct ceph_pool_perm, node);
  2230. rb_erase(n, &mdsc->pool_perm_tree);
  2231. kfree(perm);
  2232. }
  2233. }