raid56.c 83 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright (C) 2012 Fusion-io All rights reserved.
  4. * Copyright (C) 2012 Intel Corp. All rights reserved.
  5. */
  6. #include <linux/sched.h>
  7. #include <linux/bio.h>
  8. #include <linux/slab.h>
  9. #include <linux/blkdev.h>
  10. #include <linux/raid/pq.h>
  11. #include <linux/hash.h>
  12. #include <linux/list_sort.h>
  13. #include <linux/raid/xor.h>
  14. #include <linux/mm.h>
  15. #include "messages.h"
  16. #include "ctree.h"
  17. #include "disk-io.h"
  18. #include "volumes.h"
  19. #include "raid56.h"
  20. #include "async-thread.h"
  21. #include "file-item.h"
  22. #include "btrfs_inode.h"
  23. /* set when additional merges to this rbio are not allowed */
  24. #define RBIO_RMW_LOCKED_BIT 1
  25. /*
  26. * set when this rbio is sitting in the hash, but it is just a cache
  27. * of past RMW
  28. */
  29. #define RBIO_CACHE_BIT 2
  30. /*
  31. * set when it is safe to trust the stripe_pages for caching
  32. */
  33. #define RBIO_CACHE_READY_BIT 3
  34. #define RBIO_CACHE_SIZE 1024
  35. #define BTRFS_STRIPE_HASH_TABLE_BITS 11
  36. static void dump_bioc(const struct btrfs_fs_info *fs_info, const struct btrfs_io_context *bioc)
  37. {
  38. if (unlikely(!bioc)) {
  39. btrfs_crit(fs_info, "bioc=NULL");
  40. return;
  41. }
  42. btrfs_crit(fs_info,
  43. "bioc logical=%llu full_stripe=%llu size=%llu map_type=0x%llx mirror=%u replace_nr_stripes=%u replace_stripe_src=%d num_stripes=%u",
  44. bioc->logical, bioc->full_stripe_logical, bioc->size,
  45. bioc->map_type, bioc->mirror_num, bioc->replace_nr_stripes,
  46. bioc->replace_stripe_src, bioc->num_stripes);
  47. for (int i = 0; i < bioc->num_stripes; i++) {
  48. btrfs_crit(fs_info, " nr=%d devid=%llu physical=%llu",
  49. i, bioc->stripes[i].dev->devid,
  50. bioc->stripes[i].physical);
  51. }
  52. }
  53. static void btrfs_dump_rbio(const struct btrfs_fs_info *fs_info,
  54. const struct btrfs_raid_bio *rbio)
  55. {
  56. if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
  57. return;
  58. dump_bioc(fs_info, rbio->bioc);
  59. btrfs_crit(fs_info,
  60. "rbio flags=0x%lx nr_sectors=%u nr_data=%u real_stripes=%u stripe_nsectors=%u sector_nsteps=%u scrubp=%u dbitmap=0x%lx",
  61. rbio->flags, rbio->nr_sectors, rbio->nr_data,
  62. rbio->real_stripes, rbio->stripe_nsectors,
  63. rbio->sector_nsteps, rbio->scrubp, rbio->dbitmap);
  64. }
  65. #define ASSERT_RBIO(expr, rbio) \
  66. ({ \
  67. if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \
  68. const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \
  69. (rbio)->bioc->fs_info : NULL; \
  70. \
  71. btrfs_dump_rbio(__fs_info, (rbio)); \
  72. } \
  73. ASSERT((expr)); \
  74. })
  75. #define ASSERT_RBIO_STRIPE(expr, rbio, stripe_nr) \
  76. ({ \
  77. if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \
  78. const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \
  79. (rbio)->bioc->fs_info : NULL; \
  80. \
  81. btrfs_dump_rbio(__fs_info, (rbio)); \
  82. btrfs_crit(__fs_info, "stripe_nr=%d", (stripe_nr)); \
  83. } \
  84. ASSERT((expr)); \
  85. })
  86. #define ASSERT_RBIO_SECTOR(expr, rbio, sector_nr) \
  87. ({ \
  88. if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \
  89. const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \
  90. (rbio)->bioc->fs_info : NULL; \
  91. \
  92. btrfs_dump_rbio(__fs_info, (rbio)); \
  93. btrfs_crit(__fs_info, "sector_nr=%d", (sector_nr)); \
  94. } \
  95. ASSERT((expr)); \
  96. })
  97. #define ASSERT_RBIO_LOGICAL(expr, rbio, logical) \
  98. ({ \
  99. if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \
  100. const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \
  101. (rbio)->bioc->fs_info : NULL; \
  102. \
  103. btrfs_dump_rbio(__fs_info, (rbio)); \
  104. btrfs_crit(__fs_info, "logical=%llu", (logical)); \
  105. } \
  106. ASSERT((expr)); \
  107. })
  108. /* Used by the raid56 code to lock stripes for read/modify/write */
  109. struct btrfs_stripe_hash {
  110. struct list_head hash_list;
  111. spinlock_t lock;
  112. };
  113. /* Used by the raid56 code to lock stripes for read/modify/write */
  114. struct btrfs_stripe_hash_table {
  115. struct list_head stripe_cache;
  116. spinlock_t cache_lock;
  117. int cache_size;
  118. struct btrfs_stripe_hash table[];
  119. };
  120. /*
  121. * The PFN may still be valid, but our paddrs should always be block size
  122. * aligned, thus such -1 paddr is definitely not a valid one.
  123. */
  124. #define INVALID_PADDR (~(phys_addr_t)0)
  125. static void rmw_rbio_work(struct work_struct *work);
  126. static void rmw_rbio_work_locked(struct work_struct *work);
  127. static void index_rbio_pages(struct btrfs_raid_bio *rbio);
  128. static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
  129. static int finish_parity_scrub(struct btrfs_raid_bio *rbio);
  130. static void scrub_rbio_work_locked(struct work_struct *work);
  131. static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio)
  132. {
  133. bitmap_free(rbio->error_bitmap);
  134. bitmap_free(rbio->stripe_uptodate_bitmap);
  135. kfree(rbio->stripe_pages);
  136. kfree(rbio->bio_paddrs);
  137. kfree(rbio->stripe_paddrs);
  138. kfree(rbio->finish_pointers);
  139. }
  140. static void free_raid_bio(struct btrfs_raid_bio *rbio)
  141. {
  142. int i;
  143. if (!refcount_dec_and_test(&rbio->refs))
  144. return;
  145. WARN_ON(!list_empty(&rbio->stripe_cache));
  146. WARN_ON(!list_empty(&rbio->hash_list));
  147. WARN_ON(!bio_list_empty(&rbio->bio_list));
  148. for (i = 0; i < rbio->nr_pages; i++) {
  149. if (rbio->stripe_pages[i]) {
  150. __free_page(rbio->stripe_pages[i]);
  151. rbio->stripe_pages[i] = NULL;
  152. }
  153. }
  154. btrfs_put_bioc(rbio->bioc);
  155. free_raid_bio_pointers(rbio);
  156. kfree(rbio);
  157. }
  158. static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func)
  159. {
  160. INIT_WORK(&rbio->work, work_func);
  161. queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work);
  162. }
  163. /*
  164. * the stripe hash table is used for locking, and to collect
  165. * bios in hopes of making a full stripe
  166. */
  167. int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
  168. {
  169. struct btrfs_stripe_hash_table *table;
  170. struct btrfs_stripe_hash_table *x;
  171. struct btrfs_stripe_hash *cur;
  172. struct btrfs_stripe_hash *h;
  173. unsigned int num_entries = 1U << BTRFS_STRIPE_HASH_TABLE_BITS;
  174. if (info->stripe_hash_table)
  175. return 0;
  176. /*
  177. * The table is large, starting with order 4 and can go as high as
  178. * order 7 in case lock debugging is turned on.
  179. *
  180. * Try harder to allocate and fallback to vmalloc to lower the chance
  181. * of a failing mount.
  182. */
  183. table = kvzalloc_flex(*table, table, num_entries);
  184. if (!table)
  185. return -ENOMEM;
  186. spin_lock_init(&table->cache_lock);
  187. INIT_LIST_HEAD(&table->stripe_cache);
  188. h = table->table;
  189. for (unsigned int i = 0; i < num_entries; i++) {
  190. cur = h + i;
  191. INIT_LIST_HEAD(&cur->hash_list);
  192. spin_lock_init(&cur->lock);
  193. }
  194. x = cmpxchg(&info->stripe_hash_table, NULL, table);
  195. kvfree(x);
  196. return 0;
  197. }
  198. static void memcpy_from_bio_to_stripe(struct btrfs_raid_bio *rbio, unsigned int sector_nr)
  199. {
  200. const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE);
  201. ASSERT(sector_nr < rbio->nr_sectors);
  202. for (int i = 0; i < rbio->sector_nsteps; i++) {
  203. unsigned int index = sector_nr * rbio->sector_nsteps + i;
  204. phys_addr_t dst = rbio->stripe_paddrs[index];
  205. phys_addr_t src = rbio->bio_paddrs[index];
  206. ASSERT(dst != INVALID_PADDR);
  207. ASSERT(src != INVALID_PADDR);
  208. memcpy_page(phys_to_page(dst), offset_in_page(dst),
  209. phys_to_page(src), offset_in_page(src), step);
  210. }
  211. }
  212. /*
  213. * caching an rbio means to copy anything from the
  214. * bio_sectors array into the stripe_pages array. We
  215. * use the page uptodate bit in the stripe cache array
  216. * to indicate if it has valid data
  217. *
  218. * once the caching is done, we set the cache ready
  219. * bit.
  220. */
  221. static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
  222. {
  223. int i;
  224. int ret;
  225. ret = alloc_rbio_pages(rbio);
  226. if (ret)
  227. return;
  228. for (i = 0; i < rbio->nr_sectors; i++) {
  229. /* Some range not covered by bio (partial write), skip it */
  230. if (rbio->bio_paddrs[i * rbio->sector_nsteps] == INVALID_PADDR) {
  231. /*
  232. * Even if the sector is not covered by bio, if it is
  233. * a data sector it should still be uptodate as it is
  234. * read from disk.
  235. */
  236. if (i < rbio->nr_data * rbio->stripe_nsectors)
  237. ASSERT(test_bit(i, rbio->stripe_uptodate_bitmap));
  238. continue;
  239. }
  240. memcpy_from_bio_to_stripe(rbio, i);
  241. set_bit(i, rbio->stripe_uptodate_bitmap);
  242. }
  243. set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
  244. }
  245. /*
  246. * we hash on the first logical address of the stripe
  247. */
  248. static int rbio_bucket(struct btrfs_raid_bio *rbio)
  249. {
  250. u64 num = rbio->bioc->full_stripe_logical;
  251. /*
  252. * we shift down quite a bit. We're using byte
  253. * addressing, and most of the lower bits are zeros.
  254. * This tends to upset hash_64, and it consistently
  255. * returns just one or two different values.
  256. *
  257. * shifting off the lower bits fixes things.
  258. */
  259. return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
  260. }
  261. /* Get the sector number of the first sector covered by @page_nr. */
  262. static u32 page_nr_to_sector_nr(struct btrfs_raid_bio *rbio, unsigned int page_nr)
  263. {
  264. u32 sector_nr;
  265. ASSERT(page_nr < rbio->nr_pages);
  266. sector_nr = (page_nr << PAGE_SHIFT) >> rbio->bioc->fs_info->sectorsize_bits;
  267. ASSERT(sector_nr < rbio->nr_sectors);
  268. return sector_nr;
  269. }
  270. /*
  271. * Get the number of sectors covered by @page_nr.
  272. *
  273. * For bs > ps cases, the result will always be 1.
  274. * For bs <= ps cases, the result will be ps / bs.
  275. */
  276. static u32 page_nr_to_num_sectors(struct btrfs_raid_bio *rbio, unsigned int page_nr)
  277. {
  278. struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
  279. u32 nr_sectors;
  280. ASSERT(page_nr < rbio->nr_pages);
  281. nr_sectors = round_up(PAGE_SIZE, fs_info->sectorsize) >> fs_info->sectorsize_bits;
  282. ASSERT(nr_sectors > 0);
  283. return nr_sectors;
  284. }
  285. static __maybe_unused bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio,
  286. unsigned int page_nr)
  287. {
  288. const u32 sector_nr = page_nr_to_sector_nr(rbio, page_nr);
  289. const u32 nr_bits = page_nr_to_num_sectors(rbio, page_nr);
  290. int i;
  291. ASSERT(page_nr < rbio->nr_pages);
  292. ASSERT(sector_nr + nr_bits < rbio->nr_sectors);
  293. for (i = sector_nr; i < sector_nr + nr_bits; i++) {
  294. if (!test_bit(i, rbio->stripe_uptodate_bitmap))
  295. return false;
  296. }
  297. return true;
  298. }
  299. /*
  300. * Update the stripe_sectors[] array to use correct page and pgoff
  301. *
  302. * Should be called every time any page pointer in stripes_pages[] got modified.
  303. */
  304. static void index_stripe_sectors(struct btrfs_raid_bio *rbio)
  305. {
  306. const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE);
  307. u32 offset;
  308. int i;
  309. for (i = 0, offset = 0; i < rbio->nr_sectors * rbio->sector_nsteps;
  310. i++, offset += step) {
  311. int page_index = offset >> PAGE_SHIFT;
  312. ASSERT(page_index < rbio->nr_pages);
  313. if (!rbio->stripe_pages[page_index])
  314. continue;
  315. rbio->stripe_paddrs[i] = page_to_phys(rbio->stripe_pages[page_index]) +
  316. offset_in_page(offset);
  317. }
  318. }
  319. static void steal_rbio_page(struct btrfs_raid_bio *src,
  320. struct btrfs_raid_bio *dest, int page_nr)
  321. {
  322. const u32 sector_nr = page_nr_to_sector_nr(src, page_nr);
  323. const u32 nr_bits = page_nr_to_num_sectors(src, page_nr);
  324. ASSERT(page_nr < src->nr_pages);
  325. ASSERT(sector_nr + nr_bits < src->nr_sectors);
  326. if (dest->stripe_pages[page_nr])
  327. __free_page(dest->stripe_pages[page_nr]);
  328. dest->stripe_pages[page_nr] = src->stripe_pages[page_nr];
  329. src->stripe_pages[page_nr] = NULL;
  330. /* Also update the stripe_uptodate_bitmap bits. */
  331. bitmap_set(dest->stripe_uptodate_bitmap, sector_nr, nr_bits);
  332. }
  333. static bool is_data_stripe_page(struct btrfs_raid_bio *rbio, int page_nr)
  334. {
  335. const int sector_nr = page_nr_to_sector_nr(rbio, page_nr);
  336. /*
  337. * We have ensured PAGE_SIZE is aligned with sectorsize, thus
  338. * we won't have a page which is half data half parity.
  339. *
  340. * Thus if the first sector of the page belongs to data stripes, then
  341. * the full page belongs to data stripes.
  342. */
  343. return (sector_nr < rbio->nr_data * rbio->stripe_nsectors);
  344. }
  345. /*
  346. * Stealing an rbio means taking all the uptodate pages from the stripe array
  347. * in the source rbio and putting them into the destination rbio.
  348. *
  349. * This will also update the involved stripe_sectors[] which are referring to
  350. * the old pages.
  351. */
  352. static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
  353. {
  354. int i;
  355. if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
  356. return;
  357. for (i = 0; i < dest->nr_pages; i++) {
  358. struct page *p = src->stripe_pages[i];
  359. /*
  360. * We don't need to steal P/Q pages as they will always be
  361. * regenerated for RMW or full write anyway.
  362. */
  363. if (!is_data_stripe_page(src, i))
  364. continue;
  365. /*
  366. * If @src already has RBIO_CACHE_READY_BIT, it should have
  367. * all data stripe pages present and uptodate.
  368. */
  369. ASSERT(p);
  370. ASSERT(full_page_sectors_uptodate(src, i));
  371. steal_rbio_page(src, dest, i);
  372. }
  373. index_stripe_sectors(dest);
  374. index_stripe_sectors(src);
  375. }
  376. /*
  377. * merging means we take the bio_list from the victim and
  378. * splice it into the destination. The victim should
  379. * be discarded afterwards.
  380. *
  381. * must be called with dest->rbio_list_lock held
  382. */
  383. static void merge_rbio(struct btrfs_raid_bio *dest,
  384. struct btrfs_raid_bio *victim)
  385. {
  386. bio_list_merge_init(&dest->bio_list, &victim->bio_list);
  387. dest->bio_list_bytes += victim->bio_list_bytes;
  388. /* Also inherit the bitmaps from @victim. */
  389. bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap,
  390. dest->stripe_nsectors);
  391. }
  392. /*
  393. * used to prune items that are in the cache. The caller
  394. * must hold the hash table lock.
  395. */
  396. static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
  397. {
  398. int bucket = rbio_bucket(rbio);
  399. struct btrfs_stripe_hash_table *table;
  400. struct btrfs_stripe_hash *h;
  401. int freeit = 0;
  402. /*
  403. * check the bit again under the hash table lock.
  404. */
  405. if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
  406. return;
  407. table = rbio->bioc->fs_info->stripe_hash_table;
  408. h = table->table + bucket;
  409. /* hold the lock for the bucket because we may be
  410. * removing it from the hash table
  411. */
  412. spin_lock(&h->lock);
  413. /*
  414. * hold the lock for the bio list because we need
  415. * to make sure the bio list is empty
  416. */
  417. spin_lock(&rbio->bio_list_lock);
  418. if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
  419. list_del_init(&rbio->stripe_cache);
  420. table->cache_size -= 1;
  421. freeit = 1;
  422. /* if the bio list isn't empty, this rbio is
  423. * still involved in an IO. We take it out
  424. * of the cache list, and drop the ref that
  425. * was held for the list.
  426. *
  427. * If the bio_list was empty, we also remove
  428. * the rbio from the hash_table, and drop
  429. * the corresponding ref
  430. */
  431. if (bio_list_empty(&rbio->bio_list)) {
  432. if (!list_empty(&rbio->hash_list)) {
  433. list_del_init(&rbio->hash_list);
  434. refcount_dec(&rbio->refs);
  435. BUG_ON(!list_empty(&rbio->plug_list));
  436. }
  437. }
  438. }
  439. spin_unlock(&rbio->bio_list_lock);
  440. spin_unlock(&h->lock);
  441. if (freeit)
  442. free_raid_bio(rbio);
  443. }
  444. /*
  445. * prune a given rbio from the cache
  446. */
  447. static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
  448. {
  449. struct btrfs_stripe_hash_table *table;
  450. if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
  451. return;
  452. table = rbio->bioc->fs_info->stripe_hash_table;
  453. spin_lock(&table->cache_lock);
  454. __remove_rbio_from_cache(rbio);
  455. spin_unlock(&table->cache_lock);
  456. }
  457. /*
  458. * remove everything in the cache
  459. */
  460. static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
  461. {
  462. struct btrfs_stripe_hash_table *table;
  463. struct btrfs_raid_bio *rbio;
  464. table = info->stripe_hash_table;
  465. spin_lock(&table->cache_lock);
  466. while (!list_empty(&table->stripe_cache)) {
  467. rbio = list_first_entry(&table->stripe_cache,
  468. struct btrfs_raid_bio, stripe_cache);
  469. __remove_rbio_from_cache(rbio);
  470. }
  471. spin_unlock(&table->cache_lock);
  472. }
  473. /*
  474. * remove all cached entries and free the hash table
  475. * used by unmount
  476. */
  477. void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
  478. {
  479. if (!info->stripe_hash_table)
  480. return;
  481. btrfs_clear_rbio_cache(info);
  482. kvfree(info->stripe_hash_table);
  483. info->stripe_hash_table = NULL;
  484. }
  485. /*
  486. * insert an rbio into the stripe cache. It
  487. * must have already been prepared by calling
  488. * cache_rbio_pages
  489. *
  490. * If this rbio was already cached, it gets
  491. * moved to the front of the lru.
  492. *
  493. * If the size of the rbio cache is too big, we
  494. * prune an item.
  495. */
  496. static void cache_rbio(struct btrfs_raid_bio *rbio)
  497. {
  498. struct btrfs_stripe_hash_table *table;
  499. if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
  500. return;
  501. table = rbio->bioc->fs_info->stripe_hash_table;
  502. spin_lock(&table->cache_lock);
  503. spin_lock(&rbio->bio_list_lock);
  504. /* bump our ref if we were not in the list before */
  505. if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
  506. refcount_inc(&rbio->refs);
  507. if (!list_empty(&rbio->stripe_cache)){
  508. list_move(&rbio->stripe_cache, &table->stripe_cache);
  509. } else {
  510. list_add(&rbio->stripe_cache, &table->stripe_cache);
  511. table->cache_size += 1;
  512. }
  513. spin_unlock(&rbio->bio_list_lock);
  514. if (table->cache_size > RBIO_CACHE_SIZE) {
  515. struct btrfs_raid_bio *found;
  516. found = list_last_entry(&table->stripe_cache,
  517. struct btrfs_raid_bio,
  518. stripe_cache);
  519. if (found != rbio)
  520. __remove_rbio_from_cache(found);
  521. }
  522. spin_unlock(&table->cache_lock);
  523. }
  524. /*
  525. * helper function to run the xor_blocks api. It is only
  526. * able to do MAX_XOR_BLOCKS at a time, so we need to
  527. * loop through.
  528. */
  529. static void run_xor(void **pages, int src_cnt, ssize_t len)
  530. {
  531. int src_off = 0;
  532. int xor_src_cnt = 0;
  533. void *dest = pages[src_cnt];
  534. while(src_cnt > 0) {
  535. xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
  536. xor_blocks(xor_src_cnt, len, dest, pages + src_off);
  537. src_cnt -= xor_src_cnt;
  538. src_off += xor_src_cnt;
  539. }
  540. }
  541. /*
  542. * Returns true if the bio list inside this rbio covers an entire stripe (no
  543. * rmw required).
  544. */
  545. static int rbio_is_full(struct btrfs_raid_bio *rbio)
  546. {
  547. unsigned long size = rbio->bio_list_bytes;
  548. int ret = 1;
  549. spin_lock(&rbio->bio_list_lock);
  550. if (size != rbio->nr_data * BTRFS_STRIPE_LEN)
  551. ret = 0;
  552. BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN);
  553. spin_unlock(&rbio->bio_list_lock);
  554. return ret;
  555. }
  556. /*
  557. * returns 1 if it is safe to merge two rbios together.
  558. * The merging is safe if the two rbios correspond to
  559. * the same stripe and if they are both going in the same
  560. * direction (read vs write), and if neither one is
  561. * locked for final IO
  562. *
  563. * The caller is responsible for locking such that
  564. * rmw_locked is safe to test
  565. */
  566. static int rbio_can_merge(struct btrfs_raid_bio *last,
  567. struct btrfs_raid_bio *cur)
  568. {
  569. if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
  570. test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
  571. return 0;
  572. /*
  573. * we can't merge with cached rbios, since the
  574. * idea is that when we merge the destination
  575. * rbio is going to run our IO for us. We can
  576. * steal from cached rbios though, other functions
  577. * handle that.
  578. */
  579. if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
  580. test_bit(RBIO_CACHE_BIT, &cur->flags))
  581. return 0;
  582. if (last->bioc->full_stripe_logical != cur->bioc->full_stripe_logical)
  583. return 0;
  584. /* we can't merge with different operations */
  585. if (last->operation != cur->operation)
  586. return 0;
  587. /*
  588. * We've need read the full stripe from the drive.
  589. * check and repair the parity and write the new results.
  590. *
  591. * We're not allowed to add any new bios to the
  592. * bio list here, anyone else that wants to
  593. * change this stripe needs to do their own rmw.
  594. */
  595. if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
  596. return 0;
  597. if (last->operation == BTRFS_RBIO_READ_REBUILD)
  598. return 0;
  599. return 1;
  600. }
  601. /* Return the sector index for @stripe_nr and @sector_nr. */
  602. static unsigned int rbio_sector_index(const struct btrfs_raid_bio *rbio,
  603. unsigned int stripe_nr,
  604. unsigned int sector_nr)
  605. {
  606. unsigned int ret;
  607. ASSERT_RBIO_STRIPE(stripe_nr < rbio->real_stripes, rbio, stripe_nr);
  608. ASSERT_RBIO_SECTOR(sector_nr < rbio->stripe_nsectors, rbio, sector_nr);
  609. ret = stripe_nr * rbio->stripe_nsectors + sector_nr;
  610. ASSERT(ret < rbio->nr_sectors);
  611. return ret;
  612. }
  613. /* Return the paddr array index for @stripe_nr, @sector_nr and @step_nr. */
  614. static unsigned int rbio_paddr_index(const struct btrfs_raid_bio *rbio,
  615. unsigned int stripe_nr,
  616. unsigned int sector_nr,
  617. unsigned int step_nr)
  618. {
  619. unsigned int ret;
  620. ASSERT_RBIO_SECTOR(step_nr < rbio->sector_nsteps, rbio, step_nr);
  621. ret = rbio_sector_index(rbio, stripe_nr, sector_nr) * rbio->sector_nsteps + step_nr;
  622. ASSERT(ret < rbio->nr_sectors * rbio->sector_nsteps);
  623. return ret;
  624. }
  625. static phys_addr_t rbio_stripe_paddr(const struct btrfs_raid_bio *rbio,
  626. unsigned int stripe_nr, unsigned int sector_nr,
  627. unsigned int step_nr)
  628. {
  629. return rbio->stripe_paddrs[rbio_paddr_index(rbio, stripe_nr, sector_nr, step_nr)];
  630. }
  631. static phys_addr_t rbio_pstripe_paddr(const struct btrfs_raid_bio *rbio,
  632. unsigned int sector_nr, unsigned int step_nr)
  633. {
  634. return rbio_stripe_paddr(rbio, rbio->nr_data, sector_nr, step_nr);
  635. }
  636. static phys_addr_t rbio_qstripe_paddr(const struct btrfs_raid_bio *rbio,
  637. unsigned int sector_nr, unsigned int step_nr)
  638. {
  639. if (rbio->nr_data + 1 == rbio->real_stripes)
  640. return INVALID_PADDR;
  641. return rbio_stripe_paddr(rbio, rbio->nr_data + 1, sector_nr, step_nr);
  642. }
  643. /* Return a paddr pointer into the rbio::stripe_paddrs[] for the specified sector. */
  644. static phys_addr_t *rbio_stripe_paddrs(const struct btrfs_raid_bio *rbio,
  645. unsigned int stripe_nr, unsigned int sector_nr)
  646. {
  647. return &rbio->stripe_paddrs[rbio_paddr_index(rbio, stripe_nr, sector_nr, 0)];
  648. }
  649. /*
  650. * The first stripe in the table for a logical address
  651. * has the lock. rbios are added in one of three ways:
  652. *
  653. * 1) Nobody has the stripe locked yet. The rbio is given
  654. * the lock and 0 is returned. The caller must start the IO
  655. * themselves.
  656. *
  657. * 2) Someone has the stripe locked, but we're able to merge
  658. * with the lock owner. The rbio is freed and the IO will
  659. * start automatically along with the existing rbio. 1 is returned.
  660. *
  661. * 3) Someone has the stripe locked, but we're not able to merge.
  662. * The rbio is added to the lock owner's plug list, or merged into
  663. * an rbio already on the plug list. When the lock owner unlocks,
  664. * the next rbio on the list is run and the IO is started automatically.
  665. * 1 is returned
  666. *
  667. * If we return 0, the caller still owns the rbio and must continue with
  668. * IO submission. If we return 1, the caller must assume the rbio has
  669. * already been freed.
  670. */
  671. static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
  672. {
  673. struct btrfs_stripe_hash *h;
  674. struct btrfs_raid_bio *cur;
  675. struct btrfs_raid_bio *pending;
  676. struct btrfs_raid_bio *freeit = NULL;
  677. struct btrfs_raid_bio *cache_drop = NULL;
  678. int ret = 0;
  679. h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
  680. spin_lock(&h->lock);
  681. list_for_each_entry(cur, &h->hash_list, hash_list) {
  682. if (cur->bioc->full_stripe_logical != rbio->bioc->full_stripe_logical)
  683. continue;
  684. spin_lock(&cur->bio_list_lock);
  685. /* Can we steal this cached rbio's pages? */
  686. if (bio_list_empty(&cur->bio_list) &&
  687. list_empty(&cur->plug_list) &&
  688. test_bit(RBIO_CACHE_BIT, &cur->flags) &&
  689. !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
  690. list_del_init(&cur->hash_list);
  691. refcount_dec(&cur->refs);
  692. steal_rbio(cur, rbio);
  693. cache_drop = cur;
  694. spin_unlock(&cur->bio_list_lock);
  695. goto lockit;
  696. }
  697. /* Can we merge into the lock owner? */
  698. if (rbio_can_merge(cur, rbio)) {
  699. merge_rbio(cur, rbio);
  700. spin_unlock(&cur->bio_list_lock);
  701. freeit = rbio;
  702. ret = 1;
  703. goto out;
  704. }
  705. /*
  706. * We couldn't merge with the running rbio, see if we can merge
  707. * with the pending ones. We don't have to check for rmw_locked
  708. * because there is no way they are inside finish_rmw right now
  709. */
  710. list_for_each_entry(pending, &cur->plug_list, plug_list) {
  711. if (rbio_can_merge(pending, rbio)) {
  712. merge_rbio(pending, rbio);
  713. spin_unlock(&cur->bio_list_lock);
  714. freeit = rbio;
  715. ret = 1;
  716. goto out;
  717. }
  718. }
  719. /*
  720. * No merging, put us on the tail of the plug list, our rbio
  721. * will be started with the currently running rbio unlocks
  722. */
  723. list_add_tail(&rbio->plug_list, &cur->plug_list);
  724. spin_unlock(&cur->bio_list_lock);
  725. ret = 1;
  726. goto out;
  727. }
  728. lockit:
  729. refcount_inc(&rbio->refs);
  730. list_add(&rbio->hash_list, &h->hash_list);
  731. out:
  732. spin_unlock(&h->lock);
  733. if (cache_drop)
  734. remove_rbio_from_cache(cache_drop);
  735. if (freeit)
  736. free_raid_bio(freeit);
  737. return ret;
  738. }
  739. static void recover_rbio_work_locked(struct work_struct *work);
  740. /*
  741. * called as rmw or parity rebuild is completed. If the plug list has more
  742. * rbios waiting for this stripe, the next one on the list will be started
  743. */
  744. static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
  745. {
  746. int bucket;
  747. struct btrfs_stripe_hash *h;
  748. int keep_cache = 0;
  749. bucket = rbio_bucket(rbio);
  750. h = rbio->bioc->fs_info->stripe_hash_table->table + bucket;
  751. if (list_empty(&rbio->plug_list))
  752. cache_rbio(rbio);
  753. spin_lock(&h->lock);
  754. spin_lock(&rbio->bio_list_lock);
  755. if (!list_empty(&rbio->hash_list)) {
  756. /*
  757. * if we're still cached and there is no other IO
  758. * to perform, just leave this rbio here for others
  759. * to steal from later
  760. */
  761. if (list_empty(&rbio->plug_list) &&
  762. test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
  763. keep_cache = 1;
  764. clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
  765. BUG_ON(!bio_list_empty(&rbio->bio_list));
  766. goto done;
  767. }
  768. list_del_init(&rbio->hash_list);
  769. refcount_dec(&rbio->refs);
  770. /*
  771. * we use the plug list to hold all the rbios
  772. * waiting for the chance to lock this stripe.
  773. * hand the lock over to one of them.
  774. */
  775. if (!list_empty(&rbio->plug_list)) {
  776. struct btrfs_raid_bio *next;
  777. struct list_head *head = rbio->plug_list.next;
  778. next = list_entry(head, struct btrfs_raid_bio,
  779. plug_list);
  780. list_del_init(&rbio->plug_list);
  781. list_add(&next->hash_list, &h->hash_list);
  782. refcount_inc(&next->refs);
  783. spin_unlock(&rbio->bio_list_lock);
  784. spin_unlock(&h->lock);
  785. if (next->operation == BTRFS_RBIO_READ_REBUILD) {
  786. start_async_work(next, recover_rbio_work_locked);
  787. } else if (next->operation == BTRFS_RBIO_WRITE) {
  788. steal_rbio(rbio, next);
  789. start_async_work(next, rmw_rbio_work_locked);
  790. } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
  791. steal_rbio(rbio, next);
  792. start_async_work(next, scrub_rbio_work_locked);
  793. }
  794. goto done_nolock;
  795. }
  796. }
  797. done:
  798. spin_unlock(&rbio->bio_list_lock);
  799. spin_unlock(&h->lock);
  800. done_nolock:
  801. if (!keep_cache)
  802. remove_rbio_from_cache(rbio);
  803. }
  804. static void rbio_endio_bio_list(struct bio *cur, blk_status_t status)
  805. {
  806. struct bio *next;
  807. while (cur) {
  808. next = cur->bi_next;
  809. cur->bi_next = NULL;
  810. cur->bi_status = status;
  811. bio_endio(cur);
  812. cur = next;
  813. }
  814. }
  815. /*
  816. * this frees the rbio and runs through all the bios in the
  817. * bio_list and calls end_io on them
  818. */
  819. static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t status)
  820. {
  821. struct bio *cur = bio_list_get(&rbio->bio_list);
  822. struct bio *extra;
  823. kfree(rbio->csum_buf);
  824. bitmap_free(rbio->csum_bitmap);
  825. rbio->csum_buf = NULL;
  826. rbio->csum_bitmap = NULL;
  827. /*
  828. * Clear the data bitmap, as the rbio may be cached for later usage.
  829. * do this before before unlock_stripe() so there will be no new bio
  830. * for this bio.
  831. */
  832. bitmap_clear(&rbio->dbitmap, 0, rbio->stripe_nsectors);
  833. /*
  834. * At this moment, rbio->bio_list is empty, however since rbio does not
  835. * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
  836. * hash list, rbio may be merged with others so that rbio->bio_list
  837. * becomes non-empty.
  838. * Once unlock_stripe() is done, rbio->bio_list will not be updated any
  839. * more and we can call bio_endio() on all queued bios.
  840. */
  841. unlock_stripe(rbio);
  842. extra = bio_list_get(&rbio->bio_list);
  843. free_raid_bio(rbio);
  844. rbio_endio_bio_list(cur, status);
  845. if (extra)
  846. rbio_endio_bio_list(extra, status);
  847. }
  848. /*
  849. * Get paddr pointer for the sector specified by its @stripe_nr and @sector_nr.
  850. *
  851. * @rbio: The raid bio
  852. * @stripe_nr: Stripe number, valid range [0, real_stripe)
  853. * @sector_nr: Sector number inside the stripe,
  854. * valid range [0, stripe_nsectors)
  855. * @bio_list_only: Whether to use sectors inside the bio list only.
  856. *
  857. * The read/modify/write code wants to reuse the original bio page as much
  858. * as possible, and only use stripe_sectors as fallback.
  859. *
  860. * Return NULL if bio_list_only is set but the specified sector has no
  861. * coresponding bio.
  862. */
  863. static phys_addr_t *sector_paddrs_in_rbio(struct btrfs_raid_bio *rbio,
  864. int stripe_nr, int sector_nr,
  865. bool bio_list_only)
  866. {
  867. phys_addr_t *ret = NULL;
  868. const int index = rbio_paddr_index(rbio, stripe_nr, sector_nr, 0);
  869. ASSERT(index >= 0 && index < rbio->nr_sectors * rbio->sector_nsteps);
  870. scoped_guard(spinlock, &rbio->bio_list_lock) {
  871. if (rbio->bio_paddrs[index] != INVALID_PADDR || bio_list_only) {
  872. /* Don't return sector without a valid page pointer */
  873. if (rbio->bio_paddrs[index] != INVALID_PADDR)
  874. ret = &rbio->bio_paddrs[index];
  875. return ret;
  876. }
  877. }
  878. return &rbio->stripe_paddrs[index];
  879. }
  880. /*
  881. * Similar to sector_paddr_in_rbio(), but with extra consideration for
  882. * bs > ps cases, where we can have multiple steps for a fs block.
  883. */
  884. static phys_addr_t sector_paddr_in_rbio(struct btrfs_raid_bio *rbio,
  885. int stripe_nr, int sector_nr, int step_nr,
  886. bool bio_list_only)
  887. {
  888. phys_addr_t ret = INVALID_PADDR;
  889. const int index = rbio_paddr_index(rbio, stripe_nr, sector_nr, step_nr);
  890. ASSERT(index >= 0 && index < rbio->nr_sectors * rbio->sector_nsteps);
  891. scoped_guard(spinlock, &rbio->bio_list_lock) {
  892. if (rbio->bio_paddrs[index] != INVALID_PADDR || bio_list_only) {
  893. /* Don't return sector without a valid page pointer */
  894. if (rbio->bio_paddrs[index] != INVALID_PADDR)
  895. ret = rbio->bio_paddrs[index];
  896. return ret;
  897. }
  898. }
  899. return rbio->stripe_paddrs[index];
  900. }
  901. /*
  902. * allocation and initial setup for the btrfs_raid_bio. Not
  903. * this does not allocate any pages for rbio->pages.
  904. */
  905. static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
  906. struct btrfs_io_context *bioc)
  907. {
  908. const unsigned int real_stripes = bioc->num_stripes - bioc->replace_nr_stripes;
  909. const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT;
  910. const unsigned int num_pages = stripe_npages * real_stripes;
  911. const unsigned int stripe_nsectors =
  912. BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
  913. const unsigned int num_sectors = stripe_nsectors * real_stripes;
  914. const unsigned int step = min(fs_info->sectorsize, PAGE_SIZE);
  915. const unsigned int sector_nsteps = fs_info->sectorsize / step;
  916. struct btrfs_raid_bio *rbio;
  917. /*
  918. * For bs <= ps cases, ps must be aligned to bs.
  919. * For bs > ps cases, bs must be aligned to ps.
  920. */
  921. ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize) ||
  922. IS_ALIGNED(fs_info->sectorsize, PAGE_SIZE));
  923. /*
  924. * Our current stripe len should be fixed to 64k thus stripe_nsectors
  925. * (at most 16) should be no larger than BITS_PER_LONG.
  926. */
  927. ASSERT(stripe_nsectors <= BITS_PER_LONG);
  928. /*
  929. * Real stripes must be between 2 (2 disks RAID5, aka RAID1) and 256
  930. * (limited by u8).
  931. */
  932. ASSERT(real_stripes >= 2);
  933. ASSERT(real_stripes <= U8_MAX);
  934. rbio = kzalloc_obj(*rbio, GFP_NOFS);
  935. if (!rbio)
  936. return ERR_PTR(-ENOMEM);
  937. rbio->stripe_pages = kzalloc_objs(struct page *, num_pages, GFP_NOFS);
  938. rbio->bio_paddrs = kzalloc_objs(phys_addr_t,
  939. num_sectors * sector_nsteps, GFP_NOFS);
  940. rbio->stripe_paddrs = kzalloc_objs(phys_addr_t,
  941. num_sectors * sector_nsteps,
  942. GFP_NOFS);
  943. rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS);
  944. rbio->error_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS);
  945. rbio->stripe_uptodate_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS);
  946. if (!rbio->stripe_pages || !rbio->bio_paddrs || !rbio->stripe_paddrs ||
  947. !rbio->finish_pointers || !rbio->error_bitmap || !rbio->stripe_uptodate_bitmap) {
  948. free_raid_bio_pointers(rbio);
  949. kfree(rbio);
  950. return ERR_PTR(-ENOMEM);
  951. }
  952. for (int i = 0; i < num_sectors * sector_nsteps; i++) {
  953. rbio->stripe_paddrs[i] = INVALID_PADDR;
  954. rbio->bio_paddrs[i] = INVALID_PADDR;
  955. }
  956. bio_list_init(&rbio->bio_list);
  957. init_waitqueue_head(&rbio->io_wait);
  958. INIT_LIST_HEAD(&rbio->plug_list);
  959. spin_lock_init(&rbio->bio_list_lock);
  960. INIT_LIST_HEAD(&rbio->stripe_cache);
  961. INIT_LIST_HEAD(&rbio->hash_list);
  962. btrfs_get_bioc(bioc);
  963. rbio->bioc = bioc;
  964. rbio->nr_pages = num_pages;
  965. rbio->nr_sectors = num_sectors;
  966. rbio->real_stripes = real_stripes;
  967. rbio->stripe_npages = stripe_npages;
  968. rbio->stripe_nsectors = stripe_nsectors;
  969. rbio->sector_nsteps = sector_nsteps;
  970. refcount_set(&rbio->refs, 1);
  971. atomic_set(&rbio->stripes_pending, 0);
  972. ASSERT(btrfs_nr_parity_stripes(bioc->map_type));
  973. rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type);
  974. ASSERT(rbio->nr_data > 0);
  975. return rbio;
  976. }
  977. /* allocate pages for all the stripes in the bio, including parity */
  978. static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
  979. {
  980. int ret;
  981. ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages, false);
  982. if (ret < 0)
  983. return ret;
  984. /* Mapping all sectors */
  985. index_stripe_sectors(rbio);
  986. return 0;
  987. }
  988. /* only allocate pages for p/q stripes */
  989. static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
  990. {
  991. const int data_pages = rbio->nr_data * rbio->stripe_npages;
  992. int ret;
  993. ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages,
  994. rbio->stripe_pages + data_pages, false);
  995. if (ret < 0)
  996. return ret;
  997. index_stripe_sectors(rbio);
  998. return 0;
  999. }
  1000. /*
  1001. * Return the total number of errors found in the vertical stripe of @sector_nr.
  1002. *
  1003. * @faila and @failb will also be updated to the first and second stripe
  1004. * number of the errors.
  1005. */
  1006. static int get_rbio_vertical_errors(struct btrfs_raid_bio *rbio, int sector_nr,
  1007. int *faila, int *failb)
  1008. {
  1009. int stripe_nr;
  1010. int found_errors = 0;
  1011. if (faila || failb) {
  1012. /*
  1013. * Both @faila and @failb should be valid pointers if any of
  1014. * them is specified.
  1015. */
  1016. ASSERT(faila && failb);
  1017. *faila = -1;
  1018. *failb = -1;
  1019. }
  1020. for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
  1021. int total_sector_nr = stripe_nr * rbio->stripe_nsectors + sector_nr;
  1022. if (test_bit(total_sector_nr, rbio->error_bitmap)) {
  1023. found_errors++;
  1024. if (faila) {
  1025. /* Update faila and failb. */
  1026. if (*faila < 0)
  1027. *faila = stripe_nr;
  1028. else if (*failb < 0)
  1029. *failb = stripe_nr;
  1030. }
  1031. }
  1032. }
  1033. return found_errors;
  1034. }
  1035. static int bio_add_paddrs(struct bio *bio, phys_addr_t *paddrs, unsigned int nr_steps,
  1036. unsigned int step)
  1037. {
  1038. int added = 0;
  1039. int ret;
  1040. for (int i = 0; i < nr_steps; i++) {
  1041. ret = bio_add_page(bio, phys_to_page(paddrs[i]), step,
  1042. offset_in_page(paddrs[i]));
  1043. if (ret != step)
  1044. goto revert;
  1045. added += ret;
  1046. }
  1047. return added;
  1048. revert:
  1049. /*
  1050. * We don't need to revert the bvec, as the bio will be submitted immediately,
  1051. * as long as the size is reduced the extra bvec will not be accessed.
  1052. */
  1053. bio->bi_iter.bi_size -= added;
  1054. return 0;
  1055. }
  1056. /*
  1057. * Add a single sector @sector into our list of bios for IO.
  1058. *
  1059. * Return 0 if everything went well.
  1060. * Return <0 for error, and no byte will be added to @rbio.
  1061. */
  1062. static int rbio_add_io_paddrs(struct btrfs_raid_bio *rbio, struct bio_list *bio_list,
  1063. phys_addr_t *paddrs, unsigned int stripe_nr,
  1064. unsigned int sector_nr, enum req_op op)
  1065. {
  1066. const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
  1067. const u32 step = min(sectorsize, PAGE_SIZE);
  1068. struct bio *last = bio_list->tail;
  1069. int ret;
  1070. struct bio *bio;
  1071. struct btrfs_io_stripe *stripe;
  1072. u64 disk_start;
  1073. /*
  1074. * Note: here stripe_nr has taken device replace into consideration,
  1075. * thus it can be larger than rbio->real_stripe.
  1076. * So here we check against bioc->num_stripes, not rbio->real_stripes.
  1077. */
  1078. ASSERT_RBIO_STRIPE(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes,
  1079. rbio, stripe_nr);
  1080. ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors,
  1081. rbio, sector_nr);
  1082. ASSERT(paddrs != NULL);
  1083. stripe = &rbio->bioc->stripes[stripe_nr];
  1084. disk_start = stripe->physical + sector_nr * sectorsize;
  1085. /* if the device is missing, just fail this stripe */
  1086. if (!stripe->dev->bdev) {
  1087. int found_errors;
  1088. set_bit(stripe_nr * rbio->stripe_nsectors + sector_nr,
  1089. rbio->error_bitmap);
  1090. /* Check if we have reached tolerance early. */
  1091. found_errors = get_rbio_vertical_errors(rbio, sector_nr,
  1092. NULL, NULL);
  1093. if (unlikely(found_errors > rbio->bioc->max_errors))
  1094. return -EIO;
  1095. return 0;
  1096. }
  1097. /* see if we can add this page onto our existing bio */
  1098. if (last) {
  1099. u64 last_end = last->bi_iter.bi_sector << SECTOR_SHIFT;
  1100. last_end += last->bi_iter.bi_size;
  1101. /*
  1102. * we can't merge these if they are from different
  1103. * devices or if they are not contiguous
  1104. */
  1105. if (last_end == disk_start && !last->bi_status &&
  1106. last->bi_bdev == stripe->dev->bdev) {
  1107. ret = bio_add_paddrs(last, paddrs, rbio->sector_nsteps, step);
  1108. if (ret == sectorsize)
  1109. return 0;
  1110. }
  1111. }
  1112. /* put a new bio on the list */
  1113. bio = bio_alloc(stripe->dev->bdev,
  1114. max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1),
  1115. op, GFP_NOFS);
  1116. bio->bi_iter.bi_sector = disk_start >> SECTOR_SHIFT;
  1117. bio->bi_private = rbio;
  1118. ret = bio_add_paddrs(bio, paddrs, rbio->sector_nsteps, step);
  1119. ASSERT(ret == sectorsize);
  1120. bio_list_add(bio_list, bio);
  1121. return 0;
  1122. }
  1123. static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
  1124. {
  1125. struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
  1126. const u32 step = min(fs_info->sectorsize, PAGE_SIZE);
  1127. const u32 step_bits = min(fs_info->sectorsize_bits, PAGE_SHIFT);
  1128. struct bvec_iter iter = bio->bi_iter;
  1129. phys_addr_t paddr;
  1130. u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
  1131. rbio->bioc->full_stripe_logical;
  1132. btrfs_bio_for_each_block(paddr, bio, &iter, step) {
  1133. unsigned int index = (offset >> step_bits);
  1134. rbio->bio_paddrs[index] = paddr;
  1135. offset += step;
  1136. }
  1137. }
  1138. /*
  1139. * helper function to walk our bio list and populate the bio_pages array with
  1140. * the result. This seems expensive, but it is faster than constantly
  1141. * searching through the bio list as we setup the IO in finish_rmw or stripe
  1142. * reconstruction.
  1143. *
  1144. * This must be called before you trust the answers from page_in_rbio
  1145. */
  1146. static void index_rbio_pages(struct btrfs_raid_bio *rbio)
  1147. {
  1148. struct bio *bio;
  1149. spin_lock(&rbio->bio_list_lock);
  1150. bio_list_for_each(bio, &rbio->bio_list)
  1151. index_one_bio(rbio, bio);
  1152. spin_unlock(&rbio->bio_list_lock);
  1153. }
  1154. static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio,
  1155. struct raid56_bio_trace_info *trace_info)
  1156. {
  1157. const struct btrfs_io_context *bioc = rbio->bioc;
  1158. int i;
  1159. ASSERT(bioc);
  1160. /* We rely on bio->bi_bdev to find the stripe number. */
  1161. if (!bio->bi_bdev)
  1162. goto not_found;
  1163. for (i = 0; i < bioc->num_stripes; i++) {
  1164. if (bio->bi_bdev != bioc->stripes[i].dev->bdev)
  1165. continue;
  1166. trace_info->stripe_nr = i;
  1167. trace_info->devid = bioc->stripes[i].dev->devid;
  1168. trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
  1169. bioc->stripes[i].physical;
  1170. return;
  1171. }
  1172. not_found:
  1173. trace_info->devid = -1;
  1174. trace_info->offset = -1;
  1175. trace_info->stripe_nr = -1;
  1176. }
  1177. static inline void bio_list_put(struct bio_list *bio_list)
  1178. {
  1179. struct bio *bio;
  1180. while ((bio = bio_list_pop(bio_list)))
  1181. bio_put(bio);
  1182. }
  1183. static void assert_rbio(struct btrfs_raid_bio *rbio)
  1184. {
  1185. if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
  1186. return;
  1187. /*
  1188. * At least two stripes (2 disks RAID5), and since real_stripes is U8,
  1189. * we won't go beyond 256 disks anyway.
  1190. */
  1191. ASSERT_RBIO(rbio->real_stripes >= 2, rbio);
  1192. ASSERT_RBIO(rbio->nr_data > 0, rbio);
  1193. /*
  1194. * This is another check to make sure nr data stripes is smaller
  1195. * than total stripes.
  1196. */
  1197. ASSERT_RBIO(rbio->nr_data < rbio->real_stripes, rbio);
  1198. }
  1199. static inline void *kmap_local_paddr(phys_addr_t paddr)
  1200. {
  1201. /* The sector pointer must have a page mapped to it. */
  1202. ASSERT(paddr != INVALID_PADDR);
  1203. return kmap_local_page(phys_to_page(paddr)) + offset_in_page(paddr);
  1204. }
  1205. static void generate_pq_vertical_step(struct btrfs_raid_bio *rbio, unsigned int sector_nr,
  1206. unsigned int step_nr)
  1207. {
  1208. void **pointers = rbio->finish_pointers;
  1209. const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE);
  1210. int stripe;
  1211. const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6;
  1212. /* First collect one sector from each data stripe */
  1213. for (stripe = 0; stripe < rbio->nr_data; stripe++)
  1214. pointers[stripe] = kmap_local_paddr(
  1215. sector_paddr_in_rbio(rbio, stripe, sector_nr, step_nr, 0));
  1216. /* Then add the parity stripe */
  1217. pointers[stripe++] = kmap_local_paddr(rbio_pstripe_paddr(rbio, sector_nr, step_nr));
  1218. if (has_qstripe) {
  1219. /*
  1220. * RAID6, add the qstripe and call the library function
  1221. * to fill in our p/q
  1222. */
  1223. pointers[stripe++] = kmap_local_paddr(
  1224. rbio_qstripe_paddr(rbio, sector_nr, step_nr));
  1225. assert_rbio(rbio);
  1226. raid6_call.gen_syndrome(rbio->real_stripes, step, pointers);
  1227. } else {
  1228. /* raid5 */
  1229. memcpy(pointers[rbio->nr_data], pointers[0], step);
  1230. run_xor(pointers + 1, rbio->nr_data - 1, step);
  1231. }
  1232. for (stripe = stripe - 1; stripe >= 0; stripe--)
  1233. kunmap_local(pointers[stripe]);
  1234. }
  1235. /* Generate PQ for one vertical stripe. */
  1236. static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
  1237. {
  1238. const bool has_qstripe = (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6);
  1239. for (int i = 0; i < rbio->sector_nsteps; i++)
  1240. generate_pq_vertical_step(rbio, sectornr, i);
  1241. set_bit(rbio_sector_index(rbio, rbio->nr_data, sectornr),
  1242. rbio->stripe_uptodate_bitmap);
  1243. if (has_qstripe)
  1244. set_bit(rbio_sector_index(rbio, rbio->nr_data + 1, sectornr),
  1245. rbio->stripe_uptodate_bitmap);
  1246. }
  1247. static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
  1248. struct bio_list *bio_list)
  1249. {
  1250. /* The total sector number inside the full stripe. */
  1251. int total_sector_nr;
  1252. int sectornr;
  1253. int stripe;
  1254. int ret;
  1255. ASSERT(bio_list_size(bio_list) == 0);
  1256. /* We should have at least one data sector. */
  1257. ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors));
  1258. /*
  1259. * Reset errors, as we may have errors inherited from from degraded
  1260. * write.
  1261. */
  1262. bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
  1263. /*
  1264. * Start assembly. Make bios for everything from the higher layers (the
  1265. * bio_list in our rbio) and our P/Q. Ignore everything else.
  1266. */
  1267. for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
  1268. total_sector_nr++) {
  1269. phys_addr_t *paddrs;
  1270. stripe = total_sector_nr / rbio->stripe_nsectors;
  1271. sectornr = total_sector_nr % rbio->stripe_nsectors;
  1272. /* This vertical stripe has no data, skip it. */
  1273. if (!test_bit(sectornr, &rbio->dbitmap))
  1274. continue;
  1275. if (stripe < rbio->nr_data) {
  1276. paddrs = sector_paddrs_in_rbio(rbio, stripe, sectornr, 1);
  1277. if (paddrs == NULL)
  1278. continue;
  1279. } else {
  1280. paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr);
  1281. }
  1282. ret = rbio_add_io_paddrs(rbio, bio_list, paddrs, stripe,
  1283. sectornr, REQ_OP_WRITE);
  1284. if (ret)
  1285. goto error;
  1286. }
  1287. if (likely(!rbio->bioc->replace_nr_stripes))
  1288. return 0;
  1289. /*
  1290. * Make a copy for the replace target device.
  1291. *
  1292. * Thus the source stripe number (in replace_stripe_src) should be valid.
  1293. */
  1294. ASSERT(rbio->bioc->replace_stripe_src >= 0);
  1295. for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
  1296. total_sector_nr++) {
  1297. phys_addr_t *paddrs;
  1298. stripe = total_sector_nr / rbio->stripe_nsectors;
  1299. sectornr = total_sector_nr % rbio->stripe_nsectors;
  1300. /*
  1301. * For RAID56, there is only one device that can be replaced,
  1302. * and replace_stripe_src[0] indicates the stripe number we
  1303. * need to copy from.
  1304. */
  1305. if (stripe != rbio->bioc->replace_stripe_src) {
  1306. /*
  1307. * We can skip the whole stripe completely, note
  1308. * total_sector_nr will be increased by one anyway.
  1309. */
  1310. ASSERT(sectornr == 0);
  1311. total_sector_nr += rbio->stripe_nsectors - 1;
  1312. continue;
  1313. }
  1314. /* This vertical stripe has no data, skip it. */
  1315. if (!test_bit(sectornr, &rbio->dbitmap))
  1316. continue;
  1317. if (stripe < rbio->nr_data) {
  1318. paddrs = sector_paddrs_in_rbio(rbio, stripe, sectornr, 1);
  1319. if (paddrs == NULL)
  1320. continue;
  1321. } else {
  1322. paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr);
  1323. }
  1324. ret = rbio_add_io_paddrs(rbio, bio_list, paddrs,
  1325. rbio->real_stripes,
  1326. sectornr, REQ_OP_WRITE);
  1327. if (ret)
  1328. goto error;
  1329. }
  1330. return 0;
  1331. error:
  1332. bio_list_put(bio_list);
  1333. return -EIO;
  1334. }
  1335. static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio)
  1336. {
  1337. struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
  1338. u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
  1339. rbio->bioc->full_stripe_logical;
  1340. int total_nr_sector = offset >> fs_info->sectorsize_bits;
  1341. ASSERT(total_nr_sector < rbio->nr_data * rbio->stripe_nsectors);
  1342. bitmap_set(rbio->error_bitmap, total_nr_sector,
  1343. bio->bi_iter.bi_size >> fs_info->sectorsize_bits);
  1344. /*
  1345. * Special handling for raid56_alloc_missing_rbio() used by
  1346. * scrub/replace. Unlike call path in raid56_parity_recover(), they
  1347. * pass an empty bio here. Thus we have to find out the missing device
  1348. * and mark the stripe error instead.
  1349. */
  1350. if (bio->bi_iter.bi_size == 0) {
  1351. bool found_missing = false;
  1352. int stripe_nr;
  1353. for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
  1354. if (!rbio->bioc->stripes[stripe_nr].dev->bdev) {
  1355. found_missing = true;
  1356. bitmap_set(rbio->error_bitmap,
  1357. stripe_nr * rbio->stripe_nsectors,
  1358. rbio->stripe_nsectors);
  1359. }
  1360. }
  1361. ASSERT(found_missing);
  1362. }
  1363. }
  1364. /*
  1365. * Return the index inside the rbio->stripe_sectors[] array.
  1366. *
  1367. * Return -1 if not found.
  1368. */
  1369. static int find_stripe_sector_nr(struct btrfs_raid_bio *rbio, phys_addr_t paddr)
  1370. {
  1371. for (int i = 0; i < rbio->nr_sectors; i++) {
  1372. if (rbio->stripe_paddrs[i * rbio->sector_nsteps] == paddr)
  1373. return i;
  1374. }
  1375. return -1;
  1376. }
  1377. /*
  1378. * this sets each page in the bio uptodate. It should only be used on private
  1379. * rbio pages, nothing that comes in from the higher layers
  1380. */
  1381. static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
  1382. {
  1383. const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
  1384. const u32 step = min(sectorsize, PAGE_SIZE);
  1385. u32 offset = 0;
  1386. phys_addr_t paddr;
  1387. ASSERT(!bio_flagged(bio, BIO_CLONED));
  1388. btrfs_bio_for_each_block_all(paddr, bio, step) {
  1389. /* Hitting the first step of a sector. */
  1390. if (IS_ALIGNED(offset, sectorsize)) {
  1391. int sector_nr = find_stripe_sector_nr(rbio, paddr);
  1392. ASSERT(sector_nr >= 0);
  1393. if (sector_nr >= 0)
  1394. set_bit(sector_nr, rbio->stripe_uptodate_bitmap);
  1395. }
  1396. offset += step;
  1397. }
  1398. }
  1399. static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio)
  1400. {
  1401. phys_addr_t bvec_paddr = bvec_phys(bio_first_bvec_all(bio));
  1402. int i;
  1403. for (i = 0; i < rbio->nr_sectors; i++) {
  1404. if (rbio->stripe_paddrs[i * rbio->sector_nsteps] == bvec_paddr)
  1405. break;
  1406. if (rbio->bio_paddrs[i * rbio->sector_nsteps] == bvec_paddr)
  1407. break;
  1408. }
  1409. ASSERT(i < rbio->nr_sectors);
  1410. return i;
  1411. }
  1412. static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bio)
  1413. {
  1414. int total_sector_nr = get_bio_sector_nr(rbio, bio);
  1415. u32 bio_size = 0;
  1416. struct bio_vec *bvec;
  1417. int i;
  1418. bio_for_each_bvec_all(bvec, bio, i)
  1419. bio_size += bvec->bv_len;
  1420. /*
  1421. * Since we can have multiple bios touching the error_bitmap, we cannot
  1422. * call bitmap_set() without protection.
  1423. *
  1424. * Instead use set_bit() for each bit, as set_bit() itself is atomic.
  1425. */
  1426. for (i = total_sector_nr; i < total_sector_nr +
  1427. (bio_size >> rbio->bioc->fs_info->sectorsize_bits); i++)
  1428. set_bit(i, rbio->error_bitmap);
  1429. }
  1430. /* Verify the data sectors at read time. */
  1431. static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio,
  1432. struct bio *bio)
  1433. {
  1434. struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
  1435. const u32 step = min(fs_info->sectorsize, PAGE_SIZE);
  1436. const u32 nr_steps = rbio->sector_nsteps;
  1437. int total_sector_nr = get_bio_sector_nr(rbio, bio);
  1438. u32 offset = 0;
  1439. phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE];
  1440. phys_addr_t paddr;
  1441. /* No data csum for the whole stripe, no need to verify. */
  1442. if (!rbio->csum_bitmap || !rbio->csum_buf)
  1443. return;
  1444. /* P/Q stripes, they have no data csum to verify against. */
  1445. if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors)
  1446. return;
  1447. btrfs_bio_for_each_block_all(paddr, bio, step) {
  1448. u8 csum_buf[BTRFS_CSUM_SIZE];
  1449. u8 *expected_csum;
  1450. paddrs[(offset / step) % nr_steps] = paddr;
  1451. offset += step;
  1452. /* Not yet covering the full fs block, continue to the next step. */
  1453. if (!IS_ALIGNED(offset, fs_info->sectorsize))
  1454. continue;
  1455. /* No csum for this sector, skip to the next sector. */
  1456. if (!test_bit(total_sector_nr, rbio->csum_bitmap))
  1457. continue;
  1458. expected_csum = rbio->csum_buf + total_sector_nr * fs_info->csum_size;
  1459. btrfs_calculate_block_csum_pages(fs_info, paddrs, csum_buf);
  1460. if (unlikely(memcmp(csum_buf, expected_csum, fs_info->csum_size) != 0))
  1461. set_bit(total_sector_nr, rbio->error_bitmap);
  1462. total_sector_nr++;
  1463. }
  1464. }
  1465. static void raid_wait_read_end_io(struct bio *bio)
  1466. {
  1467. struct btrfs_raid_bio *rbio = bio->bi_private;
  1468. if (bio->bi_status) {
  1469. rbio_update_error_bitmap(rbio, bio);
  1470. } else {
  1471. set_bio_pages_uptodate(rbio, bio);
  1472. verify_bio_data_sectors(rbio, bio);
  1473. }
  1474. bio_put(bio);
  1475. if (atomic_dec_and_test(&rbio->stripes_pending))
  1476. wake_up(&rbio->io_wait);
  1477. }
  1478. static void submit_read_wait_bio_list(struct btrfs_raid_bio *rbio,
  1479. struct bio_list *bio_list)
  1480. {
  1481. struct bio *bio;
  1482. atomic_set(&rbio->stripes_pending, bio_list_size(bio_list));
  1483. while ((bio = bio_list_pop(bio_list))) {
  1484. bio->bi_end_io = raid_wait_read_end_io;
  1485. if (trace_raid56_read_enabled()) {
  1486. struct raid56_bio_trace_info trace_info = { 0 };
  1487. bio_get_trace_info(rbio, bio, &trace_info);
  1488. trace_raid56_read(rbio, bio, &trace_info);
  1489. }
  1490. submit_bio(bio);
  1491. }
  1492. wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
  1493. }
  1494. static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio)
  1495. {
  1496. const int data_pages = rbio->nr_data * rbio->stripe_npages;
  1497. int ret;
  1498. ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages, false);
  1499. if (ret < 0)
  1500. return ret;
  1501. index_stripe_sectors(rbio);
  1502. return 0;
  1503. }
  1504. /*
  1505. * We use plugging call backs to collect full stripes.
  1506. * Any time we get a partial stripe write while plugged
  1507. * we collect it into a list. When the unplug comes down,
  1508. * we sort the list by logical block number and merge
  1509. * everything we can into the same rbios
  1510. */
  1511. struct btrfs_plug_cb {
  1512. struct blk_plug_cb cb;
  1513. struct btrfs_fs_info *info;
  1514. struct list_head rbio_list;
  1515. };
  1516. /*
  1517. * rbios on the plug list are sorted for easier merging.
  1518. */
  1519. static int plug_cmp(void *priv, const struct list_head *a,
  1520. const struct list_head *b)
  1521. {
  1522. const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
  1523. plug_list);
  1524. const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
  1525. plug_list);
  1526. u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
  1527. u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
  1528. if (a_sector < b_sector)
  1529. return -1;
  1530. if (a_sector > b_sector)
  1531. return 1;
  1532. return 0;
  1533. }
  1534. static void raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
  1535. {
  1536. struct btrfs_plug_cb *plug = container_of(cb, struct btrfs_plug_cb, cb);
  1537. struct btrfs_raid_bio *cur;
  1538. struct btrfs_raid_bio *last = NULL;
  1539. list_sort(NULL, &plug->rbio_list, plug_cmp);
  1540. while (!list_empty(&plug->rbio_list)) {
  1541. cur = list_first_entry(&plug->rbio_list,
  1542. struct btrfs_raid_bio, plug_list);
  1543. list_del_init(&cur->plug_list);
  1544. if (rbio_is_full(cur)) {
  1545. /* We have a full stripe, queue it down. */
  1546. start_async_work(cur, rmw_rbio_work);
  1547. continue;
  1548. }
  1549. if (last) {
  1550. if (rbio_can_merge(last, cur)) {
  1551. merge_rbio(last, cur);
  1552. free_raid_bio(cur);
  1553. continue;
  1554. }
  1555. start_async_work(last, rmw_rbio_work);
  1556. }
  1557. last = cur;
  1558. }
  1559. if (last)
  1560. start_async_work(last, rmw_rbio_work);
  1561. kfree(plug);
  1562. }
  1563. /* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */
  1564. static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
  1565. {
  1566. const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
  1567. const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT;
  1568. const u64 full_stripe_start = rbio->bioc->full_stripe_logical;
  1569. const u32 orig_len = orig_bio->bi_iter.bi_size;
  1570. const u32 sectorsize = fs_info->sectorsize;
  1571. u64 cur_logical;
  1572. ASSERT_RBIO_LOGICAL(orig_logical >= full_stripe_start &&
  1573. orig_logical + orig_len <= full_stripe_start +
  1574. rbio->nr_data * BTRFS_STRIPE_LEN,
  1575. rbio, orig_logical);
  1576. bio_list_add(&rbio->bio_list, orig_bio);
  1577. rbio->bio_list_bytes += orig_bio->bi_iter.bi_size;
  1578. /* Update the dbitmap. */
  1579. for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len;
  1580. cur_logical += sectorsize) {
  1581. int bit = ((u32)(cur_logical - full_stripe_start) >>
  1582. fs_info->sectorsize_bits) % rbio->stripe_nsectors;
  1583. set_bit(bit, &rbio->dbitmap);
  1584. }
  1585. }
  1586. /*
  1587. * our main entry point for writes from the rest of the FS.
  1588. */
  1589. void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
  1590. {
  1591. struct btrfs_fs_info *fs_info = bioc->fs_info;
  1592. struct btrfs_raid_bio *rbio;
  1593. struct btrfs_plug_cb *plug = NULL;
  1594. struct blk_plug_cb *cb;
  1595. rbio = alloc_rbio(fs_info, bioc);
  1596. if (IS_ERR(rbio)) {
  1597. bio->bi_status = errno_to_blk_status(PTR_ERR(rbio));
  1598. bio_endio(bio);
  1599. return;
  1600. }
  1601. rbio->operation = BTRFS_RBIO_WRITE;
  1602. rbio_add_bio(rbio, bio);
  1603. /*
  1604. * Don't plug on full rbios, just get them out the door
  1605. * as quickly as we can
  1606. */
  1607. if (!rbio_is_full(rbio)) {
  1608. cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug));
  1609. if (cb) {
  1610. plug = container_of(cb, struct btrfs_plug_cb, cb);
  1611. if (!plug->info) {
  1612. plug->info = fs_info;
  1613. INIT_LIST_HEAD(&plug->rbio_list);
  1614. }
  1615. list_add_tail(&rbio->plug_list, &plug->rbio_list);
  1616. return;
  1617. }
  1618. }
  1619. /*
  1620. * Either we don't have any existing plug, or we're doing a full stripe,
  1621. * queue the rmw work now.
  1622. */
  1623. start_async_work(rbio, rmw_rbio_work);
  1624. }
  1625. static int verify_one_sector(struct btrfs_raid_bio *rbio,
  1626. int stripe_nr, int sector_nr)
  1627. {
  1628. struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
  1629. phys_addr_t *paddrs;
  1630. u8 csum_buf[BTRFS_CSUM_SIZE];
  1631. u8 *csum_expected;
  1632. if (!rbio->csum_bitmap || !rbio->csum_buf)
  1633. return 0;
  1634. /* No way to verify P/Q as they are not covered by data csum. */
  1635. if (stripe_nr >= rbio->nr_data)
  1636. return 0;
  1637. /*
  1638. * If we're rebuilding a read, we have to use pages from the
  1639. * bio list if possible.
  1640. */
  1641. if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
  1642. paddrs = sector_paddrs_in_rbio(rbio, stripe_nr, sector_nr, 0);
  1643. } else {
  1644. paddrs = rbio_stripe_paddrs(rbio, stripe_nr, sector_nr);
  1645. }
  1646. csum_expected = rbio->csum_buf +
  1647. (stripe_nr * rbio->stripe_nsectors + sector_nr) *
  1648. fs_info->csum_size;
  1649. btrfs_calculate_block_csum_pages(fs_info, paddrs, csum_buf);
  1650. if (unlikely(memcmp(csum_buf, csum_expected, fs_info->csum_size) != 0))
  1651. return -EIO;
  1652. return 0;
  1653. }
  1654. static void recover_vertical_step(struct btrfs_raid_bio *rbio,
  1655. unsigned int sector_nr,
  1656. unsigned int step_nr,
  1657. int faila, int failb,
  1658. void **pointers, void **unmap_array)
  1659. {
  1660. struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
  1661. const u32 step = min(fs_info->sectorsize, PAGE_SIZE);
  1662. int stripe_nr;
  1663. ASSERT(step_nr < rbio->sector_nsteps);
  1664. ASSERT(sector_nr < rbio->stripe_nsectors);
  1665. /*
  1666. * Setup our array of pointers with sectors from each stripe
  1667. *
  1668. * NOTE: store a duplicate array of pointers to preserve the
  1669. * pointer order.
  1670. */
  1671. for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
  1672. phys_addr_t paddr;
  1673. /*
  1674. * If we're rebuilding a read, we have to use pages from the
  1675. * bio list if possible.
  1676. */
  1677. if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
  1678. paddr = sector_paddr_in_rbio(rbio, stripe_nr, sector_nr, step_nr, 0);
  1679. } else {
  1680. paddr = rbio_stripe_paddr(rbio, stripe_nr, sector_nr, step_nr);
  1681. }
  1682. pointers[stripe_nr] = kmap_local_paddr(paddr);
  1683. unmap_array[stripe_nr] = pointers[stripe_nr];
  1684. }
  1685. /* All raid6 handling here */
  1686. if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
  1687. /* Single failure, rebuild from parity raid5 style */
  1688. if (failb < 0) {
  1689. if (faila == rbio->nr_data)
  1690. /*
  1691. * Just the P stripe has failed, without
  1692. * a bad data or Q stripe.
  1693. * We have nothing to do, just skip the
  1694. * recovery for this stripe.
  1695. */
  1696. goto cleanup;
  1697. /*
  1698. * a single failure in raid6 is rebuilt
  1699. * in the pstripe code below
  1700. */
  1701. goto pstripe;
  1702. }
  1703. /*
  1704. * If the q stripe is failed, do a pstripe reconstruction from
  1705. * the xors.
  1706. * If both the q stripe and the P stripe are failed, we're
  1707. * here due to a crc mismatch and we can't give them the
  1708. * data they want.
  1709. */
  1710. if (failb == rbio->real_stripes - 1) {
  1711. if (faila == rbio->real_stripes - 2)
  1712. /*
  1713. * Only P and Q are corrupted.
  1714. * We only care about data stripes recovery,
  1715. * can skip this vertical stripe.
  1716. */
  1717. goto cleanup;
  1718. /*
  1719. * Otherwise we have one bad data stripe and
  1720. * a good P stripe. raid5!
  1721. */
  1722. goto pstripe;
  1723. }
  1724. if (failb == rbio->real_stripes - 2) {
  1725. raid6_datap_recov(rbio->real_stripes, step,
  1726. faila, pointers);
  1727. } else {
  1728. raid6_2data_recov(rbio->real_stripes, step,
  1729. faila, failb, pointers);
  1730. }
  1731. } else {
  1732. void *p;
  1733. /* Rebuild from P stripe here (raid5 or raid6). */
  1734. ASSERT(failb == -1);
  1735. pstripe:
  1736. /* Copy parity block into failed block to start with */
  1737. memcpy(pointers[faila], pointers[rbio->nr_data], step);
  1738. /* Rearrange the pointer array */
  1739. p = pointers[faila];
  1740. for (stripe_nr = faila; stripe_nr < rbio->nr_data - 1;
  1741. stripe_nr++)
  1742. pointers[stripe_nr] = pointers[stripe_nr + 1];
  1743. pointers[rbio->nr_data - 1] = p;
  1744. /* Xor in the rest */
  1745. run_xor(pointers, rbio->nr_data - 1, step);
  1746. }
  1747. cleanup:
  1748. for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--)
  1749. kunmap_local(unmap_array[stripe_nr]);
  1750. }
  1751. /*
  1752. * Recover a vertical stripe specified by @sector_nr.
  1753. * @*pointers are the pre-allocated pointers by the caller, so we don't
  1754. * need to allocate/free the pointers again and again.
  1755. */
  1756. static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
  1757. void **pointers, void **unmap_array)
  1758. {
  1759. int found_errors;
  1760. int faila;
  1761. int failb;
  1762. int ret = 0;
  1763. /*
  1764. * Now we just use bitmap to mark the horizontal stripes in
  1765. * which we have data when doing parity scrub.
  1766. */
  1767. if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
  1768. !test_bit(sector_nr, &rbio->dbitmap))
  1769. return 0;
  1770. found_errors = get_rbio_vertical_errors(rbio, sector_nr, &faila,
  1771. &failb);
  1772. /*
  1773. * No errors in the vertical stripe, skip it. Can happen for recovery
  1774. * which only part of a stripe failed csum check.
  1775. */
  1776. if (!found_errors)
  1777. return 0;
  1778. if (unlikely(found_errors > rbio->bioc->max_errors))
  1779. return -EIO;
  1780. for (int i = 0; i < rbio->sector_nsteps; i++)
  1781. recover_vertical_step(rbio, sector_nr, i, faila, failb,
  1782. pointers, unmap_array);
  1783. if (faila >= 0) {
  1784. ret = verify_one_sector(rbio, faila, sector_nr);
  1785. if (ret < 0)
  1786. return ret;
  1787. set_bit(rbio_sector_index(rbio, faila, sector_nr),
  1788. rbio->stripe_uptodate_bitmap);
  1789. }
  1790. if (failb >= 0) {
  1791. ret = verify_one_sector(rbio, failb, sector_nr);
  1792. if (ret < 0)
  1793. return ret;
  1794. set_bit(rbio_sector_index(rbio, failb, sector_nr),
  1795. rbio->stripe_uptodate_bitmap);
  1796. }
  1797. return ret;
  1798. }
  1799. static int recover_sectors(struct btrfs_raid_bio *rbio)
  1800. {
  1801. void **pointers = NULL;
  1802. void **unmap_array = NULL;
  1803. int sectornr;
  1804. int ret = 0;
  1805. /*
  1806. * @pointers array stores the pointer for each sector.
  1807. *
  1808. * @unmap_array stores copy of pointers that does not get reordered
  1809. * during reconstruction so that kunmap_local works.
  1810. */
  1811. pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
  1812. unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
  1813. if (!pointers || !unmap_array) {
  1814. ret = -ENOMEM;
  1815. goto out;
  1816. }
  1817. if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
  1818. spin_lock(&rbio->bio_list_lock);
  1819. set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
  1820. spin_unlock(&rbio->bio_list_lock);
  1821. }
  1822. index_rbio_pages(rbio);
  1823. for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
  1824. ret = recover_vertical(rbio, sectornr, pointers, unmap_array);
  1825. if (ret < 0)
  1826. break;
  1827. }
  1828. out:
  1829. kfree(pointers);
  1830. kfree(unmap_array);
  1831. return ret;
  1832. }
  1833. static void recover_rbio(struct btrfs_raid_bio *rbio)
  1834. {
  1835. struct bio_list bio_list = BIO_EMPTY_LIST;
  1836. int total_sector_nr;
  1837. int ret = 0;
  1838. /*
  1839. * Either we're doing recover for a read failure or degraded write,
  1840. * caller should have set error bitmap correctly.
  1841. */
  1842. ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors));
  1843. /* For recovery, we need to read all sectors including P/Q. */
  1844. ret = alloc_rbio_pages(rbio);
  1845. if (ret < 0)
  1846. goto out;
  1847. index_rbio_pages(rbio);
  1848. /*
  1849. * Read everything that hasn't failed. However this time we will
  1850. * not trust any cached sector.
  1851. * As we may read out some stale data but higher layer is not reading
  1852. * that stale part.
  1853. *
  1854. * So here we always re-read everything in recovery path.
  1855. */
  1856. for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
  1857. total_sector_nr++) {
  1858. int stripe = total_sector_nr / rbio->stripe_nsectors;
  1859. int sectornr = total_sector_nr % rbio->stripe_nsectors;
  1860. phys_addr_t *paddrs;
  1861. /*
  1862. * Skip the range which has error. It can be a range which is
  1863. * marked error (for csum mismatch), or it can be a missing
  1864. * device.
  1865. */
  1866. if (!rbio->bioc->stripes[stripe].dev->bdev ||
  1867. test_bit(total_sector_nr, rbio->error_bitmap)) {
  1868. /*
  1869. * Also set the error bit for missing device, which
  1870. * may not yet have its error bit set.
  1871. */
  1872. set_bit(total_sector_nr, rbio->error_bitmap);
  1873. continue;
  1874. }
  1875. paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr);
  1876. ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, stripe,
  1877. sectornr, REQ_OP_READ);
  1878. if (ret < 0) {
  1879. bio_list_put(&bio_list);
  1880. goto out;
  1881. }
  1882. }
  1883. submit_read_wait_bio_list(rbio, &bio_list);
  1884. ret = recover_sectors(rbio);
  1885. out:
  1886. rbio_orig_end_io(rbio, errno_to_blk_status(ret));
  1887. }
  1888. static void recover_rbio_work(struct work_struct *work)
  1889. {
  1890. struct btrfs_raid_bio *rbio;
  1891. rbio = container_of(work, struct btrfs_raid_bio, work);
  1892. if (!lock_stripe_add(rbio))
  1893. recover_rbio(rbio);
  1894. }
  1895. static void recover_rbio_work_locked(struct work_struct *work)
  1896. {
  1897. recover_rbio(container_of(work, struct btrfs_raid_bio, work));
  1898. }
  1899. static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num)
  1900. {
  1901. bool found = false;
  1902. int sector_nr;
  1903. /*
  1904. * This is for RAID6 extra recovery tries, thus mirror number should
  1905. * be large than 2.
  1906. * Mirror 1 means read from data stripes. Mirror 2 means rebuild using
  1907. * RAID5 methods.
  1908. */
  1909. ASSERT(mirror_num > 2);
  1910. for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
  1911. int found_errors;
  1912. int faila;
  1913. int failb;
  1914. found_errors = get_rbio_vertical_errors(rbio, sector_nr,
  1915. &faila, &failb);
  1916. /* This vertical stripe doesn't have errors. */
  1917. if (!found_errors)
  1918. continue;
  1919. /*
  1920. * If we found errors, there should be only one error marked
  1921. * by previous set_rbio_range_error().
  1922. */
  1923. ASSERT(found_errors == 1);
  1924. found = true;
  1925. /* Now select another stripe to mark as error. */
  1926. failb = rbio->real_stripes - (mirror_num - 1);
  1927. if (failb <= faila)
  1928. failb--;
  1929. /* Set the extra bit in error bitmap. */
  1930. if (failb >= 0)
  1931. set_bit(failb * rbio->stripe_nsectors + sector_nr,
  1932. rbio->error_bitmap);
  1933. }
  1934. /* We should found at least one vertical stripe with error.*/
  1935. ASSERT(found);
  1936. }
  1937. /*
  1938. * the main entry point for reads from the higher layers. This
  1939. * is really only called when the normal read path had a failure,
  1940. * so we assume the bio they send down corresponds to a failed part
  1941. * of the drive.
  1942. */
  1943. void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
  1944. int mirror_num)
  1945. {
  1946. struct btrfs_fs_info *fs_info = bioc->fs_info;
  1947. struct btrfs_raid_bio *rbio;
  1948. rbio = alloc_rbio(fs_info, bioc);
  1949. if (IS_ERR(rbio)) {
  1950. bio->bi_status = errno_to_blk_status(PTR_ERR(rbio));
  1951. bio_endio(bio);
  1952. return;
  1953. }
  1954. rbio->operation = BTRFS_RBIO_READ_REBUILD;
  1955. rbio_add_bio(rbio, bio);
  1956. set_rbio_range_error(rbio, bio);
  1957. /*
  1958. * Loop retry:
  1959. * for 'mirror == 2', reconstruct from all other stripes.
  1960. * for 'mirror_num > 2', select a stripe to fail on every retry.
  1961. */
  1962. if (mirror_num > 2)
  1963. set_rbio_raid6_extra_error(rbio, mirror_num);
  1964. start_async_work(rbio, recover_rbio_work);
  1965. }
  1966. static void fill_data_csums(struct btrfs_raid_bio *rbio)
  1967. {
  1968. struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
  1969. struct btrfs_root *csum_root;
  1970. const u64 start = rbio->bioc->full_stripe_logical;
  1971. const u32 len = (rbio->nr_data * rbio->stripe_nsectors) <<
  1972. fs_info->sectorsize_bits;
  1973. int ret;
  1974. /* The rbio should not have its csum buffer initialized. */
  1975. ASSERT(!rbio->csum_buf && !rbio->csum_bitmap);
  1976. /*
  1977. * Skip the csum search if:
  1978. *
  1979. * - The rbio doesn't belong to data block groups
  1980. * Then we are doing IO for tree blocks, no need to search csums.
  1981. *
  1982. * - The rbio belongs to mixed block groups
  1983. * This is to avoid deadlock, as we're already holding the full
  1984. * stripe lock, if we trigger a metadata read, and it needs to do
  1985. * raid56 recovery, we will deadlock.
  1986. */
  1987. if (!(rbio->bioc->map_type & BTRFS_BLOCK_GROUP_DATA) ||
  1988. rbio->bioc->map_type & BTRFS_BLOCK_GROUP_METADATA)
  1989. return;
  1990. rbio->csum_buf = kzalloc(rbio->nr_data * rbio->stripe_nsectors *
  1991. fs_info->csum_size, GFP_NOFS);
  1992. rbio->csum_bitmap = bitmap_zalloc(rbio->nr_data * rbio->stripe_nsectors,
  1993. GFP_NOFS);
  1994. if (!rbio->csum_buf || !rbio->csum_bitmap) {
  1995. ret = -ENOMEM;
  1996. goto error;
  1997. }
  1998. csum_root = btrfs_csum_root(fs_info, rbio->bioc->full_stripe_logical);
  1999. if (unlikely(!csum_root)) {
  2000. btrfs_err(fs_info,
  2001. "missing csum root for extent at bytenr %llu",
  2002. rbio->bioc->full_stripe_logical);
  2003. ret = -EUCLEAN;
  2004. goto error;
  2005. }
  2006. ret = btrfs_lookup_csums_bitmap(csum_root, NULL, start, start + len - 1,
  2007. rbio->csum_buf, rbio->csum_bitmap);
  2008. if (ret < 0)
  2009. goto error;
  2010. if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits))
  2011. goto no_csum;
  2012. return;
  2013. error:
  2014. /*
  2015. * We failed to allocate memory or grab the csum, but it's not fatal,
  2016. * we can still continue. But better to warn users that RMW is no
  2017. * longer safe for this particular sub-stripe write.
  2018. */
  2019. btrfs_warn_rl(fs_info,
  2020. "sub-stripe write for full stripe %llu is not safe, failed to get csum: %d",
  2021. rbio->bioc->full_stripe_logical, ret);
  2022. no_csum:
  2023. kfree(rbio->csum_buf);
  2024. bitmap_free(rbio->csum_bitmap);
  2025. rbio->csum_buf = NULL;
  2026. rbio->csum_bitmap = NULL;
  2027. }
  2028. static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio)
  2029. {
  2030. struct bio_list bio_list = BIO_EMPTY_LIST;
  2031. int total_sector_nr;
  2032. int ret = 0;
  2033. /*
  2034. * Fill the data csums we need for data verification. We need to fill
  2035. * the csum_bitmap/csum_buf first, as our endio function will try to
  2036. * verify the data sectors.
  2037. */
  2038. fill_data_csums(rbio);
  2039. /*
  2040. * Build a list of bios to read all sectors (including data and P/Q).
  2041. *
  2042. * This behavior is to compensate the later csum verification and recovery.
  2043. */
  2044. for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
  2045. total_sector_nr++) {
  2046. int stripe = total_sector_nr / rbio->stripe_nsectors;
  2047. int sectornr = total_sector_nr % rbio->stripe_nsectors;
  2048. phys_addr_t *paddrs;
  2049. paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr);
  2050. ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, stripe,
  2051. sectornr, REQ_OP_READ);
  2052. if (ret) {
  2053. bio_list_put(&bio_list);
  2054. return ret;
  2055. }
  2056. }
  2057. /*
  2058. * We may or may not have any corrupted sectors (including missing dev
  2059. * and csum mismatch), just let recover_sectors() to handle them all.
  2060. */
  2061. submit_read_wait_bio_list(rbio, &bio_list);
  2062. return recover_sectors(rbio);
  2063. }
  2064. static void raid_wait_write_end_io(struct bio *bio)
  2065. {
  2066. struct btrfs_raid_bio *rbio = bio->bi_private;
  2067. if (bio->bi_status)
  2068. rbio_update_error_bitmap(rbio, bio);
  2069. bio_put(bio);
  2070. if (atomic_dec_and_test(&rbio->stripes_pending))
  2071. wake_up(&rbio->io_wait);
  2072. }
  2073. static void submit_write_bios(struct btrfs_raid_bio *rbio,
  2074. struct bio_list *bio_list)
  2075. {
  2076. struct bio *bio;
  2077. atomic_set(&rbio->stripes_pending, bio_list_size(bio_list));
  2078. while ((bio = bio_list_pop(bio_list))) {
  2079. bio->bi_end_io = raid_wait_write_end_io;
  2080. if (trace_raid56_write_enabled()) {
  2081. struct raid56_bio_trace_info trace_info = { 0 };
  2082. bio_get_trace_info(rbio, bio, &trace_info);
  2083. trace_raid56_write(rbio, bio, &trace_info);
  2084. }
  2085. submit_bio(bio);
  2086. }
  2087. }
  2088. /*
  2089. * To determine if we need to read any sector from the disk.
  2090. * Should only be utilized in RMW path, to skip cached rbio.
  2091. */
  2092. static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio)
  2093. {
  2094. int i;
  2095. for (i = 0; i < rbio->nr_data * rbio->stripe_nsectors; i++) {
  2096. phys_addr_t paddr = rbio->stripe_paddrs[i * rbio->sector_nsteps];
  2097. /*
  2098. * We have a sector which doesn't have page nor uptodate,
  2099. * thus this rbio can not be cached one, as cached one must
  2100. * have all its data sectors present and uptodate.
  2101. */
  2102. if (paddr == INVALID_PADDR ||
  2103. !test_bit(i, rbio->stripe_uptodate_bitmap))
  2104. return true;
  2105. }
  2106. return false;
  2107. }
  2108. static void rmw_rbio(struct btrfs_raid_bio *rbio)
  2109. {
  2110. struct bio_list bio_list;
  2111. int sectornr;
  2112. int ret = 0;
  2113. /*
  2114. * Allocate the pages for parity first, as P/Q pages will always be
  2115. * needed for both full-stripe and sub-stripe writes.
  2116. */
  2117. ret = alloc_rbio_parity_pages(rbio);
  2118. if (ret < 0)
  2119. goto out;
  2120. /*
  2121. * Either full stripe write, or we have every data sector already
  2122. * cached, can go to write path immediately.
  2123. */
  2124. if (!rbio_is_full(rbio) && need_read_stripe_sectors(rbio)) {
  2125. /*
  2126. * Now we're doing sub-stripe write, also need all data stripes
  2127. * to do the full RMW.
  2128. */
  2129. ret = alloc_rbio_data_pages(rbio);
  2130. if (ret < 0)
  2131. goto out;
  2132. index_rbio_pages(rbio);
  2133. ret = rmw_read_wait_recover(rbio);
  2134. if (ret < 0)
  2135. goto out;
  2136. }
  2137. /*
  2138. * At this stage we're not allowed to add any new bios to the
  2139. * bio list any more, anyone else that wants to change this stripe
  2140. * needs to do their own rmw.
  2141. */
  2142. spin_lock(&rbio->bio_list_lock);
  2143. set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
  2144. spin_unlock(&rbio->bio_list_lock);
  2145. bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
  2146. index_rbio_pages(rbio);
  2147. /*
  2148. * We don't cache full rbios because we're assuming
  2149. * the higher layers are unlikely to use this area of
  2150. * the disk again soon. If they do use it again,
  2151. * hopefully they will send another full bio.
  2152. */
  2153. if (!rbio_is_full(rbio))
  2154. cache_rbio_pages(rbio);
  2155. else
  2156. clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
  2157. for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++)
  2158. generate_pq_vertical(rbio, sectornr);
  2159. bio_list_init(&bio_list);
  2160. ret = rmw_assemble_write_bios(rbio, &bio_list);
  2161. if (ret < 0)
  2162. goto out;
  2163. /* We should have at least one bio assembled. */
  2164. ASSERT(bio_list_size(&bio_list));
  2165. submit_write_bios(rbio, &bio_list);
  2166. wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
  2167. /* We may have more errors than our tolerance during the read. */
  2168. for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
  2169. int found_errors;
  2170. found_errors = get_rbio_vertical_errors(rbio, sectornr, NULL, NULL);
  2171. if (unlikely(found_errors > rbio->bioc->max_errors)) {
  2172. ret = -EIO;
  2173. break;
  2174. }
  2175. }
  2176. out:
  2177. rbio_orig_end_io(rbio, errno_to_blk_status(ret));
  2178. }
  2179. static void rmw_rbio_work(struct work_struct *work)
  2180. {
  2181. struct btrfs_raid_bio *rbio;
  2182. rbio = container_of(work, struct btrfs_raid_bio, work);
  2183. if (lock_stripe_add(rbio) == 0)
  2184. rmw_rbio(rbio);
  2185. }
  2186. static void rmw_rbio_work_locked(struct work_struct *work)
  2187. {
  2188. rmw_rbio(container_of(work, struct btrfs_raid_bio, work));
  2189. }
  2190. /*
  2191. * The following code is used to scrub/replace the parity stripe
  2192. *
  2193. * Caller must have already increased bio_counter for getting @bioc.
  2194. *
  2195. * Note: We need make sure all the pages that add into the scrub/replace
  2196. * raid bio are correct and not be changed during the scrub/replace. That
  2197. * is those pages just hold metadata or file data with checksum.
  2198. */
  2199. struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
  2200. struct btrfs_io_context *bioc,
  2201. struct btrfs_device *scrub_dev,
  2202. unsigned long *dbitmap, int stripe_nsectors)
  2203. {
  2204. struct btrfs_fs_info *fs_info = bioc->fs_info;
  2205. struct btrfs_raid_bio *rbio;
  2206. int i;
  2207. rbio = alloc_rbio(fs_info, bioc);
  2208. if (IS_ERR(rbio))
  2209. return NULL;
  2210. bio_list_add(&rbio->bio_list, bio);
  2211. /*
  2212. * This is a special bio which is used to hold the completion handler
  2213. * and make the scrub rbio is similar to the other types
  2214. */
  2215. ASSERT(!bio->bi_iter.bi_size);
  2216. rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
  2217. /*
  2218. * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted
  2219. * to the end position, so this search can start from the first parity
  2220. * stripe.
  2221. */
  2222. for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
  2223. if (bioc->stripes[i].dev == scrub_dev) {
  2224. rbio->scrubp = i;
  2225. break;
  2226. }
  2227. }
  2228. ASSERT_RBIO_STRIPE(i < rbio->real_stripes, rbio, i);
  2229. bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors);
  2230. return rbio;
  2231. }
  2232. static int alloc_rbio_sector_pages(struct btrfs_raid_bio *rbio,
  2233. int sector_nr)
  2234. {
  2235. const u32 step = min(PAGE_SIZE, rbio->bioc->fs_info->sectorsize);
  2236. const u32 base = sector_nr * rbio->sector_nsteps;
  2237. for (int i = base; i < base + rbio->sector_nsteps; i++) {
  2238. const unsigned int page_index = (i * step) >> PAGE_SHIFT;
  2239. struct page *page;
  2240. if (rbio->stripe_pages[page_index])
  2241. continue;
  2242. page = alloc_page(GFP_NOFS);
  2243. if (!page)
  2244. return -ENOMEM;
  2245. rbio->stripe_pages[page_index] = page;
  2246. }
  2247. return 0;
  2248. }
  2249. /*
  2250. * We just scrub the parity that we have correct data on the same horizontal,
  2251. * so we needn't allocate all pages for all the stripes.
  2252. */
  2253. static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
  2254. {
  2255. int total_sector_nr;
  2256. for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
  2257. total_sector_nr++) {
  2258. int sectornr = total_sector_nr % rbio->stripe_nsectors;
  2259. int ret;
  2260. if (!test_bit(sectornr, &rbio->dbitmap))
  2261. continue;
  2262. ret = alloc_rbio_sector_pages(rbio, total_sector_nr);
  2263. if (ret < 0)
  2264. return ret;
  2265. }
  2266. index_stripe_sectors(rbio);
  2267. return 0;
  2268. }
  2269. /* Return true if the content of the step matches the caclulated one. */
  2270. static bool verify_one_parity_step(struct btrfs_raid_bio *rbio,
  2271. void *pointers[], unsigned int sector_nr,
  2272. unsigned int step_nr)
  2273. {
  2274. const unsigned int nr_data = rbio->nr_data;
  2275. const bool has_qstripe = (rbio->real_stripes - rbio->nr_data == 2);
  2276. const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE);
  2277. void *parity;
  2278. bool ret = false;
  2279. ASSERT(step_nr < rbio->sector_nsteps);
  2280. /* First collect one page from each data stripe. */
  2281. for (int stripe = 0; stripe < nr_data; stripe++)
  2282. pointers[stripe] = kmap_local_paddr(
  2283. sector_paddr_in_rbio(rbio, stripe, sector_nr,
  2284. step_nr, 0));
  2285. if (has_qstripe) {
  2286. assert_rbio(rbio);
  2287. /* RAID6, call the library function to fill in our P/Q. */
  2288. raid6_call.gen_syndrome(rbio->real_stripes, step, pointers);
  2289. } else {
  2290. /* RAID5. */
  2291. memcpy(pointers[nr_data], pointers[0], step);
  2292. run_xor(pointers + 1, nr_data - 1, step);
  2293. }
  2294. /* Check scrubbing parity and repair it. */
  2295. parity = kmap_local_paddr(rbio_stripe_paddr(rbio, rbio->scrubp, sector_nr, step_nr));
  2296. if (memcmp(parity, pointers[rbio->scrubp], step) != 0)
  2297. memcpy(parity, pointers[rbio->scrubp], step);
  2298. else
  2299. ret = true;
  2300. kunmap_local(parity);
  2301. for (int stripe = nr_data - 1; stripe >= 0; stripe--)
  2302. kunmap_local(pointers[stripe]);
  2303. return ret;
  2304. }
  2305. /*
  2306. * The @pointers array should have the P/Q parity already mapped.
  2307. */
  2308. static void verify_one_parity_sector(struct btrfs_raid_bio *rbio,
  2309. void *pointers[], unsigned int sector_nr)
  2310. {
  2311. bool found_error = false;
  2312. for (int step_nr = 0; step_nr < rbio->sector_nsteps; step_nr++) {
  2313. bool match;
  2314. match = verify_one_parity_step(rbio, pointers, sector_nr, step_nr);
  2315. if (!match)
  2316. found_error = true;
  2317. }
  2318. if (!found_error)
  2319. bitmap_clear(&rbio->dbitmap, sector_nr, 1);
  2320. }
  2321. static int finish_parity_scrub(struct btrfs_raid_bio *rbio)
  2322. {
  2323. struct btrfs_io_context *bioc = rbio->bioc;
  2324. void **pointers = rbio->finish_pointers;
  2325. unsigned long *pbitmap = &rbio->finish_pbitmap;
  2326. int nr_data = rbio->nr_data;
  2327. int sectornr;
  2328. bool has_qstripe;
  2329. struct page *page;
  2330. phys_addr_t p_paddr = INVALID_PADDR;
  2331. phys_addr_t q_paddr = INVALID_PADDR;
  2332. struct bio_list bio_list;
  2333. int is_replace = 0;
  2334. int ret;
  2335. bio_list_init(&bio_list);
  2336. if (rbio->real_stripes - rbio->nr_data == 1)
  2337. has_qstripe = false;
  2338. else if (rbio->real_stripes - rbio->nr_data == 2)
  2339. has_qstripe = true;
  2340. else
  2341. BUG();
  2342. /*
  2343. * Replace is running and our P/Q stripe is being replaced, then we
  2344. * need to duplicate the final write to replace target.
  2345. */
  2346. if (bioc->replace_nr_stripes && bioc->replace_stripe_src == rbio->scrubp) {
  2347. is_replace = 1;
  2348. bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors);
  2349. }
  2350. /*
  2351. * Because the higher layers(scrubber) are unlikely to
  2352. * use this area of the disk again soon, so don't cache
  2353. * it.
  2354. */
  2355. clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
  2356. page = alloc_page(GFP_NOFS);
  2357. if (!page)
  2358. return -ENOMEM;
  2359. p_paddr = page_to_phys(page);
  2360. page = NULL;
  2361. pointers[nr_data] = kmap_local_paddr(p_paddr);
  2362. if (has_qstripe) {
  2363. /* RAID6, allocate and map temp space for the Q stripe */
  2364. page = alloc_page(GFP_NOFS);
  2365. if (!page) {
  2366. __free_page(phys_to_page(p_paddr));
  2367. p_paddr = INVALID_PADDR;
  2368. return -ENOMEM;
  2369. }
  2370. q_paddr = page_to_phys(page);
  2371. page = NULL;
  2372. pointers[rbio->real_stripes - 1] = kmap_local_paddr(q_paddr);
  2373. }
  2374. bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
  2375. /* Map the parity stripe just once */
  2376. for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors)
  2377. verify_one_parity_sector(rbio, pointers, sectornr);
  2378. kunmap_local(pointers[nr_data]);
  2379. __free_page(phys_to_page(p_paddr));
  2380. p_paddr = INVALID_PADDR;
  2381. if (q_paddr != INVALID_PADDR) {
  2382. __free_page(phys_to_page(q_paddr));
  2383. q_paddr = INVALID_PADDR;
  2384. }
  2385. /*
  2386. * time to start writing. Make bios for everything from the
  2387. * higher layers (the bio_list in our rbio) and our p/q. Ignore
  2388. * everything else.
  2389. */
  2390. for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
  2391. phys_addr_t *paddrs;
  2392. paddrs = rbio_stripe_paddrs(rbio, rbio->scrubp, sectornr);
  2393. ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, rbio->scrubp,
  2394. sectornr, REQ_OP_WRITE);
  2395. if (ret)
  2396. goto cleanup;
  2397. }
  2398. if (!is_replace)
  2399. goto submit_write;
  2400. /*
  2401. * Replace is running and our parity stripe needs to be duplicated to
  2402. * the target device. Check we have a valid source stripe number.
  2403. */
  2404. ASSERT_RBIO(rbio->bioc->replace_stripe_src >= 0, rbio);
  2405. for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) {
  2406. phys_addr_t *paddrs;
  2407. paddrs = rbio_stripe_paddrs(rbio, rbio->scrubp, sectornr);
  2408. ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, rbio->real_stripes,
  2409. sectornr, REQ_OP_WRITE);
  2410. if (ret)
  2411. goto cleanup;
  2412. }
  2413. submit_write:
  2414. submit_write_bios(rbio, &bio_list);
  2415. return 0;
  2416. cleanup:
  2417. bio_list_put(&bio_list);
  2418. return ret;
  2419. }
  2420. static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
  2421. {
  2422. if (stripe >= 0 && stripe < rbio->nr_data)
  2423. return 1;
  2424. return 0;
  2425. }
  2426. static int recover_scrub_rbio(struct btrfs_raid_bio *rbio)
  2427. {
  2428. void **pointers = NULL;
  2429. void **unmap_array = NULL;
  2430. int sector_nr;
  2431. int ret = 0;
  2432. /*
  2433. * @pointers array stores the pointer for each sector.
  2434. *
  2435. * @unmap_array stores copy of pointers that does not get reordered
  2436. * during reconstruction so that kunmap_local works.
  2437. */
  2438. pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
  2439. unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
  2440. if (!pointers || !unmap_array) {
  2441. ret = -ENOMEM;
  2442. goto out;
  2443. }
  2444. for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
  2445. int dfail = 0, failp = -1;
  2446. int faila;
  2447. int failb;
  2448. int found_errors;
  2449. found_errors = get_rbio_vertical_errors(rbio, sector_nr,
  2450. &faila, &failb);
  2451. if (unlikely(found_errors > rbio->bioc->max_errors)) {
  2452. ret = -EIO;
  2453. goto out;
  2454. }
  2455. if (found_errors == 0)
  2456. continue;
  2457. /* We should have at least one error here. */
  2458. ASSERT(faila >= 0 || failb >= 0);
  2459. if (is_data_stripe(rbio, faila))
  2460. dfail++;
  2461. else if (is_parity_stripe(faila))
  2462. failp = faila;
  2463. if (is_data_stripe(rbio, failb))
  2464. dfail++;
  2465. else if (is_parity_stripe(failb))
  2466. failp = failb;
  2467. /*
  2468. * Because we can not use a scrubbing parity to repair the
  2469. * data, so the capability of the repair is declined. (In the
  2470. * case of RAID5, we can not repair anything.)
  2471. */
  2472. if (unlikely(dfail > rbio->bioc->max_errors - 1)) {
  2473. ret = -EIO;
  2474. goto out;
  2475. }
  2476. /*
  2477. * If all data is good, only parity is correctly, just repair
  2478. * the parity, no need to recover data stripes.
  2479. */
  2480. if (dfail == 0)
  2481. continue;
  2482. /*
  2483. * Here means we got one corrupted data stripe and one
  2484. * corrupted parity on RAID6, if the corrupted parity is
  2485. * scrubbing parity, luckily, use the other one to repair the
  2486. * data, or we can not repair the data stripe.
  2487. */
  2488. if (unlikely(failp != rbio->scrubp)) {
  2489. ret = -EIO;
  2490. goto out;
  2491. }
  2492. ret = recover_vertical(rbio, sector_nr, pointers, unmap_array);
  2493. if (ret < 0)
  2494. goto out;
  2495. }
  2496. out:
  2497. kfree(pointers);
  2498. kfree(unmap_array);
  2499. return ret;
  2500. }
  2501. static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio)
  2502. {
  2503. struct bio_list bio_list = BIO_EMPTY_LIST;
  2504. int total_sector_nr;
  2505. int ret = 0;
  2506. /* Build a list of bios to read all the missing parts. */
  2507. for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
  2508. total_sector_nr++) {
  2509. int sectornr = total_sector_nr % rbio->stripe_nsectors;
  2510. int stripe = total_sector_nr / rbio->stripe_nsectors;
  2511. phys_addr_t *paddrs;
  2512. /* No data in the vertical stripe, no need to read. */
  2513. if (!test_bit(sectornr, &rbio->dbitmap))
  2514. continue;
  2515. /*
  2516. * We want to find all the sectors missing from the rbio and
  2517. * read them from the disk. If sector_paddr_in_rbio() finds a sector
  2518. * in the bio list we don't need to read it off the stripe.
  2519. */
  2520. paddrs = sector_paddrs_in_rbio(rbio, stripe, sectornr, 1);
  2521. if (paddrs == NULL)
  2522. continue;
  2523. paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr);
  2524. /*
  2525. * The bio cache may have handed us an uptodate sector. If so,
  2526. * use it.
  2527. */
  2528. if (test_bit(rbio_sector_index(rbio, stripe, sectornr),
  2529. rbio->stripe_uptodate_bitmap))
  2530. continue;
  2531. ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, stripe,
  2532. sectornr, REQ_OP_READ);
  2533. if (ret) {
  2534. bio_list_put(&bio_list);
  2535. return ret;
  2536. }
  2537. }
  2538. submit_read_wait_bio_list(rbio, &bio_list);
  2539. return 0;
  2540. }
  2541. static void scrub_rbio(struct btrfs_raid_bio *rbio)
  2542. {
  2543. int sector_nr;
  2544. int ret;
  2545. ret = alloc_rbio_essential_pages(rbio);
  2546. if (ret)
  2547. goto out;
  2548. bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
  2549. ret = scrub_assemble_read_bios(rbio);
  2550. if (ret < 0)
  2551. goto out;
  2552. /* We may have some failures, recover the failed sectors first. */
  2553. ret = recover_scrub_rbio(rbio);
  2554. if (ret < 0)
  2555. goto out;
  2556. /*
  2557. * We have every sector properly prepared. Can finish the scrub
  2558. * and writeback the good content.
  2559. */
  2560. ret = finish_parity_scrub(rbio);
  2561. wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
  2562. for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
  2563. int found_errors;
  2564. found_errors = get_rbio_vertical_errors(rbio, sector_nr, NULL, NULL);
  2565. if (unlikely(found_errors > rbio->bioc->max_errors)) {
  2566. ret = -EIO;
  2567. break;
  2568. }
  2569. }
  2570. out:
  2571. rbio_orig_end_io(rbio, errno_to_blk_status(ret));
  2572. }
  2573. static void scrub_rbio_work_locked(struct work_struct *work)
  2574. {
  2575. scrub_rbio(container_of(work, struct btrfs_raid_bio, work));
  2576. }
  2577. void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
  2578. {
  2579. if (!lock_stripe_add(rbio))
  2580. start_async_work(rbio, scrub_rbio_work_locked);
  2581. }
  2582. /*
  2583. * This is for scrub call sites where we already have correct data contents.
  2584. * This allows us to avoid reading data stripes again.
  2585. *
  2586. * Unfortunately here we have to do folio copy, other than reusing the pages.
  2587. * This is due to the fact rbio has its own page management for its cache.
  2588. */
  2589. void raid56_parity_cache_data_folios(struct btrfs_raid_bio *rbio,
  2590. struct folio **data_folios, u64 data_logical)
  2591. {
  2592. struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
  2593. const u64 offset_in_full_stripe = data_logical -
  2594. rbio->bioc->full_stripe_logical;
  2595. unsigned int findex = 0;
  2596. unsigned int foffset = 0;
  2597. int ret;
  2598. /*
  2599. * If we hit ENOMEM temporarily, but later at
  2600. * raid56_parity_submit_scrub_rbio() time it succeeded, we just do
  2601. * the extra read, not a big deal.
  2602. *
  2603. * If we hit ENOMEM later at raid56_parity_submit_scrub_rbio() time,
  2604. * the bio would got proper error number set.
  2605. */
  2606. ret = alloc_rbio_data_pages(rbio);
  2607. if (ret < 0)
  2608. return;
  2609. /* data_logical must be at stripe boundary and inside the full stripe. */
  2610. ASSERT(IS_ALIGNED(offset_in_full_stripe, BTRFS_STRIPE_LEN));
  2611. ASSERT(offset_in_full_stripe < (rbio->nr_data << BTRFS_STRIPE_LEN_SHIFT));
  2612. for (unsigned int cur_off = offset_in_full_stripe;
  2613. cur_off < offset_in_full_stripe + BTRFS_STRIPE_LEN;
  2614. cur_off += PAGE_SIZE) {
  2615. const unsigned int pindex = cur_off >> PAGE_SHIFT;
  2616. void *kaddr;
  2617. kaddr = kmap_local_page(rbio->stripe_pages[pindex]);
  2618. memcpy_from_folio(kaddr, data_folios[findex], foffset, PAGE_SIZE);
  2619. kunmap_local(kaddr);
  2620. foffset += PAGE_SIZE;
  2621. ASSERT(foffset <= folio_size(data_folios[findex]));
  2622. if (foffset == folio_size(data_folios[findex])) {
  2623. findex++;
  2624. foffset = 0;
  2625. }
  2626. }
  2627. bitmap_set(rbio->stripe_uptodate_bitmap,
  2628. offset_in_full_stripe >> fs_info->sectorsize_bits,
  2629. BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits);
  2630. }