raid1.c 93 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. /*
  3. * raid1.c : Multiple Devices driver for Linux
  4. *
  5. * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat
  6. *
  7. * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
  8. *
  9. * RAID-1 management functions.
  10. *
  11. * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
  12. *
  13. * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>
  14. * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
  15. *
  16. * Changes by Peter T. Breuer <ptb@it.uc3m.es> 31/1/2003 to support
  17. * bitmapped intelligence in resync:
  18. *
  19. * - bitmap marked during normal i/o
  20. * - bitmap used to skip nondirty blocks during sync
  21. *
  22. * Additions to bitmap code, (C) 2003-2004 Paul Clements, SteelEye Technology:
  23. * - persistent bitmap code
  24. */
  25. #include <linux/slab.h>
  26. #include <linux/delay.h>
  27. #include <linux/blkdev.h>
  28. #include <linux/module.h>
  29. #include <linux/seq_file.h>
  30. #include <linux/ratelimit.h>
  31. #include <linux/interval_tree_generic.h>
  32. #include <trace/events/block.h>
  33. #include "md.h"
  34. #include "raid1.h"
  35. #include "md-bitmap.h"
  36. #include "md-cluster.h"
  37. #define UNSUPPORTED_MDDEV_FLAGS \
  38. ((1L << MD_HAS_JOURNAL) | \
  39. (1L << MD_JOURNAL_CLEAN) | \
  40. (1L << MD_HAS_PPL) | \
  41. (1L << MD_HAS_MULTIPLE_PPLS))
  42. static void allow_barrier(struct r1conf *conf, sector_t sector_nr);
  43. static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
  44. static void raid1_free(struct mddev *mddev, void *priv);
  45. #define RAID_1_10_NAME "raid1"
  46. #include "raid1-10.c"
  47. #define START(node) ((node)->start)
  48. #define LAST(node) ((node)->last)
  49. INTERVAL_TREE_DEFINE(struct serial_info, node, sector_t, _subtree_last,
  50. START, LAST, static inline, raid1_rb);
  51. static int check_and_add_serial(struct md_rdev *rdev, struct r1bio *r1_bio,
  52. struct serial_info *si, int idx)
  53. {
  54. unsigned long flags;
  55. int ret = 0;
  56. sector_t lo = r1_bio->sector;
  57. sector_t hi = lo + r1_bio->sectors;
  58. struct serial_in_rdev *serial = &rdev->serial[idx];
  59. spin_lock_irqsave(&serial->serial_lock, flags);
  60. /* collision happened */
  61. if (raid1_rb_iter_first(&serial->serial_rb, lo, hi))
  62. ret = -EBUSY;
  63. else {
  64. si->start = lo;
  65. si->last = hi;
  66. raid1_rb_insert(si, &serial->serial_rb);
  67. }
  68. spin_unlock_irqrestore(&serial->serial_lock, flags);
  69. return ret;
  70. }
  71. static void wait_for_serialization(struct md_rdev *rdev, struct r1bio *r1_bio)
  72. {
  73. struct mddev *mddev = rdev->mddev;
  74. struct serial_info *si;
  75. int idx = sector_to_idx(r1_bio->sector);
  76. struct serial_in_rdev *serial = &rdev->serial[idx];
  77. if (WARN_ON(!mddev->serial_info_pool))
  78. return;
  79. si = mempool_alloc(mddev->serial_info_pool, GFP_NOIO);
  80. wait_event(serial->serial_io_wait,
  81. check_and_add_serial(rdev, r1_bio, si, idx) == 0);
  82. }
  83. static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi)
  84. {
  85. struct serial_info *si;
  86. unsigned long flags;
  87. int found = 0;
  88. struct mddev *mddev = rdev->mddev;
  89. int idx = sector_to_idx(lo);
  90. struct serial_in_rdev *serial = &rdev->serial[idx];
  91. spin_lock_irqsave(&serial->serial_lock, flags);
  92. for (si = raid1_rb_iter_first(&serial->serial_rb, lo, hi);
  93. si; si = raid1_rb_iter_next(si, lo, hi)) {
  94. if (si->start == lo && si->last == hi) {
  95. raid1_rb_remove(si, &serial->serial_rb);
  96. mempool_free(si, mddev->serial_info_pool);
  97. found = 1;
  98. break;
  99. }
  100. }
  101. if (!found)
  102. WARN(1, "The write IO is not recorded for serialization\n");
  103. spin_unlock_irqrestore(&serial->serial_lock, flags);
  104. wake_up(&serial->serial_io_wait);
  105. }
  106. /*
  107. * for resync bio, r1bio pointer can be retrieved from the per-bio
  108. * 'struct resync_pages'.
  109. */
  110. static inline struct r1bio *get_resync_r1bio(struct bio *bio)
  111. {
  112. return get_resync_pages(bio)->raid_bio;
  113. }
  114. static void *r1bio_pool_alloc(gfp_t gfp_flags, struct r1conf *conf)
  115. {
  116. int size = offsetof(struct r1bio, bios[conf->raid_disks * 2]);
  117. /* allocate a r1bio with room for raid_disks entries in the bios array */
  118. return kzalloc(size, gfp_flags);
  119. }
  120. #define RESYNC_DEPTH 32
  121. #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
  122. #define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH)
  123. #define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
  124. #define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
  125. #define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
  126. static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
  127. {
  128. struct r1conf *conf = data;
  129. struct r1bio *r1_bio;
  130. struct bio *bio;
  131. int need_pages;
  132. int j;
  133. struct resync_pages *rps;
  134. r1_bio = r1bio_pool_alloc(gfp_flags, conf);
  135. if (!r1_bio)
  136. return NULL;
  137. rps = kmalloc_objs(struct resync_pages, conf->raid_disks * 2, gfp_flags);
  138. if (!rps)
  139. goto out_free_r1bio;
  140. /*
  141. * Allocate bios : 1 for reading, n-1 for writing
  142. */
  143. for (j = conf->raid_disks * 2; j-- ; ) {
  144. bio = bio_kmalloc(RESYNC_PAGES, gfp_flags);
  145. if (!bio)
  146. goto out_free_bio;
  147. bio_init_inline(bio, NULL, RESYNC_PAGES, 0);
  148. r1_bio->bios[j] = bio;
  149. }
  150. /*
  151. * Allocate RESYNC_PAGES data pages and attach them to
  152. * the first bio.
  153. * If this is a user-requested check/repair, allocate
  154. * RESYNC_PAGES for each bio.
  155. */
  156. if (test_bit(MD_RECOVERY_REQUESTED, &conf->mddev->recovery))
  157. need_pages = conf->raid_disks * 2;
  158. else
  159. need_pages = 1;
  160. for (j = 0; j < conf->raid_disks * 2; j++) {
  161. struct resync_pages *rp = &rps[j];
  162. bio = r1_bio->bios[j];
  163. if (j < need_pages) {
  164. if (resync_alloc_pages(rp, gfp_flags))
  165. goto out_free_pages;
  166. } else {
  167. memcpy(rp, &rps[0], sizeof(*rp));
  168. resync_get_all_pages(rp);
  169. }
  170. rp->raid_bio = r1_bio;
  171. bio->bi_private = rp;
  172. }
  173. r1_bio->master_bio = NULL;
  174. return r1_bio;
  175. out_free_pages:
  176. while (--j >= 0)
  177. resync_free_pages(&rps[j]);
  178. out_free_bio:
  179. while (++j < conf->raid_disks * 2) {
  180. bio_uninit(r1_bio->bios[j]);
  181. kfree(r1_bio->bios[j]);
  182. }
  183. kfree(rps);
  184. out_free_r1bio:
  185. rbio_pool_free(r1_bio, data);
  186. return NULL;
  187. }
  188. static void r1buf_pool_free(void *__r1_bio, void *data)
  189. {
  190. struct r1conf *conf = data;
  191. int i;
  192. struct r1bio *r1bio = __r1_bio;
  193. struct resync_pages *rp = NULL;
  194. for (i = conf->raid_disks * 2; i--; ) {
  195. rp = get_resync_pages(r1bio->bios[i]);
  196. resync_free_pages(rp);
  197. bio_uninit(r1bio->bios[i]);
  198. kfree(r1bio->bios[i]);
  199. }
  200. /* resync pages array stored in the 1st bio's .bi_private */
  201. kfree(rp);
  202. rbio_pool_free(r1bio, data);
  203. }
  204. static void put_all_bios(struct r1conf *conf, struct r1bio *r1_bio)
  205. {
  206. int i;
  207. for (i = 0; i < conf->raid_disks * 2; i++) {
  208. struct bio **bio = r1_bio->bios + i;
  209. if (!BIO_SPECIAL(*bio))
  210. bio_put(*bio);
  211. *bio = NULL;
  212. }
  213. }
  214. static void free_r1bio(struct r1bio *r1_bio)
  215. {
  216. struct r1conf *conf = r1_bio->mddev->private;
  217. put_all_bios(conf, r1_bio);
  218. mempool_free(r1_bio, conf->r1bio_pool);
  219. }
  220. static void put_buf(struct r1bio *r1_bio)
  221. {
  222. struct r1conf *conf = r1_bio->mddev->private;
  223. sector_t sect = r1_bio->sector;
  224. int i;
  225. for (i = 0; i < conf->raid_disks * 2; i++) {
  226. struct bio *bio = r1_bio->bios[i];
  227. if (bio->bi_end_io)
  228. rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev);
  229. }
  230. mempool_free(r1_bio, &conf->r1buf_pool);
  231. lower_barrier(conf, sect);
  232. }
  233. static void reschedule_retry(struct r1bio *r1_bio)
  234. {
  235. unsigned long flags;
  236. struct mddev *mddev = r1_bio->mddev;
  237. struct r1conf *conf = mddev->private;
  238. int idx;
  239. idx = sector_to_idx(r1_bio->sector);
  240. spin_lock_irqsave(&conf->device_lock, flags);
  241. list_add(&r1_bio->retry_list, &conf->retry_list);
  242. atomic_inc(&conf->nr_queued[idx]);
  243. spin_unlock_irqrestore(&conf->device_lock, flags);
  244. wake_up(&conf->wait_barrier);
  245. md_wakeup_thread(mddev->thread);
  246. }
  247. /*
  248. * raid_end_bio_io() is called when we have finished servicing a mirrored
  249. * operation and are ready to return a success/failure code to the buffer
  250. * cache layer.
  251. */
  252. static void call_bio_endio(struct r1bio *r1_bio)
  253. {
  254. struct bio *bio = r1_bio->master_bio;
  255. if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
  256. bio->bi_status = BLK_STS_IOERR;
  257. bio_endio(bio);
  258. }
  259. static void raid_end_bio_io(struct r1bio *r1_bio)
  260. {
  261. struct bio *bio = r1_bio->master_bio;
  262. struct r1conf *conf = r1_bio->mddev->private;
  263. sector_t sector = r1_bio->sector;
  264. /* if nobody has done the final endio yet, do it now */
  265. if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
  266. pr_debug("raid1: sync end %s on sectors %llu-%llu\n",
  267. (bio_data_dir(bio) == WRITE) ? "write" : "read",
  268. (unsigned long long) bio->bi_iter.bi_sector,
  269. (unsigned long long) bio_end_sector(bio) - 1);
  270. call_bio_endio(r1_bio);
  271. }
  272. free_r1bio(r1_bio);
  273. /*
  274. * Wake up any possible resync thread that waits for the device
  275. * to go idle. All I/Os, even write-behind writes, are done.
  276. */
  277. allow_barrier(conf, sector);
  278. }
  279. /*
  280. * Update disk head position estimator based on IRQ completion info.
  281. */
  282. static inline void update_head_pos(int disk, struct r1bio *r1_bio)
  283. {
  284. struct r1conf *conf = r1_bio->mddev->private;
  285. conf->mirrors[disk].head_position =
  286. r1_bio->sector + (r1_bio->sectors);
  287. }
  288. /*
  289. * Find the disk number which triggered given bio
  290. */
  291. static int find_bio_disk(struct r1bio *r1_bio, struct bio *bio)
  292. {
  293. int mirror;
  294. struct r1conf *conf = r1_bio->mddev->private;
  295. int raid_disks = conf->raid_disks;
  296. for (mirror = 0; mirror < raid_disks * 2; mirror++)
  297. if (r1_bio->bios[mirror] == bio)
  298. break;
  299. BUG_ON(mirror == raid_disks * 2);
  300. update_head_pos(mirror, r1_bio);
  301. return mirror;
  302. }
  303. static void raid1_end_read_request(struct bio *bio)
  304. {
  305. int uptodate = !bio->bi_status;
  306. struct r1bio *r1_bio = bio->bi_private;
  307. struct r1conf *conf = r1_bio->mddev->private;
  308. struct md_rdev *rdev = conf->mirrors[r1_bio->read_disk].rdev;
  309. /*
  310. * this branch is our 'one mirror IO has finished' event handler:
  311. */
  312. update_head_pos(r1_bio->read_disk, r1_bio);
  313. if (uptodate) {
  314. set_bit(R1BIO_Uptodate, &r1_bio->state);
  315. } else if (test_bit(FailFast, &rdev->flags) &&
  316. test_bit(R1BIO_FailFast, &r1_bio->state)) {
  317. /* This was a fail-fast read so we definitely
  318. * want to retry */
  319. ;
  320. } else if (!raid1_should_handle_error(bio)) {
  321. uptodate = 1;
  322. } else {
  323. /* If all other devices have failed, we want to return
  324. * the error upwards rather than fail the last device.
  325. * Here we redefine "uptodate" to mean "Don't want to retry"
  326. */
  327. unsigned long flags;
  328. spin_lock_irqsave(&conf->device_lock, flags);
  329. if (r1_bio->mddev->degraded == conf->raid_disks ||
  330. (r1_bio->mddev->degraded == conf->raid_disks-1 &&
  331. test_bit(In_sync, &rdev->flags)))
  332. uptodate = 1;
  333. spin_unlock_irqrestore(&conf->device_lock, flags);
  334. }
  335. if (uptodate) {
  336. raid_end_bio_io(r1_bio);
  337. rdev_dec_pending(rdev, conf->mddev);
  338. } else {
  339. /*
  340. * oops, read error:
  341. */
  342. pr_err_ratelimited("md/raid1:%s: %pg: rescheduling sector %llu\n",
  343. mdname(conf->mddev),
  344. rdev->bdev,
  345. (unsigned long long)r1_bio->sector);
  346. set_bit(R1BIO_ReadError, &r1_bio->state);
  347. reschedule_retry(r1_bio);
  348. /* don't drop the reference on read_disk yet */
  349. }
  350. }
  351. static void close_write(struct r1bio *r1_bio)
  352. {
  353. struct mddev *mddev = r1_bio->mddev;
  354. /* it really is the end of this request */
  355. if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
  356. bio_free_pages(r1_bio->behind_master_bio);
  357. bio_put(r1_bio->behind_master_bio);
  358. r1_bio->behind_master_bio = NULL;
  359. }
  360. if (test_bit(R1BIO_BehindIO, &r1_bio->state))
  361. mddev->bitmap_ops->end_behind_write(mddev);
  362. md_write_end(mddev);
  363. }
  364. static void r1_bio_write_done(struct r1bio *r1_bio)
  365. {
  366. if (!atomic_dec_and_test(&r1_bio->remaining))
  367. return;
  368. if (test_bit(R1BIO_WriteError, &r1_bio->state))
  369. reschedule_retry(r1_bio);
  370. else {
  371. close_write(r1_bio);
  372. if (test_bit(R1BIO_MadeGood, &r1_bio->state))
  373. reschedule_retry(r1_bio);
  374. else
  375. raid_end_bio_io(r1_bio);
  376. }
  377. }
  378. static void raid1_end_write_request(struct bio *bio)
  379. {
  380. struct r1bio *r1_bio = bio->bi_private;
  381. int behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
  382. struct r1conf *conf = r1_bio->mddev->private;
  383. struct bio *to_put = NULL;
  384. int mirror = find_bio_disk(r1_bio, bio);
  385. struct md_rdev *rdev = conf->mirrors[mirror].rdev;
  386. sector_t lo = r1_bio->sector;
  387. sector_t hi = r1_bio->sector + r1_bio->sectors;
  388. bool ignore_error = !raid1_should_handle_error(bio) ||
  389. (bio->bi_status && bio_op(bio) == REQ_OP_DISCARD);
  390. /*
  391. * 'one mirror IO has finished' event handler:
  392. */
  393. if (bio->bi_status && !ignore_error) {
  394. set_bit(WriteErrorSeen, &rdev->flags);
  395. if (!test_and_set_bit(WantReplacement, &rdev->flags))
  396. set_bit(MD_RECOVERY_NEEDED, &
  397. conf->mddev->recovery);
  398. if (test_bit(FailFast, &rdev->flags) &&
  399. (bio->bi_opf & MD_FAILFAST) &&
  400. /* We never try FailFast to WriteMostly devices */
  401. !test_bit(WriteMostly, &rdev->flags)) {
  402. md_error(r1_bio->mddev, rdev);
  403. }
  404. /*
  405. * When the device is faulty, it is not necessary to
  406. * handle write error.
  407. */
  408. if (!test_bit(Faulty, &rdev->flags))
  409. set_bit(R1BIO_WriteError, &r1_bio->state);
  410. else {
  411. /* Finished with this branch */
  412. r1_bio->bios[mirror] = NULL;
  413. to_put = bio;
  414. }
  415. } else {
  416. /*
  417. * Set R1BIO_Uptodate in our master bio, so that we
  418. * will return a good error code for to the higher
  419. * levels even if IO on some other mirrored buffer
  420. * fails.
  421. *
  422. * The 'master' represents the composite IO operation
  423. * to user-side. So if something waits for IO, then it
  424. * will wait for the 'master' bio.
  425. */
  426. r1_bio->bios[mirror] = NULL;
  427. to_put = bio;
  428. /*
  429. * Do not set R1BIO_Uptodate if the current device is
  430. * rebuilding or Faulty. This is because we cannot use
  431. * such device for properly reading the data back (we could
  432. * potentially use it, if the current write would have felt
  433. * before rdev->recovery_offset, but for simplicity we don't
  434. * check this here.
  435. */
  436. if (test_bit(In_sync, &rdev->flags) &&
  437. !test_bit(Faulty, &rdev->flags))
  438. set_bit(R1BIO_Uptodate, &r1_bio->state);
  439. /* Maybe we can clear some bad blocks. */
  440. if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors) &&
  441. !ignore_error) {
  442. r1_bio->bios[mirror] = IO_MADE_GOOD;
  443. set_bit(R1BIO_MadeGood, &r1_bio->state);
  444. }
  445. }
  446. if (behind) {
  447. if (test_bit(CollisionCheck, &rdev->flags))
  448. remove_serial(rdev, lo, hi);
  449. if (test_bit(WriteMostly, &rdev->flags))
  450. atomic_dec(&r1_bio->behind_remaining);
  451. /*
  452. * In behind mode, we ACK the master bio once the I/O
  453. * has safely reached all non-writemostly
  454. * disks. Setting the Returned bit ensures that this
  455. * gets done only once -- we don't ever want to return
  456. * -EIO here, instead we'll wait
  457. */
  458. if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
  459. test_bit(R1BIO_Uptodate, &r1_bio->state)) {
  460. /* Maybe we can return now */
  461. if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
  462. struct bio *mbio = r1_bio->master_bio;
  463. pr_debug("raid1: behind end write sectors"
  464. " %llu-%llu\n",
  465. (unsigned long long) mbio->bi_iter.bi_sector,
  466. (unsigned long long) bio_end_sector(mbio) - 1);
  467. call_bio_endio(r1_bio);
  468. }
  469. }
  470. } else if (test_bit(MD_SERIALIZE_POLICY, &rdev->mddev->flags))
  471. remove_serial(rdev, lo, hi);
  472. if (r1_bio->bios[mirror] == NULL)
  473. rdev_dec_pending(rdev, conf->mddev);
  474. /*
  475. * Let's see if all mirrored write operations have finished
  476. * already.
  477. */
  478. r1_bio_write_done(r1_bio);
  479. if (to_put)
  480. bio_put(to_put);
  481. }
  482. static sector_t align_to_barrier_unit_end(sector_t start_sector,
  483. sector_t sectors)
  484. {
  485. sector_t len;
  486. WARN_ON(sectors == 0);
  487. /*
  488. * len is the number of sectors from start_sector to end of the
  489. * barrier unit which start_sector belongs to.
  490. */
  491. len = round_up(start_sector + 1, BARRIER_UNIT_SECTOR_SIZE) -
  492. start_sector;
  493. if (len > sectors)
  494. len = sectors;
  495. return len;
  496. }
  497. static void update_read_sectors(struct r1conf *conf, int disk,
  498. sector_t this_sector, int len)
  499. {
  500. struct raid1_info *info = &conf->mirrors[disk];
  501. atomic_inc(&info->rdev->nr_pending);
  502. if (info->next_seq_sect != this_sector)
  503. info->seq_start = this_sector;
  504. info->next_seq_sect = this_sector + len;
  505. }
  506. static int choose_first_rdev(struct r1conf *conf, struct r1bio *r1_bio,
  507. int *max_sectors)
  508. {
  509. sector_t this_sector = r1_bio->sector;
  510. int len = r1_bio->sectors;
  511. int disk;
  512. for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
  513. struct md_rdev *rdev;
  514. int read_len;
  515. if (r1_bio->bios[disk] == IO_BLOCKED)
  516. continue;
  517. rdev = conf->mirrors[disk].rdev;
  518. if (!rdev || test_bit(Faulty, &rdev->flags))
  519. continue;
  520. /* choose the first disk even if it has some bad blocks. */
  521. read_len = raid1_check_read_range(rdev, this_sector, &len);
  522. if (read_len > 0) {
  523. update_read_sectors(conf, disk, this_sector, read_len);
  524. *max_sectors = read_len;
  525. return disk;
  526. }
  527. }
  528. return -1;
  529. }
  530. static bool rdev_in_recovery(struct md_rdev *rdev, struct r1bio *r1_bio)
  531. {
  532. return !test_bit(In_sync, &rdev->flags) &&
  533. rdev->recovery_offset < r1_bio->sector + r1_bio->sectors;
  534. }
  535. static int choose_bb_rdev(struct r1conf *conf, struct r1bio *r1_bio,
  536. int *max_sectors)
  537. {
  538. sector_t this_sector = r1_bio->sector;
  539. int best_disk = -1;
  540. int best_len = 0;
  541. int disk;
  542. for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
  543. struct md_rdev *rdev;
  544. int len;
  545. int read_len;
  546. if (r1_bio->bios[disk] == IO_BLOCKED)
  547. continue;
  548. rdev = conf->mirrors[disk].rdev;
  549. if (!rdev || test_bit(Faulty, &rdev->flags) ||
  550. rdev_in_recovery(rdev, r1_bio) ||
  551. test_bit(WriteMostly, &rdev->flags))
  552. continue;
  553. /* keep track of the disk with the most readable sectors. */
  554. len = r1_bio->sectors;
  555. read_len = raid1_check_read_range(rdev, this_sector, &len);
  556. if (read_len > best_len) {
  557. best_disk = disk;
  558. best_len = read_len;
  559. }
  560. }
  561. if (best_disk != -1) {
  562. *max_sectors = best_len;
  563. update_read_sectors(conf, best_disk, this_sector, best_len);
  564. }
  565. return best_disk;
  566. }
  567. static int choose_slow_rdev(struct r1conf *conf, struct r1bio *r1_bio,
  568. int *max_sectors)
  569. {
  570. sector_t this_sector = r1_bio->sector;
  571. int bb_disk = -1;
  572. int bb_read_len = 0;
  573. int disk;
  574. for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
  575. struct md_rdev *rdev;
  576. int len;
  577. int read_len;
  578. if (r1_bio->bios[disk] == IO_BLOCKED)
  579. continue;
  580. rdev = conf->mirrors[disk].rdev;
  581. if (!rdev || test_bit(Faulty, &rdev->flags) ||
  582. !test_bit(WriteMostly, &rdev->flags) ||
  583. rdev_in_recovery(rdev, r1_bio))
  584. continue;
  585. /* there are no bad blocks, we can use this disk */
  586. len = r1_bio->sectors;
  587. read_len = raid1_check_read_range(rdev, this_sector, &len);
  588. if (read_len == r1_bio->sectors) {
  589. *max_sectors = read_len;
  590. update_read_sectors(conf, disk, this_sector, read_len);
  591. return disk;
  592. }
  593. /*
  594. * there are partial bad blocks, choose the rdev with largest
  595. * read length.
  596. */
  597. if (read_len > bb_read_len) {
  598. bb_disk = disk;
  599. bb_read_len = read_len;
  600. }
  601. }
  602. if (bb_disk != -1) {
  603. *max_sectors = bb_read_len;
  604. update_read_sectors(conf, bb_disk, this_sector, bb_read_len);
  605. }
  606. return bb_disk;
  607. }
  608. static bool is_sequential(struct r1conf *conf, int disk, struct r1bio *r1_bio)
  609. {
  610. /* TODO: address issues with this check and concurrency. */
  611. return conf->mirrors[disk].next_seq_sect == r1_bio->sector ||
  612. conf->mirrors[disk].head_position == r1_bio->sector;
  613. }
  614. /*
  615. * If buffered sequential IO size exceeds optimal iosize, check if there is idle
  616. * disk. If yes, choose the idle disk.
  617. */
  618. static bool should_choose_next(struct r1conf *conf, int disk)
  619. {
  620. struct raid1_info *mirror = &conf->mirrors[disk];
  621. int opt_iosize;
  622. if (!test_bit(Nonrot, &mirror->rdev->flags))
  623. return false;
  624. opt_iosize = bdev_io_opt(mirror->rdev->bdev) >> 9;
  625. return opt_iosize > 0 && mirror->seq_start != MaxSector &&
  626. mirror->next_seq_sect > opt_iosize &&
  627. mirror->next_seq_sect - opt_iosize >= mirror->seq_start;
  628. }
  629. static bool rdev_readable(struct md_rdev *rdev, struct r1bio *r1_bio)
  630. {
  631. if (!rdev || test_bit(Faulty, &rdev->flags))
  632. return false;
  633. if (rdev_in_recovery(rdev, r1_bio))
  634. return false;
  635. /* don't read from slow disk unless have to */
  636. if (test_bit(WriteMostly, &rdev->flags))
  637. return false;
  638. /* don't split IO for bad blocks unless have to */
  639. if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors))
  640. return false;
  641. return true;
  642. }
  643. struct read_balance_ctl {
  644. sector_t closest_dist;
  645. int closest_dist_disk;
  646. int min_pending;
  647. int min_pending_disk;
  648. int sequential_disk;
  649. int readable_disks;
  650. };
  651. static int choose_best_rdev(struct r1conf *conf, struct r1bio *r1_bio)
  652. {
  653. int disk;
  654. struct read_balance_ctl ctl = {
  655. .closest_dist_disk = -1,
  656. .closest_dist = MaxSector,
  657. .min_pending_disk = -1,
  658. .min_pending = UINT_MAX,
  659. .sequential_disk = -1,
  660. };
  661. for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
  662. struct md_rdev *rdev;
  663. sector_t dist;
  664. unsigned int pending;
  665. if (r1_bio->bios[disk] == IO_BLOCKED)
  666. continue;
  667. rdev = conf->mirrors[disk].rdev;
  668. if (!rdev_readable(rdev, r1_bio))
  669. continue;
  670. /* At least two disks to choose from so failfast is OK */
  671. if (ctl.readable_disks++ == 1)
  672. set_bit(R1BIO_FailFast, &r1_bio->state);
  673. pending = atomic_read(&rdev->nr_pending);
  674. dist = abs(r1_bio->sector - conf->mirrors[disk].head_position);
  675. /* Don't change to another disk for sequential reads */
  676. if (is_sequential(conf, disk, r1_bio)) {
  677. if (!should_choose_next(conf, disk))
  678. return disk;
  679. /*
  680. * Add 'pending' to avoid choosing this disk if
  681. * there is other idle disk.
  682. */
  683. pending++;
  684. /*
  685. * If there is no other idle disk, this disk
  686. * will be chosen.
  687. */
  688. ctl.sequential_disk = disk;
  689. }
  690. if (ctl.min_pending > pending) {
  691. ctl.min_pending = pending;
  692. ctl.min_pending_disk = disk;
  693. }
  694. if (ctl.closest_dist > dist) {
  695. ctl.closest_dist = dist;
  696. ctl.closest_dist_disk = disk;
  697. }
  698. }
  699. /*
  700. * sequential IO size exceeds optimal iosize, however, there is no other
  701. * idle disk, so choose the sequential disk.
  702. */
  703. if (ctl.sequential_disk != -1 && ctl.min_pending != 0)
  704. return ctl.sequential_disk;
  705. /*
  706. * If all disks are rotational, choose the closest disk. If any disk is
  707. * non-rotational, choose the disk with less pending request even the
  708. * disk is rotational, which might/might not be optimal for raids with
  709. * mixed ratation/non-rotational disks depending on workload.
  710. */
  711. if (ctl.min_pending_disk != -1 &&
  712. (READ_ONCE(conf->nonrot_disks) || ctl.min_pending == 0))
  713. return ctl.min_pending_disk;
  714. else
  715. return ctl.closest_dist_disk;
  716. }
  717. /*
  718. * This routine returns the disk from which the requested read should be done.
  719. *
  720. * 1) If resync is in progress, find the first usable disk and use it even if it
  721. * has some bad blocks.
  722. *
  723. * 2) Now that there is no resync, loop through all disks and skipping slow
  724. * disks and disks with bad blocks for now. Only pay attention to key disk
  725. * choice.
  726. *
  727. * 3) If we've made it this far, now look for disks with bad blocks and choose
  728. * the one with most number of sectors.
  729. *
  730. * 4) If we are all the way at the end, we have no choice but to use a disk even
  731. * if it is write mostly.
  732. *
  733. * The rdev for the device selected will have nr_pending incremented.
  734. */
  735. static int read_balance(struct r1conf *conf, struct r1bio *r1_bio,
  736. int *max_sectors)
  737. {
  738. int disk;
  739. clear_bit(R1BIO_FailFast, &r1_bio->state);
  740. if (raid1_should_read_first(conf->mddev, r1_bio->sector,
  741. r1_bio->sectors))
  742. return choose_first_rdev(conf, r1_bio, max_sectors);
  743. disk = choose_best_rdev(conf, r1_bio);
  744. if (disk >= 0) {
  745. *max_sectors = r1_bio->sectors;
  746. update_read_sectors(conf, disk, r1_bio->sector,
  747. r1_bio->sectors);
  748. return disk;
  749. }
  750. /*
  751. * If we are here it means we didn't find a perfectly good disk so
  752. * now spend a bit more time trying to find one with the most good
  753. * sectors.
  754. */
  755. disk = choose_bb_rdev(conf, r1_bio, max_sectors);
  756. if (disk >= 0)
  757. return disk;
  758. return choose_slow_rdev(conf, r1_bio, max_sectors);
  759. }
  760. static void wake_up_barrier(struct r1conf *conf)
  761. {
  762. if (wq_has_sleeper(&conf->wait_barrier))
  763. wake_up(&conf->wait_barrier);
  764. }
  765. static void flush_bio_list(struct r1conf *conf, struct bio *bio)
  766. {
  767. /* flush any pending bitmap writes to disk before proceeding w/ I/O */
  768. raid1_prepare_flush_writes(conf->mddev);
  769. wake_up_barrier(conf);
  770. while (bio) { /* submit pending writes */
  771. struct bio *next = bio->bi_next;
  772. raid1_submit_write(bio);
  773. bio = next;
  774. cond_resched();
  775. }
  776. }
  777. static void flush_pending_writes(struct r1conf *conf)
  778. {
  779. /* Any writes that have been queued but are awaiting
  780. * bitmap updates get flushed here.
  781. */
  782. spin_lock_irq(&conf->device_lock);
  783. if (conf->pending_bio_list.head) {
  784. struct blk_plug plug;
  785. struct bio *bio;
  786. bio = bio_list_get(&conf->pending_bio_list);
  787. spin_unlock_irq(&conf->device_lock);
  788. /*
  789. * As this is called in a wait_event() loop (see freeze_array),
  790. * current->state might be TASK_UNINTERRUPTIBLE which will
  791. * cause a warning when we prepare to wait again. As it is
  792. * rare that this path is taken, it is perfectly safe to force
  793. * us to go around the wait_event() loop again, so the warning
  794. * is a false-positive. Silence the warning by resetting
  795. * thread state
  796. */
  797. __set_current_state(TASK_RUNNING);
  798. blk_start_plug(&plug);
  799. flush_bio_list(conf, bio);
  800. blk_finish_plug(&plug);
  801. } else
  802. spin_unlock_irq(&conf->device_lock);
  803. }
  804. /* Barriers....
  805. * Sometimes we need to suspend IO while we do something else,
  806. * either some resync/recovery, or reconfigure the array.
  807. * To do this we raise a 'barrier'.
  808. * The 'barrier' is a counter that can be raised multiple times
  809. * to count how many activities are happening which preclude
  810. * normal IO.
  811. * We can only raise the barrier if there is no pending IO.
  812. * i.e. if nr_pending == 0.
  813. * We choose only to raise the barrier if no-one is waiting for the
  814. * barrier to go down. This means that as soon as an IO request
  815. * is ready, no other operations which require a barrier will start
  816. * until the IO request has had a chance.
  817. *
  818. * So: regular IO calls 'wait_barrier'. When that returns there
  819. * is no backgroup IO happening, It must arrange to call
  820. * allow_barrier when it has finished its IO.
  821. * backgroup IO calls must call raise_barrier. Once that returns
  822. * there is no normal IO happeing. It must arrange to call
  823. * lower_barrier when the particular background IO completes.
  824. *
  825. * If resync/recovery is interrupted, returns -EINTR;
  826. * Otherwise, returns 0.
  827. */
  828. static int raise_barrier(struct r1conf *conf, sector_t sector_nr)
  829. {
  830. int idx = sector_to_idx(sector_nr);
  831. spin_lock_irq(&conf->resync_lock);
  832. /* Wait until no block IO is waiting */
  833. wait_event_lock_irq(conf->wait_barrier,
  834. !atomic_read(&conf->nr_waiting[idx]),
  835. conf->resync_lock);
  836. /* block any new IO from starting */
  837. atomic_inc(&conf->barrier[idx]);
  838. /*
  839. * In raise_barrier() we firstly increase conf->barrier[idx] then
  840. * check conf->nr_pending[idx]. In _wait_barrier() we firstly
  841. * increase conf->nr_pending[idx] then check conf->barrier[idx].
  842. * A memory barrier here to make sure conf->nr_pending[idx] won't
  843. * be fetched before conf->barrier[idx] is increased. Otherwise
  844. * there will be a race between raise_barrier() and _wait_barrier().
  845. */
  846. smp_mb__after_atomic();
  847. /* For these conditions we must wait:
  848. * A: while the array is in frozen state
  849. * B: while conf->nr_pending[idx] is not 0, meaning regular I/O
  850. * existing in corresponding I/O barrier bucket.
  851. * C: while conf->barrier[idx] >= RESYNC_DEPTH, meaning reaches
  852. * max resync count which allowed on current I/O barrier bucket.
  853. */
  854. wait_event_lock_irq(conf->wait_barrier,
  855. (!conf->array_frozen &&
  856. !atomic_read(&conf->nr_pending[idx]) &&
  857. atomic_read(&conf->barrier[idx]) < RESYNC_DEPTH) ||
  858. test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery),
  859. conf->resync_lock);
  860. if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
  861. atomic_dec(&conf->barrier[idx]);
  862. spin_unlock_irq(&conf->resync_lock);
  863. wake_up(&conf->wait_barrier);
  864. return -EINTR;
  865. }
  866. atomic_inc(&conf->nr_sync_pending);
  867. spin_unlock_irq(&conf->resync_lock);
  868. return 0;
  869. }
  870. static void lower_barrier(struct r1conf *conf, sector_t sector_nr)
  871. {
  872. int idx = sector_to_idx(sector_nr);
  873. BUG_ON(atomic_read(&conf->barrier[idx]) <= 0);
  874. atomic_dec(&conf->barrier[idx]);
  875. atomic_dec(&conf->nr_sync_pending);
  876. wake_up(&conf->wait_barrier);
  877. }
  878. static bool _wait_barrier(struct r1conf *conf, int idx, bool nowait)
  879. {
  880. bool ret = true;
  881. /*
  882. * We need to increase conf->nr_pending[idx] very early here,
  883. * then raise_barrier() can be blocked when it waits for
  884. * conf->nr_pending[idx] to be 0. Then we can avoid holding
  885. * conf->resync_lock when there is no barrier raised in same
  886. * barrier unit bucket. Also if the array is frozen, I/O
  887. * should be blocked until array is unfrozen.
  888. */
  889. atomic_inc(&conf->nr_pending[idx]);
  890. /*
  891. * In _wait_barrier() we firstly increase conf->nr_pending[idx], then
  892. * check conf->barrier[idx]. In raise_barrier() we firstly increase
  893. * conf->barrier[idx], then check conf->nr_pending[idx]. A memory
  894. * barrier is necessary here to make sure conf->barrier[idx] won't be
  895. * fetched before conf->nr_pending[idx] is increased. Otherwise there
  896. * will be a race between _wait_barrier() and raise_barrier().
  897. */
  898. smp_mb__after_atomic();
  899. /*
  900. * Don't worry about checking two atomic_t variables at same time
  901. * here. If during we check conf->barrier[idx], the array is
  902. * frozen (conf->array_frozen is 1), and chonf->barrier[idx] is
  903. * 0, it is safe to return and make the I/O continue. Because the
  904. * array is frozen, all I/O returned here will eventually complete
  905. * or be queued, no race will happen. See code comment in
  906. * frozen_array().
  907. */
  908. if (!READ_ONCE(conf->array_frozen) &&
  909. !atomic_read(&conf->barrier[idx]))
  910. return ret;
  911. /*
  912. * After holding conf->resync_lock, conf->nr_pending[idx]
  913. * should be decreased before waiting for barrier to drop.
  914. * Otherwise, we may encounter a race condition because
  915. * raise_barrer() might be waiting for conf->nr_pending[idx]
  916. * to be 0 at same time.
  917. */
  918. spin_lock_irq(&conf->resync_lock);
  919. atomic_inc(&conf->nr_waiting[idx]);
  920. atomic_dec(&conf->nr_pending[idx]);
  921. /*
  922. * In case freeze_array() is waiting for
  923. * get_unqueued_pending() == extra
  924. */
  925. wake_up_barrier(conf);
  926. /* Wait for the barrier in same barrier unit bucket to drop. */
  927. /* Return false when nowait flag is set */
  928. if (nowait) {
  929. ret = false;
  930. } else {
  931. wait_event_lock_irq(conf->wait_barrier,
  932. !conf->array_frozen &&
  933. !atomic_read(&conf->barrier[idx]),
  934. conf->resync_lock);
  935. atomic_inc(&conf->nr_pending[idx]);
  936. }
  937. atomic_dec(&conf->nr_waiting[idx]);
  938. spin_unlock_irq(&conf->resync_lock);
  939. return ret;
  940. }
  941. static bool wait_read_barrier(struct r1conf *conf, sector_t sector_nr, bool nowait)
  942. {
  943. int idx = sector_to_idx(sector_nr);
  944. bool ret = true;
  945. /*
  946. * Very similar to _wait_barrier(). The difference is, for read
  947. * I/O we don't need wait for sync I/O, but if the whole array
  948. * is frozen, the read I/O still has to wait until the array is
  949. * unfrozen. Since there is no ordering requirement with
  950. * conf->barrier[idx] here, memory barrier is unnecessary as well.
  951. */
  952. atomic_inc(&conf->nr_pending[idx]);
  953. if (!READ_ONCE(conf->array_frozen))
  954. return ret;
  955. spin_lock_irq(&conf->resync_lock);
  956. atomic_inc(&conf->nr_waiting[idx]);
  957. atomic_dec(&conf->nr_pending[idx]);
  958. /*
  959. * In case freeze_array() is waiting for
  960. * get_unqueued_pending() == extra
  961. */
  962. wake_up_barrier(conf);
  963. /* Wait for array to be unfrozen */
  964. /* Return false when nowait flag is set */
  965. if (nowait) {
  966. /* Return false when nowait flag is set */
  967. ret = false;
  968. } else {
  969. wait_event_lock_irq(conf->wait_barrier,
  970. !conf->array_frozen,
  971. conf->resync_lock);
  972. atomic_inc(&conf->nr_pending[idx]);
  973. }
  974. atomic_dec(&conf->nr_waiting[idx]);
  975. spin_unlock_irq(&conf->resync_lock);
  976. return ret;
  977. }
  978. static bool wait_barrier(struct r1conf *conf, sector_t sector_nr, bool nowait)
  979. {
  980. int idx = sector_to_idx(sector_nr);
  981. return _wait_barrier(conf, idx, nowait);
  982. }
  983. static void _allow_barrier(struct r1conf *conf, int idx)
  984. {
  985. atomic_dec(&conf->nr_pending[idx]);
  986. wake_up_barrier(conf);
  987. }
  988. static void allow_barrier(struct r1conf *conf, sector_t sector_nr)
  989. {
  990. int idx = sector_to_idx(sector_nr);
  991. _allow_barrier(conf, idx);
  992. }
  993. /* conf->resync_lock should be held */
  994. static int get_unqueued_pending(struct r1conf *conf)
  995. {
  996. int idx, ret;
  997. ret = atomic_read(&conf->nr_sync_pending);
  998. for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
  999. ret += atomic_read(&conf->nr_pending[idx]) -
  1000. atomic_read(&conf->nr_queued[idx]);
  1001. return ret;
  1002. }
  1003. static void freeze_array(struct r1conf *conf, int extra)
  1004. {
  1005. /* Stop sync I/O and normal I/O and wait for everything to
  1006. * go quiet.
  1007. * This is called in two situations:
  1008. * 1) management command handlers (reshape, remove disk, quiesce).
  1009. * 2) one normal I/O request failed.
  1010. * After array_frozen is set to 1, new sync IO will be blocked at
  1011. * raise_barrier(), and new normal I/O will blocked at _wait_barrier()
  1012. * or wait_read_barrier(). The flying I/Os will either complete or be
  1013. * queued. When everything goes quite, there are only queued I/Os left.
  1014. * Every flying I/O contributes to a conf->nr_pending[idx], idx is the
  1015. * barrier bucket index which this I/O request hits. When all sync and
  1016. * normal I/O are queued, sum of all conf->nr_pending[] will match sum
  1017. * of all conf->nr_queued[]. But normal I/O failure is an exception,
  1018. * in handle_read_error(), we may call freeze_array() before trying to
  1019. * fix the read error. In this case, the error read I/O is not queued,
  1020. * so get_unqueued_pending() == 1.
  1021. *
  1022. * Therefore before this function returns, we need to wait until
  1023. * get_unqueued_pendings(conf) gets equal to extra. For
  1024. * normal I/O context, extra is 1, in rested situations extra is 0.
  1025. */
  1026. spin_lock_irq(&conf->resync_lock);
  1027. conf->array_frozen = 1;
  1028. mddev_add_trace_msg(conf->mddev, "raid1 wait freeze");
  1029. wait_event_lock_irq_cmd(
  1030. conf->wait_barrier,
  1031. get_unqueued_pending(conf) == extra,
  1032. conf->resync_lock,
  1033. flush_pending_writes(conf));
  1034. spin_unlock_irq(&conf->resync_lock);
  1035. }
  1036. static void unfreeze_array(struct r1conf *conf)
  1037. {
  1038. /* reverse the effect of the freeze */
  1039. spin_lock_irq(&conf->resync_lock);
  1040. conf->array_frozen = 0;
  1041. spin_unlock_irq(&conf->resync_lock);
  1042. wake_up(&conf->wait_barrier);
  1043. }
  1044. static void alloc_behind_master_bio(struct r1bio *r1_bio,
  1045. struct bio *bio)
  1046. {
  1047. int size = bio->bi_iter.bi_size;
  1048. unsigned vcnt = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
  1049. int i = 0;
  1050. struct bio *behind_bio = NULL;
  1051. behind_bio = bio_alloc_bioset(NULL, vcnt, bio->bi_opf, GFP_NOIO,
  1052. &r1_bio->mddev->bio_set);
  1053. /* discard op, we don't support writezero/writesame yet */
  1054. if (!bio_has_data(bio)) {
  1055. behind_bio->bi_iter.bi_size = size;
  1056. goto skip_copy;
  1057. }
  1058. while (i < vcnt && size) {
  1059. struct page *page;
  1060. int len = min_t(int, PAGE_SIZE, size);
  1061. page = alloc_page(GFP_NOIO);
  1062. if (unlikely(!page))
  1063. goto free_pages;
  1064. if (!bio_add_page(behind_bio, page, len, 0)) {
  1065. put_page(page);
  1066. goto free_pages;
  1067. }
  1068. size -= len;
  1069. i++;
  1070. }
  1071. bio_copy_data(behind_bio, bio);
  1072. skip_copy:
  1073. r1_bio->behind_master_bio = behind_bio;
  1074. set_bit(R1BIO_BehindIO, &r1_bio->state);
  1075. return;
  1076. free_pages:
  1077. pr_debug("%dB behind alloc failed, doing sync I/O\n",
  1078. bio->bi_iter.bi_size);
  1079. bio_free_pages(behind_bio);
  1080. bio_put(behind_bio);
  1081. }
  1082. static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
  1083. {
  1084. struct raid1_plug_cb *plug = container_of(cb, struct raid1_plug_cb,
  1085. cb);
  1086. struct mddev *mddev = plug->cb.data;
  1087. struct r1conf *conf = mddev->private;
  1088. struct bio *bio;
  1089. if (from_schedule) {
  1090. spin_lock_irq(&conf->device_lock);
  1091. bio_list_merge(&conf->pending_bio_list, &plug->pending);
  1092. spin_unlock_irq(&conf->device_lock);
  1093. wake_up_barrier(conf);
  1094. md_wakeup_thread(mddev->thread);
  1095. kfree(plug);
  1096. return;
  1097. }
  1098. /* we aren't scheduling, so we can do the write-out directly. */
  1099. bio = bio_list_get(&plug->pending);
  1100. flush_bio_list(conf, bio);
  1101. kfree(plug);
  1102. }
  1103. static void init_r1bio(struct r1bio *r1_bio, struct mddev *mddev, struct bio *bio)
  1104. {
  1105. r1_bio->master_bio = bio;
  1106. r1_bio->sectors = bio_sectors(bio);
  1107. r1_bio->state = 0;
  1108. r1_bio->mddev = mddev;
  1109. r1_bio->sector = bio->bi_iter.bi_sector;
  1110. }
  1111. static inline struct r1bio *
  1112. alloc_r1bio(struct mddev *mddev, struct bio *bio)
  1113. {
  1114. struct r1conf *conf = mddev->private;
  1115. struct r1bio *r1_bio;
  1116. r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
  1117. memset(r1_bio, 0, offsetof(struct r1bio, bios[conf->raid_disks * 2]));
  1118. init_r1bio(r1_bio, mddev, bio);
  1119. return r1_bio;
  1120. }
  1121. static void raid1_read_request(struct mddev *mddev, struct bio *bio,
  1122. int max_read_sectors, struct r1bio *r1_bio)
  1123. {
  1124. struct r1conf *conf = mddev->private;
  1125. struct raid1_info *mirror;
  1126. struct bio *read_bio;
  1127. int max_sectors;
  1128. int rdisk;
  1129. bool r1bio_existed = !!r1_bio;
  1130. /*
  1131. * If r1_bio is set, we are blocking the raid1d thread
  1132. * so there is a tiny risk of deadlock. So ask for
  1133. * emergency memory if needed.
  1134. */
  1135. gfp_t gfp = r1_bio ? (GFP_NOIO | __GFP_HIGH) : GFP_NOIO;
  1136. /*
  1137. * Still need barrier for READ in case that whole
  1138. * array is frozen.
  1139. */
  1140. if (!wait_read_barrier(conf, bio->bi_iter.bi_sector,
  1141. bio->bi_opf & REQ_NOWAIT)) {
  1142. bio_wouldblock_error(bio);
  1143. return;
  1144. }
  1145. if (!r1_bio)
  1146. r1_bio = alloc_r1bio(mddev, bio);
  1147. else
  1148. init_r1bio(r1_bio, mddev, bio);
  1149. r1_bio->sectors = max_read_sectors;
  1150. /*
  1151. * make_request() can abort the operation when read-ahead is being
  1152. * used and no empty request is available.
  1153. */
  1154. rdisk = read_balance(conf, r1_bio, &max_sectors);
  1155. if (rdisk < 0) {
  1156. /* couldn't find anywhere to read from */
  1157. if (r1bio_existed)
  1158. pr_crit_ratelimited("md/raid1:%s: %pg: unrecoverable I/O read error for block %llu\n",
  1159. mdname(mddev),
  1160. conf->mirrors[r1_bio->read_disk].rdev->bdev,
  1161. r1_bio->sector);
  1162. raid_end_bio_io(r1_bio);
  1163. return;
  1164. }
  1165. mirror = conf->mirrors + rdisk;
  1166. if (r1bio_existed)
  1167. pr_info_ratelimited("md/raid1:%s: redirecting sector %llu to other mirror: %pg\n",
  1168. mdname(mddev),
  1169. (unsigned long long)r1_bio->sector,
  1170. mirror->rdev->bdev);
  1171. if (test_bit(WriteMostly, &mirror->rdev->flags) &&
  1172. md_bitmap_enabled(mddev, false)) {
  1173. /*
  1174. * Reading from a write-mostly device must take care not to
  1175. * over-take any writes that are 'behind'
  1176. */
  1177. mddev_add_trace_msg(mddev, "raid1 wait behind writes");
  1178. mddev->bitmap_ops->wait_behind_writes(mddev);
  1179. }
  1180. if (max_sectors < bio_sectors(bio)) {
  1181. bio = bio_submit_split_bioset(bio, max_sectors,
  1182. &conf->bio_split);
  1183. if (!bio) {
  1184. set_bit(R1BIO_Returned, &r1_bio->state);
  1185. goto err_handle;
  1186. }
  1187. r1_bio->master_bio = bio;
  1188. r1_bio->sectors = max_sectors;
  1189. }
  1190. r1_bio->read_disk = rdisk;
  1191. if (!r1bio_existed) {
  1192. md_account_bio(mddev, &bio);
  1193. r1_bio->master_bio = bio;
  1194. }
  1195. read_bio = bio_alloc_clone(mirror->rdev->bdev, bio, gfp,
  1196. &mddev->bio_set);
  1197. read_bio->bi_opf &= ~REQ_NOWAIT;
  1198. r1_bio->bios[rdisk] = read_bio;
  1199. read_bio->bi_iter.bi_sector = r1_bio->sector +
  1200. mirror->rdev->data_offset;
  1201. read_bio->bi_end_io = raid1_end_read_request;
  1202. if (test_bit(FailFast, &mirror->rdev->flags) &&
  1203. test_bit(R1BIO_FailFast, &r1_bio->state))
  1204. read_bio->bi_opf |= MD_FAILFAST;
  1205. read_bio->bi_private = r1_bio;
  1206. mddev_trace_remap(mddev, read_bio, r1_bio->sector);
  1207. submit_bio_noacct(read_bio);
  1208. return;
  1209. err_handle:
  1210. atomic_dec(&mirror->rdev->nr_pending);
  1211. raid_end_bio_io(r1_bio);
  1212. }
  1213. static bool wait_blocked_rdev(struct mddev *mddev, struct bio *bio)
  1214. {
  1215. struct r1conf *conf = mddev->private;
  1216. int disks = conf->raid_disks * 2;
  1217. int i;
  1218. retry:
  1219. for (i = 0; i < disks; i++) {
  1220. struct md_rdev *rdev = conf->mirrors[i].rdev;
  1221. if (!rdev)
  1222. continue;
  1223. /* don't write here until the bad block is acknowledged */
  1224. if (test_bit(WriteErrorSeen, &rdev->flags) &&
  1225. rdev_has_badblock(rdev, bio->bi_iter.bi_sector,
  1226. bio_sectors(bio)) < 0)
  1227. set_bit(BlockedBadBlocks, &rdev->flags);
  1228. if (rdev_blocked(rdev)) {
  1229. if (bio->bi_opf & REQ_NOWAIT)
  1230. return false;
  1231. mddev_add_trace_msg(rdev->mddev, "raid1 wait rdev %d blocked",
  1232. rdev->raid_disk);
  1233. atomic_inc(&rdev->nr_pending);
  1234. md_wait_for_blocked_rdev(rdev, rdev->mddev);
  1235. goto retry;
  1236. }
  1237. }
  1238. return true;
  1239. }
  1240. static void raid1_start_write_behind(struct mddev *mddev, struct r1bio *r1_bio,
  1241. struct bio *bio)
  1242. {
  1243. unsigned long max_write_behind = mddev->bitmap_info.max_write_behind;
  1244. struct md_bitmap_stats stats;
  1245. int err;
  1246. /* behind write rely on bitmap, see bitmap_operations */
  1247. if (!md_bitmap_enabled(mddev, false))
  1248. return;
  1249. err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
  1250. if (err)
  1251. return;
  1252. /* Don't do behind IO if reader is waiting, or there are too many. */
  1253. if (!stats.behind_wait && stats.behind_writes < max_write_behind)
  1254. alloc_behind_master_bio(r1_bio, bio);
  1255. if (test_bit(R1BIO_BehindIO, &r1_bio->state))
  1256. mddev->bitmap_ops->start_behind_write(mddev);
  1257. }
  1258. static void raid1_write_request(struct mddev *mddev, struct bio *bio,
  1259. int max_write_sectors)
  1260. {
  1261. struct r1conf *conf = mddev->private;
  1262. struct r1bio *r1_bio;
  1263. int i, disks, k;
  1264. unsigned long flags;
  1265. int first_clone;
  1266. int max_sectors;
  1267. bool write_behind = false;
  1268. bool is_discard = (bio_op(bio) == REQ_OP_DISCARD);
  1269. if (mddev_is_clustered(mddev) &&
  1270. mddev->cluster_ops->area_resyncing(mddev, WRITE,
  1271. bio->bi_iter.bi_sector, bio_end_sector(bio))) {
  1272. DEFINE_WAIT(w);
  1273. if (bio->bi_opf & REQ_NOWAIT) {
  1274. bio_wouldblock_error(bio);
  1275. return;
  1276. }
  1277. for (;;) {
  1278. prepare_to_wait(&conf->wait_barrier,
  1279. &w, TASK_IDLE);
  1280. if (!mddev->cluster_ops->area_resyncing(mddev, WRITE,
  1281. bio->bi_iter.bi_sector,
  1282. bio_end_sector(bio)))
  1283. break;
  1284. schedule();
  1285. }
  1286. finish_wait(&conf->wait_barrier, &w);
  1287. }
  1288. /*
  1289. * Register the new request and wait if the reconstruction
  1290. * thread has put up a bar for new requests.
  1291. * Continue immediately if no resync is active currently.
  1292. */
  1293. if (!wait_barrier(conf, bio->bi_iter.bi_sector,
  1294. bio->bi_opf & REQ_NOWAIT)) {
  1295. bio_wouldblock_error(bio);
  1296. return;
  1297. }
  1298. if (!wait_blocked_rdev(mddev, bio)) {
  1299. bio_wouldblock_error(bio);
  1300. return;
  1301. }
  1302. r1_bio = alloc_r1bio(mddev, bio);
  1303. r1_bio->sectors = max_write_sectors;
  1304. /* first select target devices under rcu_lock and
  1305. * inc refcount on their rdev. Record them by setting
  1306. * bios[x] to bio
  1307. * If there are known/acknowledged bad blocks on any device on
  1308. * which we have seen a write error, we want to avoid writing those
  1309. * blocks.
  1310. * This potentially requires several writes to write around
  1311. * the bad blocks. Each set of writes gets it's own r1bio
  1312. * with a set of bios attached.
  1313. */
  1314. disks = conf->raid_disks * 2;
  1315. max_sectors = r1_bio->sectors;
  1316. for (i = 0; i < disks; i++) {
  1317. struct md_rdev *rdev = conf->mirrors[i].rdev;
  1318. /*
  1319. * The write-behind io is only attempted on drives marked as
  1320. * write-mostly, which means we could allocate write behind
  1321. * bio later.
  1322. */
  1323. if (!is_discard && rdev && test_bit(WriteMostly, &rdev->flags))
  1324. write_behind = true;
  1325. r1_bio->bios[i] = NULL;
  1326. if (!rdev || test_bit(Faulty, &rdev->flags))
  1327. continue;
  1328. atomic_inc(&rdev->nr_pending);
  1329. if (test_bit(WriteErrorSeen, &rdev->flags)) {
  1330. sector_t first_bad;
  1331. sector_t bad_sectors;
  1332. int is_bad;
  1333. is_bad = is_badblock(rdev, r1_bio->sector, max_sectors,
  1334. &first_bad, &bad_sectors);
  1335. if (is_bad && first_bad <= r1_bio->sector) {
  1336. /* Cannot write here at all */
  1337. bad_sectors -= (r1_bio->sector - first_bad);
  1338. if (bad_sectors < max_sectors)
  1339. /* mustn't write more than bad_sectors
  1340. * to other devices yet
  1341. */
  1342. max_sectors = bad_sectors;
  1343. rdev_dec_pending(rdev, mddev);
  1344. continue;
  1345. }
  1346. if (is_bad) {
  1347. int good_sectors;
  1348. /*
  1349. * We cannot atomically write this, so just
  1350. * error in that case. It could be possible to
  1351. * atomically write other mirrors, but the
  1352. * complexity of supporting that is not worth
  1353. * the benefit.
  1354. */
  1355. if (bio->bi_opf & REQ_ATOMIC)
  1356. goto err_handle;
  1357. good_sectors = first_bad - r1_bio->sector;
  1358. if (good_sectors < max_sectors)
  1359. max_sectors = good_sectors;
  1360. }
  1361. }
  1362. r1_bio->bios[i] = bio;
  1363. }
  1364. /*
  1365. * When using a bitmap, we may call alloc_behind_master_bio below.
  1366. * alloc_behind_master_bio allocates a copy of the data payload a page
  1367. * at a time and thus needs a new bio that can fit the whole payload
  1368. * this bio in page sized chunks.
  1369. */
  1370. if (write_behind && mddev->bitmap)
  1371. max_sectors = min_t(int, max_sectors,
  1372. BIO_MAX_VECS * (PAGE_SIZE >> 9));
  1373. if (max_sectors < bio_sectors(bio)) {
  1374. bio = bio_submit_split_bioset(bio, max_sectors,
  1375. &conf->bio_split);
  1376. if (!bio) {
  1377. set_bit(R1BIO_Returned, &r1_bio->state);
  1378. goto err_handle;
  1379. }
  1380. r1_bio->master_bio = bio;
  1381. r1_bio->sectors = max_sectors;
  1382. }
  1383. md_account_bio(mddev, &bio);
  1384. r1_bio->master_bio = bio;
  1385. atomic_set(&r1_bio->remaining, 1);
  1386. atomic_set(&r1_bio->behind_remaining, 0);
  1387. first_clone = 1;
  1388. for (i = 0; i < disks; i++) {
  1389. struct bio *mbio = NULL;
  1390. struct md_rdev *rdev = conf->mirrors[i].rdev;
  1391. if (!r1_bio->bios[i])
  1392. continue;
  1393. if (first_clone) {
  1394. if (write_behind)
  1395. raid1_start_write_behind(mddev, r1_bio, bio);
  1396. first_clone = 0;
  1397. }
  1398. if (r1_bio->behind_master_bio) {
  1399. mbio = bio_alloc_clone(rdev->bdev,
  1400. r1_bio->behind_master_bio,
  1401. GFP_NOIO, &mddev->bio_set);
  1402. if (test_bit(CollisionCheck, &rdev->flags))
  1403. wait_for_serialization(rdev, r1_bio);
  1404. if (test_bit(WriteMostly, &rdev->flags))
  1405. atomic_inc(&r1_bio->behind_remaining);
  1406. } else {
  1407. mbio = bio_alloc_clone(rdev->bdev, bio, GFP_NOIO,
  1408. &mddev->bio_set);
  1409. if (test_bit(MD_SERIALIZE_POLICY, &mddev->flags))
  1410. wait_for_serialization(rdev, r1_bio);
  1411. }
  1412. mbio->bi_opf &= ~REQ_NOWAIT;
  1413. r1_bio->bios[i] = mbio;
  1414. mbio->bi_iter.bi_sector = (r1_bio->sector + rdev->data_offset);
  1415. mbio->bi_end_io = raid1_end_write_request;
  1416. if (test_bit(FailFast, &rdev->flags) &&
  1417. !test_bit(WriteMostly, &rdev->flags) &&
  1418. conf->raid_disks - mddev->degraded > 1)
  1419. mbio->bi_opf |= MD_FAILFAST;
  1420. mbio->bi_private = r1_bio;
  1421. atomic_inc(&r1_bio->remaining);
  1422. mddev_trace_remap(mddev, mbio, r1_bio->sector);
  1423. /* flush_pending_writes() needs access to the rdev so...*/
  1424. mbio->bi_bdev = (void *)rdev;
  1425. if (!raid1_add_bio_to_plug(mddev, mbio, raid1_unplug, disks)) {
  1426. spin_lock_irqsave(&conf->device_lock, flags);
  1427. bio_list_add(&conf->pending_bio_list, mbio);
  1428. spin_unlock_irqrestore(&conf->device_lock, flags);
  1429. md_wakeup_thread(mddev->thread);
  1430. }
  1431. }
  1432. r1_bio_write_done(r1_bio);
  1433. /* In case raid1d snuck in to freeze_array */
  1434. wake_up_barrier(conf);
  1435. return;
  1436. err_handle:
  1437. for (k = 0; k < i; k++) {
  1438. if (r1_bio->bios[k]) {
  1439. rdev_dec_pending(conf->mirrors[k].rdev, mddev);
  1440. r1_bio->bios[k] = NULL;
  1441. }
  1442. }
  1443. raid_end_bio_io(r1_bio);
  1444. }
  1445. static bool raid1_make_request(struct mddev *mddev, struct bio *bio)
  1446. {
  1447. sector_t sectors;
  1448. if (unlikely(bio->bi_opf & REQ_PREFLUSH)
  1449. && md_flush_request(mddev, bio))
  1450. return true;
  1451. /*
  1452. * There is a limit to the maximum size, but
  1453. * the read/write handler might find a lower limit
  1454. * due to bad blocks. To avoid multiple splits,
  1455. * we pass the maximum number of sectors down
  1456. * and let the lower level perform the split.
  1457. */
  1458. sectors = align_to_barrier_unit_end(
  1459. bio->bi_iter.bi_sector, bio_sectors(bio));
  1460. if (bio_data_dir(bio) == READ)
  1461. raid1_read_request(mddev, bio, sectors, NULL);
  1462. else {
  1463. md_write_start(mddev,bio);
  1464. raid1_write_request(mddev, bio, sectors);
  1465. }
  1466. return true;
  1467. }
  1468. static void raid1_status(struct seq_file *seq, struct mddev *mddev)
  1469. {
  1470. struct r1conf *conf = mddev->private;
  1471. int i;
  1472. lockdep_assert_held(&mddev->lock);
  1473. seq_printf(seq, " [%d/%d] [", conf->raid_disks,
  1474. conf->raid_disks - mddev->degraded);
  1475. for (i = 0; i < conf->raid_disks; i++) {
  1476. struct md_rdev *rdev = READ_ONCE(conf->mirrors[i].rdev);
  1477. seq_printf(seq, "%s",
  1478. rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
  1479. }
  1480. seq_printf(seq, "]");
  1481. }
  1482. /**
  1483. * raid1_error() - RAID1 error handler.
  1484. * @mddev: affected md device.
  1485. * @rdev: member device to fail.
  1486. *
  1487. * The routine acknowledges &rdev failure and determines new @mddev state.
  1488. * If it failed, then:
  1489. * - &MD_BROKEN flag is set in &mddev->flags.
  1490. * - recovery is disabled.
  1491. * Otherwise, it must be degraded:
  1492. * - recovery is interrupted.
  1493. * - &mddev->degraded is bumped.
  1494. *
  1495. * @rdev is marked as &Faulty excluding case when array is failed and
  1496. * MD_FAILLAST_DEV is not set.
  1497. */
  1498. static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
  1499. {
  1500. struct r1conf *conf = mddev->private;
  1501. unsigned long flags;
  1502. spin_lock_irqsave(&conf->device_lock, flags);
  1503. if (test_bit(In_sync, &rdev->flags) &&
  1504. (conf->raid_disks - mddev->degraded) == 1) {
  1505. set_bit(MD_BROKEN, &mddev->flags);
  1506. if (!test_bit(MD_FAILLAST_DEV, &mddev->flags)) {
  1507. spin_unlock_irqrestore(&conf->device_lock, flags);
  1508. return;
  1509. }
  1510. }
  1511. set_bit(Blocked, &rdev->flags);
  1512. if (test_and_clear_bit(In_sync, &rdev->flags))
  1513. mddev->degraded++;
  1514. set_bit(Faulty, &rdev->flags);
  1515. spin_unlock_irqrestore(&conf->device_lock, flags);
  1516. /*
  1517. * if recovery is running, make sure it aborts.
  1518. */
  1519. set_bit(MD_RECOVERY_INTR, &mddev->recovery);
  1520. set_mask_bits(&mddev->sb_flags, 0,
  1521. BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
  1522. pr_crit("md/raid1:%s: Disk failure on %pg, disabling device.\n"
  1523. "md/raid1:%s: Operation continuing on %d devices.\n",
  1524. mdname(mddev), rdev->bdev,
  1525. mdname(mddev), conf->raid_disks - mddev->degraded);
  1526. }
  1527. static void print_conf(struct r1conf *conf)
  1528. {
  1529. int i;
  1530. pr_debug("RAID1 conf printout:\n");
  1531. if (!conf) {
  1532. pr_debug("(!conf)\n");
  1533. return;
  1534. }
  1535. pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
  1536. conf->raid_disks);
  1537. lockdep_assert_held(&conf->mddev->reconfig_mutex);
  1538. for (i = 0; i < conf->raid_disks; i++) {
  1539. struct md_rdev *rdev = conf->mirrors[i].rdev;
  1540. if (rdev)
  1541. pr_debug(" disk %d, wo:%d, o:%d, dev:%pg\n",
  1542. i, !test_bit(In_sync, &rdev->flags),
  1543. !test_bit(Faulty, &rdev->flags),
  1544. rdev->bdev);
  1545. }
  1546. }
  1547. static void close_sync(struct r1conf *conf)
  1548. {
  1549. int idx;
  1550. for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) {
  1551. _wait_barrier(conf, idx, false);
  1552. _allow_barrier(conf, idx);
  1553. }
  1554. mempool_exit(&conf->r1buf_pool);
  1555. }
  1556. static int raid1_spare_active(struct mddev *mddev)
  1557. {
  1558. int i;
  1559. struct r1conf *conf = mddev->private;
  1560. int count = 0;
  1561. unsigned long flags;
  1562. /*
  1563. * Find all failed disks within the RAID1 configuration
  1564. * and mark them readable.
  1565. * Called under mddev lock, so rcu protection not needed.
  1566. * device_lock used to avoid races with raid1_end_read_request
  1567. * which expects 'In_sync' flags and ->degraded to be consistent.
  1568. */
  1569. spin_lock_irqsave(&conf->device_lock, flags);
  1570. for (i = 0; i < conf->raid_disks; i++) {
  1571. struct md_rdev *rdev = conf->mirrors[i].rdev;
  1572. struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev;
  1573. if (repl
  1574. && !test_bit(Candidate, &repl->flags)
  1575. && repl->recovery_offset == MaxSector
  1576. && !test_bit(Faulty, &repl->flags)
  1577. && !test_and_set_bit(In_sync, &repl->flags)) {
  1578. /* replacement has just become active */
  1579. if (!rdev ||
  1580. !test_and_clear_bit(In_sync, &rdev->flags))
  1581. count++;
  1582. if (rdev) {
  1583. /* Replaced device not technically
  1584. * faulty, but we need to be sure
  1585. * it gets removed and never re-added
  1586. */
  1587. set_bit(Faulty, &rdev->flags);
  1588. sysfs_notify_dirent_safe(
  1589. rdev->sysfs_state);
  1590. }
  1591. }
  1592. if (rdev
  1593. && rdev->recovery_offset == MaxSector
  1594. && !test_bit(Faulty, &rdev->flags)
  1595. && !test_and_set_bit(In_sync, &rdev->flags)) {
  1596. count++;
  1597. sysfs_notify_dirent_safe(rdev->sysfs_state);
  1598. }
  1599. }
  1600. mddev->degraded -= count;
  1601. spin_unlock_irqrestore(&conf->device_lock, flags);
  1602. print_conf(conf);
  1603. return count;
  1604. }
  1605. static bool raid1_add_conf(struct r1conf *conf, struct md_rdev *rdev, int disk,
  1606. bool replacement)
  1607. {
  1608. struct raid1_info *info = conf->mirrors + disk;
  1609. if (replacement)
  1610. info += conf->raid_disks;
  1611. if (info->rdev)
  1612. return false;
  1613. if (bdev_nonrot(rdev->bdev)) {
  1614. set_bit(Nonrot, &rdev->flags);
  1615. WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks + 1);
  1616. }
  1617. rdev->raid_disk = disk;
  1618. info->head_position = 0;
  1619. info->seq_start = MaxSector;
  1620. WRITE_ONCE(info->rdev, rdev);
  1621. return true;
  1622. }
  1623. static bool raid1_remove_conf(struct r1conf *conf, int disk)
  1624. {
  1625. struct raid1_info *info = conf->mirrors + disk;
  1626. struct md_rdev *rdev = info->rdev;
  1627. if (!rdev || test_bit(In_sync, &rdev->flags) ||
  1628. atomic_read(&rdev->nr_pending))
  1629. return false;
  1630. /* Only remove non-faulty devices if recovery is not possible. */
  1631. if (!test_bit(Faulty, &rdev->flags) &&
  1632. rdev->mddev->degraded < conf->raid_disks)
  1633. return false;
  1634. if (test_and_clear_bit(Nonrot, &rdev->flags))
  1635. WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks - 1);
  1636. WRITE_ONCE(info->rdev, NULL);
  1637. return true;
  1638. }
  1639. static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
  1640. {
  1641. struct r1conf *conf = mddev->private;
  1642. int err = -EEXIST;
  1643. int mirror = 0, repl_slot = -1;
  1644. struct raid1_info *p;
  1645. int first = 0;
  1646. int last = conf->raid_disks - 1;
  1647. if (rdev->raid_disk >= 0)
  1648. first = last = rdev->raid_disk;
  1649. /*
  1650. * find the disk ... but prefer rdev->saved_raid_disk
  1651. * if possible.
  1652. */
  1653. if (rdev->saved_raid_disk >= 0 &&
  1654. rdev->saved_raid_disk >= first &&
  1655. rdev->saved_raid_disk < conf->raid_disks &&
  1656. conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
  1657. first = last = rdev->saved_raid_disk;
  1658. for (mirror = first; mirror <= last; mirror++) {
  1659. p = conf->mirrors + mirror;
  1660. if (!p->rdev) {
  1661. err = mddev_stack_new_rdev(mddev, rdev);
  1662. if (err)
  1663. return err;
  1664. raid1_add_conf(conf, rdev, mirror, false);
  1665. /* As all devices are equivalent, we don't need a full recovery
  1666. * if this was recently any drive of the array
  1667. */
  1668. if (rdev->saved_raid_disk < 0)
  1669. conf->fullsync = 1;
  1670. break;
  1671. }
  1672. if (test_bit(WantReplacement, &p->rdev->flags) &&
  1673. p[conf->raid_disks].rdev == NULL && repl_slot < 0)
  1674. repl_slot = mirror;
  1675. }
  1676. if (err && repl_slot >= 0) {
  1677. /* Add this device as a replacement */
  1678. clear_bit(In_sync, &rdev->flags);
  1679. set_bit(Replacement, &rdev->flags);
  1680. raid1_add_conf(conf, rdev, repl_slot, true);
  1681. err = 0;
  1682. conf->fullsync = 1;
  1683. }
  1684. print_conf(conf);
  1685. return err;
  1686. }
  1687. static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
  1688. {
  1689. struct r1conf *conf = mddev->private;
  1690. int err = 0;
  1691. int number = rdev->raid_disk;
  1692. struct raid1_info *p = conf->mirrors + number;
  1693. if (unlikely(number >= conf->raid_disks))
  1694. goto abort;
  1695. if (rdev != p->rdev) {
  1696. number += conf->raid_disks;
  1697. p = conf->mirrors + number;
  1698. }
  1699. print_conf(conf);
  1700. if (rdev == p->rdev) {
  1701. if (!raid1_remove_conf(conf, number)) {
  1702. err = -EBUSY;
  1703. goto abort;
  1704. }
  1705. if (number < conf->raid_disks &&
  1706. conf->mirrors[conf->raid_disks + number].rdev) {
  1707. /* We just removed a device that is being replaced.
  1708. * Move down the replacement. We drain all IO before
  1709. * doing this to avoid confusion.
  1710. */
  1711. struct md_rdev *repl =
  1712. conf->mirrors[conf->raid_disks + number].rdev;
  1713. freeze_array(conf, 0);
  1714. if (atomic_read(&repl->nr_pending)) {
  1715. /* It means that some queued IO of retry_list
  1716. * hold repl. Thus, we cannot set replacement
  1717. * as NULL, avoiding rdev NULL pointer
  1718. * dereference in sync_request_write and
  1719. * handle_write_finished.
  1720. */
  1721. err = -EBUSY;
  1722. unfreeze_array(conf);
  1723. goto abort;
  1724. }
  1725. clear_bit(Replacement, &repl->flags);
  1726. WRITE_ONCE(p->rdev, repl);
  1727. conf->mirrors[conf->raid_disks + number].rdev = NULL;
  1728. unfreeze_array(conf);
  1729. }
  1730. clear_bit(WantReplacement, &rdev->flags);
  1731. err = md_integrity_register(mddev);
  1732. }
  1733. abort:
  1734. print_conf(conf);
  1735. return err;
  1736. }
  1737. static void end_sync_read(struct bio *bio)
  1738. {
  1739. struct r1bio *r1_bio = get_resync_r1bio(bio);
  1740. update_head_pos(r1_bio->read_disk, r1_bio);
  1741. /*
  1742. * we have read a block, now it needs to be re-written,
  1743. * or re-read if the read failed.
  1744. * We don't do much here, just schedule handling by raid1d
  1745. */
  1746. if (!bio->bi_status)
  1747. set_bit(R1BIO_Uptodate, &r1_bio->state);
  1748. if (atomic_dec_and_test(&r1_bio->remaining))
  1749. reschedule_retry(r1_bio);
  1750. }
  1751. static void abort_sync_write(struct mddev *mddev, struct r1bio *r1_bio)
  1752. {
  1753. sector_t sync_blocks = 0;
  1754. sector_t s = r1_bio->sector;
  1755. long sectors_to_go = r1_bio->sectors;
  1756. /* make sure these bits don't get cleared. */
  1757. do {
  1758. md_bitmap_end_sync(mddev, s, &sync_blocks);
  1759. s += sync_blocks;
  1760. sectors_to_go -= sync_blocks;
  1761. } while (sectors_to_go > 0);
  1762. }
  1763. static void put_sync_write_buf(struct r1bio *r1_bio)
  1764. {
  1765. if (atomic_dec_and_test(&r1_bio->remaining)) {
  1766. struct mddev *mddev = r1_bio->mddev;
  1767. int s = r1_bio->sectors;
  1768. if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
  1769. test_bit(R1BIO_WriteError, &r1_bio->state))
  1770. reschedule_retry(r1_bio);
  1771. else {
  1772. put_buf(r1_bio);
  1773. md_done_sync(mddev, s);
  1774. }
  1775. }
  1776. }
  1777. static void end_sync_write(struct bio *bio)
  1778. {
  1779. struct r1bio *r1_bio = get_resync_r1bio(bio);
  1780. struct mddev *mddev = r1_bio->mddev;
  1781. struct r1conf *conf = mddev->private;
  1782. struct md_rdev *rdev = conf->mirrors[find_bio_disk(r1_bio, bio)].rdev;
  1783. if (bio->bi_status) {
  1784. abort_sync_write(mddev, r1_bio);
  1785. set_bit(WriteErrorSeen, &rdev->flags);
  1786. if (!test_and_set_bit(WantReplacement, &rdev->flags))
  1787. set_bit(MD_RECOVERY_NEEDED, &
  1788. mddev->recovery);
  1789. set_bit(R1BIO_WriteError, &r1_bio->state);
  1790. } else if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors) &&
  1791. !rdev_has_badblock(conf->mirrors[r1_bio->read_disk].rdev,
  1792. r1_bio->sector, r1_bio->sectors)) {
  1793. set_bit(R1BIO_MadeGood, &r1_bio->state);
  1794. }
  1795. put_sync_write_buf(r1_bio);
  1796. }
  1797. static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector,
  1798. int sectors, struct page *page, blk_opf_t rw)
  1799. {
  1800. if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
  1801. /* success */
  1802. return 1;
  1803. if (rw == REQ_OP_WRITE) {
  1804. set_bit(WriteErrorSeen, &rdev->flags);
  1805. if (!test_and_set_bit(WantReplacement,
  1806. &rdev->flags))
  1807. set_bit(MD_RECOVERY_NEEDED, &
  1808. rdev->mddev->recovery);
  1809. }
  1810. /* need to record an error - either for the block or the device */
  1811. rdev_set_badblocks(rdev, sector, sectors, 0);
  1812. return 0;
  1813. }
  1814. static int fix_sync_read_error(struct r1bio *r1_bio)
  1815. {
  1816. /* Try some synchronous reads of other devices to get
  1817. * good data, much like with normal read errors. Only
  1818. * read into the pages we already have so we don't
  1819. * need to re-issue the read request.
  1820. * We don't need to freeze the array, because being in an
  1821. * active sync request, there is no normal IO, and
  1822. * no overlapping syncs.
  1823. * We don't need to check is_badblock() again as we
  1824. * made sure that anything with a bad block in range
  1825. * will have bi_end_io clear.
  1826. */
  1827. struct mddev *mddev = r1_bio->mddev;
  1828. struct r1conf *conf = mddev->private;
  1829. struct bio *bio = r1_bio->bios[r1_bio->read_disk];
  1830. struct page **pages = get_resync_pages(bio)->pages;
  1831. sector_t sect = r1_bio->sector;
  1832. int sectors = r1_bio->sectors;
  1833. int idx = 0;
  1834. struct md_rdev *rdev;
  1835. rdev = conf->mirrors[r1_bio->read_disk].rdev;
  1836. if (test_bit(FailFast, &rdev->flags)) {
  1837. /* Don't try recovering from here - just fail it
  1838. * ... unless it is the last working device of course */
  1839. md_error(mddev, rdev);
  1840. if (test_bit(Faulty, &rdev->flags))
  1841. /* Don't try to read from here, but make sure
  1842. * put_buf does it's thing
  1843. */
  1844. bio->bi_end_io = end_sync_write;
  1845. }
  1846. while(sectors) {
  1847. int s = sectors;
  1848. int d = r1_bio->read_disk;
  1849. int success = 0;
  1850. int start;
  1851. if (s > (PAGE_SIZE>>9))
  1852. s = PAGE_SIZE >> 9;
  1853. do {
  1854. if (r1_bio->bios[d]->bi_end_io == end_sync_read) {
  1855. /* No rcu protection needed here devices
  1856. * can only be removed when no resync is
  1857. * active, and resync is currently active
  1858. */
  1859. rdev = conf->mirrors[d].rdev;
  1860. if (sync_page_io(rdev, sect, s<<9,
  1861. pages[idx],
  1862. REQ_OP_READ, false)) {
  1863. success = 1;
  1864. break;
  1865. }
  1866. }
  1867. d++;
  1868. if (d == conf->raid_disks * 2)
  1869. d = 0;
  1870. } while (!success && d != r1_bio->read_disk);
  1871. if (!success) {
  1872. int abort = 0;
  1873. /* Cannot read from anywhere, this block is lost.
  1874. * Record a bad block on each device. If that doesn't
  1875. * work just disable and interrupt the recovery.
  1876. * Don't fail devices as that won't really help.
  1877. */
  1878. pr_crit_ratelimited("md/raid1:%s: %pg: unrecoverable I/O read error for block %llu\n",
  1879. mdname(mddev), bio->bi_bdev,
  1880. (unsigned long long)r1_bio->sector);
  1881. for (d = 0; d < conf->raid_disks * 2; d++) {
  1882. rdev = conf->mirrors[d].rdev;
  1883. if (!rdev || test_bit(Faulty, &rdev->flags))
  1884. continue;
  1885. if (!rdev_set_badblocks(rdev, sect, s, 0))
  1886. abort = 1;
  1887. }
  1888. if (abort)
  1889. return 0;
  1890. /* Try next page */
  1891. sectors -= s;
  1892. sect += s;
  1893. idx++;
  1894. continue;
  1895. }
  1896. start = d;
  1897. /* write it back and re-read */
  1898. while (d != r1_bio->read_disk) {
  1899. if (d == 0)
  1900. d = conf->raid_disks * 2;
  1901. d--;
  1902. if (r1_bio->bios[d]->bi_end_io != end_sync_read)
  1903. continue;
  1904. rdev = conf->mirrors[d].rdev;
  1905. if (r1_sync_page_io(rdev, sect, s,
  1906. pages[idx],
  1907. REQ_OP_WRITE) == 0) {
  1908. r1_bio->bios[d]->bi_end_io = NULL;
  1909. rdev_dec_pending(rdev, mddev);
  1910. }
  1911. }
  1912. d = start;
  1913. while (d != r1_bio->read_disk) {
  1914. if (d == 0)
  1915. d = conf->raid_disks * 2;
  1916. d--;
  1917. if (r1_bio->bios[d]->bi_end_io != end_sync_read)
  1918. continue;
  1919. rdev = conf->mirrors[d].rdev;
  1920. if (r1_sync_page_io(rdev, sect, s,
  1921. pages[idx],
  1922. REQ_OP_READ) != 0)
  1923. atomic_add(s, &rdev->corrected_errors);
  1924. }
  1925. sectors -= s;
  1926. sect += s;
  1927. idx ++;
  1928. }
  1929. set_bit(R1BIO_Uptodate, &r1_bio->state);
  1930. bio->bi_status = 0;
  1931. return 1;
  1932. }
  1933. static void process_checks(struct r1bio *r1_bio)
  1934. {
  1935. /* We have read all readable devices. If we haven't
  1936. * got the block, then there is no hope left.
  1937. * If we have, then we want to do a comparison
  1938. * and skip the write if everything is the same.
  1939. * If any blocks failed to read, then we need to
  1940. * attempt an over-write
  1941. */
  1942. struct mddev *mddev = r1_bio->mddev;
  1943. struct r1conf *conf = mddev->private;
  1944. int primary;
  1945. int i;
  1946. int vcnt;
  1947. /* Fix variable parts of all bios */
  1948. vcnt = (r1_bio->sectors + PAGE_SIZE / 512 - 1) >> (PAGE_SHIFT - 9);
  1949. for (i = 0; i < conf->raid_disks * 2; i++) {
  1950. blk_status_t status;
  1951. struct bio *b = r1_bio->bios[i];
  1952. struct resync_pages *rp = get_resync_pages(b);
  1953. if (b->bi_end_io != end_sync_read)
  1954. continue;
  1955. /* fixup the bio for reuse, but preserve errno */
  1956. status = b->bi_status;
  1957. bio_reset(b, conf->mirrors[i].rdev->bdev, REQ_OP_READ);
  1958. b->bi_status = status;
  1959. b->bi_iter.bi_sector = r1_bio->sector +
  1960. conf->mirrors[i].rdev->data_offset;
  1961. b->bi_end_io = end_sync_read;
  1962. rp->raid_bio = r1_bio;
  1963. b->bi_private = rp;
  1964. /* initialize bvec table again */
  1965. md_bio_reset_resync_pages(b, rp, r1_bio->sectors << 9);
  1966. }
  1967. for (primary = 0; primary < conf->raid_disks * 2; primary++)
  1968. if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
  1969. !r1_bio->bios[primary]->bi_status) {
  1970. r1_bio->bios[primary]->bi_end_io = NULL;
  1971. rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
  1972. break;
  1973. }
  1974. r1_bio->read_disk = primary;
  1975. for (i = 0; i < conf->raid_disks * 2; i++) {
  1976. int j = 0;
  1977. struct bio *pbio = r1_bio->bios[primary];
  1978. struct bio *sbio = r1_bio->bios[i];
  1979. blk_status_t status = sbio->bi_status;
  1980. struct page **ppages = get_resync_pages(pbio)->pages;
  1981. struct page **spages = get_resync_pages(sbio)->pages;
  1982. struct bio_vec *bi;
  1983. int page_len[RESYNC_PAGES] = { 0 };
  1984. struct bvec_iter_all iter_all;
  1985. if (sbio->bi_end_io != end_sync_read)
  1986. continue;
  1987. /* Now we can 'fixup' the error value */
  1988. sbio->bi_status = 0;
  1989. bio_for_each_segment_all(bi, sbio, iter_all)
  1990. page_len[j++] = bi->bv_len;
  1991. if (!status) {
  1992. for (j = vcnt; j-- ; ) {
  1993. if (memcmp(page_address(ppages[j]),
  1994. page_address(spages[j]),
  1995. page_len[j]))
  1996. break;
  1997. }
  1998. } else
  1999. j = 0;
  2000. if (j >= 0)
  2001. atomic64_add(r1_bio->sectors, &mddev->resync_mismatches);
  2002. if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
  2003. && !status)) {
  2004. /* No need to write to this device. */
  2005. sbio->bi_end_io = NULL;
  2006. rdev_dec_pending(conf->mirrors[i].rdev, mddev);
  2007. continue;
  2008. }
  2009. bio_copy_data(sbio, pbio);
  2010. }
  2011. }
  2012. static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
  2013. {
  2014. struct r1conf *conf = mddev->private;
  2015. int i;
  2016. int disks = conf->raid_disks * 2;
  2017. struct bio *wbio;
  2018. if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {
  2019. /*
  2020. * ouch - failed to read all of that.
  2021. * No need to fix read error for check/repair
  2022. * because all member disks are read.
  2023. */
  2024. if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) ||
  2025. !fix_sync_read_error(r1_bio)) {
  2026. md_done_sync(mddev, r1_bio->sectors);
  2027. md_sync_error(mddev);
  2028. put_buf(r1_bio);
  2029. return;
  2030. }
  2031. }
  2032. if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
  2033. process_checks(r1_bio);
  2034. /*
  2035. * schedule writes
  2036. */
  2037. atomic_set(&r1_bio->remaining, 1);
  2038. for (i = 0; i < disks ; i++) {
  2039. wbio = r1_bio->bios[i];
  2040. if (wbio->bi_end_io == NULL ||
  2041. (wbio->bi_end_io == end_sync_read &&
  2042. (i == r1_bio->read_disk ||
  2043. !test_bit(MD_RECOVERY_SYNC, &mddev->recovery))))
  2044. continue;
  2045. if (test_bit(Faulty, &conf->mirrors[i].rdev->flags)) {
  2046. abort_sync_write(mddev, r1_bio);
  2047. continue;
  2048. }
  2049. wbio->bi_opf = REQ_OP_WRITE;
  2050. if (test_bit(FailFast, &conf->mirrors[i].rdev->flags))
  2051. wbio->bi_opf |= MD_FAILFAST;
  2052. wbio->bi_end_io = end_sync_write;
  2053. atomic_inc(&r1_bio->remaining);
  2054. submit_bio_noacct(wbio);
  2055. }
  2056. put_sync_write_buf(r1_bio);
  2057. }
  2058. /*
  2059. * This is a kernel thread which:
  2060. *
  2061. * 1. Retries failed read operations on working mirrors.
  2062. * 2. Updates the raid superblock when problems encounter.
  2063. * 3. Performs writes following reads for array synchronising.
  2064. */
  2065. static void fix_read_error(struct r1conf *conf, struct r1bio *r1_bio)
  2066. {
  2067. sector_t sect = r1_bio->sector;
  2068. int sectors = r1_bio->sectors;
  2069. int read_disk = r1_bio->read_disk;
  2070. struct mddev *mddev = conf->mddev;
  2071. struct md_rdev *rdev = conf->mirrors[read_disk].rdev;
  2072. if (exceed_read_errors(mddev, rdev)) {
  2073. r1_bio->bios[r1_bio->read_disk] = IO_BLOCKED;
  2074. return;
  2075. }
  2076. while(sectors) {
  2077. int s = sectors;
  2078. int d = read_disk;
  2079. int success = 0;
  2080. int start;
  2081. if (s > (PAGE_SIZE>>9))
  2082. s = PAGE_SIZE >> 9;
  2083. do {
  2084. rdev = conf->mirrors[d].rdev;
  2085. if (rdev &&
  2086. (test_bit(In_sync, &rdev->flags) ||
  2087. (!test_bit(Faulty, &rdev->flags) &&
  2088. rdev->recovery_offset >= sect + s)) &&
  2089. rdev_has_badblock(rdev, sect, s) == 0) {
  2090. atomic_inc(&rdev->nr_pending);
  2091. if (sync_page_io(rdev, sect, s<<9,
  2092. conf->tmppage, REQ_OP_READ, false))
  2093. success = 1;
  2094. rdev_dec_pending(rdev, mddev);
  2095. if (success)
  2096. break;
  2097. }
  2098. d++;
  2099. if (d == conf->raid_disks * 2)
  2100. d = 0;
  2101. } while (d != read_disk);
  2102. if (!success) {
  2103. /* Cannot read from anywhere - mark it bad */
  2104. struct md_rdev *rdev = conf->mirrors[read_disk].rdev;
  2105. rdev_set_badblocks(rdev, sect, s, 0);
  2106. break;
  2107. }
  2108. /* write it back and re-read */
  2109. start = d;
  2110. while (d != read_disk) {
  2111. if (d==0)
  2112. d = conf->raid_disks * 2;
  2113. d--;
  2114. rdev = conf->mirrors[d].rdev;
  2115. if (rdev &&
  2116. !test_bit(Faulty, &rdev->flags)) {
  2117. atomic_inc(&rdev->nr_pending);
  2118. r1_sync_page_io(rdev, sect, s,
  2119. conf->tmppage, REQ_OP_WRITE);
  2120. rdev_dec_pending(rdev, mddev);
  2121. }
  2122. }
  2123. d = start;
  2124. while (d != read_disk) {
  2125. if (d==0)
  2126. d = conf->raid_disks * 2;
  2127. d--;
  2128. rdev = conf->mirrors[d].rdev;
  2129. if (rdev &&
  2130. !test_bit(Faulty, &rdev->flags)) {
  2131. atomic_inc(&rdev->nr_pending);
  2132. if (r1_sync_page_io(rdev, sect, s,
  2133. conf->tmppage, REQ_OP_READ)) {
  2134. atomic_add(s, &rdev->corrected_errors);
  2135. pr_info("md/raid1:%s: read error corrected (%d sectors at %llu on %pg)\n",
  2136. mdname(mddev), s,
  2137. (unsigned long long)(sect +
  2138. rdev->data_offset),
  2139. rdev->bdev);
  2140. }
  2141. rdev_dec_pending(rdev, mddev);
  2142. }
  2143. }
  2144. sectors -= s;
  2145. sect += s;
  2146. }
  2147. }
  2148. static void narrow_write_error(struct r1bio *r1_bio, int i)
  2149. {
  2150. struct mddev *mddev = r1_bio->mddev;
  2151. struct r1conf *conf = mddev->private;
  2152. struct md_rdev *rdev = conf->mirrors[i].rdev;
  2153. /* bio has the data to be written to device 'i' where
  2154. * we just recently had a write error.
  2155. * We repeatedly clone the bio and trim down to one block,
  2156. * then try the write. Where the write fails we record
  2157. * a bad block.
  2158. * It is conceivable that the bio doesn't exactly align with
  2159. * blocks. We must handle this somehow.
  2160. *
  2161. * We currently own a reference on the rdev.
  2162. */
  2163. int block_sectors, lbs = bdev_logical_block_size(rdev->bdev) >> 9;
  2164. sector_t sector;
  2165. int sectors;
  2166. int sect_to_write = r1_bio->sectors;
  2167. if (rdev->badblocks.shift < 0)
  2168. block_sectors = lbs;
  2169. else
  2170. block_sectors = roundup(1 << rdev->badblocks.shift, lbs);
  2171. sector = r1_bio->sector;
  2172. sectors = ((sector + block_sectors)
  2173. & ~(sector_t)(block_sectors - 1))
  2174. - sector;
  2175. while (sect_to_write) {
  2176. struct bio *wbio;
  2177. if (sectors > sect_to_write)
  2178. sectors = sect_to_write;
  2179. /* Write at 'sector' for 'sectors'*/
  2180. if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
  2181. wbio = bio_alloc_clone(rdev->bdev,
  2182. r1_bio->behind_master_bio,
  2183. GFP_NOIO, &mddev->bio_set);
  2184. } else {
  2185. wbio = bio_alloc_clone(rdev->bdev, r1_bio->master_bio,
  2186. GFP_NOIO, &mddev->bio_set);
  2187. }
  2188. wbio->bi_opf = REQ_OP_WRITE;
  2189. wbio->bi_iter.bi_sector = r1_bio->sector;
  2190. wbio->bi_iter.bi_size = r1_bio->sectors << 9;
  2191. bio_trim(wbio, sector - r1_bio->sector, sectors);
  2192. wbio->bi_iter.bi_sector += rdev->data_offset;
  2193. if (submit_bio_wait(wbio) &&
  2194. !rdev_set_badblocks(rdev, sector, sectors, 0)) {
  2195. /*
  2196. * Badblocks set failed, disk marked Faulty.
  2197. * No further operations needed.
  2198. */
  2199. bio_put(wbio);
  2200. break;
  2201. }
  2202. bio_put(wbio);
  2203. sect_to_write -= sectors;
  2204. sector += sectors;
  2205. sectors = block_sectors;
  2206. }
  2207. }
  2208. static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
  2209. {
  2210. int m;
  2211. int s = r1_bio->sectors;
  2212. for (m = 0; m < conf->raid_disks * 2 ; m++) {
  2213. struct md_rdev *rdev = conf->mirrors[m].rdev;
  2214. struct bio *bio = r1_bio->bios[m];
  2215. if (bio->bi_end_io == NULL)
  2216. continue;
  2217. if (!bio->bi_status &&
  2218. test_bit(R1BIO_MadeGood, &r1_bio->state))
  2219. rdev_clear_badblocks(rdev, r1_bio->sector, s, 0);
  2220. if (bio->bi_status &&
  2221. test_bit(R1BIO_WriteError, &r1_bio->state))
  2222. rdev_set_badblocks(rdev, r1_bio->sector, s, 0);
  2223. }
  2224. put_buf(r1_bio);
  2225. md_done_sync(conf->mddev, s);
  2226. }
  2227. static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
  2228. {
  2229. int m, idx;
  2230. bool fail = false;
  2231. for (m = 0; m < conf->raid_disks * 2 ; m++)
  2232. if (r1_bio->bios[m] == IO_MADE_GOOD) {
  2233. struct md_rdev *rdev = conf->mirrors[m].rdev;
  2234. rdev_clear_badblocks(rdev,
  2235. r1_bio->sector,
  2236. r1_bio->sectors, 0);
  2237. rdev_dec_pending(rdev, conf->mddev);
  2238. } else if (r1_bio->bios[m] != NULL) {
  2239. /* This drive got a write error. We need to
  2240. * narrow down and record precise write
  2241. * errors.
  2242. */
  2243. fail = true;
  2244. narrow_write_error(r1_bio, m);
  2245. rdev_dec_pending(conf->mirrors[m].rdev,
  2246. conf->mddev);
  2247. }
  2248. if (fail) {
  2249. spin_lock_irq(&conf->device_lock);
  2250. list_add(&r1_bio->retry_list, &conf->bio_end_io_list);
  2251. idx = sector_to_idx(r1_bio->sector);
  2252. atomic_inc(&conf->nr_queued[idx]);
  2253. spin_unlock_irq(&conf->device_lock);
  2254. /*
  2255. * In case freeze_array() is waiting for condition
  2256. * get_unqueued_pending() == extra to be true.
  2257. */
  2258. wake_up(&conf->wait_barrier);
  2259. md_wakeup_thread(conf->mddev->thread);
  2260. } else {
  2261. if (test_bit(R1BIO_WriteError, &r1_bio->state))
  2262. close_write(r1_bio);
  2263. raid_end_bio_io(r1_bio);
  2264. }
  2265. }
  2266. static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
  2267. {
  2268. struct mddev *mddev = conf->mddev;
  2269. struct bio *bio;
  2270. struct md_rdev *rdev;
  2271. sector_t sector;
  2272. clear_bit(R1BIO_ReadError, &r1_bio->state);
  2273. /* we got a read error. Maybe the drive is bad. Maybe just
  2274. * the block and we can fix it.
  2275. * We freeze all other IO, and try reading the block from
  2276. * other devices. When we find one, we re-write
  2277. * and check it that fixes the read error.
  2278. * This is all done synchronously while the array is
  2279. * frozen
  2280. */
  2281. bio = r1_bio->bios[r1_bio->read_disk];
  2282. bio_put(bio);
  2283. r1_bio->bios[r1_bio->read_disk] = NULL;
  2284. rdev = conf->mirrors[r1_bio->read_disk].rdev;
  2285. if (mddev->ro == 0
  2286. && !test_bit(FailFast, &rdev->flags)) {
  2287. freeze_array(conf, 1);
  2288. fix_read_error(conf, r1_bio);
  2289. unfreeze_array(conf);
  2290. } else if (mddev->ro == 0 && test_bit(FailFast, &rdev->flags)) {
  2291. md_error(mddev, rdev);
  2292. } else {
  2293. r1_bio->bios[r1_bio->read_disk] = IO_BLOCKED;
  2294. }
  2295. rdev_dec_pending(rdev, conf->mddev);
  2296. sector = r1_bio->sector;
  2297. bio = r1_bio->master_bio;
  2298. /* Reuse the old r1_bio so that the IO_BLOCKED settings are preserved */
  2299. r1_bio->state = 0;
  2300. raid1_read_request(mddev, bio, r1_bio->sectors, r1_bio);
  2301. allow_barrier(conf, sector);
  2302. }
  2303. static void raid1d(struct md_thread *thread)
  2304. {
  2305. struct mddev *mddev = thread->mddev;
  2306. struct r1bio *r1_bio;
  2307. unsigned long flags;
  2308. struct r1conf *conf = mddev->private;
  2309. struct list_head *head = &conf->retry_list;
  2310. struct blk_plug plug;
  2311. int idx;
  2312. md_check_recovery(mddev);
  2313. if (!list_empty_careful(&conf->bio_end_io_list) &&
  2314. !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
  2315. LIST_HEAD(tmp);
  2316. spin_lock_irqsave(&conf->device_lock, flags);
  2317. if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
  2318. list_splice_init(&conf->bio_end_io_list, &tmp);
  2319. spin_unlock_irqrestore(&conf->device_lock, flags);
  2320. while (!list_empty(&tmp)) {
  2321. r1_bio = list_first_entry(&tmp, struct r1bio,
  2322. retry_list);
  2323. list_del(&r1_bio->retry_list);
  2324. idx = sector_to_idx(r1_bio->sector);
  2325. atomic_dec(&conf->nr_queued[idx]);
  2326. if (test_bit(R1BIO_WriteError, &r1_bio->state))
  2327. close_write(r1_bio);
  2328. raid_end_bio_io(r1_bio);
  2329. }
  2330. }
  2331. blk_start_plug(&plug);
  2332. for (;;) {
  2333. flush_pending_writes(conf);
  2334. spin_lock_irqsave(&conf->device_lock, flags);
  2335. if (list_empty(head)) {
  2336. spin_unlock_irqrestore(&conf->device_lock, flags);
  2337. break;
  2338. }
  2339. r1_bio = list_entry(head->prev, struct r1bio, retry_list);
  2340. list_del(head->prev);
  2341. idx = sector_to_idx(r1_bio->sector);
  2342. atomic_dec(&conf->nr_queued[idx]);
  2343. spin_unlock_irqrestore(&conf->device_lock, flags);
  2344. mddev = r1_bio->mddev;
  2345. conf = mddev->private;
  2346. if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
  2347. if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
  2348. test_bit(R1BIO_WriteError, &r1_bio->state))
  2349. handle_sync_write_finished(conf, r1_bio);
  2350. else
  2351. sync_request_write(mddev, r1_bio);
  2352. } else if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
  2353. test_bit(R1BIO_WriteError, &r1_bio->state))
  2354. handle_write_finished(conf, r1_bio);
  2355. else if (test_bit(R1BIO_ReadError, &r1_bio->state))
  2356. handle_read_error(conf, r1_bio);
  2357. else
  2358. WARN_ON_ONCE(1);
  2359. cond_resched();
  2360. if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING))
  2361. md_check_recovery(mddev);
  2362. }
  2363. blk_finish_plug(&plug);
  2364. }
  2365. static int init_resync(struct r1conf *conf)
  2366. {
  2367. int buffs;
  2368. buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
  2369. BUG_ON(mempool_initialized(&conf->r1buf_pool));
  2370. return mempool_init(&conf->r1buf_pool, buffs, r1buf_pool_alloc,
  2371. r1buf_pool_free, conf);
  2372. }
  2373. static struct r1bio *raid1_alloc_init_r1buf(struct r1conf *conf)
  2374. {
  2375. struct r1bio *r1bio = mempool_alloc(&conf->r1buf_pool, GFP_NOIO);
  2376. struct resync_pages *rps;
  2377. struct bio *bio;
  2378. int i;
  2379. for (i = conf->raid_disks * 2; i--; ) {
  2380. bio = r1bio->bios[i];
  2381. rps = bio->bi_private;
  2382. bio_reset(bio, NULL, 0);
  2383. bio->bi_private = rps;
  2384. }
  2385. r1bio->master_bio = NULL;
  2386. return r1bio;
  2387. }
  2388. /*
  2389. * perform a "sync" on one "block"
  2390. *
  2391. * We need to make sure that no normal I/O request - particularly write
  2392. * requests - conflict with active sync requests.
  2393. *
  2394. * This is achieved by tracking pending requests and a 'barrier' concept
  2395. * that can be installed to exclude normal IO requests.
  2396. */
  2397. static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
  2398. sector_t max_sector, int *skipped)
  2399. {
  2400. struct r1conf *conf = mddev->private;
  2401. struct r1bio *r1_bio;
  2402. struct bio *bio;
  2403. sector_t nr_sectors;
  2404. int disk = -1;
  2405. int i;
  2406. int wonly = -1;
  2407. int write_targets = 0, read_targets = 0;
  2408. sector_t sync_blocks;
  2409. bool still_degraded = false;
  2410. int good_sectors = RESYNC_SECTORS;
  2411. int min_bad = 0; /* number of sectors that are bad in all devices */
  2412. int idx = sector_to_idx(sector_nr);
  2413. int page_idx = 0;
  2414. if (!mempool_initialized(&conf->r1buf_pool))
  2415. if (init_resync(conf))
  2416. return 0;
  2417. if (sector_nr >= max_sector) {
  2418. /* If we aborted, we need to abort the
  2419. * sync on the 'current' bitmap chunk (there will
  2420. * only be one in raid1 resync.
  2421. * We can find the current addess in mddev->curr_resync
  2422. */
  2423. if (mddev->curr_resync < max_sector) /* aborted */
  2424. md_bitmap_end_sync(mddev, mddev->curr_resync,
  2425. &sync_blocks);
  2426. else /* completed sync */
  2427. conf->fullsync = 0;
  2428. if (md_bitmap_enabled(mddev, false))
  2429. mddev->bitmap_ops->close_sync(mddev);
  2430. close_sync(conf);
  2431. if (mddev_is_clustered(mddev)) {
  2432. conf->cluster_sync_low = 0;
  2433. conf->cluster_sync_high = 0;
  2434. }
  2435. return 0;
  2436. }
  2437. if (mddev->bitmap == NULL &&
  2438. mddev->resync_offset == MaxSector &&
  2439. !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
  2440. conf->fullsync == 0) {
  2441. *skipped = 1;
  2442. return max_sector - sector_nr;
  2443. }
  2444. /* before building a request, check if we can skip these blocks..
  2445. * This call the bitmap_start_sync doesn't actually record anything
  2446. */
  2447. if (!md_bitmap_start_sync(mddev, sector_nr, &sync_blocks, true) &&
  2448. !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
  2449. /* We can skip this block, and probably several more */
  2450. *skipped = 1;
  2451. return sync_blocks;
  2452. }
  2453. /*
  2454. * If there is non-resync activity waiting for a turn, then let it
  2455. * though before starting on this new sync request.
  2456. */
  2457. if (atomic_read(&conf->nr_waiting[idx]))
  2458. schedule_timeout_uninterruptible(1);
  2459. /* we are incrementing sector_nr below. To be safe, we check against
  2460. * sector_nr + two times RESYNC_SECTORS
  2461. */
  2462. if (md_bitmap_enabled(mddev, false))
  2463. mddev->bitmap_ops->cond_end_sync(mddev, sector_nr,
  2464. mddev_is_clustered(mddev) &&
  2465. (sector_nr + 2 * RESYNC_SECTORS >
  2466. conf->cluster_sync_high));
  2467. if (raise_barrier(conf, sector_nr))
  2468. return 0;
  2469. r1_bio = raid1_alloc_init_r1buf(conf);
  2470. /*
  2471. * If we get a correctably read error during resync or recovery,
  2472. * we might want to read from a different device. So we
  2473. * flag all drives that could conceivably be read from for READ,
  2474. * and any others (which will be non-In_sync devices) for WRITE.
  2475. * If a read fails, we try reading from something else for which READ
  2476. * is OK.
  2477. */
  2478. r1_bio->mddev = mddev;
  2479. r1_bio->sector = sector_nr;
  2480. r1_bio->state = 0;
  2481. set_bit(R1BIO_IsSync, &r1_bio->state);
  2482. /* make sure good_sectors won't go across barrier unit boundary */
  2483. good_sectors = align_to_barrier_unit_end(sector_nr, good_sectors);
  2484. for (i = 0; i < conf->raid_disks * 2; i++) {
  2485. struct md_rdev *rdev;
  2486. bio = r1_bio->bios[i];
  2487. rdev = conf->mirrors[i].rdev;
  2488. if (rdev == NULL ||
  2489. test_bit(Faulty, &rdev->flags)) {
  2490. if (i < conf->raid_disks)
  2491. still_degraded = true;
  2492. } else if (!test_bit(In_sync, &rdev->flags)) {
  2493. bio->bi_opf = REQ_OP_WRITE;
  2494. bio->bi_end_io = end_sync_write;
  2495. write_targets ++;
  2496. } else {
  2497. /* may need to read from here */
  2498. sector_t first_bad = MaxSector;
  2499. sector_t bad_sectors;
  2500. if (is_badblock(rdev, sector_nr, good_sectors,
  2501. &first_bad, &bad_sectors)) {
  2502. if (first_bad > sector_nr)
  2503. good_sectors = first_bad - sector_nr;
  2504. else {
  2505. bad_sectors -= (sector_nr - first_bad);
  2506. if (min_bad == 0 ||
  2507. min_bad > bad_sectors)
  2508. min_bad = bad_sectors;
  2509. }
  2510. }
  2511. if (sector_nr < first_bad) {
  2512. if (test_bit(WriteMostly, &rdev->flags)) {
  2513. if (wonly < 0)
  2514. wonly = i;
  2515. } else {
  2516. if (disk < 0)
  2517. disk = i;
  2518. }
  2519. bio->bi_opf = REQ_OP_READ;
  2520. bio->bi_end_io = end_sync_read;
  2521. read_targets++;
  2522. } else if (!test_bit(WriteErrorSeen, &rdev->flags) &&
  2523. test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
  2524. !test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
  2525. /*
  2526. * The device is suitable for reading (InSync),
  2527. * but has bad block(s) here. Let's try to correct them,
  2528. * if we are doing resync or repair. Otherwise, leave
  2529. * this device alone for this sync request.
  2530. */
  2531. bio->bi_opf = REQ_OP_WRITE;
  2532. bio->bi_end_io = end_sync_write;
  2533. write_targets++;
  2534. }
  2535. }
  2536. if (rdev && bio->bi_end_io) {
  2537. atomic_inc(&rdev->nr_pending);
  2538. bio->bi_iter.bi_sector = sector_nr + rdev->data_offset;
  2539. bio_set_dev(bio, rdev->bdev);
  2540. if (test_bit(FailFast, &rdev->flags))
  2541. bio->bi_opf |= MD_FAILFAST;
  2542. }
  2543. }
  2544. if (disk < 0)
  2545. disk = wonly;
  2546. r1_bio->read_disk = disk;
  2547. if (read_targets == 0 && min_bad > 0) {
  2548. /* These sectors are bad on all InSync devices, so we
  2549. * need to mark them bad on all write targets
  2550. */
  2551. int ok = 1;
  2552. for (i = 0 ; i < conf->raid_disks * 2 ; i++)
  2553. if (r1_bio->bios[i]->bi_end_io == end_sync_write) {
  2554. struct md_rdev *rdev = conf->mirrors[i].rdev;
  2555. ok = rdev_set_badblocks(rdev, sector_nr,
  2556. min_bad, 0
  2557. ) && ok;
  2558. }
  2559. set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
  2560. *skipped = 1;
  2561. put_buf(r1_bio);
  2562. if (!ok)
  2563. /* Cannot record the badblocks, md_error has set INTR,
  2564. * abort the resync.
  2565. */
  2566. return 0;
  2567. else
  2568. return min_bad;
  2569. }
  2570. if (min_bad > 0 && min_bad < good_sectors) {
  2571. /* only resync enough to reach the next bad->good
  2572. * transition */
  2573. good_sectors = min_bad;
  2574. }
  2575. if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0)
  2576. /* extra read targets are also write targets */
  2577. write_targets += read_targets-1;
  2578. if (write_targets == 0 || read_targets == 0) {
  2579. /* There is nowhere to write, so all non-sync
  2580. * drives must be failed - so we are finished
  2581. */
  2582. sector_t rv;
  2583. if (min_bad > 0)
  2584. max_sector = sector_nr + min_bad;
  2585. rv = max_sector - sector_nr;
  2586. *skipped = 1;
  2587. put_buf(r1_bio);
  2588. return rv;
  2589. }
  2590. if (max_sector > mddev->resync_max)
  2591. max_sector = mddev->resync_max; /* Don't do IO beyond here */
  2592. if (max_sector > sector_nr + good_sectors)
  2593. max_sector = sector_nr + good_sectors;
  2594. nr_sectors = 0;
  2595. sync_blocks = 0;
  2596. do {
  2597. struct page *page;
  2598. int len = PAGE_SIZE;
  2599. if (sector_nr + (len>>9) > max_sector)
  2600. len = (max_sector - sector_nr) << 9;
  2601. if (len == 0)
  2602. break;
  2603. if (sync_blocks == 0) {
  2604. if (!md_bitmap_start_sync(mddev, sector_nr,
  2605. &sync_blocks, still_degraded) &&
  2606. !conf->fullsync &&
  2607. !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
  2608. break;
  2609. if ((len >> 9) > sync_blocks)
  2610. len = sync_blocks<<9;
  2611. }
  2612. for (i = 0 ; i < conf->raid_disks * 2; i++) {
  2613. struct resync_pages *rp;
  2614. bio = r1_bio->bios[i];
  2615. rp = get_resync_pages(bio);
  2616. if (bio->bi_end_io) {
  2617. page = resync_fetch_page(rp, page_idx);
  2618. /*
  2619. * won't fail because the vec table is big
  2620. * enough to hold all these pages
  2621. */
  2622. __bio_add_page(bio, page, len, 0);
  2623. }
  2624. }
  2625. nr_sectors += len>>9;
  2626. sector_nr += len>>9;
  2627. sync_blocks -= (len>>9);
  2628. } while (++page_idx < RESYNC_PAGES);
  2629. r1_bio->sectors = nr_sectors;
  2630. if (mddev_is_clustered(mddev) &&
  2631. conf->cluster_sync_high < sector_nr + nr_sectors) {
  2632. conf->cluster_sync_low = mddev->curr_resync_completed;
  2633. conf->cluster_sync_high = conf->cluster_sync_low + CLUSTER_RESYNC_WINDOW_SECTORS;
  2634. /* Send resync message */
  2635. mddev->cluster_ops->resync_info_update(mddev,
  2636. conf->cluster_sync_low,
  2637. conf->cluster_sync_high);
  2638. }
  2639. /* For a user-requested sync, we read all readable devices and do a
  2640. * compare
  2641. */
  2642. if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
  2643. atomic_set(&r1_bio->remaining, read_targets);
  2644. for (i = 0; i < conf->raid_disks * 2 && read_targets; i++) {
  2645. bio = r1_bio->bios[i];
  2646. if (bio->bi_end_io == end_sync_read) {
  2647. read_targets--;
  2648. if (read_targets == 1)
  2649. bio->bi_opf &= ~MD_FAILFAST;
  2650. submit_bio_noacct(bio);
  2651. }
  2652. }
  2653. } else {
  2654. atomic_set(&r1_bio->remaining, 1);
  2655. bio = r1_bio->bios[r1_bio->read_disk];
  2656. if (read_targets == 1)
  2657. bio->bi_opf &= ~MD_FAILFAST;
  2658. submit_bio_noacct(bio);
  2659. }
  2660. return nr_sectors;
  2661. }
  2662. static sector_t raid1_size(struct mddev *mddev, sector_t sectors, int raid_disks)
  2663. {
  2664. if (sectors)
  2665. return sectors;
  2666. return mddev->dev_sectors;
  2667. }
  2668. static struct r1conf *setup_conf(struct mddev *mddev)
  2669. {
  2670. struct r1conf *conf;
  2671. int i;
  2672. struct raid1_info *disk;
  2673. struct md_rdev *rdev;
  2674. size_t r1bio_size;
  2675. int err = -ENOMEM;
  2676. conf = kzalloc_obj(struct r1conf);
  2677. if (!conf)
  2678. goto abort;
  2679. conf->nr_pending = kzalloc_objs(atomic_t, BARRIER_BUCKETS_NR);
  2680. if (!conf->nr_pending)
  2681. goto abort;
  2682. conf->nr_waiting = kzalloc_objs(atomic_t, BARRIER_BUCKETS_NR);
  2683. if (!conf->nr_waiting)
  2684. goto abort;
  2685. conf->nr_queued = kzalloc_objs(atomic_t, BARRIER_BUCKETS_NR);
  2686. if (!conf->nr_queued)
  2687. goto abort;
  2688. conf->barrier = kzalloc_objs(atomic_t, BARRIER_BUCKETS_NR);
  2689. if (!conf->barrier)
  2690. goto abort;
  2691. conf->mirrors = kzalloc(array3_size(sizeof(struct raid1_info),
  2692. mddev->raid_disks, 2),
  2693. GFP_KERNEL);
  2694. if (!conf->mirrors)
  2695. goto abort;
  2696. conf->tmppage = alloc_page(GFP_KERNEL);
  2697. if (!conf->tmppage)
  2698. goto abort;
  2699. r1bio_size = offsetof(struct r1bio, bios[mddev->raid_disks * 2]);
  2700. conf->r1bio_pool = mempool_create_kmalloc_pool(NR_RAID_BIOS, r1bio_size);
  2701. if (!conf->r1bio_pool)
  2702. goto abort;
  2703. err = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0);
  2704. if (err)
  2705. goto abort;
  2706. err = -EINVAL;
  2707. spin_lock_init(&conf->device_lock);
  2708. conf->raid_disks = mddev->raid_disks;
  2709. rdev_for_each(rdev, mddev) {
  2710. int disk_idx = rdev->raid_disk;
  2711. if (disk_idx >= conf->raid_disks || disk_idx < 0)
  2712. continue;
  2713. if (!raid1_add_conf(conf, rdev, disk_idx,
  2714. test_bit(Replacement, &rdev->flags)))
  2715. goto abort;
  2716. }
  2717. conf->mddev = mddev;
  2718. INIT_LIST_HEAD(&conf->retry_list);
  2719. INIT_LIST_HEAD(&conf->bio_end_io_list);
  2720. spin_lock_init(&conf->resync_lock);
  2721. init_waitqueue_head(&conf->wait_barrier);
  2722. bio_list_init(&conf->pending_bio_list);
  2723. err = -EIO;
  2724. for (i = 0; i < conf->raid_disks * 2; i++) {
  2725. disk = conf->mirrors + i;
  2726. if (i < conf->raid_disks &&
  2727. disk[conf->raid_disks].rdev) {
  2728. /* This slot has a replacement. */
  2729. if (!disk->rdev) {
  2730. /* No original, just make the replacement
  2731. * a recovering spare
  2732. */
  2733. disk->rdev =
  2734. disk[conf->raid_disks].rdev;
  2735. disk[conf->raid_disks].rdev = NULL;
  2736. } else if (!test_bit(In_sync, &disk->rdev->flags))
  2737. /* Original is not in_sync - bad */
  2738. goto abort;
  2739. }
  2740. if (!disk->rdev ||
  2741. !test_bit(In_sync, &disk->rdev->flags)) {
  2742. disk->head_position = 0;
  2743. if (disk->rdev &&
  2744. (disk->rdev->saved_raid_disk < 0))
  2745. conf->fullsync = 1;
  2746. }
  2747. }
  2748. err = -ENOMEM;
  2749. rcu_assign_pointer(conf->thread,
  2750. md_register_thread(raid1d, mddev, "raid1"));
  2751. if (!conf->thread)
  2752. goto abort;
  2753. return conf;
  2754. abort:
  2755. if (conf) {
  2756. mempool_destroy(conf->r1bio_pool);
  2757. kfree(conf->mirrors);
  2758. safe_put_page(conf->tmppage);
  2759. kfree(conf->nr_pending);
  2760. kfree(conf->nr_waiting);
  2761. kfree(conf->nr_queued);
  2762. kfree(conf->barrier);
  2763. bioset_exit(&conf->bio_split);
  2764. kfree(conf);
  2765. }
  2766. return ERR_PTR(err);
  2767. }
  2768. static int raid1_set_limits(struct mddev *mddev)
  2769. {
  2770. struct queue_limits lim;
  2771. int err;
  2772. md_init_stacking_limits(&lim);
  2773. lim.max_write_zeroes_sectors = 0;
  2774. lim.max_hw_wzeroes_unmap_sectors = 0;
  2775. lim.logical_block_size = mddev->logical_block_size;
  2776. lim.features |= BLK_FEAT_ATOMIC_WRITES;
  2777. err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY);
  2778. if (err)
  2779. return err;
  2780. return queue_limits_set(mddev->gendisk->queue, &lim);
  2781. }
  2782. static int raid1_run(struct mddev *mddev)
  2783. {
  2784. struct r1conf *conf;
  2785. int i;
  2786. int ret;
  2787. if (mddev->level != 1) {
  2788. pr_warn("md/raid1:%s: raid level not set to mirroring (%d)\n",
  2789. mdname(mddev), mddev->level);
  2790. return -EIO;
  2791. }
  2792. if (mddev->reshape_position != MaxSector) {
  2793. pr_warn("md/raid1:%s: reshape_position set but not supported\n",
  2794. mdname(mddev));
  2795. return -EIO;
  2796. }
  2797. /*
  2798. * copy the already verified devices into our private RAID1
  2799. * bookkeeping area. [whatever we allocate in run(),
  2800. * should be freed in raid1_free()]
  2801. */
  2802. if (mddev->private == NULL)
  2803. conf = setup_conf(mddev);
  2804. else
  2805. conf = mddev->private;
  2806. if (IS_ERR(conf))
  2807. return PTR_ERR(conf);
  2808. if (!mddev_is_dm(mddev)) {
  2809. ret = raid1_set_limits(mddev);
  2810. if (ret) {
  2811. md_unregister_thread(mddev, &conf->thread);
  2812. if (!mddev->private)
  2813. raid1_free(mddev, conf);
  2814. return ret;
  2815. }
  2816. }
  2817. mddev->degraded = 0;
  2818. for (i = 0; i < conf->raid_disks; i++)
  2819. if (conf->mirrors[i].rdev == NULL ||
  2820. !test_bit(In_sync, &conf->mirrors[i].rdev->flags) ||
  2821. test_bit(Faulty, &conf->mirrors[i].rdev->flags))
  2822. mddev->degraded++;
  2823. /*
  2824. * RAID1 needs at least one disk in active
  2825. */
  2826. if (conf->raid_disks - mddev->degraded < 1) {
  2827. md_unregister_thread(mddev, &conf->thread);
  2828. if (!mddev->private)
  2829. raid1_free(mddev, conf);
  2830. return -EINVAL;
  2831. }
  2832. if (conf->raid_disks - mddev->degraded == 1)
  2833. mddev->resync_offset = MaxSector;
  2834. if (mddev->resync_offset != MaxSector)
  2835. pr_info("md/raid1:%s: not clean -- starting background reconstruction\n",
  2836. mdname(mddev));
  2837. pr_info("md/raid1:%s: active with %d out of %d mirrors\n",
  2838. mdname(mddev), mddev->raid_disks - mddev->degraded,
  2839. mddev->raid_disks);
  2840. /*
  2841. * Ok, everything is just fine now
  2842. */
  2843. rcu_assign_pointer(mddev->thread, conf->thread);
  2844. rcu_assign_pointer(conf->thread, NULL);
  2845. mddev->private = conf;
  2846. set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
  2847. md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
  2848. ret = md_integrity_register(mddev);
  2849. if (ret)
  2850. md_unregister_thread(mddev, &mddev->thread);
  2851. return ret;
  2852. }
  2853. static void raid1_free(struct mddev *mddev, void *priv)
  2854. {
  2855. struct r1conf *conf = priv;
  2856. mempool_destroy(conf->r1bio_pool);
  2857. kfree(conf->mirrors);
  2858. safe_put_page(conf->tmppage);
  2859. kfree(conf->nr_pending);
  2860. kfree(conf->nr_waiting);
  2861. kfree(conf->nr_queued);
  2862. kfree(conf->barrier);
  2863. bioset_exit(&conf->bio_split);
  2864. kfree(conf);
  2865. }
  2866. static int raid1_resize(struct mddev *mddev, sector_t sectors)
  2867. {
  2868. /* no resync is happening, and there is enough space
  2869. * on all devices, so we can resize.
  2870. * We need to make sure resync covers any new space.
  2871. * If the array is shrinking we should possibly wait until
  2872. * any io in the removed space completes, but it hardly seems
  2873. * worth it.
  2874. */
  2875. sector_t newsize = raid1_size(mddev, sectors, 0);
  2876. if (mddev->external_size &&
  2877. mddev->array_sectors > newsize)
  2878. return -EINVAL;
  2879. if (md_bitmap_enabled(mddev, false)) {
  2880. int ret = mddev->bitmap_ops->resize(mddev, newsize, 0);
  2881. if (ret)
  2882. return ret;
  2883. }
  2884. md_set_array_sectors(mddev, newsize);
  2885. if (sectors > mddev->dev_sectors &&
  2886. mddev->resync_offset > mddev->dev_sectors) {
  2887. mddev->resync_offset = mddev->dev_sectors;
  2888. set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
  2889. }
  2890. mddev->dev_sectors = sectors;
  2891. mddev->resync_max_sectors = sectors;
  2892. return 0;
  2893. }
  2894. static int raid1_reshape(struct mddev *mddev)
  2895. {
  2896. /* We need to:
  2897. * 1/ resize the r1bio_pool
  2898. * 2/ resize conf->mirrors
  2899. *
  2900. * We allocate a new r1bio_pool if we can.
  2901. * Then raise a device barrier and wait until all IO stops.
  2902. * Then resize conf->mirrors and swap in the new r1bio pool.
  2903. *
  2904. * At the same time, we "pack" the devices so that all the missing
  2905. * devices have the higher raid_disk numbers.
  2906. */
  2907. mempool_t *newpool, *oldpool;
  2908. size_t new_r1bio_size;
  2909. struct raid1_info *newmirrors;
  2910. struct r1conf *conf = mddev->private;
  2911. int cnt, raid_disks;
  2912. unsigned long flags;
  2913. int d, d2;
  2914. /* Cannot change chunk_size, layout, or level */
  2915. if (mddev->chunk_sectors != mddev->new_chunk_sectors ||
  2916. mddev->layout != mddev->new_layout ||
  2917. mddev->level != mddev->new_level) {
  2918. mddev->new_chunk_sectors = mddev->chunk_sectors;
  2919. mddev->new_layout = mddev->layout;
  2920. mddev->new_level = mddev->level;
  2921. return -EINVAL;
  2922. }
  2923. if (!mddev_is_clustered(mddev))
  2924. md_allow_write(mddev);
  2925. raid_disks = mddev->raid_disks + mddev->delta_disks;
  2926. if (raid_disks < conf->raid_disks) {
  2927. cnt=0;
  2928. for (d= 0; d < conf->raid_disks; d++)
  2929. if (conf->mirrors[d].rdev)
  2930. cnt++;
  2931. if (cnt > raid_disks)
  2932. return -EBUSY;
  2933. }
  2934. new_r1bio_size = offsetof(struct r1bio, bios[raid_disks * 2]);
  2935. newpool = mempool_create_kmalloc_pool(NR_RAID_BIOS, new_r1bio_size);
  2936. if (!newpool) {
  2937. return -ENOMEM;
  2938. }
  2939. newmirrors = kzalloc(array3_size(sizeof(struct raid1_info),
  2940. raid_disks, 2),
  2941. GFP_KERNEL);
  2942. if (!newmirrors) {
  2943. mempool_destroy(newpool);
  2944. return -ENOMEM;
  2945. }
  2946. freeze_array(conf, 0);
  2947. /* ok, everything is stopped */
  2948. oldpool = conf->r1bio_pool;
  2949. conf->r1bio_pool = newpool;
  2950. for (d = d2 = 0; d < conf->raid_disks; d++) {
  2951. struct md_rdev *rdev = conf->mirrors[d].rdev;
  2952. if (rdev && rdev->raid_disk != d2) {
  2953. sysfs_unlink_rdev(mddev, rdev);
  2954. rdev->raid_disk = d2;
  2955. sysfs_unlink_rdev(mddev, rdev);
  2956. if (sysfs_link_rdev(mddev, rdev))
  2957. pr_warn("md/raid1:%s: cannot register rd%d\n",
  2958. mdname(mddev), rdev->raid_disk);
  2959. }
  2960. if (rdev)
  2961. newmirrors[d2++].rdev = rdev;
  2962. }
  2963. kfree(conf->mirrors);
  2964. conf->mirrors = newmirrors;
  2965. spin_lock_irqsave(&conf->device_lock, flags);
  2966. mddev->degraded += (raid_disks - conf->raid_disks);
  2967. spin_unlock_irqrestore(&conf->device_lock, flags);
  2968. conf->raid_disks = mddev->raid_disks = raid_disks;
  2969. mddev->delta_disks = 0;
  2970. unfreeze_array(conf);
  2971. set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
  2972. set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
  2973. md_wakeup_thread(mddev->thread);
  2974. mempool_destroy(oldpool);
  2975. return 0;
  2976. }
  2977. static void raid1_quiesce(struct mddev *mddev, int quiesce)
  2978. {
  2979. struct r1conf *conf = mddev->private;
  2980. if (quiesce)
  2981. freeze_array(conf, 0);
  2982. else
  2983. unfreeze_array(conf);
  2984. }
  2985. static void *raid1_takeover(struct mddev *mddev)
  2986. {
  2987. /* raid1 can take over:
  2988. * raid5 with 2 devices, any layout or chunk size
  2989. */
  2990. if (mddev->level == 5 && mddev->raid_disks == 2) {
  2991. struct r1conf *conf;
  2992. mddev->new_level = 1;
  2993. mddev->new_layout = 0;
  2994. mddev->new_chunk_sectors = 0;
  2995. conf = setup_conf(mddev);
  2996. if (!IS_ERR(conf)) {
  2997. /* Array must appear to be quiesced */
  2998. conf->array_frozen = 1;
  2999. mddev_clear_unsupported_flags(mddev,
  3000. UNSUPPORTED_MDDEV_FLAGS);
  3001. }
  3002. return conf;
  3003. }
  3004. return ERR_PTR(-EINVAL);
  3005. }
  3006. static struct md_personality raid1_personality =
  3007. {
  3008. .head = {
  3009. .type = MD_PERSONALITY,
  3010. .id = ID_RAID1,
  3011. .name = "raid1",
  3012. .owner = THIS_MODULE,
  3013. },
  3014. .make_request = raid1_make_request,
  3015. .run = raid1_run,
  3016. .free = raid1_free,
  3017. .status = raid1_status,
  3018. .error_handler = raid1_error,
  3019. .hot_add_disk = raid1_add_disk,
  3020. .hot_remove_disk= raid1_remove_disk,
  3021. .spare_active = raid1_spare_active,
  3022. .sync_request = raid1_sync_request,
  3023. .resize = raid1_resize,
  3024. .size = raid1_size,
  3025. .check_reshape = raid1_reshape,
  3026. .quiesce = raid1_quiesce,
  3027. .takeover = raid1_takeover,
  3028. };
  3029. static int __init raid1_init(void)
  3030. {
  3031. return register_md_submodule(&raid1_personality.head);
  3032. }
  3033. static void __exit raid1_exit(void)
  3034. {
  3035. unregister_md_submodule(&raid1_personality.head);
  3036. }
  3037. module_init(raid1_init);
  3038. module_exit(raid1_exit);
  3039. MODULE_LICENSE("GPL");
  3040. MODULE_DESCRIPTION("RAID1 (mirroring) personality for MD");
  3041. MODULE_ALIAS("md-personality-3"); /* RAID1 */
  3042. MODULE_ALIAS("md-raid1");
  3043. MODULE_ALIAS("md-level-1");