dm-writecache.c 69 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * Copyright (C) 2018 Red Hat. All rights reserved.
  4. *
  5. * This file is released under the GPL.
  6. */
  7. #include <linux/device-mapper.h>
  8. #include <linux/module.h>
  9. #include <linux/init.h>
  10. #include <linux/vmalloc.h>
  11. #include <linux/kthread.h>
  12. #include <linux/dm-io.h>
  13. #include <linux/dm-kcopyd.h>
  14. #include <linux/dax.h>
  15. #include <linux/libnvdimm.h>
  16. #include <linux/delay.h>
  17. #include "dm-io-tracker.h"
  18. #define DM_MSG_PREFIX "writecache"
  19. #define HIGH_WATERMARK 50
  20. #define LOW_WATERMARK 45
  21. #define MAX_WRITEBACK_JOBS min(0x10000000 / PAGE_SIZE, totalram_pages() / 16)
  22. #define ENDIO_LATENCY 16
  23. #define WRITEBACK_LATENCY 64
  24. #define AUTOCOMMIT_BLOCKS_SSD 65536
  25. #define AUTOCOMMIT_BLOCKS_PMEM 64
  26. #define AUTOCOMMIT_MSEC 1000
  27. #define MAX_AGE_DIV 16
  28. #define MAX_AGE_UNSPECIFIED -1UL
  29. #define PAUSE_WRITEBACK (HZ * 3)
  30. #define BITMAP_GRANULARITY 65536
  31. #if BITMAP_GRANULARITY < PAGE_SIZE
  32. #undef BITMAP_GRANULARITY
  33. #define BITMAP_GRANULARITY PAGE_SIZE
  34. #endif
  35. #if IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && IS_ENABLED(CONFIG_FS_DAX)
  36. #define DM_WRITECACHE_HAS_PMEM
  37. #endif
  38. #ifdef DM_WRITECACHE_HAS_PMEM
  39. #define pmem_assign(dest, src) \
  40. do { \
  41. typeof(dest) uniq = (src); \
  42. memcpy_flushcache(&(dest), &uniq, sizeof(dest)); \
  43. } while (0)
  44. #else
  45. #define pmem_assign(dest, src) ((dest) = (src))
  46. #endif
  47. #if IS_ENABLED(CONFIG_ARCH_HAS_COPY_MC) && defined(DM_WRITECACHE_HAS_PMEM)
  48. #define DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
  49. #endif
  50. #define MEMORY_SUPERBLOCK_MAGIC 0x23489321
  51. #define MEMORY_SUPERBLOCK_VERSION 1
  52. struct wc_memory_entry {
  53. __le64 original_sector;
  54. __le64 seq_count;
  55. };
  56. struct wc_memory_superblock {
  57. union {
  58. struct {
  59. __le32 magic;
  60. __le32 version;
  61. __le32 block_size;
  62. __le32 pad;
  63. __le64 n_blocks;
  64. __le64 seq_count;
  65. };
  66. __le64 padding[8];
  67. };
  68. struct wc_memory_entry entries[];
  69. };
  70. struct wc_entry {
  71. struct rb_node rb_node;
  72. struct list_head lru;
  73. unsigned short wc_list_contiguous;
  74. #if BITS_PER_LONG == 64
  75. bool write_in_progress : 1;
  76. unsigned long index : 47;
  77. #else
  78. bool write_in_progress;
  79. unsigned long index;
  80. #endif
  81. unsigned long age;
  82. #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
  83. uint64_t original_sector;
  84. uint64_t seq_count;
  85. #endif
  86. };
  87. #ifdef DM_WRITECACHE_HAS_PMEM
  88. #define WC_MODE_PMEM(wc) ((wc)->pmem_mode)
  89. #define WC_MODE_FUA(wc) ((wc)->writeback_fua)
  90. #else
  91. #define WC_MODE_PMEM(wc) false
  92. #define WC_MODE_FUA(wc) false
  93. #endif
  94. #define WC_MODE_SORT_FREELIST(wc) (!WC_MODE_PMEM(wc))
  95. struct dm_writecache {
  96. struct mutex lock;
  97. struct list_head lru;
  98. union {
  99. struct list_head freelist;
  100. struct {
  101. struct rb_root freetree;
  102. struct wc_entry *current_free;
  103. };
  104. };
  105. struct rb_root tree;
  106. size_t freelist_size;
  107. size_t writeback_size;
  108. size_t freelist_high_watermark;
  109. size_t freelist_low_watermark;
  110. unsigned long max_age;
  111. unsigned long pause;
  112. unsigned int uncommitted_blocks;
  113. unsigned int autocommit_blocks;
  114. unsigned int max_writeback_jobs;
  115. int error;
  116. unsigned long autocommit_jiffies;
  117. struct timer_list autocommit_timer;
  118. struct wait_queue_head freelist_wait;
  119. struct timer_list max_age_timer;
  120. atomic_t bio_in_progress[2];
  121. struct wait_queue_head bio_in_progress_wait[2];
  122. struct dm_target *ti;
  123. struct dm_dev *dev;
  124. struct dm_dev *ssd_dev;
  125. sector_t start_sector;
  126. void *memory_map;
  127. uint64_t memory_map_size;
  128. size_t metadata_sectors;
  129. size_t n_blocks;
  130. uint64_t seq_count;
  131. sector_t data_device_sectors;
  132. void *block_start;
  133. struct wc_entry *entries;
  134. unsigned int block_size;
  135. unsigned char block_size_bits;
  136. bool pmem_mode:1;
  137. bool writeback_fua:1;
  138. bool overwrote_committed:1;
  139. bool memory_vmapped:1;
  140. bool start_sector_set:1;
  141. bool high_wm_percent_set:1;
  142. bool low_wm_percent_set:1;
  143. bool max_writeback_jobs_set:1;
  144. bool autocommit_blocks_set:1;
  145. bool autocommit_time_set:1;
  146. bool max_age_set:1;
  147. bool writeback_fua_set:1;
  148. bool flush_on_suspend:1;
  149. bool cleaner:1;
  150. bool cleaner_set:1;
  151. bool metadata_only:1;
  152. bool pause_set:1;
  153. unsigned int high_wm_percent_value;
  154. unsigned int low_wm_percent_value;
  155. unsigned int autocommit_time_value;
  156. unsigned int max_age_value;
  157. unsigned int pause_value;
  158. unsigned int writeback_all;
  159. struct workqueue_struct *writeback_wq;
  160. struct work_struct writeback_work;
  161. struct work_struct flush_work;
  162. struct dm_io_tracker iot;
  163. struct dm_io_client *dm_io;
  164. raw_spinlock_t endio_list_lock;
  165. struct list_head endio_list;
  166. struct task_struct *endio_thread;
  167. struct task_struct *flush_thread;
  168. struct bio_list flush_list;
  169. struct dm_kcopyd_client *dm_kcopyd;
  170. unsigned long *dirty_bitmap;
  171. unsigned int dirty_bitmap_size;
  172. struct bio_set bio_set;
  173. mempool_t copy_pool;
  174. struct {
  175. unsigned long long reads;
  176. unsigned long long read_hits;
  177. unsigned long long writes;
  178. unsigned long long write_hits_uncommitted;
  179. unsigned long long write_hits_committed;
  180. unsigned long long writes_around;
  181. unsigned long long writes_allocate;
  182. unsigned long long writes_blocked_on_freelist;
  183. unsigned long long flushes;
  184. unsigned long long discards;
  185. } stats;
  186. };
  187. #define WB_LIST_INLINE 16
  188. struct writeback_struct {
  189. struct list_head endio_entry;
  190. struct dm_writecache *wc;
  191. struct wc_entry **wc_list;
  192. unsigned int wc_list_n;
  193. struct wc_entry *wc_list_inline[WB_LIST_INLINE];
  194. struct bio bio;
  195. };
  196. struct copy_struct {
  197. struct list_head endio_entry;
  198. struct dm_writecache *wc;
  199. struct wc_entry *e;
  200. unsigned int n_entries;
  201. int error;
  202. };
  203. DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(dm_writecache_throttle,
  204. "A percentage of time allocated for data copying");
  205. static void wc_lock(struct dm_writecache *wc)
  206. {
  207. mutex_lock(&wc->lock);
  208. }
  209. static void wc_unlock(struct dm_writecache *wc)
  210. {
  211. mutex_unlock(&wc->lock);
  212. }
  213. #ifdef DM_WRITECACHE_HAS_PMEM
  214. static int persistent_memory_claim(struct dm_writecache *wc)
  215. {
  216. int r;
  217. loff_t s;
  218. long p, da;
  219. unsigned long pfn;
  220. int id;
  221. struct page **pages;
  222. sector_t offset;
  223. wc->memory_vmapped = false;
  224. s = wc->memory_map_size;
  225. p = s >> PAGE_SHIFT;
  226. if (!p) {
  227. r = -EINVAL;
  228. goto err1;
  229. }
  230. if (p != s >> PAGE_SHIFT) {
  231. r = -EOVERFLOW;
  232. goto err1;
  233. }
  234. offset = get_start_sect(wc->ssd_dev->bdev);
  235. if (offset & (PAGE_SIZE / 512 - 1)) {
  236. r = -EINVAL;
  237. goto err1;
  238. }
  239. offset >>= PAGE_SHIFT - 9;
  240. id = dax_read_lock();
  241. da = dax_direct_access(wc->ssd_dev->dax_dev, offset, p, DAX_ACCESS,
  242. &wc->memory_map, &pfn);
  243. if (da < 0) {
  244. wc->memory_map = NULL;
  245. r = da;
  246. goto err2;
  247. }
  248. if (!pfn_valid(pfn)) {
  249. wc->memory_map = NULL;
  250. r = -EOPNOTSUPP;
  251. goto err2;
  252. }
  253. if (da != p) {
  254. long i;
  255. wc->memory_map = NULL;
  256. pages = vmalloc_array(p, sizeof(struct page *));
  257. if (!pages) {
  258. r = -ENOMEM;
  259. goto err2;
  260. }
  261. i = 0;
  262. do {
  263. long daa;
  264. daa = dax_direct_access(wc->ssd_dev->dax_dev, offset + i,
  265. p - i, DAX_ACCESS, NULL, &pfn);
  266. if (daa <= 0) {
  267. r = daa ? daa : -EINVAL;
  268. goto err3;
  269. }
  270. if (!pfn_valid(pfn)) {
  271. r = -EOPNOTSUPP;
  272. goto err3;
  273. }
  274. while (daa-- && i < p) {
  275. pages[i++] = pfn_to_page(pfn);
  276. pfn++;
  277. if (!(i & 15))
  278. cond_resched();
  279. }
  280. } while (i < p);
  281. wc->memory_map = vmap(pages, p, VM_MAP, PAGE_KERNEL);
  282. if (!wc->memory_map) {
  283. r = -ENOMEM;
  284. goto err3;
  285. }
  286. vfree(pages);
  287. wc->memory_vmapped = true;
  288. }
  289. dax_read_unlock(id);
  290. wc->memory_map += (size_t)wc->start_sector << SECTOR_SHIFT;
  291. wc->memory_map_size -= (size_t)wc->start_sector << SECTOR_SHIFT;
  292. return 0;
  293. err3:
  294. vfree(pages);
  295. err2:
  296. dax_read_unlock(id);
  297. err1:
  298. return r;
  299. }
  300. #else
  301. static int persistent_memory_claim(struct dm_writecache *wc)
  302. {
  303. return -EOPNOTSUPP;
  304. }
  305. #endif
  306. static void persistent_memory_release(struct dm_writecache *wc)
  307. {
  308. if (wc->memory_vmapped)
  309. vunmap(wc->memory_map - ((size_t)wc->start_sector << SECTOR_SHIFT));
  310. }
  311. static struct page *persistent_memory_page(void *addr)
  312. {
  313. if (is_vmalloc_addr(addr))
  314. return vmalloc_to_page(addr);
  315. else
  316. return virt_to_page(addr);
  317. }
  318. static unsigned int persistent_memory_page_offset(void *addr)
  319. {
  320. return (unsigned long)addr & (PAGE_SIZE - 1);
  321. }
  322. static void persistent_memory_flush_cache(void *ptr, size_t size)
  323. {
  324. if (is_vmalloc_addr(ptr))
  325. flush_kernel_vmap_range(ptr, size);
  326. }
  327. static void persistent_memory_invalidate_cache(void *ptr, size_t size)
  328. {
  329. if (is_vmalloc_addr(ptr))
  330. invalidate_kernel_vmap_range(ptr, size);
  331. }
  332. static struct wc_memory_superblock *sb(struct dm_writecache *wc)
  333. {
  334. return wc->memory_map;
  335. }
  336. static struct wc_memory_entry *memory_entry(struct dm_writecache *wc, struct wc_entry *e)
  337. {
  338. return &sb(wc)->entries[e->index];
  339. }
  340. static void *memory_data(struct dm_writecache *wc, struct wc_entry *e)
  341. {
  342. return (char *)wc->block_start + (e->index << wc->block_size_bits);
  343. }
  344. static sector_t cache_sector(struct dm_writecache *wc, struct wc_entry *e)
  345. {
  346. return wc->start_sector + wc->metadata_sectors +
  347. ((sector_t)e->index << (wc->block_size_bits - SECTOR_SHIFT));
  348. }
  349. static uint64_t read_original_sector(struct dm_writecache *wc, struct wc_entry *e)
  350. {
  351. #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
  352. return e->original_sector;
  353. #else
  354. return le64_to_cpu(memory_entry(wc, e)->original_sector);
  355. #endif
  356. }
  357. static uint64_t read_seq_count(struct dm_writecache *wc, struct wc_entry *e)
  358. {
  359. #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
  360. return e->seq_count;
  361. #else
  362. return le64_to_cpu(memory_entry(wc, e)->seq_count);
  363. #endif
  364. }
  365. static void clear_seq_count(struct dm_writecache *wc, struct wc_entry *e)
  366. {
  367. #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
  368. e->seq_count = -1;
  369. #endif
  370. pmem_assign(memory_entry(wc, e)->seq_count, cpu_to_le64(-1));
  371. }
  372. static void write_original_sector_seq_count(struct dm_writecache *wc, struct wc_entry *e,
  373. uint64_t original_sector, uint64_t seq_count)
  374. {
  375. struct wc_memory_entry me;
  376. #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
  377. e->original_sector = original_sector;
  378. e->seq_count = seq_count;
  379. #endif
  380. me.original_sector = cpu_to_le64(original_sector);
  381. me.seq_count = cpu_to_le64(seq_count);
  382. pmem_assign(*memory_entry(wc, e), me);
  383. }
  384. #define writecache_error(wc, err, msg, arg...) \
  385. do { \
  386. if (!cmpxchg(&(wc)->error, 0, err)) \
  387. DMERR(msg, ##arg); \
  388. wake_up(&(wc)->freelist_wait); \
  389. } while (0)
  390. #define writecache_has_error(wc) (unlikely(READ_ONCE((wc)->error)))
  391. static void writecache_flush_all_metadata(struct dm_writecache *wc)
  392. {
  393. if (!WC_MODE_PMEM(wc))
  394. memset(wc->dirty_bitmap, -1, wc->dirty_bitmap_size);
  395. }
  396. static void writecache_flush_region(struct dm_writecache *wc, void *ptr, size_t size)
  397. {
  398. if (!WC_MODE_PMEM(wc))
  399. __set_bit(((char *)ptr - (char *)wc->memory_map) / BITMAP_GRANULARITY,
  400. wc->dirty_bitmap);
  401. }
  402. static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev);
  403. struct io_notify {
  404. struct dm_writecache *wc;
  405. struct completion c;
  406. atomic_t count;
  407. };
  408. static void writecache_notify_io(unsigned long error, void *context)
  409. {
  410. struct io_notify *endio = context;
  411. if (unlikely(error != 0))
  412. writecache_error(endio->wc, -EIO, "error writing metadata");
  413. BUG_ON(atomic_read(&endio->count) <= 0);
  414. if (atomic_dec_and_test(&endio->count))
  415. complete(&endio->c);
  416. }
  417. static void writecache_wait_for_ios(struct dm_writecache *wc, int direction)
  418. {
  419. wait_event(wc->bio_in_progress_wait[direction],
  420. !atomic_read(&wc->bio_in_progress[direction]));
  421. }
  422. static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
  423. {
  424. struct dm_io_region region;
  425. struct dm_io_request req;
  426. struct io_notify endio = {
  427. wc,
  428. COMPLETION_INITIALIZER_ONSTACK(endio.c),
  429. ATOMIC_INIT(1),
  430. };
  431. unsigned int bitmap_bits = wc->dirty_bitmap_size * 8;
  432. unsigned int i = 0;
  433. while (1) {
  434. unsigned int j;
  435. i = find_next_bit(wc->dirty_bitmap, bitmap_bits, i);
  436. if (unlikely(i == bitmap_bits))
  437. break;
  438. j = find_next_zero_bit(wc->dirty_bitmap, bitmap_bits, i);
  439. region.bdev = wc->ssd_dev->bdev;
  440. region.sector = (sector_t)i * (BITMAP_GRANULARITY >> SECTOR_SHIFT);
  441. region.count = (sector_t)(j - i) * (BITMAP_GRANULARITY >> SECTOR_SHIFT);
  442. if (unlikely(region.sector >= wc->metadata_sectors))
  443. break;
  444. if (unlikely(region.sector + region.count > wc->metadata_sectors))
  445. region.count = wc->metadata_sectors - region.sector;
  446. region.sector += wc->start_sector;
  447. atomic_inc(&endio.count);
  448. req.bi_opf = REQ_OP_WRITE | REQ_SYNC;
  449. req.mem.type = DM_IO_VMA;
  450. req.mem.ptr.vma = (char *)wc->memory_map + (size_t)i * BITMAP_GRANULARITY;
  451. req.client = wc->dm_io;
  452. req.notify.fn = writecache_notify_io;
  453. req.notify.context = &endio;
  454. /* writing via async dm-io (implied by notify.fn above) won't return an error */
  455. (void) dm_io(&req, 1, &region, NULL, IOPRIO_DEFAULT);
  456. i = j;
  457. }
  458. writecache_notify_io(0, &endio);
  459. wait_for_completion_io(&endio.c);
  460. if (wait_for_ios)
  461. writecache_wait_for_ios(wc, WRITE);
  462. writecache_disk_flush(wc, wc->ssd_dev);
  463. memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size);
  464. }
  465. static void ssd_commit_superblock(struct dm_writecache *wc)
  466. {
  467. int r;
  468. struct dm_io_region region;
  469. struct dm_io_request req;
  470. region.bdev = wc->ssd_dev->bdev;
  471. region.sector = 0;
  472. region.count = max(4096U, wc->block_size) >> SECTOR_SHIFT;
  473. if (unlikely(region.sector + region.count > wc->metadata_sectors))
  474. region.count = wc->metadata_sectors - region.sector;
  475. region.sector += wc->start_sector;
  476. req.bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_FUA;
  477. req.mem.type = DM_IO_VMA;
  478. req.mem.ptr.vma = (char *)wc->memory_map;
  479. req.client = wc->dm_io;
  480. req.notify.fn = NULL;
  481. req.notify.context = NULL;
  482. r = dm_io(&req, 1, &region, NULL, IOPRIO_DEFAULT);
  483. if (unlikely(r))
  484. writecache_error(wc, r, "error writing superblock");
  485. }
  486. static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
  487. {
  488. if (WC_MODE_PMEM(wc))
  489. pmem_wmb();
  490. else
  491. ssd_commit_flushed(wc, wait_for_ios);
  492. }
  493. static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev)
  494. {
  495. int r;
  496. struct dm_io_region region;
  497. struct dm_io_request req;
  498. region.bdev = dev->bdev;
  499. region.sector = 0;
  500. region.count = 0;
  501. req.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
  502. req.mem.type = DM_IO_KMEM;
  503. req.mem.ptr.addr = NULL;
  504. req.client = wc->dm_io;
  505. req.notify.fn = NULL;
  506. r = dm_io(&req, 1, &region, NULL, IOPRIO_DEFAULT);
  507. if (unlikely(r))
  508. writecache_error(wc, r, "error flushing metadata: %d", r);
  509. }
  510. #define WFE_RETURN_FOLLOWING 1
  511. #define WFE_LOWEST_SEQ 2
  512. static struct wc_entry *writecache_find_entry(struct dm_writecache *wc,
  513. uint64_t block, int flags)
  514. {
  515. struct wc_entry *e;
  516. struct rb_node *node = wc->tree.rb_node;
  517. if (unlikely(!node))
  518. return NULL;
  519. while (1) {
  520. e = container_of(node, struct wc_entry, rb_node);
  521. if (read_original_sector(wc, e) == block)
  522. break;
  523. node = (read_original_sector(wc, e) >= block ?
  524. e->rb_node.rb_left : e->rb_node.rb_right);
  525. if (unlikely(!node)) {
  526. if (!(flags & WFE_RETURN_FOLLOWING))
  527. return NULL;
  528. if (read_original_sector(wc, e) >= block)
  529. return e;
  530. node = rb_next(&e->rb_node);
  531. if (unlikely(!node))
  532. return NULL;
  533. e = container_of(node, struct wc_entry, rb_node);
  534. return e;
  535. }
  536. }
  537. while (1) {
  538. struct wc_entry *e2;
  539. if (flags & WFE_LOWEST_SEQ)
  540. node = rb_prev(&e->rb_node);
  541. else
  542. node = rb_next(&e->rb_node);
  543. if (unlikely(!node))
  544. return e;
  545. e2 = container_of(node, struct wc_entry, rb_node);
  546. if (read_original_sector(wc, e2) != block)
  547. return e;
  548. e = e2;
  549. }
  550. }
  551. static void writecache_insert_entry(struct dm_writecache *wc, struct wc_entry *ins)
  552. {
  553. struct wc_entry *e;
  554. struct rb_node **node = &wc->tree.rb_node, *parent = NULL;
  555. while (*node) {
  556. e = container_of(*node, struct wc_entry, rb_node);
  557. parent = &e->rb_node;
  558. if (read_original_sector(wc, e) > read_original_sector(wc, ins))
  559. node = &parent->rb_left;
  560. else
  561. node = &parent->rb_right;
  562. }
  563. rb_link_node(&ins->rb_node, parent, node);
  564. rb_insert_color(&ins->rb_node, &wc->tree);
  565. list_add(&ins->lru, &wc->lru);
  566. ins->age = jiffies;
  567. }
  568. static void writecache_unlink(struct dm_writecache *wc, struct wc_entry *e)
  569. {
  570. list_del(&e->lru);
  571. rb_erase(&e->rb_node, &wc->tree);
  572. }
  573. static void writecache_add_to_freelist(struct dm_writecache *wc, struct wc_entry *e)
  574. {
  575. if (WC_MODE_SORT_FREELIST(wc)) {
  576. struct rb_node **node = &wc->freetree.rb_node, *parent = NULL;
  577. if (unlikely(!*node))
  578. wc->current_free = e;
  579. while (*node) {
  580. parent = *node;
  581. if (&e->rb_node < *node)
  582. node = &parent->rb_left;
  583. else
  584. node = &parent->rb_right;
  585. }
  586. rb_link_node(&e->rb_node, parent, node);
  587. rb_insert_color(&e->rb_node, &wc->freetree);
  588. } else {
  589. list_add_tail(&e->lru, &wc->freelist);
  590. }
  591. wc->freelist_size++;
  592. }
  593. static inline void writecache_verify_watermark(struct dm_writecache *wc)
  594. {
  595. if (unlikely(wc->freelist_size + wc->writeback_size <= wc->freelist_high_watermark))
  596. queue_work(wc->writeback_wq, &wc->writeback_work);
  597. }
  598. static void writecache_max_age_timer(struct timer_list *t)
  599. {
  600. struct dm_writecache *wc = timer_container_of(wc, t, max_age_timer);
  601. if (!dm_suspended(wc->ti) && !writecache_has_error(wc)) {
  602. queue_work(wc->writeback_wq, &wc->writeback_work);
  603. mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV);
  604. }
  605. }
  606. static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc, sector_t expected_sector)
  607. {
  608. struct wc_entry *e;
  609. if (WC_MODE_SORT_FREELIST(wc)) {
  610. struct rb_node *next;
  611. if (unlikely(!wc->current_free))
  612. return NULL;
  613. e = wc->current_free;
  614. if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector))
  615. return NULL;
  616. next = rb_next(&e->rb_node);
  617. rb_erase(&e->rb_node, &wc->freetree);
  618. if (unlikely(!next))
  619. next = rb_first(&wc->freetree);
  620. wc->current_free = next ? container_of(next, struct wc_entry, rb_node) : NULL;
  621. } else {
  622. if (unlikely(list_empty(&wc->freelist)))
  623. return NULL;
  624. e = container_of(wc->freelist.next, struct wc_entry, lru);
  625. if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector))
  626. return NULL;
  627. list_del(&e->lru);
  628. }
  629. wc->freelist_size--;
  630. writecache_verify_watermark(wc);
  631. return e;
  632. }
  633. static void writecache_free_entry(struct dm_writecache *wc, struct wc_entry *e)
  634. {
  635. writecache_unlink(wc, e);
  636. writecache_add_to_freelist(wc, e);
  637. clear_seq_count(wc, e);
  638. writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry));
  639. if (unlikely(waitqueue_active(&wc->freelist_wait)))
  640. wake_up(&wc->freelist_wait);
  641. }
  642. static void writecache_wait_on_freelist(struct dm_writecache *wc)
  643. {
  644. DEFINE_WAIT(wait);
  645. prepare_to_wait(&wc->freelist_wait, &wait, TASK_UNINTERRUPTIBLE);
  646. wc_unlock(wc);
  647. io_schedule();
  648. finish_wait(&wc->freelist_wait, &wait);
  649. wc_lock(wc);
  650. }
  651. static void writecache_poison_lists(struct dm_writecache *wc)
  652. {
  653. /*
  654. * Catch incorrect access to these values while the device is suspended.
  655. */
  656. memset(&wc->tree, -1, sizeof(wc->tree));
  657. wc->lru.next = LIST_POISON1;
  658. wc->lru.prev = LIST_POISON2;
  659. wc->freelist.next = LIST_POISON1;
  660. wc->freelist.prev = LIST_POISON2;
  661. }
  662. static void writecache_flush_entry(struct dm_writecache *wc, struct wc_entry *e)
  663. {
  664. writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry));
  665. if (WC_MODE_PMEM(wc))
  666. writecache_flush_region(wc, memory_data(wc, e), wc->block_size);
  667. }
  668. static bool writecache_entry_is_committed(struct dm_writecache *wc, struct wc_entry *e)
  669. {
  670. return read_seq_count(wc, e) < wc->seq_count;
  671. }
  672. static void writecache_flush(struct dm_writecache *wc)
  673. {
  674. struct wc_entry *e, *e2;
  675. bool need_flush_after_free;
  676. wc->uncommitted_blocks = 0;
  677. timer_delete(&wc->autocommit_timer);
  678. if (list_empty(&wc->lru))
  679. return;
  680. e = container_of(wc->lru.next, struct wc_entry, lru);
  681. if (writecache_entry_is_committed(wc, e)) {
  682. if (wc->overwrote_committed) {
  683. writecache_wait_for_ios(wc, WRITE);
  684. writecache_disk_flush(wc, wc->ssd_dev);
  685. wc->overwrote_committed = false;
  686. }
  687. return;
  688. }
  689. while (1) {
  690. writecache_flush_entry(wc, e);
  691. if (unlikely(e->lru.next == &wc->lru))
  692. break;
  693. e2 = container_of(e->lru.next, struct wc_entry, lru);
  694. if (writecache_entry_is_committed(wc, e2))
  695. break;
  696. e = e2;
  697. cond_resched();
  698. }
  699. writecache_commit_flushed(wc, true);
  700. wc->seq_count++;
  701. pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
  702. if (WC_MODE_PMEM(wc))
  703. writecache_commit_flushed(wc, false);
  704. else
  705. ssd_commit_superblock(wc);
  706. wc->overwrote_committed = false;
  707. need_flush_after_free = false;
  708. while (1) {
  709. /* Free another committed entry with lower seq-count */
  710. struct rb_node *rb_node = rb_prev(&e->rb_node);
  711. if (rb_node) {
  712. e2 = container_of(rb_node, struct wc_entry, rb_node);
  713. if (read_original_sector(wc, e2) == read_original_sector(wc, e) &&
  714. likely(!e2->write_in_progress)) {
  715. writecache_free_entry(wc, e2);
  716. need_flush_after_free = true;
  717. }
  718. }
  719. if (unlikely(e->lru.prev == &wc->lru))
  720. break;
  721. e = container_of(e->lru.prev, struct wc_entry, lru);
  722. cond_resched();
  723. }
  724. if (need_flush_after_free)
  725. writecache_commit_flushed(wc, false);
  726. }
  727. static void writecache_flush_work(struct work_struct *work)
  728. {
  729. struct dm_writecache *wc = container_of(work, struct dm_writecache, flush_work);
  730. wc_lock(wc);
  731. writecache_flush(wc);
  732. wc_unlock(wc);
  733. }
  734. static void writecache_autocommit_timer(struct timer_list *t)
  735. {
  736. struct dm_writecache *wc = timer_container_of(wc, t, autocommit_timer);
  737. if (!writecache_has_error(wc))
  738. queue_work(wc->writeback_wq, &wc->flush_work);
  739. }
  740. static void writecache_schedule_autocommit(struct dm_writecache *wc)
  741. {
  742. if (!timer_pending(&wc->autocommit_timer))
  743. mod_timer(&wc->autocommit_timer, jiffies + wc->autocommit_jiffies);
  744. }
  745. static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_t end)
  746. {
  747. struct wc_entry *e;
  748. bool discarded_something = false;
  749. e = writecache_find_entry(wc, start, WFE_RETURN_FOLLOWING | WFE_LOWEST_SEQ);
  750. if (unlikely(!e))
  751. return;
  752. while (read_original_sector(wc, e) < end) {
  753. struct rb_node *node = rb_next(&e->rb_node);
  754. if (likely(!e->write_in_progress)) {
  755. if (!discarded_something) {
  756. if (!WC_MODE_PMEM(wc)) {
  757. writecache_wait_for_ios(wc, READ);
  758. writecache_wait_for_ios(wc, WRITE);
  759. }
  760. discarded_something = true;
  761. }
  762. if (!writecache_entry_is_committed(wc, e))
  763. wc->uncommitted_blocks--;
  764. writecache_free_entry(wc, e);
  765. }
  766. if (unlikely(!node))
  767. break;
  768. e = container_of(node, struct wc_entry, rb_node);
  769. }
  770. if (discarded_something)
  771. writecache_commit_flushed(wc, false);
  772. }
  773. static bool writecache_wait_for_writeback(struct dm_writecache *wc)
  774. {
  775. if (wc->writeback_size) {
  776. writecache_wait_on_freelist(wc);
  777. return true;
  778. }
  779. return false;
  780. }
  781. static void writecache_suspend(struct dm_target *ti)
  782. {
  783. struct dm_writecache *wc = ti->private;
  784. bool flush_on_suspend;
  785. timer_delete_sync(&wc->autocommit_timer);
  786. timer_delete_sync(&wc->max_age_timer);
  787. wc_lock(wc);
  788. writecache_flush(wc);
  789. flush_on_suspend = wc->flush_on_suspend;
  790. if (flush_on_suspend) {
  791. wc->flush_on_suspend = false;
  792. wc->writeback_all++;
  793. queue_work(wc->writeback_wq, &wc->writeback_work);
  794. }
  795. wc_unlock(wc);
  796. drain_workqueue(wc->writeback_wq);
  797. wc_lock(wc);
  798. if (flush_on_suspend)
  799. wc->writeback_all--;
  800. while (writecache_wait_for_writeback(wc))
  801. ;
  802. if (WC_MODE_PMEM(wc))
  803. persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size);
  804. writecache_poison_lists(wc);
  805. wc_unlock(wc);
  806. }
  807. static int writecache_alloc_entries(struct dm_writecache *wc)
  808. {
  809. size_t b;
  810. if (wc->entries)
  811. return 0;
  812. wc->entries = vmalloc_array(wc->n_blocks, sizeof(struct wc_entry));
  813. if (!wc->entries)
  814. return -ENOMEM;
  815. for (b = 0; b < wc->n_blocks; b++) {
  816. struct wc_entry *e = &wc->entries[b];
  817. e->index = b;
  818. e->write_in_progress = false;
  819. cond_resched();
  820. }
  821. return 0;
  822. }
  823. static int writecache_read_metadata(struct dm_writecache *wc, sector_t n_sectors)
  824. {
  825. struct dm_io_region region;
  826. struct dm_io_request req;
  827. region.bdev = wc->ssd_dev->bdev;
  828. region.sector = wc->start_sector;
  829. region.count = n_sectors;
  830. req.bi_opf = REQ_OP_READ | REQ_SYNC;
  831. req.mem.type = DM_IO_VMA;
  832. req.mem.ptr.vma = (char *)wc->memory_map;
  833. req.client = wc->dm_io;
  834. req.notify.fn = NULL;
  835. return dm_io(&req, 1, &region, NULL, IOPRIO_DEFAULT);
  836. }
  837. static void writecache_resume(struct dm_target *ti)
  838. {
  839. struct dm_writecache *wc = ti->private;
  840. size_t b;
  841. bool need_flush = false;
  842. __le64 sb_seq_count;
  843. int r;
  844. wc_lock(wc);
  845. wc->data_device_sectors = bdev_nr_sectors(wc->dev->bdev);
  846. if (WC_MODE_PMEM(wc)) {
  847. persistent_memory_invalidate_cache(wc->memory_map, wc->memory_map_size);
  848. } else {
  849. r = writecache_read_metadata(wc, wc->metadata_sectors);
  850. if (r) {
  851. size_t sb_entries_offset;
  852. writecache_error(wc, r, "unable to read metadata: %d", r);
  853. sb_entries_offset = offsetof(struct wc_memory_superblock, entries);
  854. memset((char *)wc->memory_map + sb_entries_offset, -1,
  855. (wc->metadata_sectors << SECTOR_SHIFT) - sb_entries_offset);
  856. }
  857. }
  858. wc->tree = RB_ROOT;
  859. INIT_LIST_HEAD(&wc->lru);
  860. if (WC_MODE_SORT_FREELIST(wc)) {
  861. wc->freetree = RB_ROOT;
  862. wc->current_free = NULL;
  863. } else {
  864. INIT_LIST_HEAD(&wc->freelist);
  865. }
  866. wc->freelist_size = 0;
  867. r = copy_mc_to_kernel(&sb_seq_count, &sb(wc)->seq_count,
  868. sizeof(uint64_t));
  869. if (r) {
  870. writecache_error(wc, r, "hardware memory error when reading superblock: %d", r);
  871. sb_seq_count = cpu_to_le64(0);
  872. }
  873. wc->seq_count = le64_to_cpu(sb_seq_count);
  874. #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
  875. for (b = 0; b < wc->n_blocks; b++) {
  876. struct wc_entry *e = &wc->entries[b];
  877. struct wc_memory_entry wme;
  878. if (writecache_has_error(wc)) {
  879. e->original_sector = -1;
  880. e->seq_count = -1;
  881. continue;
  882. }
  883. r = copy_mc_to_kernel(&wme, memory_entry(wc, e),
  884. sizeof(struct wc_memory_entry));
  885. if (r) {
  886. writecache_error(wc, r, "hardware memory error when reading metadata entry %lu: %d",
  887. (unsigned long)b, r);
  888. e->original_sector = -1;
  889. e->seq_count = -1;
  890. } else {
  891. e->original_sector = le64_to_cpu(wme.original_sector);
  892. e->seq_count = le64_to_cpu(wme.seq_count);
  893. }
  894. cond_resched();
  895. }
  896. #endif
  897. for (b = 0; b < wc->n_blocks; b++) {
  898. struct wc_entry *e = &wc->entries[b];
  899. if (!writecache_entry_is_committed(wc, e)) {
  900. if (read_seq_count(wc, e) != -1) {
  901. erase_this:
  902. clear_seq_count(wc, e);
  903. need_flush = true;
  904. }
  905. writecache_add_to_freelist(wc, e);
  906. } else {
  907. struct wc_entry *old;
  908. old = writecache_find_entry(wc, read_original_sector(wc, e), 0);
  909. if (!old) {
  910. writecache_insert_entry(wc, e);
  911. } else {
  912. if (read_seq_count(wc, old) == read_seq_count(wc, e)) {
  913. writecache_error(wc, -EINVAL,
  914. "two identical entries, position %llu, sector %llu, sequence %llu",
  915. (unsigned long long)b, (unsigned long long)read_original_sector(wc, e),
  916. (unsigned long long)read_seq_count(wc, e));
  917. }
  918. if (read_seq_count(wc, old) > read_seq_count(wc, e)) {
  919. goto erase_this;
  920. } else {
  921. writecache_free_entry(wc, old);
  922. writecache_insert_entry(wc, e);
  923. need_flush = true;
  924. }
  925. }
  926. }
  927. cond_resched();
  928. }
  929. if (need_flush) {
  930. writecache_flush_all_metadata(wc);
  931. writecache_commit_flushed(wc, false);
  932. }
  933. writecache_verify_watermark(wc);
  934. if (wc->max_age != MAX_AGE_UNSPECIFIED)
  935. mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV);
  936. wc_unlock(wc);
  937. }
  938. static int process_flush_mesg(unsigned int argc, char **argv, struct dm_writecache *wc)
  939. {
  940. if (argc != 1)
  941. return -EINVAL;
  942. wc_lock(wc);
  943. if (dm_suspended(wc->ti)) {
  944. wc_unlock(wc);
  945. return -EBUSY;
  946. }
  947. if (writecache_has_error(wc)) {
  948. wc_unlock(wc);
  949. return -EIO;
  950. }
  951. writecache_flush(wc);
  952. wc->writeback_all++;
  953. queue_work(wc->writeback_wq, &wc->writeback_work);
  954. wc_unlock(wc);
  955. flush_workqueue(wc->writeback_wq);
  956. wc_lock(wc);
  957. wc->writeback_all--;
  958. if (writecache_has_error(wc)) {
  959. wc_unlock(wc);
  960. return -EIO;
  961. }
  962. wc_unlock(wc);
  963. return 0;
  964. }
  965. static int process_flush_on_suspend_mesg(unsigned int argc, char **argv, struct dm_writecache *wc)
  966. {
  967. if (argc != 1)
  968. return -EINVAL;
  969. wc_lock(wc);
  970. wc->flush_on_suspend = true;
  971. wc_unlock(wc);
  972. return 0;
  973. }
  974. static void activate_cleaner(struct dm_writecache *wc)
  975. {
  976. wc->flush_on_suspend = true;
  977. wc->cleaner = true;
  978. wc->freelist_high_watermark = wc->n_blocks;
  979. wc->freelist_low_watermark = wc->n_blocks;
  980. }
  981. static int process_cleaner_mesg(unsigned int argc, char **argv, struct dm_writecache *wc)
  982. {
  983. if (argc != 1)
  984. return -EINVAL;
  985. wc_lock(wc);
  986. activate_cleaner(wc);
  987. if (!dm_suspended(wc->ti))
  988. writecache_verify_watermark(wc);
  989. wc_unlock(wc);
  990. return 0;
  991. }
  992. static int process_clear_stats_mesg(unsigned int argc, char **argv, struct dm_writecache *wc)
  993. {
  994. if (argc != 1)
  995. return -EINVAL;
  996. wc_lock(wc);
  997. memset(&wc->stats, 0, sizeof(wc->stats));
  998. wc_unlock(wc);
  999. return 0;
  1000. }
  1001. static int writecache_message(struct dm_target *ti, unsigned int argc, char **argv,
  1002. char *result, unsigned int maxlen)
  1003. {
  1004. int r = -EINVAL;
  1005. struct dm_writecache *wc = ti->private;
  1006. if (!strcasecmp(argv[0], "flush"))
  1007. r = process_flush_mesg(argc, argv, wc);
  1008. else if (!strcasecmp(argv[0], "flush_on_suspend"))
  1009. r = process_flush_on_suspend_mesg(argc, argv, wc);
  1010. else if (!strcasecmp(argv[0], "cleaner"))
  1011. r = process_cleaner_mesg(argc, argv, wc);
  1012. else if (!strcasecmp(argv[0], "clear_stats"))
  1013. r = process_clear_stats_mesg(argc, argv, wc);
  1014. else
  1015. DMERR("unrecognised message received: %s", argv[0]);
  1016. return r;
  1017. }
  1018. static void memcpy_flushcache_optimized(void *dest, void *source, size_t size)
  1019. {
  1020. /*
  1021. * clflushopt performs better with block size 1024, 2048, 4096
  1022. * non-temporal stores perform better with block size 512
  1023. *
  1024. * block size 512 1024 2048 4096
  1025. * movnti 496 MB/s 642 MB/s 725 MB/s 744 MB/s
  1026. * clflushopt 373 MB/s 688 MB/s 1.1 GB/s 1.2 GB/s
  1027. *
  1028. * We see that movnti performs better for 512-byte blocks, and
  1029. * clflushopt performs better for 1024-byte and larger blocks. So, we
  1030. * prefer clflushopt for sizes >= 768.
  1031. *
  1032. * NOTE: this happens to be the case now (with dm-writecache's single
  1033. * threaded model) but re-evaluate this once memcpy_flushcache() is
  1034. * enabled to use movdir64b which might invalidate this performance
  1035. * advantage seen with cache-allocating-writes plus flushing.
  1036. */
  1037. #ifdef CONFIG_X86
  1038. if (static_cpu_has(X86_FEATURE_CLFLUSHOPT) &&
  1039. likely(boot_cpu_data.x86_clflush_size == 64) &&
  1040. likely(size >= 768)) {
  1041. do {
  1042. memcpy((void *)dest, (void *)source, 64);
  1043. clflushopt((void *)dest);
  1044. dest += 64;
  1045. source += 64;
  1046. size -= 64;
  1047. } while (size >= 64);
  1048. return;
  1049. }
  1050. #endif
  1051. memcpy_flushcache(dest, source, size);
  1052. }
  1053. static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data)
  1054. {
  1055. void *buf;
  1056. unsigned int size;
  1057. int rw = bio_data_dir(bio);
  1058. unsigned int remaining_size = wc->block_size;
  1059. do {
  1060. struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter);
  1061. buf = bvec_kmap_local(&bv);
  1062. size = bv.bv_len;
  1063. if (unlikely(size > remaining_size))
  1064. size = remaining_size;
  1065. if (rw == READ) {
  1066. int r;
  1067. r = copy_mc_to_kernel(buf, data, size);
  1068. flush_dcache_page(bio_page(bio));
  1069. if (unlikely(r)) {
  1070. writecache_error(wc, r, "hardware memory error when reading data: %d", r);
  1071. bio->bi_status = BLK_STS_IOERR;
  1072. }
  1073. } else {
  1074. flush_dcache_page(bio_page(bio));
  1075. memcpy_flushcache_optimized(data, buf, size);
  1076. }
  1077. kunmap_local(buf);
  1078. data = (char *)data + size;
  1079. remaining_size -= size;
  1080. bio_advance(bio, size);
  1081. } while (unlikely(remaining_size));
  1082. }
  1083. static int writecache_flush_thread(void *data)
  1084. {
  1085. struct dm_writecache *wc = data;
  1086. while (1) {
  1087. struct bio *bio;
  1088. wc_lock(wc);
  1089. bio = bio_list_pop(&wc->flush_list);
  1090. if (!bio) {
  1091. set_current_state(TASK_INTERRUPTIBLE);
  1092. wc_unlock(wc);
  1093. if (unlikely(kthread_should_stop())) {
  1094. set_current_state(TASK_RUNNING);
  1095. break;
  1096. }
  1097. schedule();
  1098. continue;
  1099. }
  1100. if (bio_op(bio) == REQ_OP_DISCARD) {
  1101. writecache_discard(wc, bio->bi_iter.bi_sector,
  1102. bio_end_sector(bio));
  1103. wc_unlock(wc);
  1104. bio_set_dev(bio, wc->dev->bdev);
  1105. submit_bio_noacct(bio);
  1106. } else {
  1107. writecache_flush(wc);
  1108. wc_unlock(wc);
  1109. if (writecache_has_error(wc))
  1110. bio->bi_status = BLK_STS_IOERR;
  1111. bio_endio(bio);
  1112. }
  1113. }
  1114. return 0;
  1115. }
  1116. static void writecache_offload_bio(struct dm_writecache *wc, struct bio *bio)
  1117. {
  1118. if (bio_list_empty(&wc->flush_list))
  1119. wake_up_process(wc->flush_thread);
  1120. bio_list_add(&wc->flush_list, bio);
  1121. }
  1122. enum wc_map_op {
  1123. WC_MAP_SUBMIT,
  1124. WC_MAP_REMAP,
  1125. WC_MAP_REMAP_ORIGIN,
  1126. WC_MAP_RETURN,
  1127. WC_MAP_ERROR,
  1128. };
  1129. static void writecache_map_remap_origin(struct dm_writecache *wc, struct bio *bio,
  1130. struct wc_entry *e)
  1131. {
  1132. if (e) {
  1133. sector_t next_boundary =
  1134. read_original_sector(wc, e) - bio->bi_iter.bi_sector;
  1135. if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT)
  1136. dm_accept_partial_bio(bio, next_boundary);
  1137. }
  1138. }
  1139. static enum wc_map_op writecache_map_read(struct dm_writecache *wc, struct bio *bio)
  1140. {
  1141. enum wc_map_op map_op;
  1142. struct wc_entry *e;
  1143. read_next_block:
  1144. wc->stats.reads++;
  1145. e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING);
  1146. if (e && read_original_sector(wc, e) == bio->bi_iter.bi_sector) {
  1147. wc->stats.read_hits++;
  1148. if (WC_MODE_PMEM(wc)) {
  1149. bio_copy_block(wc, bio, memory_data(wc, e));
  1150. if (bio->bi_iter.bi_size)
  1151. goto read_next_block;
  1152. map_op = WC_MAP_SUBMIT;
  1153. } else {
  1154. dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT);
  1155. bio_set_dev(bio, wc->ssd_dev->bdev);
  1156. bio->bi_iter.bi_sector = cache_sector(wc, e);
  1157. if (!writecache_entry_is_committed(wc, e))
  1158. writecache_wait_for_ios(wc, WRITE);
  1159. map_op = WC_MAP_REMAP;
  1160. }
  1161. } else {
  1162. writecache_map_remap_origin(wc, bio, e);
  1163. wc->stats.reads += (bio->bi_iter.bi_size - wc->block_size) >> wc->block_size_bits;
  1164. map_op = WC_MAP_REMAP_ORIGIN;
  1165. }
  1166. return map_op;
  1167. }
  1168. static void writecache_bio_copy_ssd(struct dm_writecache *wc, struct bio *bio,
  1169. struct wc_entry *e, bool search_used)
  1170. {
  1171. unsigned int bio_size = wc->block_size;
  1172. sector_t start_cache_sec = cache_sector(wc, e);
  1173. sector_t current_cache_sec = start_cache_sec + (bio_size >> SECTOR_SHIFT);
  1174. while (bio_size < bio->bi_iter.bi_size) {
  1175. if (!search_used) {
  1176. struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec);
  1177. if (!f)
  1178. break;
  1179. write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector +
  1180. (bio_size >> SECTOR_SHIFT), wc->seq_count);
  1181. writecache_insert_entry(wc, f);
  1182. wc->uncommitted_blocks++;
  1183. } else {
  1184. struct wc_entry *f;
  1185. struct rb_node *next = rb_next(&e->rb_node);
  1186. if (!next)
  1187. break;
  1188. f = container_of(next, struct wc_entry, rb_node);
  1189. if (f != e + 1)
  1190. break;
  1191. if (read_original_sector(wc, f) !=
  1192. read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT))
  1193. break;
  1194. if (unlikely(f->write_in_progress))
  1195. break;
  1196. if (writecache_entry_is_committed(wc, f))
  1197. wc->overwrote_committed = true;
  1198. e = f;
  1199. }
  1200. bio_size += wc->block_size;
  1201. current_cache_sec += wc->block_size >> SECTOR_SHIFT;
  1202. }
  1203. bio_set_dev(bio, wc->ssd_dev->bdev);
  1204. bio->bi_iter.bi_sector = start_cache_sec;
  1205. dm_accept_partial_bio(bio, bio_size >> SECTOR_SHIFT);
  1206. wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits;
  1207. wc->stats.writes_allocate += (bio->bi_iter.bi_size - wc->block_size) >> wc->block_size_bits;
  1208. if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) {
  1209. wc->uncommitted_blocks = 0;
  1210. queue_work(wc->writeback_wq, &wc->flush_work);
  1211. } else {
  1212. writecache_schedule_autocommit(wc);
  1213. }
  1214. }
  1215. static enum wc_map_op writecache_map_write(struct dm_writecache *wc, struct bio *bio)
  1216. {
  1217. struct wc_entry *e;
  1218. do {
  1219. bool found_entry = false;
  1220. bool search_used = false;
  1221. if (writecache_has_error(wc)) {
  1222. wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits;
  1223. return WC_MAP_ERROR;
  1224. }
  1225. e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0);
  1226. if (e) {
  1227. if (!writecache_entry_is_committed(wc, e)) {
  1228. wc->stats.write_hits_uncommitted++;
  1229. search_used = true;
  1230. goto bio_copy;
  1231. }
  1232. wc->stats.write_hits_committed++;
  1233. if (!WC_MODE_PMEM(wc) && !e->write_in_progress) {
  1234. wc->overwrote_committed = true;
  1235. search_used = true;
  1236. goto bio_copy;
  1237. }
  1238. found_entry = true;
  1239. } else {
  1240. if (unlikely(wc->cleaner) ||
  1241. (wc->metadata_only && !(bio->bi_opf & REQ_META)))
  1242. goto direct_write;
  1243. }
  1244. e = writecache_pop_from_freelist(wc, (sector_t)-1);
  1245. if (unlikely(!e)) {
  1246. if (!WC_MODE_PMEM(wc) && !found_entry) {
  1247. direct_write:
  1248. e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING);
  1249. writecache_map_remap_origin(wc, bio, e);
  1250. wc->stats.writes_around += bio->bi_iter.bi_size >> wc->block_size_bits;
  1251. wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits;
  1252. return WC_MAP_REMAP_ORIGIN;
  1253. }
  1254. wc->stats.writes_blocked_on_freelist++;
  1255. writecache_wait_on_freelist(wc);
  1256. continue;
  1257. }
  1258. write_original_sector_seq_count(wc, e, bio->bi_iter.bi_sector, wc->seq_count);
  1259. writecache_insert_entry(wc, e);
  1260. wc->uncommitted_blocks++;
  1261. wc->stats.writes_allocate++;
  1262. bio_copy:
  1263. if (WC_MODE_PMEM(wc)) {
  1264. bio_copy_block(wc, bio, memory_data(wc, e));
  1265. wc->stats.writes++;
  1266. } else {
  1267. writecache_bio_copy_ssd(wc, bio, e, search_used);
  1268. return WC_MAP_REMAP;
  1269. }
  1270. } while (bio->bi_iter.bi_size);
  1271. if (unlikely(bio->bi_opf & REQ_FUA || wc->uncommitted_blocks >= wc->autocommit_blocks))
  1272. writecache_flush(wc);
  1273. else
  1274. writecache_schedule_autocommit(wc);
  1275. return WC_MAP_SUBMIT;
  1276. }
  1277. static enum wc_map_op writecache_map_flush(struct dm_writecache *wc, struct bio *bio)
  1278. {
  1279. if (writecache_has_error(wc))
  1280. return WC_MAP_ERROR;
  1281. if (WC_MODE_PMEM(wc)) {
  1282. wc->stats.flushes++;
  1283. writecache_flush(wc);
  1284. if (writecache_has_error(wc))
  1285. return WC_MAP_ERROR;
  1286. else if (unlikely(wc->cleaner) || unlikely(wc->metadata_only))
  1287. return WC_MAP_REMAP_ORIGIN;
  1288. return WC_MAP_SUBMIT;
  1289. }
  1290. /* SSD: */
  1291. if (dm_bio_get_target_bio_nr(bio))
  1292. return WC_MAP_REMAP_ORIGIN;
  1293. wc->stats.flushes++;
  1294. writecache_offload_bio(wc, bio);
  1295. return WC_MAP_RETURN;
  1296. }
  1297. static enum wc_map_op writecache_map_discard(struct dm_writecache *wc, struct bio *bio)
  1298. {
  1299. wc->stats.discards += bio->bi_iter.bi_size >> wc->block_size_bits;
  1300. if (writecache_has_error(wc))
  1301. return WC_MAP_ERROR;
  1302. if (WC_MODE_PMEM(wc)) {
  1303. writecache_discard(wc, bio->bi_iter.bi_sector, bio_end_sector(bio));
  1304. return WC_MAP_REMAP_ORIGIN;
  1305. }
  1306. /* SSD: */
  1307. writecache_offload_bio(wc, bio);
  1308. return WC_MAP_RETURN;
  1309. }
  1310. static int writecache_map(struct dm_target *ti, struct bio *bio)
  1311. {
  1312. struct dm_writecache *wc = ti->private;
  1313. enum wc_map_op map_op;
  1314. bio->bi_private = NULL;
  1315. wc_lock(wc);
  1316. if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
  1317. map_op = writecache_map_flush(wc, bio);
  1318. goto done;
  1319. }
  1320. bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
  1321. if (unlikely((((unsigned int)bio->bi_iter.bi_sector | bio_sectors(bio)) &
  1322. (wc->block_size / 512 - 1)) != 0)) {
  1323. DMERR("I/O is not aligned, sector %llu, size %u, block size %u",
  1324. (unsigned long long)bio->bi_iter.bi_sector,
  1325. bio->bi_iter.bi_size, wc->block_size);
  1326. map_op = WC_MAP_ERROR;
  1327. goto done;
  1328. }
  1329. if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
  1330. map_op = writecache_map_discard(wc, bio);
  1331. goto done;
  1332. }
  1333. if (bio_data_dir(bio) == READ)
  1334. map_op = writecache_map_read(wc, bio);
  1335. else
  1336. map_op = writecache_map_write(wc, bio);
  1337. done:
  1338. switch (map_op) {
  1339. case WC_MAP_REMAP_ORIGIN:
  1340. if (likely(wc->pause != 0)) {
  1341. if (bio_op(bio) == REQ_OP_WRITE) {
  1342. dm_iot_io_begin(&wc->iot, 1);
  1343. bio->bi_private = (void *)2;
  1344. }
  1345. }
  1346. bio_set_dev(bio, wc->dev->bdev);
  1347. wc_unlock(wc);
  1348. return DM_MAPIO_REMAPPED;
  1349. case WC_MAP_REMAP:
  1350. /* make sure that writecache_end_io decrements bio_in_progress: */
  1351. bio->bi_private = (void *)1;
  1352. atomic_inc(&wc->bio_in_progress[bio_data_dir(bio)]);
  1353. wc_unlock(wc);
  1354. return DM_MAPIO_REMAPPED;
  1355. case WC_MAP_SUBMIT:
  1356. wc_unlock(wc);
  1357. bio_endio(bio);
  1358. return DM_MAPIO_SUBMITTED;
  1359. case WC_MAP_RETURN:
  1360. wc_unlock(wc);
  1361. return DM_MAPIO_SUBMITTED;
  1362. case WC_MAP_ERROR:
  1363. wc_unlock(wc);
  1364. bio_io_error(bio);
  1365. return DM_MAPIO_SUBMITTED;
  1366. default:
  1367. BUG();
  1368. wc_unlock(wc);
  1369. return DM_MAPIO_KILL;
  1370. }
  1371. }
  1372. static int writecache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *status)
  1373. {
  1374. struct dm_writecache *wc = ti->private;
  1375. if (bio->bi_private == (void *)1) {
  1376. int dir = bio_data_dir(bio);
  1377. if (atomic_dec_and_test(&wc->bio_in_progress[dir]))
  1378. if (unlikely(waitqueue_active(&wc->bio_in_progress_wait[dir])))
  1379. wake_up(&wc->bio_in_progress_wait[dir]);
  1380. } else if (bio->bi_private == (void *)2) {
  1381. dm_iot_io_end(&wc->iot, 1);
  1382. }
  1383. return 0;
  1384. }
  1385. static int writecache_iterate_devices(struct dm_target *ti,
  1386. iterate_devices_callout_fn fn, void *data)
  1387. {
  1388. struct dm_writecache *wc = ti->private;
  1389. return fn(ti, wc->dev, 0, ti->len, data);
  1390. }
  1391. static void writecache_io_hints(struct dm_target *ti, struct queue_limits *limits)
  1392. {
  1393. struct dm_writecache *wc = ti->private;
  1394. if (limits->logical_block_size < wc->block_size)
  1395. limits->logical_block_size = wc->block_size;
  1396. if (limits->physical_block_size < wc->block_size)
  1397. limits->physical_block_size = wc->block_size;
  1398. if (limits->io_min < wc->block_size)
  1399. limits->io_min = wc->block_size;
  1400. }
  1401. static void writecache_writeback_endio(struct bio *bio)
  1402. {
  1403. struct writeback_struct *wb = container_of(bio, struct writeback_struct, bio);
  1404. struct dm_writecache *wc = wb->wc;
  1405. unsigned long flags;
  1406. raw_spin_lock_irqsave(&wc->endio_list_lock, flags);
  1407. if (unlikely(list_empty(&wc->endio_list)))
  1408. wake_up_process(wc->endio_thread);
  1409. list_add_tail(&wb->endio_entry, &wc->endio_list);
  1410. raw_spin_unlock_irqrestore(&wc->endio_list_lock, flags);
  1411. }
  1412. static void writecache_copy_endio(int read_err, unsigned long write_err, void *ptr)
  1413. {
  1414. struct copy_struct *c = ptr;
  1415. struct dm_writecache *wc = c->wc;
  1416. c->error = likely(!(read_err | write_err)) ? 0 : -EIO;
  1417. raw_spin_lock_irq(&wc->endio_list_lock);
  1418. if (unlikely(list_empty(&wc->endio_list)))
  1419. wake_up_process(wc->endio_thread);
  1420. list_add_tail(&c->endio_entry, &wc->endio_list);
  1421. raw_spin_unlock_irq(&wc->endio_list_lock);
  1422. }
  1423. static void __writecache_endio_pmem(struct dm_writecache *wc, struct list_head *list)
  1424. {
  1425. unsigned int i;
  1426. struct writeback_struct *wb;
  1427. struct wc_entry *e;
  1428. unsigned long n_walked = 0;
  1429. do {
  1430. wb = list_entry(list->next, struct writeback_struct, endio_entry);
  1431. list_del(&wb->endio_entry);
  1432. if (unlikely(wb->bio.bi_status != BLK_STS_OK))
  1433. writecache_error(wc, blk_status_to_errno(wb->bio.bi_status),
  1434. "write error %d", wb->bio.bi_status);
  1435. i = 0;
  1436. do {
  1437. e = wb->wc_list[i];
  1438. BUG_ON(!e->write_in_progress);
  1439. e->write_in_progress = false;
  1440. INIT_LIST_HEAD(&e->lru);
  1441. if (!writecache_has_error(wc))
  1442. writecache_free_entry(wc, e);
  1443. BUG_ON(!wc->writeback_size);
  1444. wc->writeback_size--;
  1445. n_walked++;
  1446. if (unlikely(n_walked >= ENDIO_LATENCY)) {
  1447. writecache_commit_flushed(wc, false);
  1448. wc_unlock(wc);
  1449. wc_lock(wc);
  1450. n_walked = 0;
  1451. }
  1452. } while (++i < wb->wc_list_n);
  1453. if (wb->wc_list != wb->wc_list_inline)
  1454. kfree(wb->wc_list);
  1455. bio_put(&wb->bio);
  1456. } while (!list_empty(list));
  1457. }
  1458. static void __writecache_endio_ssd(struct dm_writecache *wc, struct list_head *list)
  1459. {
  1460. struct copy_struct *c;
  1461. struct wc_entry *e;
  1462. do {
  1463. c = list_entry(list->next, struct copy_struct, endio_entry);
  1464. list_del(&c->endio_entry);
  1465. if (unlikely(c->error))
  1466. writecache_error(wc, c->error, "copy error");
  1467. e = c->e;
  1468. do {
  1469. BUG_ON(!e->write_in_progress);
  1470. e->write_in_progress = false;
  1471. INIT_LIST_HEAD(&e->lru);
  1472. if (!writecache_has_error(wc))
  1473. writecache_free_entry(wc, e);
  1474. BUG_ON(!wc->writeback_size);
  1475. wc->writeback_size--;
  1476. e++;
  1477. } while (--c->n_entries);
  1478. mempool_free(c, &wc->copy_pool);
  1479. } while (!list_empty(list));
  1480. }
  1481. static int writecache_endio_thread(void *data)
  1482. {
  1483. struct dm_writecache *wc = data;
  1484. while (1) {
  1485. struct list_head list;
  1486. raw_spin_lock_irq(&wc->endio_list_lock);
  1487. if (!list_empty(&wc->endio_list))
  1488. goto pop_from_list;
  1489. set_current_state(TASK_INTERRUPTIBLE);
  1490. raw_spin_unlock_irq(&wc->endio_list_lock);
  1491. if (unlikely(kthread_should_stop())) {
  1492. set_current_state(TASK_RUNNING);
  1493. break;
  1494. }
  1495. schedule();
  1496. continue;
  1497. pop_from_list:
  1498. list = wc->endio_list;
  1499. list.next->prev = list.prev->next = &list;
  1500. INIT_LIST_HEAD(&wc->endio_list);
  1501. raw_spin_unlock_irq(&wc->endio_list_lock);
  1502. if (!WC_MODE_FUA(wc))
  1503. writecache_disk_flush(wc, wc->dev);
  1504. wc_lock(wc);
  1505. if (WC_MODE_PMEM(wc)) {
  1506. __writecache_endio_pmem(wc, &list);
  1507. } else {
  1508. __writecache_endio_ssd(wc, &list);
  1509. writecache_wait_for_ios(wc, READ);
  1510. }
  1511. writecache_commit_flushed(wc, false);
  1512. wc_unlock(wc);
  1513. }
  1514. return 0;
  1515. }
  1516. static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e)
  1517. {
  1518. struct dm_writecache *wc = wb->wc;
  1519. unsigned int block_size = wc->block_size;
  1520. void *address = memory_data(wc, e);
  1521. persistent_memory_flush_cache(address, block_size);
  1522. if (unlikely(bio_end_sector(&wb->bio) >= wc->data_device_sectors))
  1523. return true;
  1524. return bio_add_page(&wb->bio, persistent_memory_page(address),
  1525. block_size, persistent_memory_page_offset(address)) != 0;
  1526. }
  1527. struct writeback_list {
  1528. struct list_head list;
  1529. size_t size;
  1530. };
  1531. static void __writeback_throttle(struct dm_writecache *wc, struct writeback_list *wbl)
  1532. {
  1533. if (unlikely(wc->max_writeback_jobs)) {
  1534. if (READ_ONCE(wc->writeback_size) - wbl->size >= wc->max_writeback_jobs) {
  1535. wc_lock(wc);
  1536. while (wc->writeback_size - wbl->size >= wc->max_writeback_jobs)
  1537. writecache_wait_on_freelist(wc);
  1538. wc_unlock(wc);
  1539. }
  1540. }
  1541. cond_resched();
  1542. }
  1543. static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeback_list *wbl)
  1544. {
  1545. struct wc_entry *e, *f;
  1546. struct bio *bio;
  1547. struct writeback_struct *wb;
  1548. unsigned int max_pages;
  1549. while (wbl->size) {
  1550. wbl->size--;
  1551. e = container_of(wbl->list.prev, struct wc_entry, lru);
  1552. list_del(&e->lru);
  1553. max_pages = e->wc_list_contiguous;
  1554. bio = bio_alloc_bioset(wc->dev->bdev, max_pages, REQ_OP_WRITE,
  1555. GFP_NOIO, &wc->bio_set);
  1556. wb = container_of(bio, struct writeback_struct, bio);
  1557. wb->wc = wc;
  1558. bio->bi_end_io = writecache_writeback_endio;
  1559. bio->bi_iter.bi_sector = read_original_sector(wc, e);
  1560. if (unlikely(max_pages > WB_LIST_INLINE))
  1561. wb->wc_list = kmalloc_objs(struct wc_entry *, max_pages,
  1562. GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
  1563. if (likely(max_pages <= WB_LIST_INLINE) || unlikely(!wb->wc_list)) {
  1564. wb->wc_list = wb->wc_list_inline;
  1565. max_pages = WB_LIST_INLINE;
  1566. }
  1567. BUG_ON(!wc_add_block(wb, e));
  1568. wb->wc_list[0] = e;
  1569. wb->wc_list_n = 1;
  1570. while (wbl->size && wb->wc_list_n < max_pages) {
  1571. f = container_of(wbl->list.prev, struct wc_entry, lru);
  1572. if (read_original_sector(wc, f) !=
  1573. read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT))
  1574. break;
  1575. if (!wc_add_block(wb, f))
  1576. break;
  1577. wbl->size--;
  1578. list_del(&f->lru);
  1579. wb->wc_list[wb->wc_list_n++] = f;
  1580. e = f;
  1581. }
  1582. if (WC_MODE_FUA(wc))
  1583. bio->bi_opf |= REQ_FUA;
  1584. if (writecache_has_error(wc)) {
  1585. bio->bi_status = BLK_STS_IOERR;
  1586. bio_endio(bio);
  1587. } else if (unlikely(!bio_sectors(bio))) {
  1588. bio->bi_status = BLK_STS_OK;
  1589. bio_endio(bio);
  1590. } else {
  1591. submit_bio(bio);
  1592. }
  1593. __writeback_throttle(wc, wbl);
  1594. }
  1595. }
  1596. static void __writecache_writeback_ssd(struct dm_writecache *wc, struct writeback_list *wbl)
  1597. {
  1598. struct wc_entry *e, *f;
  1599. struct dm_io_region from, to;
  1600. struct copy_struct *c;
  1601. while (wbl->size) {
  1602. unsigned int n_sectors;
  1603. wbl->size--;
  1604. e = container_of(wbl->list.prev, struct wc_entry, lru);
  1605. list_del(&e->lru);
  1606. n_sectors = e->wc_list_contiguous << (wc->block_size_bits - SECTOR_SHIFT);
  1607. from.bdev = wc->ssd_dev->bdev;
  1608. from.sector = cache_sector(wc, e);
  1609. from.count = n_sectors;
  1610. to.bdev = wc->dev->bdev;
  1611. to.sector = read_original_sector(wc, e);
  1612. to.count = n_sectors;
  1613. c = mempool_alloc(&wc->copy_pool, GFP_NOIO);
  1614. c->wc = wc;
  1615. c->e = e;
  1616. c->n_entries = e->wc_list_contiguous;
  1617. while ((n_sectors -= wc->block_size >> SECTOR_SHIFT)) {
  1618. wbl->size--;
  1619. f = container_of(wbl->list.prev, struct wc_entry, lru);
  1620. BUG_ON(f != e + 1);
  1621. list_del(&f->lru);
  1622. e = f;
  1623. }
  1624. if (unlikely(to.sector + to.count > wc->data_device_sectors)) {
  1625. if (to.sector >= wc->data_device_sectors) {
  1626. writecache_copy_endio(0, 0, c);
  1627. continue;
  1628. }
  1629. from.count = to.count = wc->data_device_sectors - to.sector;
  1630. }
  1631. dm_kcopyd_copy(wc->dm_kcopyd, &from, 1, &to, 0, writecache_copy_endio, c);
  1632. __writeback_throttle(wc, wbl);
  1633. }
  1634. }
  1635. static void writecache_writeback(struct work_struct *work)
  1636. {
  1637. struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work);
  1638. struct blk_plug plug;
  1639. struct wc_entry *f, *g, *e = NULL;
  1640. struct rb_node *node, *next_node;
  1641. struct list_head skipped;
  1642. struct writeback_list wbl;
  1643. unsigned long n_walked;
  1644. if (!WC_MODE_PMEM(wc)) {
  1645. /* Wait for any active kcopyd work on behalf of ssd writeback */
  1646. dm_kcopyd_client_flush(wc->dm_kcopyd);
  1647. }
  1648. if (likely(wc->pause != 0)) {
  1649. while (1) {
  1650. unsigned long idle;
  1651. if (unlikely(wc->cleaner) || unlikely(wc->writeback_all) ||
  1652. unlikely(dm_suspended(wc->ti)))
  1653. break;
  1654. idle = dm_iot_idle_time(&wc->iot);
  1655. if (idle >= wc->pause)
  1656. break;
  1657. idle = wc->pause - idle;
  1658. if (idle > HZ)
  1659. idle = HZ;
  1660. schedule_timeout_idle(idle);
  1661. }
  1662. }
  1663. wc_lock(wc);
  1664. restart:
  1665. if (writecache_has_error(wc)) {
  1666. wc_unlock(wc);
  1667. return;
  1668. }
  1669. if (unlikely(wc->writeback_all)) {
  1670. if (writecache_wait_for_writeback(wc))
  1671. goto restart;
  1672. }
  1673. if (wc->overwrote_committed)
  1674. writecache_wait_for_ios(wc, WRITE);
  1675. n_walked = 0;
  1676. INIT_LIST_HEAD(&skipped);
  1677. INIT_LIST_HEAD(&wbl.list);
  1678. wbl.size = 0;
  1679. while (!list_empty(&wc->lru) &&
  1680. (wc->writeback_all ||
  1681. wc->freelist_size + wc->writeback_size <= wc->freelist_low_watermark ||
  1682. (jiffies - container_of(wc->lru.prev, struct wc_entry, lru)->age >=
  1683. wc->max_age - wc->max_age / MAX_AGE_DIV))) {
  1684. n_walked++;
  1685. if (unlikely(n_walked > WRITEBACK_LATENCY) &&
  1686. likely(!wc->writeback_all)) {
  1687. if (likely(!dm_suspended(wc->ti)))
  1688. queue_work(wc->writeback_wq, &wc->writeback_work);
  1689. break;
  1690. }
  1691. if (unlikely(wc->writeback_all)) {
  1692. if (unlikely(!e)) {
  1693. writecache_flush(wc);
  1694. e = container_of(rb_first(&wc->tree), struct wc_entry, rb_node);
  1695. } else
  1696. e = g;
  1697. } else
  1698. e = container_of(wc->lru.prev, struct wc_entry, lru);
  1699. BUG_ON(e->write_in_progress);
  1700. if (unlikely(!writecache_entry_is_committed(wc, e)))
  1701. writecache_flush(wc);
  1702. node = rb_prev(&e->rb_node);
  1703. if (node) {
  1704. f = container_of(node, struct wc_entry, rb_node);
  1705. if (unlikely(read_original_sector(wc, f) ==
  1706. read_original_sector(wc, e))) {
  1707. BUG_ON(!f->write_in_progress);
  1708. list_move(&e->lru, &skipped);
  1709. cond_resched();
  1710. continue;
  1711. }
  1712. }
  1713. wc->writeback_size++;
  1714. list_move(&e->lru, &wbl.list);
  1715. wbl.size++;
  1716. e->write_in_progress = true;
  1717. e->wc_list_contiguous = 1;
  1718. f = e;
  1719. while (1) {
  1720. next_node = rb_next(&f->rb_node);
  1721. if (unlikely(!next_node))
  1722. break;
  1723. g = container_of(next_node, struct wc_entry, rb_node);
  1724. if (unlikely(read_original_sector(wc, g) ==
  1725. read_original_sector(wc, f))) {
  1726. f = g;
  1727. continue;
  1728. }
  1729. if (read_original_sector(wc, g) !=
  1730. read_original_sector(wc, f) + (wc->block_size >> SECTOR_SHIFT))
  1731. break;
  1732. if (unlikely(g->write_in_progress))
  1733. break;
  1734. if (unlikely(!writecache_entry_is_committed(wc, g)))
  1735. break;
  1736. if (!WC_MODE_PMEM(wc)) {
  1737. if (g != f + 1)
  1738. break;
  1739. }
  1740. n_walked++;
  1741. //if (unlikely(n_walked > WRITEBACK_LATENCY) && likely(!wc->writeback_all))
  1742. // break;
  1743. wc->writeback_size++;
  1744. list_move(&g->lru, &wbl.list);
  1745. wbl.size++;
  1746. g->write_in_progress = true;
  1747. g->wc_list_contiguous = BIO_MAX_VECS;
  1748. f = g;
  1749. e->wc_list_contiguous++;
  1750. if (unlikely(e->wc_list_contiguous == BIO_MAX_VECS)) {
  1751. if (unlikely(wc->writeback_all)) {
  1752. next_node = rb_next(&f->rb_node);
  1753. if (likely(next_node))
  1754. g = container_of(next_node, struct wc_entry, rb_node);
  1755. }
  1756. break;
  1757. }
  1758. }
  1759. cond_resched();
  1760. }
  1761. if (!list_empty(&skipped)) {
  1762. list_splice_tail(&skipped, &wc->lru);
  1763. /*
  1764. * If we didn't do any progress, we must wait until some
  1765. * writeback finishes to avoid burning CPU in a loop
  1766. */
  1767. if (unlikely(!wbl.size))
  1768. writecache_wait_for_writeback(wc);
  1769. }
  1770. wc_unlock(wc);
  1771. blk_start_plug(&plug);
  1772. if (WC_MODE_PMEM(wc))
  1773. __writecache_writeback_pmem(wc, &wbl);
  1774. else
  1775. __writecache_writeback_ssd(wc, &wbl);
  1776. blk_finish_plug(&plug);
  1777. if (unlikely(wc->writeback_all)) {
  1778. wc_lock(wc);
  1779. while (writecache_wait_for_writeback(wc))
  1780. ;
  1781. wc_unlock(wc);
  1782. }
  1783. }
  1784. static int calculate_memory_size(uint64_t device_size, unsigned int block_size,
  1785. size_t *n_blocks_p, size_t *n_metadata_blocks_p)
  1786. {
  1787. uint64_t n_blocks, offset;
  1788. struct wc_entry e;
  1789. n_blocks = device_size;
  1790. do_div(n_blocks, block_size + sizeof(struct wc_memory_entry));
  1791. while (1) {
  1792. if (!n_blocks)
  1793. return -ENOSPC;
  1794. /* Verify the following entries[n_blocks] won't overflow */
  1795. if (n_blocks >= ((size_t)-sizeof(struct wc_memory_superblock) /
  1796. sizeof(struct wc_memory_entry)))
  1797. return -EFBIG;
  1798. offset = offsetof(struct wc_memory_superblock, entries[n_blocks]);
  1799. offset = (offset + block_size - 1) & ~(uint64_t)(block_size - 1);
  1800. if (offset + n_blocks * block_size <= device_size)
  1801. break;
  1802. n_blocks--;
  1803. }
  1804. /* check if the bit field overflows */
  1805. e.index = n_blocks;
  1806. if (e.index != n_blocks)
  1807. return -EFBIG;
  1808. if (n_blocks_p)
  1809. *n_blocks_p = n_blocks;
  1810. if (n_metadata_blocks_p)
  1811. *n_metadata_blocks_p = offset >> __ffs(block_size);
  1812. return 0;
  1813. }
  1814. static int init_memory(struct dm_writecache *wc)
  1815. {
  1816. size_t b;
  1817. int r;
  1818. r = calculate_memory_size(wc->memory_map_size, wc->block_size, &wc->n_blocks, NULL);
  1819. if (r)
  1820. return r;
  1821. r = writecache_alloc_entries(wc);
  1822. if (r)
  1823. return r;
  1824. for (b = 0; b < ARRAY_SIZE(sb(wc)->padding); b++)
  1825. pmem_assign(sb(wc)->padding[b], cpu_to_le64(0));
  1826. pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION));
  1827. pmem_assign(sb(wc)->block_size, cpu_to_le32(wc->block_size));
  1828. pmem_assign(sb(wc)->n_blocks, cpu_to_le64(wc->n_blocks));
  1829. pmem_assign(sb(wc)->seq_count, cpu_to_le64(0));
  1830. for (b = 0; b < wc->n_blocks; b++) {
  1831. write_original_sector_seq_count(wc, &wc->entries[b], -1, -1);
  1832. cond_resched();
  1833. }
  1834. writecache_flush_all_metadata(wc);
  1835. writecache_commit_flushed(wc, false);
  1836. pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC));
  1837. writecache_flush_region(wc, &sb(wc)->magic, sizeof(sb(wc)->magic));
  1838. writecache_commit_flushed(wc, false);
  1839. return 0;
  1840. }
  1841. static void writecache_dtr(struct dm_target *ti)
  1842. {
  1843. struct dm_writecache *wc = ti->private;
  1844. if (!wc)
  1845. return;
  1846. if (wc->endio_thread)
  1847. kthread_stop(wc->endio_thread);
  1848. if (wc->flush_thread)
  1849. kthread_stop(wc->flush_thread);
  1850. bioset_exit(&wc->bio_set);
  1851. mempool_exit(&wc->copy_pool);
  1852. if (wc->writeback_wq)
  1853. destroy_workqueue(wc->writeback_wq);
  1854. if (wc->dev)
  1855. dm_put_device(ti, wc->dev);
  1856. if (wc->ssd_dev)
  1857. dm_put_device(ti, wc->ssd_dev);
  1858. vfree(wc->entries);
  1859. if (wc->memory_map) {
  1860. if (WC_MODE_PMEM(wc))
  1861. persistent_memory_release(wc);
  1862. else
  1863. vfree(wc->memory_map);
  1864. }
  1865. if (wc->dm_kcopyd)
  1866. dm_kcopyd_client_destroy(wc->dm_kcopyd);
  1867. if (wc->dm_io)
  1868. dm_io_client_destroy(wc->dm_io);
  1869. vfree(wc->dirty_bitmap);
  1870. kfree(wc);
  1871. }
  1872. static int writecache_ctr(struct dm_target *ti, unsigned int argc, char **argv)
  1873. {
  1874. struct dm_writecache *wc;
  1875. struct dm_arg_set as;
  1876. const char *string;
  1877. unsigned int opt_params;
  1878. size_t offset, data_size;
  1879. int i, r;
  1880. char dummy;
  1881. int high_wm_percent = HIGH_WATERMARK;
  1882. int low_wm_percent = LOW_WATERMARK;
  1883. uint64_t x;
  1884. struct wc_memory_superblock s;
  1885. static struct dm_arg _args[] = {
  1886. {0, 18, "Invalid number of feature args"},
  1887. };
  1888. as.argc = argc;
  1889. as.argv = argv;
  1890. wc = kzalloc_obj(struct dm_writecache);
  1891. if (!wc) {
  1892. ti->error = "Cannot allocate writecache structure";
  1893. r = -ENOMEM;
  1894. goto bad;
  1895. }
  1896. ti->private = wc;
  1897. wc->ti = ti;
  1898. mutex_init(&wc->lock);
  1899. wc->max_age = MAX_AGE_UNSPECIFIED;
  1900. writecache_poison_lists(wc);
  1901. init_waitqueue_head(&wc->freelist_wait);
  1902. timer_setup(&wc->autocommit_timer, writecache_autocommit_timer, 0);
  1903. timer_setup(&wc->max_age_timer, writecache_max_age_timer, 0);
  1904. for (i = 0; i < 2; i++) {
  1905. atomic_set(&wc->bio_in_progress[i], 0);
  1906. init_waitqueue_head(&wc->bio_in_progress_wait[i]);
  1907. }
  1908. wc->dm_io = dm_io_client_create();
  1909. if (IS_ERR(wc->dm_io)) {
  1910. r = PTR_ERR(wc->dm_io);
  1911. ti->error = "Unable to allocate dm-io client";
  1912. wc->dm_io = NULL;
  1913. goto bad;
  1914. }
  1915. wc->writeback_wq = alloc_workqueue("writecache-writeback",
  1916. WQ_MEM_RECLAIM | WQ_PERCPU, 1);
  1917. if (!wc->writeback_wq) {
  1918. r = -ENOMEM;
  1919. ti->error = "Could not allocate writeback workqueue";
  1920. goto bad;
  1921. }
  1922. INIT_WORK(&wc->writeback_work, writecache_writeback);
  1923. INIT_WORK(&wc->flush_work, writecache_flush_work);
  1924. dm_iot_init(&wc->iot);
  1925. raw_spin_lock_init(&wc->endio_list_lock);
  1926. INIT_LIST_HEAD(&wc->endio_list);
  1927. wc->endio_thread = kthread_run(writecache_endio_thread, wc, "writecache_endio");
  1928. if (IS_ERR(wc->endio_thread)) {
  1929. r = PTR_ERR(wc->endio_thread);
  1930. wc->endio_thread = NULL;
  1931. ti->error = "Couldn't spawn endio thread";
  1932. goto bad;
  1933. }
  1934. /*
  1935. * Parse the mode (pmem or ssd)
  1936. */
  1937. string = dm_shift_arg(&as);
  1938. if (!string)
  1939. goto bad_arguments;
  1940. if (!strcasecmp(string, "s")) {
  1941. wc->pmem_mode = false;
  1942. } else if (!strcasecmp(string, "p")) {
  1943. #ifdef DM_WRITECACHE_HAS_PMEM
  1944. wc->pmem_mode = true;
  1945. wc->writeback_fua = true;
  1946. #else
  1947. /*
  1948. * If the architecture doesn't support persistent memory or
  1949. * the kernel doesn't support any DAX drivers, this driver can
  1950. * only be used in SSD-only mode.
  1951. */
  1952. r = -EOPNOTSUPP;
  1953. ti->error = "Persistent memory or DAX not supported on this system";
  1954. goto bad;
  1955. #endif
  1956. } else {
  1957. goto bad_arguments;
  1958. }
  1959. if (WC_MODE_PMEM(wc)) {
  1960. r = bioset_init(&wc->bio_set, BIO_POOL_SIZE,
  1961. offsetof(struct writeback_struct, bio),
  1962. BIOSET_NEED_BVECS);
  1963. if (r) {
  1964. ti->error = "Could not allocate bio set";
  1965. goto bad;
  1966. }
  1967. } else {
  1968. wc->pause = PAUSE_WRITEBACK;
  1969. r = mempool_init_kmalloc_pool(&wc->copy_pool, 1, sizeof(struct copy_struct));
  1970. if (r) {
  1971. ti->error = "Could not allocate mempool";
  1972. goto bad;
  1973. }
  1974. }
  1975. /*
  1976. * Parse the origin data device
  1977. */
  1978. string = dm_shift_arg(&as);
  1979. if (!string)
  1980. goto bad_arguments;
  1981. r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->dev);
  1982. if (r) {
  1983. ti->error = "Origin data device lookup failed";
  1984. goto bad;
  1985. }
  1986. /*
  1987. * Parse cache data device (be it pmem or ssd)
  1988. */
  1989. string = dm_shift_arg(&as);
  1990. if (!string)
  1991. goto bad_arguments;
  1992. r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->ssd_dev);
  1993. if (r) {
  1994. ti->error = "Cache data device lookup failed";
  1995. goto bad;
  1996. }
  1997. wc->memory_map_size = bdev_nr_bytes(wc->ssd_dev->bdev);
  1998. /*
  1999. * Parse the cache block size
  2000. */
  2001. string = dm_shift_arg(&as);
  2002. if (!string)
  2003. goto bad_arguments;
  2004. if (sscanf(string, "%u%c", &wc->block_size, &dummy) != 1 ||
  2005. wc->block_size < 512 || wc->block_size > PAGE_SIZE ||
  2006. (wc->block_size & (wc->block_size - 1))) {
  2007. r = -EINVAL;
  2008. ti->error = "Invalid block size";
  2009. goto bad;
  2010. }
  2011. if (wc->block_size < bdev_logical_block_size(wc->dev->bdev) ||
  2012. wc->block_size < bdev_logical_block_size(wc->ssd_dev->bdev)) {
  2013. r = -EINVAL;
  2014. ti->error = "Block size is smaller than device logical block size";
  2015. goto bad;
  2016. }
  2017. wc->block_size_bits = __ffs(wc->block_size);
  2018. wc->max_writeback_jobs = MAX_WRITEBACK_JOBS;
  2019. wc->autocommit_blocks = !WC_MODE_PMEM(wc) ? AUTOCOMMIT_BLOCKS_SSD : AUTOCOMMIT_BLOCKS_PMEM;
  2020. wc->autocommit_jiffies = msecs_to_jiffies(AUTOCOMMIT_MSEC);
  2021. /*
  2022. * Parse optional arguments
  2023. */
  2024. r = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
  2025. if (r)
  2026. goto bad;
  2027. while (opt_params) {
  2028. string = dm_shift_arg(&as), opt_params--;
  2029. if (!strcasecmp(string, "start_sector") && opt_params >= 1) {
  2030. unsigned long long start_sector;
  2031. string = dm_shift_arg(&as), opt_params--;
  2032. if (sscanf(string, "%llu%c", &start_sector, &dummy) != 1)
  2033. goto invalid_optional;
  2034. wc->start_sector = start_sector;
  2035. wc->start_sector_set = true;
  2036. if (wc->start_sector != start_sector ||
  2037. wc->start_sector >= wc->memory_map_size >> SECTOR_SHIFT)
  2038. goto invalid_optional;
  2039. } else if (!strcasecmp(string, "high_watermark") && opt_params >= 1) {
  2040. string = dm_shift_arg(&as), opt_params--;
  2041. if (sscanf(string, "%d%c", &high_wm_percent, &dummy) != 1)
  2042. goto invalid_optional;
  2043. if (high_wm_percent < 0 || high_wm_percent > 100)
  2044. goto invalid_optional;
  2045. wc->high_wm_percent_value = high_wm_percent;
  2046. wc->high_wm_percent_set = true;
  2047. } else if (!strcasecmp(string, "low_watermark") && opt_params >= 1) {
  2048. string = dm_shift_arg(&as), opt_params--;
  2049. if (sscanf(string, "%d%c", &low_wm_percent, &dummy) != 1)
  2050. goto invalid_optional;
  2051. if (low_wm_percent < 0 || low_wm_percent > 100)
  2052. goto invalid_optional;
  2053. wc->low_wm_percent_value = low_wm_percent;
  2054. wc->low_wm_percent_set = true;
  2055. } else if (!strcasecmp(string, "writeback_jobs") && opt_params >= 1) {
  2056. string = dm_shift_arg(&as), opt_params--;
  2057. if (sscanf(string, "%u%c", &wc->max_writeback_jobs, &dummy) != 1)
  2058. goto invalid_optional;
  2059. wc->max_writeback_jobs_set = true;
  2060. } else if (!strcasecmp(string, "autocommit_blocks") && opt_params >= 1) {
  2061. string = dm_shift_arg(&as), opt_params--;
  2062. if (sscanf(string, "%u%c", &wc->autocommit_blocks, &dummy) != 1)
  2063. goto invalid_optional;
  2064. wc->autocommit_blocks_set = true;
  2065. } else if (!strcasecmp(string, "autocommit_time") && opt_params >= 1) {
  2066. unsigned int autocommit_msecs;
  2067. string = dm_shift_arg(&as), opt_params--;
  2068. if (sscanf(string, "%u%c", &autocommit_msecs, &dummy) != 1)
  2069. goto invalid_optional;
  2070. if (autocommit_msecs > 3600000)
  2071. goto invalid_optional;
  2072. wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs);
  2073. wc->autocommit_time_value = autocommit_msecs;
  2074. wc->autocommit_time_set = true;
  2075. } else if (!strcasecmp(string, "max_age") && opt_params >= 1) {
  2076. unsigned int max_age_msecs;
  2077. string = dm_shift_arg(&as), opt_params--;
  2078. if (sscanf(string, "%u%c", &max_age_msecs, &dummy) != 1)
  2079. goto invalid_optional;
  2080. if (max_age_msecs > 86400000)
  2081. goto invalid_optional;
  2082. wc->max_age = msecs_to_jiffies(max_age_msecs);
  2083. wc->max_age_set = true;
  2084. wc->max_age_value = max_age_msecs;
  2085. } else if (!strcasecmp(string, "cleaner")) {
  2086. wc->cleaner_set = true;
  2087. wc->cleaner = true;
  2088. } else if (!strcasecmp(string, "fua")) {
  2089. if (WC_MODE_PMEM(wc)) {
  2090. wc->writeback_fua = true;
  2091. wc->writeback_fua_set = true;
  2092. } else
  2093. goto invalid_optional;
  2094. } else if (!strcasecmp(string, "nofua")) {
  2095. if (WC_MODE_PMEM(wc)) {
  2096. wc->writeback_fua = false;
  2097. wc->writeback_fua_set = true;
  2098. } else
  2099. goto invalid_optional;
  2100. } else if (!strcasecmp(string, "metadata_only")) {
  2101. wc->metadata_only = true;
  2102. } else if (!strcasecmp(string, "pause_writeback") && opt_params >= 1) {
  2103. unsigned int pause_msecs;
  2104. if (WC_MODE_PMEM(wc))
  2105. goto invalid_optional;
  2106. string = dm_shift_arg(&as), opt_params--;
  2107. if (sscanf(string, "%u%c", &pause_msecs, &dummy) != 1)
  2108. goto invalid_optional;
  2109. if (pause_msecs > 60000)
  2110. goto invalid_optional;
  2111. wc->pause = msecs_to_jiffies(pause_msecs);
  2112. wc->pause_set = true;
  2113. wc->pause_value = pause_msecs;
  2114. } else {
  2115. invalid_optional:
  2116. r = -EINVAL;
  2117. ti->error = "Invalid optional argument";
  2118. goto bad;
  2119. }
  2120. }
  2121. if (high_wm_percent < low_wm_percent) {
  2122. r = -EINVAL;
  2123. ti->error = "High watermark must be greater than or equal to low watermark";
  2124. goto bad;
  2125. }
  2126. if (WC_MODE_PMEM(wc)) {
  2127. if (!dax_synchronous(wc->ssd_dev->dax_dev)) {
  2128. r = -EOPNOTSUPP;
  2129. ti->error = "Asynchronous persistent memory not supported as pmem cache";
  2130. goto bad;
  2131. }
  2132. r = persistent_memory_claim(wc);
  2133. if (r) {
  2134. ti->error = "Unable to map persistent memory for cache";
  2135. goto bad;
  2136. }
  2137. } else {
  2138. size_t n_blocks, n_metadata_blocks;
  2139. uint64_t n_bitmap_bits;
  2140. wc->memory_map_size -= (uint64_t)wc->start_sector << SECTOR_SHIFT;
  2141. bio_list_init(&wc->flush_list);
  2142. wc->flush_thread = kthread_run(writecache_flush_thread, wc, "dm_writecache_flush");
  2143. if (IS_ERR(wc->flush_thread)) {
  2144. r = PTR_ERR(wc->flush_thread);
  2145. wc->flush_thread = NULL;
  2146. ti->error = "Couldn't spawn flush thread";
  2147. goto bad;
  2148. }
  2149. r = calculate_memory_size(wc->memory_map_size, wc->block_size,
  2150. &n_blocks, &n_metadata_blocks);
  2151. if (r) {
  2152. ti->error = "Invalid device size";
  2153. goto bad;
  2154. }
  2155. n_bitmap_bits = (((uint64_t)n_metadata_blocks << wc->block_size_bits) +
  2156. BITMAP_GRANULARITY - 1) / BITMAP_GRANULARITY;
  2157. /* this is limitation of test_bit functions */
  2158. if (n_bitmap_bits > 1U << 31) {
  2159. r = -EFBIG;
  2160. ti->error = "Invalid device size";
  2161. goto bad;
  2162. }
  2163. wc->memory_map = vmalloc(n_metadata_blocks << wc->block_size_bits);
  2164. if (!wc->memory_map) {
  2165. r = -ENOMEM;
  2166. ti->error = "Unable to allocate memory for metadata";
  2167. goto bad;
  2168. }
  2169. wc->dm_kcopyd = dm_kcopyd_client_create(&dm_kcopyd_throttle);
  2170. if (IS_ERR(wc->dm_kcopyd)) {
  2171. r = PTR_ERR(wc->dm_kcopyd);
  2172. ti->error = "Unable to allocate dm-kcopyd client";
  2173. wc->dm_kcopyd = NULL;
  2174. goto bad;
  2175. }
  2176. wc->metadata_sectors = n_metadata_blocks << (wc->block_size_bits - SECTOR_SHIFT);
  2177. wc->dirty_bitmap_size = (n_bitmap_bits + BITS_PER_LONG - 1) /
  2178. BITS_PER_LONG * sizeof(unsigned long);
  2179. wc->dirty_bitmap = vzalloc(wc->dirty_bitmap_size);
  2180. if (!wc->dirty_bitmap) {
  2181. r = -ENOMEM;
  2182. ti->error = "Unable to allocate dirty bitmap";
  2183. goto bad;
  2184. }
  2185. r = writecache_read_metadata(wc, wc->block_size >> SECTOR_SHIFT);
  2186. if (r) {
  2187. ti->error = "Unable to read first block of metadata";
  2188. goto bad;
  2189. }
  2190. }
  2191. r = copy_mc_to_kernel(&s, sb(wc), sizeof(struct wc_memory_superblock));
  2192. if (r) {
  2193. ti->error = "Hardware memory error when reading superblock";
  2194. goto bad;
  2195. }
  2196. if (!le32_to_cpu(s.magic) && !le32_to_cpu(s.version)) {
  2197. r = init_memory(wc);
  2198. if (r) {
  2199. ti->error = "Unable to initialize device";
  2200. goto bad;
  2201. }
  2202. r = copy_mc_to_kernel(&s, sb(wc),
  2203. sizeof(struct wc_memory_superblock));
  2204. if (r) {
  2205. ti->error = "Hardware memory error when reading superblock";
  2206. goto bad;
  2207. }
  2208. }
  2209. if (le32_to_cpu(s.magic) != MEMORY_SUPERBLOCK_MAGIC) {
  2210. ti->error = "Invalid magic in the superblock";
  2211. r = -EINVAL;
  2212. goto bad;
  2213. }
  2214. if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION) {
  2215. ti->error = "Invalid version in the superblock";
  2216. r = -EINVAL;
  2217. goto bad;
  2218. }
  2219. if (le32_to_cpu(s.block_size) != wc->block_size) {
  2220. ti->error = "Block size does not match superblock";
  2221. r = -EINVAL;
  2222. goto bad;
  2223. }
  2224. wc->n_blocks = le64_to_cpu(s.n_blocks);
  2225. offset = wc->n_blocks * sizeof(struct wc_memory_entry);
  2226. if (offset / sizeof(struct wc_memory_entry) != le64_to_cpu(sb(wc)->n_blocks)) {
  2227. overflow:
  2228. ti->error = "Overflow in size calculation";
  2229. r = -EINVAL;
  2230. goto bad;
  2231. }
  2232. offset += sizeof(struct wc_memory_superblock);
  2233. if (offset < sizeof(struct wc_memory_superblock))
  2234. goto overflow;
  2235. offset = (offset + wc->block_size - 1) & ~(size_t)(wc->block_size - 1);
  2236. data_size = wc->n_blocks * (size_t)wc->block_size;
  2237. if (!offset || (data_size / wc->block_size != wc->n_blocks) ||
  2238. (offset + data_size < offset))
  2239. goto overflow;
  2240. if (offset + data_size > wc->memory_map_size) {
  2241. ti->error = "Memory area is too small";
  2242. r = -EINVAL;
  2243. goto bad;
  2244. }
  2245. wc->metadata_sectors = offset >> SECTOR_SHIFT;
  2246. wc->block_start = (char *)sb(wc) + offset;
  2247. x = (uint64_t)wc->n_blocks * (100 - high_wm_percent);
  2248. x += 50;
  2249. do_div(x, 100);
  2250. wc->freelist_high_watermark = x;
  2251. x = (uint64_t)wc->n_blocks * (100 - low_wm_percent);
  2252. x += 50;
  2253. do_div(x, 100);
  2254. wc->freelist_low_watermark = x;
  2255. if (wc->cleaner)
  2256. activate_cleaner(wc);
  2257. r = writecache_alloc_entries(wc);
  2258. if (r) {
  2259. ti->error = "Cannot allocate memory";
  2260. goto bad;
  2261. }
  2262. ti->num_flush_bios = WC_MODE_PMEM(wc) ? 1 : 2;
  2263. ti->flush_supported = true;
  2264. ti->num_discard_bios = 1;
  2265. if (WC_MODE_PMEM(wc))
  2266. persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size);
  2267. return 0;
  2268. bad_arguments:
  2269. r = -EINVAL;
  2270. ti->error = "Bad arguments";
  2271. bad:
  2272. writecache_dtr(ti);
  2273. return r;
  2274. }
  2275. static void writecache_status(struct dm_target *ti, status_type_t type,
  2276. unsigned int status_flags, char *result, unsigned int maxlen)
  2277. {
  2278. struct dm_writecache *wc = ti->private;
  2279. unsigned int extra_args;
  2280. unsigned int sz = 0;
  2281. switch (type) {
  2282. case STATUSTYPE_INFO:
  2283. DMEMIT("%ld %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu",
  2284. writecache_has_error(wc),
  2285. (unsigned long long)wc->n_blocks, (unsigned long long)wc->freelist_size,
  2286. (unsigned long long)wc->writeback_size,
  2287. wc->stats.reads,
  2288. wc->stats.read_hits,
  2289. wc->stats.writes,
  2290. wc->stats.write_hits_uncommitted,
  2291. wc->stats.write_hits_committed,
  2292. wc->stats.writes_around,
  2293. wc->stats.writes_allocate,
  2294. wc->stats.writes_blocked_on_freelist,
  2295. wc->stats.flushes,
  2296. wc->stats.discards);
  2297. break;
  2298. case STATUSTYPE_TABLE:
  2299. DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc) ? 'p' : 's',
  2300. wc->dev->name, wc->ssd_dev->name, wc->block_size);
  2301. extra_args = 0;
  2302. if (wc->start_sector_set)
  2303. extra_args += 2;
  2304. if (wc->high_wm_percent_set)
  2305. extra_args += 2;
  2306. if (wc->low_wm_percent_set)
  2307. extra_args += 2;
  2308. if (wc->max_writeback_jobs_set)
  2309. extra_args += 2;
  2310. if (wc->autocommit_blocks_set)
  2311. extra_args += 2;
  2312. if (wc->autocommit_time_set)
  2313. extra_args += 2;
  2314. if (wc->max_age_set)
  2315. extra_args += 2;
  2316. if (wc->cleaner_set)
  2317. extra_args++;
  2318. if (wc->writeback_fua_set)
  2319. extra_args++;
  2320. if (wc->metadata_only)
  2321. extra_args++;
  2322. if (wc->pause_set)
  2323. extra_args += 2;
  2324. DMEMIT("%u", extra_args);
  2325. if (wc->start_sector_set)
  2326. DMEMIT(" start_sector %llu", (unsigned long long)wc->start_sector);
  2327. if (wc->high_wm_percent_set)
  2328. DMEMIT(" high_watermark %u", wc->high_wm_percent_value);
  2329. if (wc->low_wm_percent_set)
  2330. DMEMIT(" low_watermark %u", wc->low_wm_percent_value);
  2331. if (wc->max_writeback_jobs_set)
  2332. DMEMIT(" writeback_jobs %u", wc->max_writeback_jobs);
  2333. if (wc->autocommit_blocks_set)
  2334. DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks);
  2335. if (wc->autocommit_time_set)
  2336. DMEMIT(" autocommit_time %u", wc->autocommit_time_value);
  2337. if (wc->max_age_set)
  2338. DMEMIT(" max_age %u", wc->max_age_value);
  2339. if (wc->cleaner_set)
  2340. DMEMIT(" cleaner");
  2341. if (wc->writeback_fua_set)
  2342. DMEMIT(" %sfua", wc->writeback_fua ? "" : "no");
  2343. if (wc->metadata_only)
  2344. DMEMIT(" metadata_only");
  2345. if (wc->pause_set)
  2346. DMEMIT(" pause_writeback %u", wc->pause_value);
  2347. break;
  2348. case STATUSTYPE_IMA:
  2349. *result = '\0';
  2350. break;
  2351. }
  2352. }
  2353. static struct target_type writecache_target = {
  2354. .name = "writecache",
  2355. .version = {1, 6, 0},
  2356. .module = THIS_MODULE,
  2357. .ctr = writecache_ctr,
  2358. .dtr = writecache_dtr,
  2359. .status = writecache_status,
  2360. .postsuspend = writecache_suspend,
  2361. .resume = writecache_resume,
  2362. .message = writecache_message,
  2363. .map = writecache_map,
  2364. .end_io = writecache_end_io,
  2365. .iterate_devices = writecache_iterate_devices,
  2366. .io_hints = writecache_io_hints,
  2367. };
  2368. module_dm(writecache);
  2369. MODULE_DESCRIPTION(DM_NAME " writecache target");
  2370. MODULE_AUTHOR("Mikulas Patocka <dm-devel@lists.linux.dev>");
  2371. MODULE_LICENSE("GPL");