dm-delay.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * Copyright (C) 2005-2007 Red Hat GmbH
  4. *
  5. * A target that delays reads and/or writes and can send
  6. * them to different devices.
  7. *
  8. * This file is released under the GPL.
  9. */
  10. #include <linux/module.h>
  11. #include <linux/init.h>
  12. #include <linux/blkdev.h>
  13. #include <linux/bio.h>
  14. #include <linux/slab.h>
  15. #include <linux/kthread.h>
  16. #include <linux/delay.h>
  17. #include <linux/device-mapper.h>
  18. #define DM_MSG_PREFIX "delay"
  19. #define SLEEP_SHIFT 3
  20. struct delay_class {
  21. struct dm_dev *dev;
  22. sector_t start;
  23. unsigned int delay;
  24. unsigned int ops;
  25. };
  26. struct delay_c {
  27. struct timer_list delay_timer;
  28. struct mutex process_bios_lock; /* hold while removing bios to be processed from list */
  29. spinlock_t delayed_bios_lock; /* hold on all accesses to delayed_bios list */
  30. struct workqueue_struct *kdelayd_wq;
  31. struct work_struct flush_expired_bios;
  32. struct list_head delayed_bios;
  33. struct task_struct *worker;
  34. unsigned int worker_sleep_us;
  35. bool may_delay;
  36. struct delay_class read;
  37. struct delay_class write;
  38. struct delay_class flush;
  39. int argc;
  40. };
  41. struct dm_delay_info {
  42. struct delay_c *context;
  43. struct delay_class *class;
  44. struct list_head list;
  45. unsigned long expires;
  46. };
  47. static void handle_delayed_timer(struct timer_list *t)
  48. {
  49. struct delay_c *dc = timer_container_of(dc, t, delay_timer);
  50. queue_work(dc->kdelayd_wq, &dc->flush_expired_bios);
  51. }
  52. static void queue_timeout(struct delay_c *dc, unsigned long expires)
  53. {
  54. timer_reduce(&dc->delay_timer, expires);
  55. }
  56. static inline bool delay_is_fast(struct delay_c *dc)
  57. {
  58. return !!dc->worker;
  59. }
  60. static void flush_bios(struct bio *bio)
  61. {
  62. struct bio *n;
  63. while (bio) {
  64. n = bio->bi_next;
  65. bio->bi_next = NULL;
  66. dm_submit_bio_remap(bio, NULL);
  67. bio = n;
  68. }
  69. }
  70. static void flush_delayed_bios(struct delay_c *dc, bool flush_all)
  71. {
  72. struct dm_delay_info *delayed, *next;
  73. struct bio_list flush_bio_list;
  74. LIST_HEAD(local_list);
  75. unsigned long next_expires = 0;
  76. bool start_timer = false;
  77. bio_list_init(&flush_bio_list);
  78. mutex_lock(&dc->process_bios_lock);
  79. spin_lock(&dc->delayed_bios_lock);
  80. list_replace_init(&dc->delayed_bios, &local_list);
  81. spin_unlock(&dc->delayed_bios_lock);
  82. list_for_each_entry_safe(delayed, next, &local_list, list) {
  83. cond_resched();
  84. if (flush_all || time_after_eq(jiffies, delayed->expires)) {
  85. struct bio *bio = dm_bio_from_per_bio_data(delayed,
  86. sizeof(struct dm_delay_info));
  87. list_del(&delayed->list);
  88. bio_list_add(&flush_bio_list, bio);
  89. delayed->class->ops--;
  90. continue;
  91. }
  92. if (!delay_is_fast(dc)) {
  93. if (!start_timer) {
  94. start_timer = true;
  95. next_expires = delayed->expires;
  96. } else {
  97. next_expires = min(next_expires, delayed->expires);
  98. }
  99. }
  100. }
  101. spin_lock(&dc->delayed_bios_lock);
  102. list_splice(&local_list, &dc->delayed_bios);
  103. spin_unlock(&dc->delayed_bios_lock);
  104. mutex_unlock(&dc->process_bios_lock);
  105. if (start_timer)
  106. queue_timeout(dc, next_expires);
  107. flush_bios(bio_list_get(&flush_bio_list));
  108. }
  109. static int flush_worker_fn(void *data)
  110. {
  111. struct delay_c *dc = data;
  112. while (!kthread_should_stop()) {
  113. flush_delayed_bios(dc, false);
  114. spin_lock(&dc->delayed_bios_lock);
  115. if (unlikely(list_empty(&dc->delayed_bios))) {
  116. set_current_state(TASK_INTERRUPTIBLE);
  117. spin_unlock(&dc->delayed_bios_lock);
  118. schedule();
  119. } else {
  120. spin_unlock(&dc->delayed_bios_lock);
  121. fsleep(dc->worker_sleep_us);
  122. cond_resched();
  123. }
  124. }
  125. return 0;
  126. }
  127. static void flush_expired_bios(struct work_struct *work)
  128. {
  129. struct delay_c *dc;
  130. dc = container_of(work, struct delay_c, flush_expired_bios);
  131. flush_delayed_bios(dc, false);
  132. }
  133. static void delay_dtr(struct dm_target *ti)
  134. {
  135. struct delay_c *dc = ti->private;
  136. if (dc->kdelayd_wq) {
  137. timer_shutdown_sync(&dc->delay_timer);
  138. destroy_workqueue(dc->kdelayd_wq);
  139. }
  140. if (dc->read.dev)
  141. dm_put_device(ti, dc->read.dev);
  142. if (dc->write.dev)
  143. dm_put_device(ti, dc->write.dev);
  144. if (dc->flush.dev)
  145. dm_put_device(ti, dc->flush.dev);
  146. if (dc->worker)
  147. kthread_stop(dc->worker);
  148. mutex_destroy(&dc->process_bios_lock);
  149. kfree(dc);
  150. }
  151. static int delay_class_ctr(struct dm_target *ti, struct delay_class *c, char **argv)
  152. {
  153. int ret;
  154. unsigned long long tmpll;
  155. char dummy;
  156. if (sscanf(argv[1], "%llu%c", &tmpll, &dummy) != 1 || tmpll != (sector_t)tmpll) {
  157. ti->error = "Invalid device sector";
  158. return -EINVAL;
  159. }
  160. c->start = tmpll;
  161. if (sscanf(argv[2], "%u%c", &c->delay, &dummy) != 1) {
  162. ti->error = "Invalid delay";
  163. return -EINVAL;
  164. }
  165. ret = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &c->dev);
  166. if (ret) {
  167. ti->error = "Device lookup failed";
  168. return ret;
  169. }
  170. return 0;
  171. }
  172. /*
  173. * Mapping parameters:
  174. * <device> <offset> <delay> [<write_device> <write_offset> <write_delay>]
  175. *
  176. * With separate write parameters, the first set is only used for reads.
  177. * Offsets are specified in sectors.
  178. * Delays are specified in milliseconds.
  179. */
  180. static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
  181. {
  182. struct delay_c *dc;
  183. int ret;
  184. unsigned int max_delay, min_delay;
  185. if (argc != 3 && argc != 6 && argc != 9) {
  186. ti->error = "Requires exactly 3, 6 or 9 arguments";
  187. return -EINVAL;
  188. }
  189. dc = kzalloc_obj(*dc);
  190. if (!dc) {
  191. ti->error = "Cannot allocate context";
  192. return -ENOMEM;
  193. }
  194. ti->private = dc;
  195. INIT_LIST_HEAD(&dc->delayed_bios);
  196. mutex_init(&dc->process_bios_lock);
  197. spin_lock_init(&dc->delayed_bios_lock);
  198. dc->may_delay = true;
  199. dc->argc = argc;
  200. ret = delay_class_ctr(ti, &dc->read, argv);
  201. if (ret)
  202. goto bad;
  203. min_delay = max_delay = dc->read.delay;
  204. if (argc == 3) {
  205. ret = delay_class_ctr(ti, &dc->write, argv);
  206. if (ret)
  207. goto bad;
  208. ret = delay_class_ctr(ti, &dc->flush, argv);
  209. if (ret)
  210. goto bad;
  211. goto out;
  212. }
  213. ret = delay_class_ctr(ti, &dc->write, argv + 3);
  214. if (ret)
  215. goto bad;
  216. max_delay = max(max_delay, dc->write.delay);
  217. min_delay = min_not_zero(min_delay, dc->write.delay);
  218. if (argc == 6) {
  219. ret = delay_class_ctr(ti, &dc->flush, argv + 3);
  220. if (ret)
  221. goto bad;
  222. goto out;
  223. }
  224. ret = delay_class_ctr(ti, &dc->flush, argv + 6);
  225. if (ret)
  226. goto bad;
  227. max_delay = max(max_delay, dc->flush.delay);
  228. min_delay = min_not_zero(min_delay, dc->flush.delay);
  229. out:
  230. if (max_delay < 50) {
  231. if (min_delay >> SLEEP_SHIFT)
  232. dc->worker_sleep_us = 1000;
  233. else
  234. dc->worker_sleep_us = (min_delay * 1000) >> SLEEP_SHIFT;
  235. /*
  236. * In case of small requested delays, use kthread instead of
  237. * timers and workqueue to achieve better latency.
  238. */
  239. dc->worker = kthread_run(&flush_worker_fn, dc, "dm-delay-flush-worker");
  240. if (IS_ERR(dc->worker)) {
  241. ret = PTR_ERR(dc->worker);
  242. dc->worker = NULL;
  243. goto bad;
  244. }
  245. } else {
  246. timer_setup(&dc->delay_timer, handle_delayed_timer, 0);
  247. INIT_WORK(&dc->flush_expired_bios, flush_expired_bios);
  248. dc->kdelayd_wq = alloc_workqueue("kdelayd",
  249. WQ_MEM_RECLAIM | WQ_PERCPU,
  250. 0);
  251. if (!dc->kdelayd_wq) {
  252. ret = -EINVAL;
  253. DMERR("Couldn't start kdelayd");
  254. goto bad;
  255. }
  256. }
  257. ti->num_flush_bios = 1;
  258. ti->num_discard_bios = 1;
  259. ti->accounts_remapped_io = true;
  260. ti->per_io_data_size = sizeof(struct dm_delay_info);
  261. return 0;
  262. bad:
  263. delay_dtr(ti);
  264. return ret;
  265. }
  266. static int delay_bio(struct delay_c *dc, struct delay_class *c, struct bio *bio)
  267. {
  268. struct dm_delay_info *delayed;
  269. unsigned long expires = 0;
  270. if (!c->delay)
  271. return DM_MAPIO_REMAPPED;
  272. delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info));
  273. delayed->context = dc;
  274. delayed->expires = expires = jiffies + msecs_to_jiffies(c->delay);
  275. spin_lock(&dc->delayed_bios_lock);
  276. if (unlikely(!dc->may_delay)) {
  277. spin_unlock(&dc->delayed_bios_lock);
  278. return DM_MAPIO_REMAPPED;
  279. }
  280. c->ops++;
  281. list_add_tail(&delayed->list, &dc->delayed_bios);
  282. spin_unlock(&dc->delayed_bios_lock);
  283. if (delay_is_fast(dc))
  284. wake_up_process(dc->worker);
  285. else
  286. queue_timeout(dc, expires);
  287. return DM_MAPIO_SUBMITTED;
  288. }
  289. static void delay_presuspend(struct dm_target *ti)
  290. {
  291. struct delay_c *dc = ti->private;
  292. spin_lock(&dc->delayed_bios_lock);
  293. dc->may_delay = false;
  294. spin_unlock(&dc->delayed_bios_lock);
  295. if (!delay_is_fast(dc))
  296. timer_delete(&dc->delay_timer);
  297. flush_delayed_bios(dc, true);
  298. }
  299. static void delay_resume(struct dm_target *ti)
  300. {
  301. struct delay_c *dc = ti->private;
  302. dc->may_delay = true;
  303. }
  304. static int delay_map(struct dm_target *ti, struct bio *bio)
  305. {
  306. struct delay_c *dc = ti->private;
  307. struct delay_class *c;
  308. struct dm_delay_info *delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info));
  309. if (bio_data_dir(bio) == WRITE) {
  310. if (unlikely(bio->bi_opf & REQ_PREFLUSH))
  311. c = &dc->flush;
  312. else
  313. c = &dc->write;
  314. } else {
  315. c = &dc->read;
  316. }
  317. delayed->class = c;
  318. bio_set_dev(bio, c->dev->bdev);
  319. bio->bi_iter.bi_sector = c->start + dm_target_offset(ti, bio->bi_iter.bi_sector);
  320. return delay_bio(dc, c, bio);
  321. }
  322. #ifdef CONFIG_BLK_DEV_ZONED
  323. static int delay_report_zones(struct dm_target *ti,
  324. struct dm_report_zones_args *args, unsigned int nr_zones)
  325. {
  326. struct delay_c *dc = ti->private;
  327. struct delay_class *c = &dc->read;
  328. return dm_report_zones(c->dev->bdev, c->start,
  329. c->start + dm_target_offset(ti, args->next_sector),
  330. args, nr_zones);
  331. }
  332. #else
  333. #define delay_report_zones NULL
  334. #endif
  335. #define DMEMIT_DELAY_CLASS(c) \
  336. DMEMIT("%s %llu %u", (c)->dev->name, (unsigned long long)(c)->start, (c)->delay)
  337. static void delay_status(struct dm_target *ti, status_type_t type,
  338. unsigned int status_flags, char *result, unsigned int maxlen)
  339. {
  340. struct delay_c *dc = ti->private;
  341. int sz = 0;
  342. switch (type) {
  343. case STATUSTYPE_INFO:
  344. DMEMIT("%u %u %u", dc->read.ops, dc->write.ops, dc->flush.ops);
  345. break;
  346. case STATUSTYPE_TABLE:
  347. DMEMIT_DELAY_CLASS(&dc->read);
  348. if (dc->argc >= 6) {
  349. DMEMIT(" ");
  350. DMEMIT_DELAY_CLASS(&dc->write);
  351. }
  352. if (dc->argc >= 9) {
  353. DMEMIT(" ");
  354. DMEMIT_DELAY_CLASS(&dc->flush);
  355. }
  356. break;
  357. case STATUSTYPE_IMA:
  358. *result = '\0';
  359. break;
  360. }
  361. }
  362. static int delay_iterate_devices(struct dm_target *ti,
  363. iterate_devices_callout_fn fn, void *data)
  364. {
  365. struct delay_c *dc = ti->private;
  366. int ret = 0;
  367. ret = fn(ti, dc->read.dev, dc->read.start, ti->len, data);
  368. if (ret)
  369. goto out;
  370. ret = fn(ti, dc->write.dev, dc->write.start, ti->len, data);
  371. if (ret)
  372. goto out;
  373. ret = fn(ti, dc->flush.dev, dc->flush.start, ti->len, data);
  374. if (ret)
  375. goto out;
  376. out:
  377. return ret;
  378. }
  379. static struct target_type delay_target = {
  380. .name = "delay",
  381. .version = {1, 5, 0},
  382. .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_ZONED_HM,
  383. .module = THIS_MODULE,
  384. .ctr = delay_ctr,
  385. .dtr = delay_dtr,
  386. .map = delay_map,
  387. .report_zones = delay_report_zones,
  388. .presuspend = delay_presuspend,
  389. .resume = delay_resume,
  390. .status = delay_status,
  391. .iterate_devices = delay_iterate_devices,
  392. };
  393. module_dm(delay);
  394. MODULE_DESCRIPTION(DM_NAME " delay target");
  395. MODULE_AUTHOR("Heinz Mauelshagen <mauelshagen@redhat.com>");
  396. MODULE_LICENSE("GPL");