stripe.c 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396
  1. // SPDX-License-Identifier: GPL-2.0
  2. #include "kublk.h"
  3. #define NR_STRIPE MAX_BACK_FILES
  4. struct stripe_conf {
  5. unsigned nr_files;
  6. unsigned shift;
  7. };
  8. struct stripe {
  9. loff_t start;
  10. unsigned nr_sects;
  11. int seq;
  12. struct iovec *vec;
  13. unsigned nr_vec;
  14. unsigned cap;
  15. };
  16. struct stripe_array {
  17. struct stripe s[NR_STRIPE];
  18. unsigned nr;
  19. struct iovec _vec[];
  20. };
  21. static inline const struct stripe_conf *get_chunk_shift(const struct ublk_queue *q)
  22. {
  23. return (struct stripe_conf *)q->dev->private_data;
  24. }
  25. static inline unsigned calculate_nr_vec(const struct stripe_conf *conf,
  26. const struct ublksrv_io_desc *iod)
  27. {
  28. const unsigned shift = conf->shift - 9;
  29. const unsigned unit_sects = conf->nr_files << shift;
  30. loff_t start = iod->start_sector;
  31. loff_t end = start + iod->nr_sectors;
  32. return (end / unit_sects) - (start / unit_sects) + 1;
  33. }
  34. static struct stripe_array *alloc_stripe_array(const struct stripe_conf *conf,
  35. const struct ublksrv_io_desc *iod)
  36. {
  37. unsigned nr_vecs = calculate_nr_vec(conf, iod);
  38. unsigned total = nr_vecs * conf->nr_files;
  39. struct stripe_array *s;
  40. int i;
  41. s = malloc(sizeof(*s) + total * sizeof(struct iovec));
  42. s->nr = 0;
  43. for (i = 0; i < conf->nr_files; i++) {
  44. struct stripe *t = &s->s[i];
  45. t->nr_vec = 0;
  46. t->vec = &s->_vec[i * nr_vecs];
  47. t->nr_sects = 0;
  48. t->cap = nr_vecs;
  49. }
  50. return s;
  51. }
  52. static void free_stripe_array(struct stripe_array *s)
  53. {
  54. free(s);
  55. }
  56. static void calculate_stripe_array(const struct stripe_conf *conf,
  57. const struct ublksrv_io_desc *iod, struct stripe_array *s, void *base)
  58. {
  59. const unsigned shift = conf->shift - 9;
  60. const unsigned chunk_sects = 1 << shift;
  61. const unsigned unit_sects = conf->nr_files << shift;
  62. off64_t start = iod->start_sector;
  63. off64_t end = start + iod->nr_sectors;
  64. unsigned long done = 0;
  65. unsigned idx = 0;
  66. while (start < end) {
  67. unsigned nr_sects = chunk_sects - (start & (chunk_sects - 1));
  68. loff_t unit_off = (start / unit_sects) * unit_sects;
  69. unsigned seq = (start - unit_off) >> shift;
  70. struct stripe *this = &s->s[idx];
  71. loff_t stripe_off = (unit_off / conf->nr_files) +
  72. (start & (chunk_sects - 1));
  73. if (nr_sects > end - start)
  74. nr_sects = end - start;
  75. if (this->nr_sects == 0) {
  76. this->nr_sects = nr_sects;
  77. this->start = stripe_off;
  78. this->seq = seq;
  79. s->nr += 1;
  80. } else {
  81. ublk_assert(seq == this->seq);
  82. ublk_assert(this->start + this->nr_sects == stripe_off);
  83. this->nr_sects += nr_sects;
  84. }
  85. ublk_assert(this->nr_vec < this->cap);
  86. this->vec[this->nr_vec].iov_base = (void *)(base + done);
  87. this->vec[this->nr_vec++].iov_len = nr_sects << 9;
  88. start += nr_sects;
  89. done += nr_sects << 9;
  90. idx = (idx + 1) % conf->nr_files;
  91. }
  92. }
  93. static inline enum io_uring_op stripe_to_uring_op(
  94. const struct ublksrv_io_desc *iod, int zc)
  95. {
  96. unsigned ublk_op = ublksrv_get_op(iod);
  97. if (ublk_op == UBLK_IO_OP_READ)
  98. return zc ? IORING_OP_READV_FIXED : IORING_OP_READV;
  99. else if (ublk_op == UBLK_IO_OP_WRITE)
  100. return zc ? IORING_OP_WRITEV_FIXED : IORING_OP_WRITEV;
  101. ublk_assert(0);
  102. }
  103. static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q,
  104. const struct ublksrv_io_desc *iod, int tag)
  105. {
  106. const struct stripe_conf *conf = get_chunk_shift(q);
  107. unsigned auto_zc = (ublk_queue_use_auto_zc(q) != 0);
  108. unsigned zc = (ublk_queue_use_zc(q) != 0);
  109. enum io_uring_op op = stripe_to_uring_op(iod, zc | auto_zc);
  110. struct io_uring_sqe *sqe[NR_STRIPE];
  111. struct stripe_array *s = alloc_stripe_array(conf, iod);
  112. struct ublk_io *io = ublk_get_io(q, tag);
  113. int i, extra = zc ? 2 : 0;
  114. void *base = io->buf_addr;
  115. unsigned short buf_idx = ublk_io_buf_idx(t, q, tag);
  116. io->private_data = s;
  117. calculate_stripe_array(conf, iod, s, base);
  118. ublk_io_alloc_sqes(t, sqe, s->nr + extra);
  119. if (zc) {
  120. io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, buf_idx);
  121. sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK;
  122. sqe[0]->user_data = build_user_data(tag,
  123. ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1);
  124. }
  125. for (i = zc; i < s->nr + extra - zc; i++) {
  126. struct stripe *t = &s->s[i - zc];
  127. io_uring_prep_rw(op, sqe[i],
  128. t->seq + 1,
  129. (void *)t->vec,
  130. t->nr_vec,
  131. t->start << 9);
  132. io_uring_sqe_set_flags(sqe[i], IOSQE_FIXED_FILE);
  133. if (auto_zc || zc) {
  134. sqe[i]->buf_index = buf_idx;
  135. if (zc)
  136. sqe[i]->flags |= IOSQE_IO_HARDLINK;
  137. }
  138. /* bit63 marks us as tgt io */
  139. sqe[i]->user_data = build_user_data(tag, ublksrv_get_op(iod), i - zc, q->q_id, 1);
  140. }
  141. if (zc) {
  142. struct io_uring_sqe *unreg = sqe[s->nr + 1];
  143. io_uring_prep_buf_unregister(unreg, q, tag, q->q_id, buf_idx);
  144. unreg->user_data = build_user_data(
  145. tag, ublk_cmd_op_nr(unreg->cmd_op), 0, q->q_id, 1);
  146. }
  147. /* register buffer is skip_success */
  148. return s->nr + zc;
  149. }
  150. static int handle_flush(struct ublk_thread *t, struct ublk_queue *q,
  151. const struct ublksrv_io_desc *iod, int tag)
  152. {
  153. const struct stripe_conf *conf = get_chunk_shift(q);
  154. struct io_uring_sqe *sqe[NR_STRIPE];
  155. int i;
  156. ublk_io_alloc_sqes(t, sqe, conf->nr_files);
  157. for (i = 0; i < conf->nr_files; i++) {
  158. io_uring_prep_fsync(sqe[i], i + 1, IORING_FSYNC_DATASYNC);
  159. io_uring_sqe_set_flags(sqe[i], IOSQE_FIXED_FILE);
  160. sqe[i]->user_data = build_user_data(tag, UBLK_IO_OP_FLUSH, 0, q->q_id, 1);
  161. }
  162. return conf->nr_files;
  163. }
  164. static int stripe_queue_tgt_io(struct ublk_thread *t, struct ublk_queue *q,
  165. int tag)
  166. {
  167. const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag);
  168. unsigned ublk_op = ublksrv_get_op(iod);
  169. int ret = 0;
  170. switch (ublk_op) {
  171. case UBLK_IO_OP_FLUSH:
  172. ret = handle_flush(t, q, iod, tag);
  173. break;
  174. case UBLK_IO_OP_WRITE_ZEROES:
  175. case UBLK_IO_OP_DISCARD:
  176. ret = -ENOTSUP;
  177. break;
  178. case UBLK_IO_OP_READ:
  179. case UBLK_IO_OP_WRITE:
  180. ret = stripe_queue_tgt_rw_io(t, q, iod, tag);
  181. break;
  182. default:
  183. ret = -EINVAL;
  184. break;
  185. }
  186. ublk_dbg(UBLK_DBG_IO, "%s: tag %d ublk io %x %llx %u ret %d\n", __func__, tag,
  187. iod->op_flags, iod->start_sector, iod->nr_sectors << 9, ret);
  188. return ret;
  189. }
  190. static int ublk_stripe_queue_io(struct ublk_thread *t, struct ublk_queue *q,
  191. int tag)
  192. {
  193. int queued = stripe_queue_tgt_io(t, q, tag);
  194. ublk_queued_tgt_io(t, q, tag, queued);
  195. return 0;
  196. }
  197. static void ublk_stripe_io_done(struct ublk_thread *t, struct ublk_queue *q,
  198. const struct io_uring_cqe *cqe)
  199. {
  200. unsigned tag = user_data_to_tag(cqe->user_data);
  201. const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag);
  202. unsigned op = user_data_to_op(cqe->user_data);
  203. struct ublk_io *io = ublk_get_io(q, tag);
  204. int res = cqe->res;
  205. if (res < 0 || op != ublk_cmd_op_nr(UBLK_U_IO_UNREGISTER_IO_BUF)) {
  206. if (!io->result)
  207. io->result = res;
  208. if (res < 0)
  209. ublk_err("%s: io failure %d tag %u\n", __func__, res, tag);
  210. }
  211. /* buffer register op is IOSQE_CQE_SKIP_SUCCESS */
  212. if (op == ublk_cmd_op_nr(UBLK_U_IO_REGISTER_IO_BUF))
  213. io->tgt_ios += 1;
  214. /* fail short READ/WRITE simply */
  215. if (op == UBLK_IO_OP_READ || op == UBLK_IO_OP_WRITE) {
  216. unsigned seq = user_data_to_tgt_data(cqe->user_data);
  217. struct stripe_array *s = io->private_data;
  218. if (res < s->s[seq].nr_sects << 9) {
  219. io->result = -EIO;
  220. ublk_err("%s: short rw op %u res %d exp %u tag %u\n",
  221. __func__, op, res, s->s[seq].vec->iov_len, tag);
  222. }
  223. }
  224. if (ublk_completed_tgt_io(t, q, tag)) {
  225. int res = io->result;
  226. if (!res)
  227. res = iod->nr_sectors << 9;
  228. ublk_complete_io(t, q, tag, res);
  229. free_stripe_array(io->private_data);
  230. io->private_data = NULL;
  231. }
  232. }
  233. static int ublk_stripe_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev)
  234. {
  235. struct ublk_params p = {
  236. .types = UBLK_PARAM_TYPE_BASIC,
  237. .basic = {
  238. .attrs = UBLK_ATTR_VOLATILE_CACHE,
  239. .logical_bs_shift = 9,
  240. .physical_bs_shift = 12,
  241. .io_opt_shift = 12,
  242. .io_min_shift = 9,
  243. .max_sectors = dev->dev_info.max_io_buf_bytes >> 9,
  244. },
  245. };
  246. unsigned chunk_size = ctx->stripe.chunk_size;
  247. struct stripe_conf *conf;
  248. unsigned chunk_shift;
  249. loff_t bytes = 0;
  250. int ret, i, mul = 1;
  251. if (ctx->auto_zc_fallback) {
  252. ublk_err("%s: not support auto_zc_fallback\n", __func__);
  253. return -EINVAL;
  254. }
  255. if (ctx->metadata_size) {
  256. ublk_err("%s: integrity not supported\n", __func__);
  257. return -EINVAL;
  258. }
  259. if ((chunk_size & (chunk_size - 1)) || !chunk_size) {
  260. ublk_err("invalid chunk size %u\n", chunk_size);
  261. return -EINVAL;
  262. }
  263. if (chunk_size < 4096 || chunk_size > 512 * 1024) {
  264. ublk_err("invalid chunk size %u\n", chunk_size);
  265. return -EINVAL;
  266. }
  267. chunk_shift = ilog2(chunk_size);
  268. ret = backing_file_tgt_init(dev, dev->tgt.nr_backing_files);
  269. if (ret)
  270. return ret;
  271. if (!dev->tgt.nr_backing_files || dev->tgt.nr_backing_files > NR_STRIPE)
  272. return -EINVAL;
  273. ublk_assert(dev->nr_fds == dev->tgt.nr_backing_files + 1);
  274. for (i = 0; i < dev->tgt.nr_backing_files; i++)
  275. dev->tgt.backing_file_size[i] &= ~((1 << chunk_shift) - 1);
  276. for (i = 0; i < dev->tgt.nr_backing_files; i++) {
  277. unsigned long size = dev->tgt.backing_file_size[i];
  278. if (size != dev->tgt.backing_file_size[0])
  279. return -EINVAL;
  280. bytes += size;
  281. }
  282. conf = malloc(sizeof(*conf));
  283. conf->shift = chunk_shift;
  284. conf->nr_files = dev->tgt.nr_backing_files;
  285. dev->private_data = conf;
  286. dev->tgt.dev_size = bytes;
  287. p.basic.dev_sectors = bytes >> 9;
  288. dev->tgt.params = p;
  289. if (dev->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY)
  290. mul = 2;
  291. dev->tgt.sq_depth = mul * dev->dev_info.queue_depth * conf->nr_files;
  292. dev->tgt.cq_depth = mul * dev->dev_info.queue_depth * conf->nr_files;
  293. printf("%s: shift %u files %u\n", __func__, conf->shift, conf->nr_files);
  294. return 0;
  295. }
  296. static void ublk_stripe_tgt_deinit(struct ublk_dev *dev)
  297. {
  298. free(dev->private_data);
  299. backing_file_tgt_deinit(dev);
  300. }
  301. static void ublk_stripe_cmd_line(struct dev_ctx *ctx, int argc, char *argv[])
  302. {
  303. static const struct option longopts[] = {
  304. { "chunk_size", 1, NULL, 0 },
  305. { 0, 0, 0, 0 }
  306. };
  307. int option_idx, opt;
  308. ctx->stripe.chunk_size = 65536;
  309. while ((opt = getopt_long(argc, argv, "",
  310. longopts, &option_idx)) != -1) {
  311. switch (opt) {
  312. case 0:
  313. if (!strcmp(longopts[option_idx].name, "chunk_size"))
  314. ctx->stripe.chunk_size = strtol(optarg, NULL, 10);
  315. }
  316. }
  317. }
  318. static void ublk_stripe_usage(const struct ublk_tgt_ops *ops)
  319. {
  320. printf("\tstripe: [--chunk_size chunk_size (default 65536)]\n");
  321. }
  322. const struct ublk_tgt_ops stripe_tgt_ops = {
  323. .name = "stripe",
  324. .init_tgt = ublk_stripe_tgt_init,
  325. .deinit_tgt = ublk_stripe_tgt_deinit,
  326. .queue_io = ublk_stripe_queue_io,
  327. .tgt_io_done = ublk_stripe_io_done,
  328. .parse_cmd_line = ublk_stripe_cmd_line,
  329. .usage = ublk_stripe_usage,
  330. };