kublk.c 48 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019
  1. /* SPDX-License-Identifier: MIT */
  2. /*
  3. * Description: uring_cmd based ublk
  4. */
  5. #include <linux/fs.h>
  6. #include "kublk.h"
  7. #define MAX_NR_TGT_ARG 64
  8. unsigned int ublk_dbg_mask = UBLK_LOG;
  9. static const struct ublk_tgt_ops *tgt_ops_list[] = {
  10. &null_tgt_ops,
  11. &loop_tgt_ops,
  12. &stripe_tgt_ops,
  13. &fault_inject_tgt_ops,
  14. };
  15. static const struct ublk_tgt_ops *ublk_find_tgt(const char *name)
  16. {
  17. int i;
  18. if (name == NULL)
  19. return NULL;
  20. for (i = 0; i < ARRAY_SIZE(tgt_ops_list); i++)
  21. if (strcmp(tgt_ops_list[i]->name, name) == 0)
  22. return tgt_ops_list[i];
  23. return NULL;
  24. }
  25. static inline int ublk_setup_ring(struct io_uring *r, int depth,
  26. int cq_depth, unsigned flags)
  27. {
  28. struct io_uring_params p;
  29. memset(&p, 0, sizeof(p));
  30. p.flags = flags | IORING_SETUP_CQSIZE;
  31. p.cq_entries = cq_depth;
  32. return io_uring_queue_init_params(depth, r, &p);
  33. }
  34. static void ublk_ctrl_init_cmd(struct ublk_dev *dev,
  35. struct io_uring_sqe *sqe,
  36. struct ublk_ctrl_cmd_data *data)
  37. {
  38. struct ublksrv_ctrl_dev_info *info = &dev->dev_info;
  39. struct ublksrv_ctrl_cmd *cmd = (struct ublksrv_ctrl_cmd *)ublk_get_sqe_cmd(sqe);
  40. sqe->fd = dev->ctrl_fd;
  41. sqe->opcode = IORING_OP_URING_CMD;
  42. sqe->ioprio = 0;
  43. if (data->flags & CTRL_CMD_HAS_BUF) {
  44. cmd->addr = data->addr;
  45. cmd->len = data->len;
  46. }
  47. if (data->flags & CTRL_CMD_HAS_DATA)
  48. cmd->data[0] = data->data[0];
  49. cmd->dev_id = info->dev_id;
  50. cmd->queue_id = -1;
  51. ublk_set_sqe_cmd_op(sqe, data->cmd_op);
  52. io_uring_sqe_set_data(sqe, cmd);
  53. }
  54. static int __ublk_ctrl_cmd(struct ublk_dev *dev,
  55. struct ublk_ctrl_cmd_data *data)
  56. {
  57. struct io_uring_sqe *sqe;
  58. struct io_uring_cqe *cqe;
  59. int ret = -EINVAL;
  60. sqe = io_uring_get_sqe(&dev->ring);
  61. if (!sqe) {
  62. ublk_err("%s: can't get sqe ret %d\n", __func__, ret);
  63. return ret;
  64. }
  65. ublk_ctrl_init_cmd(dev, sqe, data);
  66. ret = io_uring_submit(&dev->ring);
  67. if (ret < 0) {
  68. ublk_err("uring submit ret %d\n", ret);
  69. return ret;
  70. }
  71. ret = io_uring_wait_cqe(&dev->ring, &cqe);
  72. if (ret < 0) {
  73. ublk_err("wait cqe: %s\n", strerror(-ret));
  74. return ret;
  75. }
  76. io_uring_cqe_seen(&dev->ring, cqe);
  77. return cqe->res;
  78. }
  79. static int ublk_ctrl_stop_dev(struct ublk_dev *dev)
  80. {
  81. struct ublk_ctrl_cmd_data data = {
  82. .cmd_op = UBLK_U_CMD_STOP_DEV,
  83. };
  84. return __ublk_ctrl_cmd(dev, &data);
  85. }
  86. static int ublk_ctrl_try_stop_dev(struct ublk_dev *dev)
  87. {
  88. struct ublk_ctrl_cmd_data data = {
  89. .cmd_op = UBLK_U_CMD_TRY_STOP_DEV,
  90. };
  91. return __ublk_ctrl_cmd(dev, &data);
  92. }
  93. static int ublk_ctrl_start_dev(struct ublk_dev *dev,
  94. int daemon_pid)
  95. {
  96. struct ublk_ctrl_cmd_data data = {
  97. .cmd_op = UBLK_U_CMD_START_DEV,
  98. .flags = CTRL_CMD_HAS_DATA,
  99. };
  100. dev->dev_info.ublksrv_pid = data.data[0] = daemon_pid;
  101. return __ublk_ctrl_cmd(dev, &data);
  102. }
  103. static int ublk_ctrl_start_user_recovery(struct ublk_dev *dev)
  104. {
  105. struct ublk_ctrl_cmd_data data = {
  106. .cmd_op = UBLK_U_CMD_START_USER_RECOVERY,
  107. };
  108. return __ublk_ctrl_cmd(dev, &data);
  109. }
  110. static int ublk_ctrl_end_user_recovery(struct ublk_dev *dev, int daemon_pid)
  111. {
  112. struct ublk_ctrl_cmd_data data = {
  113. .cmd_op = UBLK_U_CMD_END_USER_RECOVERY,
  114. .flags = CTRL_CMD_HAS_DATA,
  115. };
  116. dev->dev_info.ublksrv_pid = data.data[0] = daemon_pid;
  117. return __ublk_ctrl_cmd(dev, &data);
  118. }
  119. static int ublk_ctrl_add_dev(struct ublk_dev *dev)
  120. {
  121. struct ublk_ctrl_cmd_data data = {
  122. .cmd_op = UBLK_U_CMD_ADD_DEV,
  123. .flags = CTRL_CMD_HAS_BUF,
  124. .addr = (__u64) (uintptr_t) &dev->dev_info,
  125. .len = sizeof(struct ublksrv_ctrl_dev_info),
  126. };
  127. return __ublk_ctrl_cmd(dev, &data);
  128. }
  129. static int ublk_ctrl_del_dev(struct ublk_dev *dev)
  130. {
  131. struct ublk_ctrl_cmd_data data = {
  132. .cmd_op = UBLK_U_CMD_DEL_DEV,
  133. .flags = 0,
  134. };
  135. return __ublk_ctrl_cmd(dev, &data);
  136. }
  137. static int ublk_ctrl_get_info(struct ublk_dev *dev)
  138. {
  139. struct ublk_ctrl_cmd_data data = {
  140. .cmd_op = UBLK_U_CMD_GET_DEV_INFO,
  141. .flags = CTRL_CMD_HAS_BUF,
  142. .addr = (__u64) (uintptr_t) &dev->dev_info,
  143. .len = sizeof(struct ublksrv_ctrl_dev_info),
  144. };
  145. return __ublk_ctrl_cmd(dev, &data);
  146. }
  147. static int ublk_ctrl_set_params(struct ublk_dev *dev,
  148. struct ublk_params *params)
  149. {
  150. struct ublk_ctrl_cmd_data data = {
  151. .cmd_op = UBLK_U_CMD_SET_PARAMS,
  152. .flags = CTRL_CMD_HAS_BUF,
  153. .addr = (__u64) (uintptr_t) params,
  154. .len = sizeof(*params),
  155. };
  156. params->len = sizeof(*params);
  157. return __ublk_ctrl_cmd(dev, &data);
  158. }
  159. static int ublk_ctrl_get_params(struct ublk_dev *dev,
  160. struct ublk_params *params)
  161. {
  162. struct ublk_ctrl_cmd_data data = {
  163. .cmd_op = UBLK_U_CMD_GET_PARAMS,
  164. .flags = CTRL_CMD_HAS_BUF,
  165. .addr = (__u64)params,
  166. .len = sizeof(*params),
  167. };
  168. params->len = sizeof(*params);
  169. return __ublk_ctrl_cmd(dev, &data);
  170. }
  171. static int ublk_ctrl_get_features(struct ublk_dev *dev,
  172. __u64 *features)
  173. {
  174. struct ublk_ctrl_cmd_data data = {
  175. .cmd_op = UBLK_U_CMD_GET_FEATURES,
  176. .flags = CTRL_CMD_HAS_BUF,
  177. .addr = (__u64) (uintptr_t) features,
  178. .len = sizeof(*features),
  179. };
  180. return __ublk_ctrl_cmd(dev, &data);
  181. }
  182. static int ublk_ctrl_update_size(struct ublk_dev *dev,
  183. __u64 nr_sects)
  184. {
  185. struct ublk_ctrl_cmd_data data = {
  186. .cmd_op = UBLK_U_CMD_UPDATE_SIZE,
  187. .flags = CTRL_CMD_HAS_DATA,
  188. };
  189. data.data[0] = nr_sects;
  190. return __ublk_ctrl_cmd(dev, &data);
  191. }
  192. static int ublk_ctrl_quiesce_dev(struct ublk_dev *dev,
  193. unsigned int timeout_ms)
  194. {
  195. struct ublk_ctrl_cmd_data data = {
  196. .cmd_op = UBLK_U_CMD_QUIESCE_DEV,
  197. .flags = CTRL_CMD_HAS_DATA,
  198. };
  199. data.data[0] = timeout_ms;
  200. return __ublk_ctrl_cmd(dev, &data);
  201. }
  202. static const char *ublk_dev_state_desc(struct ublk_dev *dev)
  203. {
  204. switch (dev->dev_info.state) {
  205. case UBLK_S_DEV_DEAD:
  206. return "DEAD";
  207. case UBLK_S_DEV_LIVE:
  208. return "LIVE";
  209. case UBLK_S_DEV_QUIESCED:
  210. return "QUIESCED";
  211. default:
  212. return "UNKNOWN";
  213. };
  214. }
  215. static void ublk_print_cpu_set(const cpu_set_t *set, char *buf, unsigned len)
  216. {
  217. unsigned done = 0;
  218. int i;
  219. for (i = 0; i < CPU_SETSIZE; i++) {
  220. if (CPU_ISSET(i, set))
  221. done += snprintf(&buf[done], len - done, "%d ", i);
  222. }
  223. }
  224. static void ublk_adjust_affinity(cpu_set_t *set)
  225. {
  226. int j, updated = 0;
  227. /*
  228. * Just keep the 1st CPU now.
  229. *
  230. * In future, auto affinity selection can be tried.
  231. */
  232. for (j = 0; j < CPU_SETSIZE; j++) {
  233. if (CPU_ISSET(j, set)) {
  234. if (!updated) {
  235. updated = 1;
  236. continue;
  237. }
  238. CPU_CLR(j, set);
  239. }
  240. }
  241. }
  242. /* Caller must free the allocated buffer */
  243. static int ublk_ctrl_get_affinity(struct ublk_dev *ctrl_dev, cpu_set_t **ptr_buf)
  244. {
  245. struct ublk_ctrl_cmd_data data = {
  246. .cmd_op = UBLK_U_CMD_GET_QUEUE_AFFINITY,
  247. .flags = CTRL_CMD_HAS_DATA | CTRL_CMD_HAS_BUF,
  248. };
  249. cpu_set_t *buf;
  250. int i, ret;
  251. buf = malloc(sizeof(cpu_set_t) * ctrl_dev->dev_info.nr_hw_queues);
  252. if (!buf)
  253. return -ENOMEM;
  254. for (i = 0; i < ctrl_dev->dev_info.nr_hw_queues; i++) {
  255. data.data[0] = i;
  256. data.len = sizeof(cpu_set_t);
  257. data.addr = (__u64)&buf[i];
  258. ret = __ublk_ctrl_cmd(ctrl_dev, &data);
  259. if (ret < 0) {
  260. free(buf);
  261. return ret;
  262. }
  263. ublk_adjust_affinity(&buf[i]);
  264. }
  265. *ptr_buf = buf;
  266. return 0;
  267. }
  268. static void ublk_ctrl_dump(struct ublk_dev *dev)
  269. {
  270. struct ublksrv_ctrl_dev_info *info = &dev->dev_info;
  271. struct ublk_params p;
  272. cpu_set_t *affinity;
  273. int ret;
  274. ret = ublk_ctrl_get_params(dev, &p);
  275. if (ret < 0) {
  276. ublk_err("failed to get params %d %s\n", ret, strerror(-ret));
  277. return;
  278. }
  279. ret = ublk_ctrl_get_affinity(dev, &affinity);
  280. if (ret < 0) {
  281. ublk_err("failed to get affinity %m\n");
  282. return;
  283. }
  284. ublk_log("dev id %d: nr_hw_queues %d queue_depth %d block size %d dev_capacity %lld\n",
  285. info->dev_id, info->nr_hw_queues, info->queue_depth,
  286. 1 << p.basic.logical_bs_shift, p.basic.dev_sectors);
  287. ublk_log("\tmax rq size %d daemon pid %d flags 0x%llx state %s\n",
  288. info->max_io_buf_bytes, info->ublksrv_pid, info->flags,
  289. ublk_dev_state_desc(dev));
  290. if (affinity) {
  291. char buf[512];
  292. int i;
  293. for (i = 0; i < info->nr_hw_queues; i++) {
  294. ublk_print_cpu_set(&affinity[i], buf, sizeof(buf));
  295. printf("\tqueue %u: affinity(%s)\n",
  296. i, buf);
  297. }
  298. free(affinity);
  299. }
  300. fflush(stdout);
  301. }
  302. static void ublk_ctrl_deinit(struct ublk_dev *dev)
  303. {
  304. close(dev->ctrl_fd);
  305. free(dev);
  306. }
  307. static struct ublk_dev *ublk_ctrl_init(void)
  308. {
  309. struct ublk_dev *dev = (struct ublk_dev *)calloc(1, sizeof(*dev));
  310. struct ublksrv_ctrl_dev_info *info = &dev->dev_info;
  311. int ret;
  312. dev->ctrl_fd = open(CTRL_DEV, O_RDWR);
  313. if (dev->ctrl_fd < 0) {
  314. free(dev);
  315. return NULL;
  316. }
  317. info->max_io_buf_bytes = UBLK_IO_MAX_BYTES;
  318. ret = ublk_setup_ring(&dev->ring, UBLK_CTRL_RING_DEPTH,
  319. UBLK_CTRL_RING_DEPTH, IORING_SETUP_SQE128);
  320. if (ret < 0) {
  321. ublk_err("queue_init: %s\n", strerror(-ret));
  322. free(dev);
  323. return NULL;
  324. }
  325. dev->nr_fds = 1;
  326. return dev;
  327. }
  328. static int __ublk_queue_cmd_buf_sz(unsigned depth)
  329. {
  330. int size = depth * sizeof(struct ublksrv_io_desc);
  331. unsigned int page_sz = getpagesize();
  332. return round_up(size, page_sz);
  333. }
  334. static int ublk_queue_max_cmd_buf_sz(void)
  335. {
  336. return __ublk_queue_cmd_buf_sz(UBLK_MAX_QUEUE_DEPTH);
  337. }
  338. static int ublk_queue_cmd_buf_sz(struct ublk_queue *q)
  339. {
  340. return __ublk_queue_cmd_buf_sz(q->q_depth);
  341. }
  342. static void ublk_queue_deinit(struct ublk_queue *q)
  343. {
  344. int i;
  345. int nr_ios = q->q_depth;
  346. if (q->io_cmd_buf)
  347. munmap(q->io_cmd_buf, ublk_queue_cmd_buf_sz(q));
  348. for (i = 0; i < nr_ios; i++) {
  349. free(q->ios[i].buf_addr);
  350. free(q->ios[i].integrity_buf);
  351. }
  352. }
  353. static void ublk_thread_deinit(struct ublk_thread *t)
  354. {
  355. io_uring_unregister_buffers(&t->ring);
  356. ublk_batch_free_buf(t);
  357. io_uring_unregister_ring_fd(&t->ring);
  358. if (t->ring.ring_fd > 0) {
  359. io_uring_unregister_files(&t->ring);
  360. close(t->ring.ring_fd);
  361. t->ring.ring_fd = -1;
  362. }
  363. }
  364. static int ublk_queue_init(struct ublk_queue *q, unsigned long long extra_flags,
  365. __u8 metadata_size)
  366. {
  367. struct ublk_dev *dev = q->dev;
  368. int depth = dev->dev_info.queue_depth;
  369. int i;
  370. int cmd_buf_size, io_buf_size, integrity_size;
  371. unsigned long off;
  372. pthread_spin_init(&q->lock, PTHREAD_PROCESS_PRIVATE);
  373. q->tgt_ops = dev->tgt.ops;
  374. q->flags = 0;
  375. q->q_depth = depth;
  376. q->flags = dev->dev_info.flags;
  377. q->flags |= extra_flags;
  378. q->metadata_size = metadata_size;
  379. /* Cache fd in queue for fast path access */
  380. q->ublk_fd = dev->fds[0];
  381. cmd_buf_size = ublk_queue_cmd_buf_sz(q);
  382. off = UBLKSRV_CMD_BUF_OFFSET + q->q_id * ublk_queue_max_cmd_buf_sz();
  383. q->io_cmd_buf = mmap(0, cmd_buf_size, PROT_READ,
  384. MAP_SHARED | MAP_POPULATE, dev->fds[0], off);
  385. if (q->io_cmd_buf == MAP_FAILED) {
  386. ublk_err("ublk dev %d queue %d map io_cmd_buf failed %m\n",
  387. q->dev->dev_info.dev_id, q->q_id);
  388. goto fail;
  389. }
  390. io_buf_size = dev->dev_info.max_io_buf_bytes;
  391. integrity_size = ublk_integrity_len(q, io_buf_size);
  392. for (i = 0; i < q->q_depth; i++) {
  393. q->ios[i].buf_addr = NULL;
  394. q->ios[i].flags = UBLKS_IO_NEED_FETCH_RQ | UBLKS_IO_FREE;
  395. q->ios[i].tag = i;
  396. if (integrity_size) {
  397. q->ios[i].integrity_buf = malloc(integrity_size);
  398. if (!q->ios[i].integrity_buf) {
  399. ublk_err("ublk dev %d queue %d io %d malloc(%d) failed: %m\n",
  400. dev->dev_info.dev_id, q->q_id, i,
  401. integrity_size);
  402. goto fail;
  403. }
  404. }
  405. if (ublk_queue_no_buf(q))
  406. continue;
  407. if (posix_memalign((void **)&q->ios[i].buf_addr,
  408. getpagesize(), io_buf_size)) {
  409. ublk_err("ublk dev %d queue %d io %d posix_memalign failed %m\n",
  410. dev->dev_info.dev_id, q->q_id, i);
  411. goto fail;
  412. }
  413. }
  414. return 0;
  415. fail:
  416. ublk_queue_deinit(q);
  417. ublk_err("ublk dev %d queue %d failed\n",
  418. dev->dev_info.dev_id, q->q_id);
  419. return -ENOMEM;
  420. }
  421. static int ublk_thread_init(struct ublk_thread *t, unsigned long long extra_flags)
  422. {
  423. struct ublk_dev *dev = t->dev;
  424. unsigned long long flags = dev->dev_info.flags | extra_flags;
  425. int ring_depth = dev->tgt.sq_depth, cq_depth = dev->tgt.cq_depth;
  426. int ret;
  427. /* FETCH_IO_CMDS is multishot, so increase cq depth for BATCH_IO */
  428. if (ublk_dev_batch_io(dev))
  429. cq_depth += dev->dev_info.queue_depth * 2;
  430. ret = ublk_setup_ring(&t->ring, ring_depth, cq_depth,
  431. IORING_SETUP_COOP_TASKRUN |
  432. IORING_SETUP_SINGLE_ISSUER |
  433. IORING_SETUP_DEFER_TASKRUN);
  434. if (ret < 0) {
  435. ublk_err("ublk dev %d thread %d setup io_uring failed %d\n",
  436. dev->dev_info.dev_id, t->idx, ret);
  437. goto fail;
  438. }
  439. if (dev->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_AUTO_BUF_REG)) {
  440. unsigned nr_ios = dev->dev_info.queue_depth * dev->dev_info.nr_hw_queues;
  441. unsigned max_nr_ios_per_thread = nr_ios / dev->nthreads;
  442. max_nr_ios_per_thread += !!(nr_ios % dev->nthreads);
  443. t->nr_bufs = max_nr_ios_per_thread;
  444. } else {
  445. t->nr_bufs = 0;
  446. }
  447. if (ublk_dev_batch_io(dev))
  448. ublk_batch_prepare(t);
  449. if (t->nr_bufs) {
  450. ret = io_uring_register_buffers_sparse(&t->ring, t->nr_bufs);
  451. if (ret) {
  452. ublk_err("ublk dev %d thread %d register spare buffers failed %d\n",
  453. dev->dev_info.dev_id, t->idx, ret);
  454. goto fail;
  455. }
  456. }
  457. if (ublk_dev_batch_io(dev)) {
  458. ret = ublk_batch_alloc_buf(t);
  459. if (ret) {
  460. ublk_err("ublk dev %d thread %d alloc batch buf failed %d\n",
  461. dev->dev_info.dev_id, t->idx, ret);
  462. goto fail;
  463. }
  464. }
  465. io_uring_register_ring_fd(&t->ring);
  466. if (flags & UBLKS_Q_NO_UBLK_FIXED_FD) {
  467. /* Register only backing files starting from index 1, exclude ublk control device */
  468. if (dev->nr_fds > 1) {
  469. ret = io_uring_register_files(&t->ring, &dev->fds[1], dev->nr_fds - 1);
  470. } else {
  471. /* No backing files to register, skip file registration */
  472. ret = 0;
  473. }
  474. } else {
  475. ret = io_uring_register_files(&t->ring, dev->fds, dev->nr_fds);
  476. }
  477. if (ret) {
  478. ublk_err("ublk dev %d thread %d register files failed %d\n",
  479. t->dev->dev_info.dev_id, t->idx, ret);
  480. goto fail;
  481. }
  482. return 0;
  483. fail:
  484. ublk_thread_deinit(t);
  485. ublk_err("ublk dev %d thread %d init failed\n",
  486. dev->dev_info.dev_id, t->idx);
  487. return -ENOMEM;
  488. }
  489. #define WAIT_USEC 100000
  490. #define MAX_WAIT_USEC (3 * 1000000)
  491. static int ublk_dev_prep(const struct dev_ctx *ctx, struct ublk_dev *dev)
  492. {
  493. int dev_id = dev->dev_info.dev_id;
  494. unsigned int wait_usec = 0;
  495. int ret = 0, fd = -1;
  496. char buf[64];
  497. snprintf(buf, 64, "%s%d", UBLKC_DEV, dev_id);
  498. while (wait_usec < MAX_WAIT_USEC) {
  499. fd = open(buf, O_RDWR);
  500. if (fd >= 0)
  501. break;
  502. usleep(WAIT_USEC);
  503. wait_usec += WAIT_USEC;
  504. }
  505. if (fd < 0) {
  506. ublk_err("can't open %s %s\n", buf, strerror(errno));
  507. return -1;
  508. }
  509. dev->fds[0] = fd;
  510. if (dev->tgt.ops->init_tgt)
  511. ret = dev->tgt.ops->init_tgt(ctx, dev);
  512. if (ret)
  513. close(dev->fds[0]);
  514. return ret;
  515. }
  516. static void ublk_dev_unprep(struct ublk_dev *dev)
  517. {
  518. if (dev->tgt.ops->deinit_tgt)
  519. dev->tgt.ops->deinit_tgt(dev);
  520. close(dev->fds[0]);
  521. }
  522. static void ublk_set_auto_buf_reg(const struct ublk_thread *t,
  523. const struct ublk_queue *q,
  524. struct io_uring_sqe *sqe,
  525. unsigned short tag)
  526. {
  527. struct ublk_auto_buf_reg buf = {};
  528. if (q->tgt_ops->buf_index)
  529. buf.index = q->tgt_ops->buf_index(t, q, tag);
  530. else
  531. buf.index = ublk_io_buf_idx(t, q, tag);
  532. if (ublk_queue_auto_zc_fallback(q))
  533. buf.flags = UBLK_AUTO_BUF_REG_FALLBACK;
  534. sqe->addr = ublk_auto_buf_reg_to_sqe_addr(&buf);
  535. }
  536. /* Copy in pieces to test the buffer offset logic */
  537. #define UBLK_USER_COPY_LEN 2048
  538. static void ublk_user_copy(const struct ublk_io *io, __u8 match_ublk_op)
  539. {
  540. const struct ublk_queue *q = ublk_io_to_queue(io);
  541. const struct ublksrv_io_desc *iod = ublk_get_iod(q, io->tag);
  542. __u64 off = ublk_user_copy_offset(q->q_id, io->tag);
  543. __u8 ublk_op = ublksrv_get_op(iod);
  544. __u32 len = iod->nr_sectors << 9;
  545. void *addr = io->buf_addr;
  546. ssize_t copied;
  547. if (ublk_op != match_ublk_op)
  548. return;
  549. while (len) {
  550. __u32 copy_len = min(len, UBLK_USER_COPY_LEN);
  551. if (ublk_op == UBLK_IO_OP_WRITE)
  552. copied = pread(q->ublk_fd, addr, copy_len, off);
  553. else if (ublk_op == UBLK_IO_OP_READ)
  554. copied = pwrite(q->ublk_fd, addr, copy_len, off);
  555. else
  556. assert(0);
  557. assert(copied == (ssize_t)copy_len);
  558. addr += copy_len;
  559. off += copy_len;
  560. len -= copy_len;
  561. }
  562. if (!(iod->op_flags & UBLK_IO_F_INTEGRITY))
  563. return;
  564. len = ublk_integrity_len(q, iod->nr_sectors << 9);
  565. off = ublk_user_copy_offset(q->q_id, io->tag);
  566. off |= UBLKSRV_IO_INTEGRITY_FLAG;
  567. if (ublk_op == UBLK_IO_OP_WRITE)
  568. copied = pread(q->ublk_fd, io->integrity_buf, len, off);
  569. else if (ublk_op == UBLK_IO_OP_READ)
  570. copied = pwrite(q->ublk_fd, io->integrity_buf, len, off);
  571. else
  572. assert(0);
  573. assert(copied == (ssize_t)len);
  574. }
  575. int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io)
  576. {
  577. struct ublk_queue *q = ublk_io_to_queue(io);
  578. struct ublksrv_io_cmd *cmd;
  579. struct io_uring_sqe *sqe[1];
  580. unsigned int cmd_op = 0;
  581. __u64 user_data;
  582. /* only freed io can be issued */
  583. if (!(io->flags & UBLKS_IO_FREE))
  584. return 0;
  585. /*
  586. * we issue because we need either fetching or committing or
  587. * getting data
  588. */
  589. if (!(io->flags &
  590. (UBLKS_IO_NEED_FETCH_RQ | UBLKS_IO_NEED_COMMIT_RQ_COMP | UBLKS_IO_NEED_GET_DATA)))
  591. return 0;
  592. if (io->flags & UBLKS_IO_NEED_GET_DATA)
  593. cmd_op = UBLK_U_IO_NEED_GET_DATA;
  594. else if (io->flags & UBLKS_IO_NEED_COMMIT_RQ_COMP) {
  595. if (ublk_queue_use_user_copy(q))
  596. ublk_user_copy(io, UBLK_IO_OP_READ);
  597. cmd_op = UBLK_U_IO_COMMIT_AND_FETCH_REQ;
  598. } else if (io->flags & UBLKS_IO_NEED_FETCH_RQ)
  599. cmd_op = UBLK_U_IO_FETCH_REQ;
  600. if (io_uring_sq_space_left(&t->ring) < 1)
  601. io_uring_submit(&t->ring);
  602. ublk_io_alloc_sqes(t, sqe, 1);
  603. if (!sqe[0]) {
  604. ublk_err("%s: run out of sqe. thread %u, tag %d\n",
  605. __func__, t->idx, io->tag);
  606. return -1;
  607. }
  608. cmd = (struct ublksrv_io_cmd *)ublk_get_sqe_cmd(sqe[0]);
  609. if (cmd_op == UBLK_U_IO_COMMIT_AND_FETCH_REQ)
  610. cmd->result = io->result;
  611. /* These fields should be written once, never change */
  612. ublk_set_sqe_cmd_op(sqe[0], cmd_op);
  613. sqe[0]->fd = ublk_get_registered_fd(q, 0); /* dev->fds[0] */
  614. sqe[0]->opcode = IORING_OP_URING_CMD;
  615. if (q->flags & UBLKS_Q_NO_UBLK_FIXED_FD)
  616. sqe[0]->flags = 0; /* Use raw FD, not fixed file */
  617. else
  618. sqe[0]->flags = IOSQE_FIXED_FILE;
  619. sqe[0]->rw_flags = 0;
  620. cmd->tag = io->tag;
  621. cmd->q_id = q->q_id;
  622. if (!ublk_queue_no_buf(q) && !ublk_queue_use_user_copy(q))
  623. cmd->addr = (__u64) (uintptr_t) io->buf_addr;
  624. else
  625. cmd->addr = 0;
  626. if (ublk_queue_use_auto_zc(q))
  627. ublk_set_auto_buf_reg(t, q, sqe[0], io->tag);
  628. user_data = build_user_data(io->tag, _IOC_NR(cmd_op), 0, q->q_id, 0);
  629. io_uring_sqe_set_data64(sqe[0], user_data);
  630. io->flags = 0;
  631. t->cmd_inflight += 1;
  632. ublk_dbg(UBLK_DBG_IO_CMD, "%s: (thread %u qid %d tag %u cmd_op %u) iof %x stopping %d\n",
  633. __func__, t->idx, q->q_id, io->tag, cmd_op,
  634. io->flags, !!(t->state & UBLKS_T_STOPPING));
  635. return 1;
  636. }
  637. static void ublk_submit_fetch_commands(struct ublk_thread *t)
  638. {
  639. struct ublk_queue *q;
  640. struct ublk_io *io;
  641. int i = 0, j = 0;
  642. if (t->dev->per_io_tasks) {
  643. /*
  644. * Lexicographically order all the (qid,tag) pairs, with
  645. * qid taking priority (so (1,0) > (0,1)). Then make
  646. * this thread the daemon for every Nth entry in this
  647. * list (N is the number of threads), starting at this
  648. * thread's index. This ensures that each queue is
  649. * handled by as many ublk server threads as possible,
  650. * so that load that is concentrated on one or a few
  651. * queues can make use of all ublk server threads.
  652. */
  653. const struct ublksrv_ctrl_dev_info *dinfo = &t->dev->dev_info;
  654. int nr_ios = dinfo->nr_hw_queues * dinfo->queue_depth;
  655. for (i = t->idx; i < nr_ios; i += t->dev->nthreads) {
  656. int q_id = i / dinfo->queue_depth;
  657. int tag = i % dinfo->queue_depth;
  658. q = &t->dev->q[q_id];
  659. io = &q->ios[tag];
  660. io->buf_index = j++;
  661. ublk_queue_io_cmd(t, io);
  662. }
  663. } else {
  664. /*
  665. * Service exclusively the queue whose q_id matches our
  666. * thread index.
  667. */
  668. struct ublk_queue *q = &t->dev->q[t->idx];
  669. for (i = 0; i < q->q_depth; i++) {
  670. io = &q->ios[i];
  671. io->buf_index = i;
  672. ublk_queue_io_cmd(t, io);
  673. }
  674. }
  675. }
  676. static int ublk_thread_is_idle(struct ublk_thread *t)
  677. {
  678. return !io_uring_sq_ready(&t->ring) && !t->io_inflight;
  679. }
  680. static int ublk_thread_is_done(struct ublk_thread *t)
  681. {
  682. return (t->state & UBLKS_T_STOPPING) && ublk_thread_is_idle(t) && !t->cmd_inflight;
  683. }
  684. static inline void ublksrv_handle_tgt_cqe(struct ublk_thread *t,
  685. struct ublk_queue *q,
  686. struct io_uring_cqe *cqe)
  687. {
  688. if (cqe->res < 0 && cqe->res != -EAGAIN)
  689. ublk_err("%s: failed tgt io: res %d qid %u tag %u, cmd_op %u\n",
  690. __func__, cqe->res, q->q_id,
  691. user_data_to_tag(cqe->user_data),
  692. user_data_to_op(cqe->user_data));
  693. if (q->tgt_ops->tgt_io_done)
  694. q->tgt_ops->tgt_io_done(t, q, cqe);
  695. }
  696. static void ublk_handle_uring_cmd(struct ublk_thread *t,
  697. struct ublk_queue *q,
  698. const struct io_uring_cqe *cqe)
  699. {
  700. int fetch = (cqe->res != UBLK_IO_RES_ABORT) &&
  701. !(t->state & UBLKS_T_STOPPING);
  702. unsigned tag = user_data_to_tag(cqe->user_data);
  703. struct ublk_io *io = &q->ios[tag];
  704. t->cmd_inflight--;
  705. if (!fetch) {
  706. t->state |= UBLKS_T_STOPPING;
  707. io->flags &= ~UBLKS_IO_NEED_FETCH_RQ;
  708. }
  709. if (cqe->res == UBLK_IO_RES_OK) {
  710. ublk_assert(tag < q->q_depth);
  711. if (ublk_queue_use_user_copy(q))
  712. ublk_user_copy(io, UBLK_IO_OP_WRITE);
  713. if (q->tgt_ops->queue_io)
  714. q->tgt_ops->queue_io(t, q, tag);
  715. } else if (cqe->res == UBLK_IO_RES_NEED_GET_DATA) {
  716. io->flags |= UBLKS_IO_NEED_GET_DATA | UBLKS_IO_FREE;
  717. ublk_queue_io_cmd(t, io);
  718. } else {
  719. /*
  720. * COMMIT_REQ will be completed immediately since no fetching
  721. * piggyback is required.
  722. *
  723. * Marking IO_FREE only, then this io won't be issued since
  724. * we only issue io with (UBLKS_IO_FREE | UBLKSRV_NEED_*)
  725. *
  726. * */
  727. io->flags = UBLKS_IO_FREE;
  728. }
  729. }
  730. static void ublk_handle_cqe(struct ublk_thread *t,
  731. struct io_uring_cqe *cqe, void *data)
  732. {
  733. struct ublk_dev *dev = t->dev;
  734. unsigned q_id = user_data_to_q_id(cqe->user_data);
  735. unsigned cmd_op = user_data_to_op(cqe->user_data);
  736. if (cqe->res < 0 && cqe->res != -ENODEV && cqe->res != -ENOBUFS)
  737. ublk_err("%s: res %d userdata %llx thread state %x\n", __func__,
  738. cqe->res, cqe->user_data, t->state);
  739. ublk_dbg(UBLK_DBG_IO_CMD, "%s: res %d (thread %d qid %d tag %u cmd_op %x "
  740. "data %lx target %d/%d) stopping %d\n",
  741. __func__, cqe->res, t->idx, q_id,
  742. user_data_to_tag(cqe->user_data),
  743. cmd_op, cqe->user_data, is_target_io(cqe->user_data),
  744. user_data_to_tgt_data(cqe->user_data),
  745. (t->state & UBLKS_T_STOPPING));
  746. /* Don't retrieve io in case of target io */
  747. if (is_target_io(cqe->user_data)) {
  748. ublksrv_handle_tgt_cqe(t, &dev->q[q_id], cqe);
  749. return;
  750. }
  751. if (ublk_thread_batch_io(t))
  752. ublk_batch_compl_cmd(t, cqe);
  753. else
  754. ublk_handle_uring_cmd(t, &dev->q[q_id], cqe);
  755. }
  756. static int ublk_reap_events_uring(struct ublk_thread *t)
  757. {
  758. struct io_uring_cqe *cqe;
  759. unsigned head;
  760. int count = 0;
  761. io_uring_for_each_cqe(&t->ring, head, cqe) {
  762. ublk_handle_cqe(t, cqe, NULL);
  763. count += 1;
  764. }
  765. io_uring_cq_advance(&t->ring, count);
  766. return count;
  767. }
  768. static int ublk_process_io(struct ublk_thread *t)
  769. {
  770. int ret, reapped;
  771. ublk_dbg(UBLK_DBG_THREAD, "dev%d-t%u: to_submit %d inflight cmd %u stopping %d\n",
  772. t->dev->dev_info.dev_id,
  773. t->idx, io_uring_sq_ready(&t->ring),
  774. t->cmd_inflight,
  775. (t->state & UBLKS_T_STOPPING));
  776. if (ublk_thread_is_done(t))
  777. return -ENODEV;
  778. ret = io_uring_submit_and_wait(&t->ring, 1);
  779. if (ublk_thread_batch_io(t)) {
  780. ublk_batch_prep_commit(t);
  781. reapped = ublk_reap_events_uring(t);
  782. ublk_batch_commit_io_cmds(t);
  783. } else {
  784. reapped = ublk_reap_events_uring(t);
  785. }
  786. ublk_dbg(UBLK_DBG_THREAD, "submit result %d, reapped %d stop %d idle %d\n",
  787. ret, reapped, (t->state & UBLKS_T_STOPPING),
  788. (t->state & UBLKS_T_IDLE));
  789. return reapped;
  790. }
  791. struct ublk_thread_info {
  792. struct ublk_dev *dev;
  793. pthread_t thread;
  794. unsigned idx;
  795. sem_t *ready;
  796. cpu_set_t *affinity;
  797. unsigned long long extra_flags;
  798. unsigned char (*q_thread_map)[UBLK_MAX_QUEUES];
  799. };
  800. static void ublk_thread_set_sched_affinity(const struct ublk_thread_info *info)
  801. {
  802. if (pthread_setaffinity_np(pthread_self(), sizeof(*info->affinity), info->affinity) < 0)
  803. ublk_err("ublk dev %u thread %u set affinity failed",
  804. info->dev->dev_info.dev_id, info->idx);
  805. }
  806. static void ublk_batch_setup_queues(struct ublk_thread *t)
  807. {
  808. int i;
  809. for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) {
  810. struct ublk_queue *q = &t->dev->q[i];
  811. int ret;
  812. /*
  813. * Only prepare io commands in the mapped thread context,
  814. * otherwise io command buffer index may not work as expected
  815. */
  816. if (t->q_map[i] == 0)
  817. continue;
  818. ret = ublk_batch_queue_prep_io_cmds(t, q);
  819. ublk_assert(ret >= 0);
  820. }
  821. }
  822. static __attribute__((noinline)) int __ublk_io_handler_fn(struct ublk_thread_info *info)
  823. {
  824. struct ublk_thread t = {
  825. .dev = info->dev,
  826. .idx = info->idx,
  827. };
  828. int dev_id = info->dev->dev_info.dev_id;
  829. int ret;
  830. /* Copy per-thread queue mapping into thread-local variable */
  831. if (info->q_thread_map)
  832. memcpy(t.q_map, info->q_thread_map[info->idx], sizeof(t.q_map));
  833. ret = ublk_thread_init(&t, info->extra_flags);
  834. if (ret) {
  835. ublk_err("ublk dev %d thread %u init failed\n",
  836. dev_id, t.idx);
  837. return ret;
  838. }
  839. sem_post(info->ready);
  840. ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %u started\n",
  841. gettid(), dev_id, t.idx);
  842. if (!ublk_thread_batch_io(&t)) {
  843. /* submit all io commands to ublk driver */
  844. ublk_submit_fetch_commands(&t);
  845. } else {
  846. ublk_batch_setup_queues(&t);
  847. ublk_batch_start_fetch(&t);
  848. }
  849. do {
  850. if (ublk_process_io(&t) < 0)
  851. break;
  852. } while (1);
  853. ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %d exiting\n",
  854. gettid(), dev_id, t.idx);
  855. ublk_thread_deinit(&t);
  856. return 0;
  857. }
  858. static void *ublk_io_handler_fn(void *data)
  859. {
  860. struct ublk_thread_info *info = data;
  861. /*
  862. * IO perf is sensitive with queue pthread affinity on NUMA machine
  863. *
  864. * Set sched_affinity at beginning, so following allocated memory/pages
  865. * could be CPU/NUMA aware.
  866. */
  867. if (info->affinity)
  868. ublk_thread_set_sched_affinity(info);
  869. __ublk_io_handler_fn(info);
  870. return NULL;
  871. }
  872. static void ublk_set_parameters(struct ublk_dev *dev)
  873. {
  874. int ret;
  875. ret = ublk_ctrl_set_params(dev, &dev->tgt.params);
  876. if (ret)
  877. ublk_err("dev %d set basic parameter failed %d\n",
  878. dev->dev_info.dev_id, ret);
  879. }
  880. static int ublk_send_dev_event(const struct dev_ctx *ctx, struct ublk_dev *dev, int dev_id)
  881. {
  882. uint64_t id;
  883. int evtfd = ctx->_evtfd;
  884. if (evtfd < 0)
  885. return -EBADF;
  886. if (dev_id >= 0)
  887. id = dev_id + 1;
  888. else
  889. id = ERROR_EVTFD_DEVID;
  890. if (dev && ctx->shadow_dev)
  891. memcpy(&ctx->shadow_dev->q, &dev->q, sizeof(dev->q));
  892. if (write(evtfd, &id, sizeof(id)) != sizeof(id))
  893. return -EINVAL;
  894. close(evtfd);
  895. shmdt(ctx->shadow_dev);
  896. return 0;
  897. }
  898. static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
  899. {
  900. const struct ublksrv_ctrl_dev_info *dinfo = &dev->dev_info;
  901. struct ublk_thread_info *tinfo;
  902. unsigned long long extra_flags = 0;
  903. cpu_set_t *affinity_buf;
  904. unsigned char (*q_thread_map)[UBLK_MAX_QUEUES] = NULL;
  905. void *thread_ret;
  906. sem_t ready;
  907. int ret, i;
  908. ublk_dbg(UBLK_DBG_DEV, "%s enter\n", __func__);
  909. tinfo = calloc(sizeof(struct ublk_thread_info), dev->nthreads);
  910. if (!tinfo)
  911. return -ENOMEM;
  912. sem_init(&ready, 0, 0);
  913. ret = ublk_dev_prep(ctx, dev);
  914. if (ret)
  915. return ret;
  916. ret = ublk_ctrl_get_affinity(dev, &affinity_buf);
  917. if (ret)
  918. return ret;
  919. if (ublk_dev_batch_io(dev)) {
  920. q_thread_map = calloc(dev->nthreads, sizeof(*q_thread_map));
  921. if (!q_thread_map) {
  922. ret = -ENOMEM;
  923. goto fail;
  924. }
  925. ublk_batch_setup_map(q_thread_map, dev->nthreads,
  926. dinfo->nr_hw_queues);
  927. }
  928. if (ctx->auto_zc_fallback)
  929. extra_flags = UBLKS_Q_AUTO_BUF_REG_FALLBACK;
  930. if (ctx->no_ublk_fixed_fd)
  931. extra_flags |= UBLKS_Q_NO_UBLK_FIXED_FD;
  932. for (i = 0; i < dinfo->nr_hw_queues; i++) {
  933. dev->q[i].dev = dev;
  934. dev->q[i].q_id = i;
  935. ret = ublk_queue_init(&dev->q[i], extra_flags,
  936. ctx->metadata_size);
  937. if (ret) {
  938. ublk_err("ublk dev %d queue %d init queue failed\n",
  939. dinfo->dev_id, i);
  940. goto fail;
  941. }
  942. }
  943. for (i = 0; i < dev->nthreads; i++) {
  944. tinfo[i].dev = dev;
  945. tinfo[i].idx = i;
  946. tinfo[i].ready = &ready;
  947. tinfo[i].extra_flags = extra_flags;
  948. tinfo[i].q_thread_map = q_thread_map;
  949. /*
  950. * If threads are not tied 1:1 to queues, setting thread
  951. * affinity based on queue affinity makes little sense.
  952. * However, thread CPU affinity has significant impact
  953. * on performance, so to compare fairly, we'll still set
  954. * thread CPU affinity based on queue affinity where
  955. * possible.
  956. */
  957. if (dev->nthreads == dinfo->nr_hw_queues)
  958. tinfo[i].affinity = &affinity_buf[i];
  959. pthread_create(&tinfo[i].thread, NULL,
  960. ublk_io_handler_fn,
  961. &tinfo[i]);
  962. }
  963. for (i = 0; i < dev->nthreads; i++)
  964. sem_wait(&ready);
  965. free(affinity_buf);
  966. free(q_thread_map);
  967. /* everything is fine now, start us */
  968. if (ctx->recovery)
  969. ret = ublk_ctrl_end_user_recovery(dev, getpid());
  970. else {
  971. ublk_set_parameters(dev);
  972. ret = ublk_ctrl_start_dev(dev, getpid());
  973. }
  974. if (ret < 0) {
  975. ublk_err("%s: ublk_ctrl_start_dev failed: %d\n", __func__, ret);
  976. /* stop device so that inflight uring_cmd can be cancelled */
  977. ublk_ctrl_stop_dev(dev);
  978. goto fail_start;
  979. }
  980. ublk_ctrl_get_info(dev);
  981. if (ctx->fg)
  982. ublk_ctrl_dump(dev);
  983. else
  984. ublk_send_dev_event(ctx, dev, dev->dev_info.dev_id);
  985. fail_start:
  986. /* wait until we are terminated */
  987. for (i = 0; i < dev->nthreads; i++)
  988. pthread_join(tinfo[i].thread, &thread_ret);
  989. free(tinfo);
  990. fail:
  991. for (i = 0; i < dinfo->nr_hw_queues; i++)
  992. ublk_queue_deinit(&dev->q[i]);
  993. ublk_dev_unprep(dev);
  994. ublk_dbg(UBLK_DBG_DEV, "%s exit\n", __func__);
  995. return ret;
  996. }
  997. static int wait_ublk_dev(const char *path, int evt_mask, unsigned timeout)
  998. {
  999. #define EV_SIZE (sizeof(struct inotify_event))
  1000. #define EV_BUF_LEN (128 * (EV_SIZE + 16))
  1001. struct pollfd pfd;
  1002. int fd, wd;
  1003. int ret = -EINVAL;
  1004. const char *dev_name = basename(path);
  1005. fd = inotify_init();
  1006. if (fd < 0) {
  1007. ublk_dbg(UBLK_DBG_DEV, "%s: inotify init failed\n", __func__);
  1008. return fd;
  1009. }
  1010. wd = inotify_add_watch(fd, "/dev", evt_mask);
  1011. if (wd == -1) {
  1012. ublk_dbg(UBLK_DBG_DEV, "%s: add watch for /dev failed\n", __func__);
  1013. goto fail;
  1014. }
  1015. pfd.fd = fd;
  1016. pfd.events = POLL_IN;
  1017. while (1) {
  1018. int i = 0;
  1019. char buffer[EV_BUF_LEN];
  1020. ret = poll(&pfd, 1, 1000 * timeout);
  1021. if (ret == -1) {
  1022. ublk_err("%s: poll inotify failed: %d\n", __func__, ret);
  1023. goto rm_watch;
  1024. } else if (ret == 0) {
  1025. ublk_err("%s: poll inotify timeout\n", __func__);
  1026. ret = -ETIMEDOUT;
  1027. goto rm_watch;
  1028. }
  1029. ret = read(fd, buffer, EV_BUF_LEN);
  1030. if (ret < 0) {
  1031. ublk_err("%s: read inotify fd failed\n", __func__);
  1032. goto rm_watch;
  1033. }
  1034. while (i < ret) {
  1035. struct inotify_event *event = (struct inotify_event *)&buffer[i];
  1036. ublk_dbg(UBLK_DBG_DEV, "%s: inotify event %x %s\n",
  1037. __func__, event->mask, event->name);
  1038. if (event->mask & evt_mask) {
  1039. if (!strcmp(event->name, dev_name)) {
  1040. ret = 0;
  1041. goto rm_watch;
  1042. }
  1043. }
  1044. i += EV_SIZE + event->len;
  1045. }
  1046. }
  1047. rm_watch:
  1048. inotify_rm_watch(fd, wd);
  1049. fail:
  1050. close(fd);
  1051. return ret;
  1052. }
  1053. static int ublk_stop_io_daemon(const struct ublk_dev *dev)
  1054. {
  1055. int daemon_pid = dev->dev_info.ublksrv_pid;
  1056. int dev_id = dev->dev_info.dev_id;
  1057. char ublkc[64];
  1058. int ret = 0;
  1059. if (daemon_pid < 0)
  1060. return 0;
  1061. /* daemon may be dead already */
  1062. if (kill(daemon_pid, 0) < 0)
  1063. goto wait;
  1064. snprintf(ublkc, sizeof(ublkc), "/dev/%s%d", "ublkc", dev_id);
  1065. /* ublk char device may be gone already */
  1066. if (access(ublkc, F_OK) != 0)
  1067. goto wait;
  1068. /* Wait until ublk char device is closed, when the daemon is shutdown */
  1069. ret = wait_ublk_dev(ublkc, IN_CLOSE, 10);
  1070. /* double check and since it may be closed before starting inotify */
  1071. if (ret == -ETIMEDOUT)
  1072. ret = kill(daemon_pid, 0) < 0;
  1073. wait:
  1074. waitpid(daemon_pid, NULL, 0);
  1075. ublk_dbg(UBLK_DBG_DEV, "%s: pid %d dev_id %d ret %d\n",
  1076. __func__, daemon_pid, dev_id, ret);
  1077. return ret;
  1078. }
  1079. static int __cmd_dev_add(const struct dev_ctx *ctx)
  1080. {
  1081. unsigned nthreads = ctx->nthreads;
  1082. unsigned nr_queues = ctx->nr_hw_queues;
  1083. const char *tgt_type = ctx->tgt_type;
  1084. unsigned depth = ctx->queue_depth;
  1085. __u64 features;
  1086. const struct ublk_tgt_ops *ops;
  1087. struct ublksrv_ctrl_dev_info *info;
  1088. struct ublk_dev *dev = NULL;
  1089. int dev_id = ctx->dev_id;
  1090. int ret, i;
  1091. ops = ublk_find_tgt(tgt_type);
  1092. if (!ops) {
  1093. ublk_err("%s: no such tgt type, type %s\n",
  1094. __func__, tgt_type);
  1095. ret = -ENODEV;
  1096. goto fail;
  1097. }
  1098. if (nr_queues > UBLK_MAX_QUEUES || depth > UBLK_QUEUE_DEPTH) {
  1099. ublk_err("%s: invalid nr_queues or depth queues %u depth %u\n",
  1100. __func__, nr_queues, depth);
  1101. ret = -EINVAL;
  1102. goto fail;
  1103. }
  1104. /* default to 1:1 threads:queues if nthreads is unspecified */
  1105. if (!nthreads)
  1106. nthreads = nr_queues;
  1107. if (nthreads > UBLK_MAX_THREADS) {
  1108. ublk_err("%s: %u is too many threads (max %u)\n",
  1109. __func__, nthreads, UBLK_MAX_THREADS);
  1110. ret = -EINVAL;
  1111. goto fail;
  1112. }
  1113. if (nthreads != nr_queues && (!ctx->per_io_tasks &&
  1114. !(ctx->flags & UBLK_F_BATCH_IO))) {
  1115. ublk_err("%s: threads %u must be same as queues %u if "
  1116. "not using per_io_tasks\n",
  1117. __func__, nthreads, nr_queues);
  1118. ret = -EINVAL;
  1119. goto fail;
  1120. }
  1121. dev = ublk_ctrl_init();
  1122. if (!dev) {
  1123. ublk_err("%s: can't alloc dev id %d, type %s\n",
  1124. __func__, dev_id, tgt_type);
  1125. ret = -ENOMEM;
  1126. goto fail;
  1127. }
  1128. /* kernel doesn't support get_features */
  1129. ret = ublk_ctrl_get_features(dev, &features);
  1130. if (ret < 0) {
  1131. ret = -EINVAL;
  1132. goto fail;
  1133. }
  1134. if (!(features & UBLK_F_CMD_IOCTL_ENCODE)) {
  1135. ret = -ENOTSUP;
  1136. goto fail;
  1137. }
  1138. info = &dev->dev_info;
  1139. info->dev_id = ctx->dev_id;
  1140. info->nr_hw_queues = nr_queues;
  1141. info->queue_depth = depth;
  1142. info->flags = ctx->flags;
  1143. if ((features & UBLK_F_QUIESCE) &&
  1144. (info->flags & UBLK_F_USER_RECOVERY))
  1145. info->flags |= UBLK_F_QUIESCE;
  1146. dev->nthreads = nthreads;
  1147. dev->per_io_tasks = ctx->per_io_tasks;
  1148. dev->tgt.ops = ops;
  1149. dev->tgt.sq_depth = depth;
  1150. dev->tgt.cq_depth = depth;
  1151. for (i = 0; i < MAX_BACK_FILES; i++) {
  1152. if (ctx->files[i]) {
  1153. strcpy(dev->tgt.backing_file[i], ctx->files[i]);
  1154. dev->tgt.nr_backing_files++;
  1155. }
  1156. }
  1157. if (ctx->recovery)
  1158. ret = ublk_ctrl_start_user_recovery(dev);
  1159. else
  1160. ret = ublk_ctrl_add_dev(dev);
  1161. if (ret < 0) {
  1162. ublk_err("%s: can't add dev id %d, type %s ret %d\n",
  1163. __func__, dev_id, tgt_type, ret);
  1164. goto fail;
  1165. }
  1166. ret = ublk_start_daemon(ctx, dev);
  1167. ublk_dbg(UBLK_DBG_DEV, "%s: daemon exit %d\n", __func__, ret);
  1168. if (ret < 0)
  1169. ublk_ctrl_del_dev(dev);
  1170. fail:
  1171. if (ret < 0)
  1172. ublk_send_dev_event(ctx, dev, -1);
  1173. if (dev)
  1174. ublk_ctrl_deinit(dev);
  1175. return ret;
  1176. }
  1177. static int __cmd_dev_list(struct dev_ctx *ctx);
  1178. static int cmd_dev_add(struct dev_ctx *ctx)
  1179. {
  1180. int res;
  1181. if (ctx->fg)
  1182. goto run;
  1183. ctx->_shmid = shmget(IPC_PRIVATE, sizeof(struct ublk_dev), IPC_CREAT | 0666);
  1184. if (ctx->_shmid < 0) {
  1185. ublk_err("%s: failed to shmget %s\n", __func__, strerror(errno));
  1186. exit(-1);
  1187. }
  1188. ctx->shadow_dev = (struct ublk_dev *)shmat(ctx->_shmid, NULL, 0);
  1189. if (ctx->shadow_dev == (struct ublk_dev *)-1) {
  1190. ublk_err("%s: failed to shmat %s\n", __func__, strerror(errno));
  1191. exit(-1);
  1192. }
  1193. ctx->_evtfd = eventfd(0, 0);
  1194. if (ctx->_evtfd < 0) {
  1195. ublk_err("%s: failed to create eventfd %s\n", __func__, strerror(errno));
  1196. exit(-1);
  1197. }
  1198. res = fork();
  1199. if (res == 0) {
  1200. int res2;
  1201. setsid();
  1202. res2 = fork();
  1203. if (res2 == 0) {
  1204. /* prepare for detaching */
  1205. close(STDIN_FILENO);
  1206. close(STDOUT_FILENO);
  1207. close(STDERR_FILENO);
  1208. run:
  1209. res = __cmd_dev_add(ctx);
  1210. return res;
  1211. } else {
  1212. /* detached from the foreground task */
  1213. exit(EXIT_SUCCESS);
  1214. }
  1215. } else if (res > 0) {
  1216. uint64_t id;
  1217. int exit_code = EXIT_FAILURE;
  1218. res = read(ctx->_evtfd, &id, sizeof(id));
  1219. close(ctx->_evtfd);
  1220. if (res == sizeof(id) && id != ERROR_EVTFD_DEVID) {
  1221. ctx->dev_id = id - 1;
  1222. if (__cmd_dev_list(ctx) >= 0)
  1223. exit_code = EXIT_SUCCESS;
  1224. }
  1225. shmdt(ctx->shadow_dev);
  1226. shmctl(ctx->_shmid, IPC_RMID, NULL);
  1227. /* wait for child and detach from it */
  1228. wait(NULL);
  1229. if (exit_code == EXIT_FAILURE)
  1230. ublk_err("%s: command failed\n", __func__);
  1231. exit(exit_code);
  1232. } else {
  1233. exit(EXIT_FAILURE);
  1234. }
  1235. }
  1236. static int __cmd_dev_del(struct dev_ctx *ctx)
  1237. {
  1238. int number = ctx->dev_id;
  1239. struct ublk_dev *dev;
  1240. int ret;
  1241. dev = ublk_ctrl_init();
  1242. dev->dev_info.dev_id = number;
  1243. ret = ublk_ctrl_get_info(dev);
  1244. if (ret < 0)
  1245. goto fail;
  1246. ret = ublk_ctrl_stop_dev(dev);
  1247. if (ret < 0)
  1248. ublk_err("%s: stop dev %d failed ret %d\n", __func__, number, ret);
  1249. ret = ublk_stop_io_daemon(dev);
  1250. if (ret < 0)
  1251. ublk_err("%s: stop daemon id %d dev %d, ret %d\n",
  1252. __func__, dev->dev_info.ublksrv_pid, number, ret);
  1253. ublk_ctrl_del_dev(dev);
  1254. fail:
  1255. ublk_ctrl_deinit(dev);
  1256. return (ret >= 0) ? 0 : ret;
  1257. }
  1258. static int cmd_dev_del(struct dev_ctx *ctx)
  1259. {
  1260. int i;
  1261. if (ctx->dev_id >= 0 || !ctx->all)
  1262. return __cmd_dev_del(ctx);
  1263. for (i = 0; i < 255; i++) {
  1264. ctx->dev_id = i;
  1265. __cmd_dev_del(ctx);
  1266. }
  1267. return 0;
  1268. }
  1269. static int cmd_dev_stop(struct dev_ctx *ctx)
  1270. {
  1271. int number = ctx->dev_id;
  1272. struct ublk_dev *dev;
  1273. int ret;
  1274. if (number < 0) {
  1275. ublk_err("%s: device id is required\n", __func__);
  1276. return -EINVAL;
  1277. }
  1278. dev = ublk_ctrl_init();
  1279. dev->dev_info.dev_id = number;
  1280. ret = ublk_ctrl_get_info(dev);
  1281. if (ret < 0)
  1282. goto fail;
  1283. if (ctx->safe_stop) {
  1284. ret = ublk_ctrl_try_stop_dev(dev);
  1285. if (ret < 0)
  1286. ublk_err("%s: try_stop dev %d failed ret %d\n",
  1287. __func__, number, ret);
  1288. } else {
  1289. ret = ublk_ctrl_stop_dev(dev);
  1290. if (ret < 0)
  1291. ublk_err("%s: stop dev %d failed ret %d\n",
  1292. __func__, number, ret);
  1293. }
  1294. fail:
  1295. ublk_ctrl_deinit(dev);
  1296. return ret;
  1297. }
  1298. static int __cmd_dev_list(struct dev_ctx *ctx)
  1299. {
  1300. struct ublk_dev *dev = ublk_ctrl_init();
  1301. int ret;
  1302. if (!dev)
  1303. return -ENODEV;
  1304. dev->dev_info.dev_id = ctx->dev_id;
  1305. ret = ublk_ctrl_get_info(dev);
  1306. if (ret < 0) {
  1307. if (ctx->logging)
  1308. ublk_err("%s: can't get dev info from %d: %d\n",
  1309. __func__, ctx->dev_id, ret);
  1310. } else {
  1311. if (ctx->shadow_dev)
  1312. memcpy(&dev->q, ctx->shadow_dev->q, sizeof(dev->q));
  1313. ublk_ctrl_dump(dev);
  1314. }
  1315. ublk_ctrl_deinit(dev);
  1316. return ret;
  1317. }
  1318. static int cmd_dev_list(struct dev_ctx *ctx)
  1319. {
  1320. int i;
  1321. if (ctx->dev_id >= 0 || !ctx->all)
  1322. return __cmd_dev_list(ctx);
  1323. ctx->logging = false;
  1324. for (i = 0; i < 255; i++) {
  1325. ctx->dev_id = i;
  1326. __cmd_dev_list(ctx);
  1327. }
  1328. return 0;
  1329. }
  1330. static int cmd_dev_get_features(void)
  1331. {
  1332. #define const_ilog2(x) (63 - __builtin_clzll(x))
  1333. #define FEAT_NAME(f) [const_ilog2(f)] = #f
  1334. static const char *feat_map[] = {
  1335. FEAT_NAME(UBLK_F_SUPPORT_ZERO_COPY),
  1336. FEAT_NAME(UBLK_F_URING_CMD_COMP_IN_TASK),
  1337. FEAT_NAME(UBLK_F_NEED_GET_DATA),
  1338. FEAT_NAME(UBLK_F_USER_RECOVERY),
  1339. FEAT_NAME(UBLK_F_USER_RECOVERY_REISSUE),
  1340. FEAT_NAME(UBLK_F_UNPRIVILEGED_DEV),
  1341. FEAT_NAME(UBLK_F_CMD_IOCTL_ENCODE),
  1342. FEAT_NAME(UBLK_F_USER_COPY),
  1343. FEAT_NAME(UBLK_F_ZONED),
  1344. FEAT_NAME(UBLK_F_USER_RECOVERY_FAIL_IO),
  1345. FEAT_NAME(UBLK_F_UPDATE_SIZE),
  1346. FEAT_NAME(UBLK_F_AUTO_BUF_REG),
  1347. FEAT_NAME(UBLK_F_QUIESCE),
  1348. FEAT_NAME(UBLK_F_PER_IO_DAEMON),
  1349. FEAT_NAME(UBLK_F_BUF_REG_OFF_DAEMON),
  1350. FEAT_NAME(UBLK_F_INTEGRITY),
  1351. FEAT_NAME(UBLK_F_SAFE_STOP_DEV),
  1352. FEAT_NAME(UBLK_F_BATCH_IO),
  1353. FEAT_NAME(UBLK_F_NO_AUTO_PART_SCAN),
  1354. };
  1355. struct ublk_dev *dev;
  1356. __u64 features = 0;
  1357. int ret;
  1358. dev = ublk_ctrl_init();
  1359. if (!dev) {
  1360. fprintf(stderr, "ublksrv_ctrl_init failed id\n");
  1361. return -EOPNOTSUPP;
  1362. }
  1363. ret = ublk_ctrl_get_features(dev, &features);
  1364. if (!ret) {
  1365. int i;
  1366. printf("ublk_drv features: 0x%llx\n", features);
  1367. for (i = 0; i < sizeof(features) * 8; i++) {
  1368. const char *feat;
  1369. if (!((1ULL << i) & features))
  1370. continue;
  1371. if (i < ARRAY_SIZE(feat_map))
  1372. feat = feat_map[i];
  1373. else
  1374. feat = "unknown";
  1375. printf("0x%-16llx: %s\n", 1ULL << i, feat);
  1376. }
  1377. }
  1378. return ret;
  1379. }
  1380. static int cmd_dev_update_size(struct dev_ctx *ctx)
  1381. {
  1382. struct ublk_dev *dev = ublk_ctrl_init();
  1383. struct ublk_params p;
  1384. int ret = -EINVAL;
  1385. if (!dev)
  1386. return -ENODEV;
  1387. if (ctx->dev_id < 0) {
  1388. fprintf(stderr, "device id isn't provided\n");
  1389. goto out;
  1390. }
  1391. dev->dev_info.dev_id = ctx->dev_id;
  1392. ret = ublk_ctrl_get_params(dev, &p);
  1393. if (ret < 0) {
  1394. ublk_err("failed to get params %d %s\n", ret, strerror(-ret));
  1395. goto out;
  1396. }
  1397. if (ctx->size & ((1 << p.basic.logical_bs_shift) - 1)) {
  1398. ublk_err("size isn't aligned with logical block size\n");
  1399. ret = -EINVAL;
  1400. goto out;
  1401. }
  1402. ret = ublk_ctrl_update_size(dev, ctx->size >> 9);
  1403. out:
  1404. ublk_ctrl_deinit(dev);
  1405. return ret;
  1406. }
  1407. static int cmd_dev_quiesce(struct dev_ctx *ctx)
  1408. {
  1409. struct ublk_dev *dev = ublk_ctrl_init();
  1410. int ret = -EINVAL;
  1411. if (!dev)
  1412. return -ENODEV;
  1413. if (ctx->dev_id < 0) {
  1414. fprintf(stderr, "device id isn't provided for quiesce\n");
  1415. goto out;
  1416. }
  1417. dev->dev_info.dev_id = ctx->dev_id;
  1418. ret = ublk_ctrl_quiesce_dev(dev, 10000);
  1419. out:
  1420. ublk_ctrl_deinit(dev);
  1421. return ret;
  1422. }
  1423. static void __cmd_create_help(char *exe, bool recovery)
  1424. {
  1425. int i;
  1426. printf("%s %s -t [null|loop|stripe|fault_inject] [-q nr_queues] [-d depth] [-n dev_id]\n",
  1427. exe, recovery ? "recover" : "add");
  1428. printf("\t[--foreground] [--quiet] [-z] [--auto_zc] [--auto_zc_fallback] [--debug_mask mask] [-r 0|1] [-g] [-u]\n");
  1429. printf("\t[-e 0|1 ] [-i 0|1] [--no_ublk_fixed_fd]\n");
  1430. printf("\t[--nthreads threads] [--per_io_tasks]\n");
  1431. printf("\t[--integrity_capable] [--integrity_reftag] [--metadata_size SIZE] "
  1432. "[--pi_offset OFFSET] [--csum_type ip|t10dif|nvme] [--tag_size SIZE]\n");
  1433. printf("\t[--batch|-b] [--no_auto_part_scan]\n");
  1434. printf("\t[target options] [backfile1] [backfile2] ...\n");
  1435. printf("\tdefault: nr_queues=2(max 32), depth=128(max 1024), dev_id=-1(auto allocation)\n");
  1436. printf("\tdefault: nthreads=nr_queues");
  1437. for (i = 0; i < ARRAY_SIZE(tgt_ops_list); i++) {
  1438. const struct ublk_tgt_ops *ops = tgt_ops_list[i];
  1439. if (ops->usage)
  1440. ops->usage(ops);
  1441. }
  1442. }
  1443. static void cmd_add_help(char *exe)
  1444. {
  1445. __cmd_create_help(exe, false);
  1446. printf("\n");
  1447. }
  1448. static void cmd_recover_help(char *exe)
  1449. {
  1450. __cmd_create_help(exe, true);
  1451. printf("\tPlease provide exact command line for creating this device with real dev_id\n");
  1452. printf("\n");
  1453. }
  1454. static int cmd_dev_help(char *exe)
  1455. {
  1456. cmd_add_help(exe);
  1457. cmd_recover_help(exe);
  1458. printf("%s del [-n dev_id] -a \n", exe);
  1459. printf("\t -a delete all devices -n delete specified device\n\n");
  1460. printf("%s stop -n dev_id [--safe]\n", exe);
  1461. printf("\t --safe only stop if device has no active openers\n\n");
  1462. printf("%s list [-n dev_id] -a \n", exe);
  1463. printf("\t -a list all devices, -n list specified device, default -a \n\n");
  1464. printf("%s features\n", exe);
  1465. printf("%s update_size -n dev_id -s|--size size_in_bytes \n", exe);
  1466. printf("%s quiesce -n dev_id\n", exe);
  1467. return 0;
  1468. }
  1469. int main(int argc, char *argv[])
  1470. {
  1471. static const struct option longopts[] = {
  1472. { "all", 0, NULL, 'a' },
  1473. { "type", 1, NULL, 't' },
  1474. { "number", 1, NULL, 'n' },
  1475. { "queues", 1, NULL, 'q' },
  1476. { "depth", 1, NULL, 'd' },
  1477. { "debug_mask", 1, NULL, 0 },
  1478. { "quiet", 0, NULL, 0 },
  1479. { "zero_copy", 0, NULL, 'z' },
  1480. { "foreground", 0, NULL, 0 },
  1481. { "recovery", 1, NULL, 'r' },
  1482. { "recovery_fail_io", 1, NULL, 'e'},
  1483. { "recovery_reissue", 1, NULL, 'i'},
  1484. { "get_data", 1, NULL, 'g'},
  1485. { "auto_zc", 0, NULL, 0 },
  1486. { "auto_zc_fallback", 0, NULL, 0 },
  1487. { "user_copy", 0, NULL, 'u'},
  1488. { "size", 1, NULL, 's'},
  1489. { "nthreads", 1, NULL, 0 },
  1490. { "per_io_tasks", 0, NULL, 0 },
  1491. { "no_ublk_fixed_fd", 0, NULL, 0 },
  1492. { "integrity_capable", 0, NULL, 0 },
  1493. { "integrity_reftag", 0, NULL, 0 },
  1494. { "metadata_size", 1, NULL, 0 },
  1495. { "pi_offset", 1, NULL, 0 },
  1496. { "csum_type", 1, NULL, 0 },
  1497. { "tag_size", 1, NULL, 0 },
  1498. { "safe", 0, NULL, 0 },
  1499. { "batch", 0, NULL, 'b'},
  1500. { "no_auto_part_scan", 0, NULL, 0 },
  1501. { 0, 0, 0, 0 }
  1502. };
  1503. const struct ublk_tgt_ops *ops = NULL;
  1504. int option_idx, opt;
  1505. const char *cmd = argv[1];
  1506. struct dev_ctx ctx = {
  1507. ._evtfd = -1,
  1508. .queue_depth = 128,
  1509. .nr_hw_queues = 2,
  1510. .dev_id = -1,
  1511. .tgt_type = "unknown",
  1512. .csum_type = LBMD_PI_CSUM_NONE,
  1513. };
  1514. int ret = -EINVAL, i;
  1515. int tgt_argc = 1;
  1516. char *tgt_argv[MAX_NR_TGT_ARG] = { NULL };
  1517. int value;
  1518. if (argc == 1)
  1519. return ret;
  1520. opterr = 0;
  1521. optind = 2;
  1522. while ((opt = getopt_long(argc, argv, "t:n:d:q:r:e:i:s:gazub",
  1523. longopts, &option_idx)) != -1) {
  1524. switch (opt) {
  1525. case 'a':
  1526. ctx.all = 1;
  1527. break;
  1528. case 'b':
  1529. ctx.flags |= UBLK_F_BATCH_IO;
  1530. break;
  1531. case 'n':
  1532. ctx.dev_id = strtol(optarg, NULL, 10);
  1533. break;
  1534. case 't':
  1535. if (strlen(optarg) < sizeof(ctx.tgt_type))
  1536. strcpy(ctx.tgt_type, optarg);
  1537. break;
  1538. case 'q':
  1539. ctx.nr_hw_queues = strtol(optarg, NULL, 10);
  1540. break;
  1541. case 'd':
  1542. ctx.queue_depth = strtol(optarg, NULL, 10);
  1543. break;
  1544. case 'z':
  1545. ctx.flags |= UBLK_F_SUPPORT_ZERO_COPY;
  1546. break;
  1547. case 'r':
  1548. value = strtol(optarg, NULL, 10);
  1549. if (value)
  1550. ctx.flags |= UBLK_F_USER_RECOVERY;
  1551. break;
  1552. case 'e':
  1553. value = strtol(optarg, NULL, 10);
  1554. if (value)
  1555. ctx.flags |= UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_FAIL_IO;
  1556. break;
  1557. case 'i':
  1558. value = strtol(optarg, NULL, 10);
  1559. if (value)
  1560. ctx.flags |= UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_REISSUE;
  1561. break;
  1562. case 'g':
  1563. ctx.flags |= UBLK_F_NEED_GET_DATA;
  1564. break;
  1565. case 'u':
  1566. ctx.flags |= UBLK_F_USER_COPY;
  1567. break;
  1568. case 's':
  1569. ctx.size = strtoull(optarg, NULL, 10);
  1570. break;
  1571. case 0:
  1572. if (!strcmp(longopts[option_idx].name, "debug_mask"))
  1573. ublk_dbg_mask = strtol(optarg, NULL, 16);
  1574. if (!strcmp(longopts[option_idx].name, "quiet"))
  1575. ublk_dbg_mask = 0;
  1576. if (!strcmp(longopts[option_idx].name, "foreground"))
  1577. ctx.fg = 1;
  1578. if (!strcmp(longopts[option_idx].name, "auto_zc"))
  1579. ctx.flags |= UBLK_F_AUTO_BUF_REG;
  1580. if (!strcmp(longopts[option_idx].name, "auto_zc_fallback"))
  1581. ctx.auto_zc_fallback = 1;
  1582. if (!strcmp(longopts[option_idx].name, "nthreads"))
  1583. ctx.nthreads = strtol(optarg, NULL, 10);
  1584. if (!strcmp(longopts[option_idx].name, "per_io_tasks"))
  1585. ctx.per_io_tasks = 1;
  1586. if (!strcmp(longopts[option_idx].name, "no_ublk_fixed_fd"))
  1587. ctx.no_ublk_fixed_fd = 1;
  1588. if (!strcmp(longopts[option_idx].name, "integrity_capable"))
  1589. ctx.integrity_flags |= LBMD_PI_CAP_INTEGRITY;
  1590. if (!strcmp(longopts[option_idx].name, "integrity_reftag"))
  1591. ctx.integrity_flags |= LBMD_PI_CAP_REFTAG;
  1592. if (!strcmp(longopts[option_idx].name, "metadata_size"))
  1593. ctx.metadata_size = strtoul(optarg, NULL, 0);
  1594. if (!strcmp(longopts[option_idx].name, "pi_offset"))
  1595. ctx.pi_offset = strtoul(optarg, NULL, 0);
  1596. if (!strcmp(longopts[option_idx].name, "csum_type")) {
  1597. if (!strcmp(optarg, "ip")) {
  1598. ctx.csum_type = LBMD_PI_CSUM_IP;
  1599. } else if (!strcmp(optarg, "t10dif")) {
  1600. ctx.csum_type = LBMD_PI_CSUM_CRC16_T10DIF;
  1601. } else if (!strcmp(optarg, "nvme")) {
  1602. ctx.csum_type = LBMD_PI_CSUM_CRC64_NVME;
  1603. } else {
  1604. ublk_err("invalid csum_type: %s\n", optarg);
  1605. return -EINVAL;
  1606. }
  1607. }
  1608. if (!strcmp(longopts[option_idx].name, "tag_size"))
  1609. ctx.tag_size = strtoul(optarg, NULL, 0);
  1610. if (!strcmp(longopts[option_idx].name, "safe"))
  1611. ctx.safe_stop = 1;
  1612. if (!strcmp(longopts[option_idx].name, "no_auto_part_scan"))
  1613. ctx.flags |= UBLK_F_NO_AUTO_PART_SCAN;
  1614. break;
  1615. case '?':
  1616. /*
  1617. * target requires every option must have argument
  1618. */
  1619. if (argv[optind][0] == '-' || argv[optind - 1][0] != '-') {
  1620. fprintf(stderr, "every target option requires argument: %s %s\n",
  1621. argv[optind - 1], argv[optind]);
  1622. exit(EXIT_FAILURE);
  1623. }
  1624. if (tgt_argc < (MAX_NR_TGT_ARG - 1) / 2) {
  1625. tgt_argv[tgt_argc++] = argv[optind - 1];
  1626. tgt_argv[tgt_argc++] = argv[optind];
  1627. } else {
  1628. fprintf(stderr, "too many target options\n");
  1629. exit(EXIT_FAILURE);
  1630. }
  1631. optind += 1;
  1632. break;
  1633. }
  1634. }
  1635. if (ctx.per_io_tasks && (ctx.flags & UBLK_F_BATCH_IO)) {
  1636. ublk_err("per_io_task and F_BATCH_IO conflict\n");
  1637. return -EINVAL;
  1638. }
  1639. /* auto_zc_fallback depends on F_AUTO_BUF_REG & F_SUPPORT_ZERO_COPY */
  1640. if (ctx.auto_zc_fallback &&
  1641. !((ctx.flags & UBLK_F_AUTO_BUF_REG) &&
  1642. (ctx.flags & UBLK_F_SUPPORT_ZERO_COPY))) {
  1643. ublk_err("%s: auto_zc_fallback is set but neither "
  1644. "F_AUTO_BUF_REG nor F_SUPPORT_ZERO_COPY is enabled\n",
  1645. __func__);
  1646. return -EINVAL;
  1647. }
  1648. if (!!(ctx.flags & UBLK_F_NEED_GET_DATA) +
  1649. !!(ctx.flags & UBLK_F_USER_COPY) +
  1650. (ctx.flags & UBLK_F_SUPPORT_ZERO_COPY && !ctx.auto_zc_fallback) +
  1651. (ctx.flags & UBLK_F_AUTO_BUF_REG && !ctx.auto_zc_fallback) +
  1652. ctx.auto_zc_fallback > 1) {
  1653. fprintf(stderr, "too many data copy modes specified\n");
  1654. return -EINVAL;
  1655. }
  1656. if (ctx.metadata_size) {
  1657. if (!(ctx.flags & UBLK_F_USER_COPY)) {
  1658. ublk_err("integrity requires user_copy\n");
  1659. return -EINVAL;
  1660. }
  1661. ctx.flags |= UBLK_F_INTEGRITY;
  1662. } else if (ctx.integrity_flags ||
  1663. ctx.pi_offset ||
  1664. ctx.csum_type != LBMD_PI_CSUM_NONE ||
  1665. ctx.tag_size) {
  1666. ublk_err("integrity parameters require metadata_size\n");
  1667. return -EINVAL;
  1668. }
  1669. if ((ctx.flags & UBLK_F_AUTO_BUF_REG) &&
  1670. (ctx.flags & UBLK_F_BATCH_IO) &&
  1671. (ctx.nthreads > ctx.nr_hw_queues)) {
  1672. ublk_err("too many threads for F_AUTO_BUF_REG & F_BATCH_IO\n");
  1673. return -EINVAL;
  1674. }
  1675. i = optind;
  1676. while (i < argc && ctx.nr_files < MAX_BACK_FILES) {
  1677. ctx.files[ctx.nr_files++] = argv[i++];
  1678. }
  1679. ops = ublk_find_tgt(ctx.tgt_type);
  1680. if (ops && ops->parse_cmd_line) {
  1681. optind = 0;
  1682. tgt_argv[0] = ctx.tgt_type;
  1683. ops->parse_cmd_line(&ctx, tgt_argc, tgt_argv);
  1684. }
  1685. if (!strcmp(cmd, "add"))
  1686. ret = cmd_dev_add(&ctx);
  1687. else if (!strcmp(cmd, "recover")) {
  1688. if (ctx.dev_id < 0) {
  1689. fprintf(stderr, "device id isn't provided for recovering\n");
  1690. ret = -EINVAL;
  1691. } else {
  1692. ctx.recovery = 1;
  1693. ret = cmd_dev_add(&ctx);
  1694. }
  1695. } else if (!strcmp(cmd, "del"))
  1696. ret = cmd_dev_del(&ctx);
  1697. else if (!strcmp(cmd, "stop"))
  1698. ret = cmd_dev_stop(&ctx);
  1699. else if (!strcmp(cmd, "list")) {
  1700. ctx.all = 1;
  1701. ret = cmd_dev_list(&ctx);
  1702. } else if (!strcmp(cmd, "help"))
  1703. ret = cmd_dev_help(argv[0]);
  1704. else if (!strcmp(cmd, "features"))
  1705. ret = cmd_dev_get_features();
  1706. else if (!strcmp(cmd, "update_size"))
  1707. ret = cmd_dev_update_size(&ctx);
  1708. else if (!strcmp(cmd, "quiesce"))
  1709. ret = cmd_dev_quiesce(&ctx);
  1710. else
  1711. cmd_dev_help(argv[0]);
  1712. return ret;
  1713. }