napi.c 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396
  1. // SPDX-License-Identifier: GPL-2.0
  2. #include "io_uring.h"
  3. #include "napi.h"
  4. #ifdef CONFIG_NET_RX_BUSY_POLL
  5. /* Timeout for cleanout of stale entries. */
  6. #define NAPI_TIMEOUT (60 * SEC_CONVERSION)
  7. struct io_napi_entry {
  8. unsigned int napi_id;
  9. struct list_head list;
  10. unsigned long timeout;
  11. struct hlist_node node;
  12. struct rcu_head rcu;
  13. };
  14. static struct io_napi_entry *io_napi_hash_find(struct hlist_head *hash_list,
  15. unsigned int napi_id)
  16. {
  17. struct io_napi_entry *e;
  18. hlist_for_each_entry_rcu(e, hash_list, node) {
  19. if (e->napi_id != napi_id)
  20. continue;
  21. return e;
  22. }
  23. return NULL;
  24. }
  25. static inline ktime_t net_to_ktime(unsigned long t)
  26. {
  27. /* napi approximating usecs, reverse busy_loop_current_time */
  28. return ns_to_ktime(t << 10);
  29. }
  30. int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id)
  31. {
  32. struct hlist_head *hash_list;
  33. struct io_napi_entry *e;
  34. /* Non-NAPI IDs can be rejected. */
  35. if (!napi_id_valid(napi_id))
  36. return -EINVAL;
  37. hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))];
  38. scoped_guard(rcu) {
  39. e = io_napi_hash_find(hash_list, napi_id);
  40. if (e) {
  41. WRITE_ONCE(e->timeout, jiffies + NAPI_TIMEOUT);
  42. return -EEXIST;
  43. }
  44. }
  45. e = kmalloc(sizeof(*e), GFP_NOWAIT);
  46. if (!e)
  47. return -ENOMEM;
  48. e->napi_id = napi_id;
  49. e->timeout = jiffies + NAPI_TIMEOUT;
  50. /*
  51. * guard(spinlock) is not used to manually unlock it before calling
  52. * kfree()
  53. */
  54. spin_lock(&ctx->napi_lock);
  55. if (unlikely(io_napi_hash_find(hash_list, napi_id))) {
  56. spin_unlock(&ctx->napi_lock);
  57. kfree(e);
  58. return -EEXIST;
  59. }
  60. hlist_add_tail_rcu(&e->node, hash_list);
  61. list_add_tail_rcu(&e->list, &ctx->napi_list);
  62. spin_unlock(&ctx->napi_lock);
  63. return 0;
  64. }
  65. static int __io_napi_del_id(struct io_ring_ctx *ctx, unsigned int napi_id)
  66. {
  67. struct hlist_head *hash_list;
  68. struct io_napi_entry *e;
  69. /* Non-NAPI IDs can be rejected. */
  70. if (!napi_id_valid(napi_id))
  71. return -EINVAL;
  72. hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))];
  73. guard(spinlock)(&ctx->napi_lock);
  74. e = io_napi_hash_find(hash_list, napi_id);
  75. if (!e)
  76. return -ENOENT;
  77. list_del_rcu(&e->list);
  78. hash_del_rcu(&e->node);
  79. kfree_rcu(e, rcu);
  80. return 0;
  81. }
  82. static void __io_napi_remove_stale(struct io_ring_ctx *ctx)
  83. {
  84. struct io_napi_entry *e;
  85. guard(spinlock)(&ctx->napi_lock);
  86. /*
  87. * list_for_each_entry_safe() is not required as long as:
  88. * 1. list_del_rcu() does not reset the deleted node next pointer
  89. * 2. kfree_rcu() delays the memory freeing until the next quiescent
  90. * state
  91. */
  92. list_for_each_entry(e, &ctx->napi_list, list) {
  93. if (time_after(jiffies, READ_ONCE(e->timeout))) {
  94. list_del_rcu(&e->list);
  95. hash_del_rcu(&e->node);
  96. kfree_rcu(e, rcu);
  97. }
  98. }
  99. }
  100. static inline void io_napi_remove_stale(struct io_ring_ctx *ctx, bool is_stale)
  101. {
  102. if (is_stale)
  103. __io_napi_remove_stale(ctx);
  104. }
  105. static inline bool io_napi_busy_loop_timeout(ktime_t start_time,
  106. ktime_t bp)
  107. {
  108. if (bp) {
  109. ktime_t end_time = ktime_add(start_time, bp);
  110. ktime_t now = net_to_ktime(busy_loop_current_time());
  111. return ktime_after(now, end_time);
  112. }
  113. return true;
  114. }
  115. static bool io_napi_busy_loop_should_end(void *data,
  116. unsigned long start_time)
  117. {
  118. struct io_wait_queue *iowq = data;
  119. if (signal_pending(current))
  120. return true;
  121. if (io_should_wake(iowq) || io_has_work(iowq->ctx))
  122. return true;
  123. if (io_napi_busy_loop_timeout(net_to_ktime(start_time),
  124. iowq->napi_busy_poll_dt))
  125. return true;
  126. return false;
  127. }
  128. /*
  129. * never report stale entries
  130. */
  131. static bool static_tracking_do_busy_loop(struct io_ring_ctx *ctx,
  132. bool (*loop_end)(void *, unsigned long),
  133. void *loop_end_arg)
  134. {
  135. struct io_napi_entry *e;
  136. list_for_each_entry_rcu(e, &ctx->napi_list, list)
  137. napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg,
  138. ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET);
  139. return false;
  140. }
  141. static bool
  142. dynamic_tracking_do_busy_loop(struct io_ring_ctx *ctx,
  143. bool (*loop_end)(void *, unsigned long),
  144. void *loop_end_arg)
  145. {
  146. struct io_napi_entry *e;
  147. bool is_stale = false;
  148. list_for_each_entry_rcu(e, &ctx->napi_list, list) {
  149. napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg,
  150. ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET);
  151. if (time_after(jiffies, READ_ONCE(e->timeout)))
  152. is_stale = true;
  153. }
  154. return is_stale;
  155. }
  156. static inline bool
  157. __io_napi_do_busy_loop(struct io_ring_ctx *ctx,
  158. bool (*loop_end)(void *, unsigned long),
  159. void *loop_end_arg)
  160. {
  161. if (READ_ONCE(ctx->napi_track_mode) == IO_URING_NAPI_TRACKING_STATIC)
  162. return static_tracking_do_busy_loop(ctx, loop_end, loop_end_arg);
  163. return dynamic_tracking_do_busy_loop(ctx, loop_end, loop_end_arg);
  164. }
  165. static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx,
  166. struct io_wait_queue *iowq)
  167. {
  168. unsigned long start_time = busy_loop_current_time();
  169. bool (*loop_end)(void *, unsigned long) = NULL;
  170. void *loop_end_arg = NULL;
  171. bool is_stale = false;
  172. /* Singular lists use a different napi loop end check function and are
  173. * only executed once.
  174. */
  175. if (list_is_singular(&ctx->napi_list)) {
  176. loop_end = io_napi_busy_loop_should_end;
  177. loop_end_arg = iowq;
  178. }
  179. scoped_guard(rcu) {
  180. do {
  181. is_stale = __io_napi_do_busy_loop(ctx, loop_end,
  182. loop_end_arg);
  183. } while (!io_napi_busy_loop_should_end(iowq, start_time) &&
  184. !loop_end_arg);
  185. }
  186. io_napi_remove_stale(ctx, is_stale);
  187. }
  188. /*
  189. * io_napi_init() - Init napi settings
  190. * @ctx: pointer to io-uring context structure
  191. *
  192. * Init napi settings in the io-uring context.
  193. */
  194. void io_napi_init(struct io_ring_ctx *ctx)
  195. {
  196. u64 sys_dt = READ_ONCE(sysctl_net_busy_poll) * NSEC_PER_USEC;
  197. INIT_LIST_HEAD(&ctx->napi_list);
  198. spin_lock_init(&ctx->napi_lock);
  199. ctx->napi_prefer_busy_poll = false;
  200. ctx->napi_busy_poll_dt = ns_to_ktime(sys_dt);
  201. ctx->napi_track_mode = IO_URING_NAPI_TRACKING_INACTIVE;
  202. }
  203. /*
  204. * io_napi_free() - Deallocate napi
  205. * @ctx: pointer to io-uring context structure
  206. *
  207. * Free the napi list and the hash table in the io-uring context.
  208. */
  209. void io_napi_free(struct io_ring_ctx *ctx)
  210. {
  211. struct io_napi_entry *e;
  212. guard(spinlock)(&ctx->napi_lock);
  213. list_for_each_entry(e, &ctx->napi_list, list) {
  214. hash_del_rcu(&e->node);
  215. kfree_rcu(e, rcu);
  216. }
  217. INIT_LIST_HEAD_RCU(&ctx->napi_list);
  218. }
  219. static int io_napi_register_napi(struct io_ring_ctx *ctx,
  220. struct io_uring_napi *napi)
  221. {
  222. switch (napi->op_param) {
  223. case IO_URING_NAPI_TRACKING_DYNAMIC:
  224. case IO_URING_NAPI_TRACKING_STATIC:
  225. break;
  226. default:
  227. return -EINVAL;
  228. }
  229. /* clean the napi list for new settings */
  230. io_napi_free(ctx);
  231. WRITE_ONCE(ctx->napi_track_mode, napi->op_param);
  232. WRITE_ONCE(ctx->napi_busy_poll_dt, napi->busy_poll_to * NSEC_PER_USEC);
  233. WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi->prefer_busy_poll);
  234. return 0;
  235. }
  236. /*
  237. * io_napi_register() - Register napi with io-uring
  238. * @ctx: pointer to io-uring context structure
  239. * @arg: pointer to io_uring_napi structure
  240. *
  241. * Register napi in the io-uring context.
  242. */
  243. int io_register_napi(struct io_ring_ctx *ctx, void __user *arg)
  244. {
  245. const struct io_uring_napi curr = {
  246. .busy_poll_to = ktime_to_us(ctx->napi_busy_poll_dt),
  247. .prefer_busy_poll = ctx->napi_prefer_busy_poll,
  248. .op_param = ctx->napi_track_mode
  249. };
  250. struct io_uring_napi napi;
  251. if (ctx->flags & IORING_SETUP_IOPOLL)
  252. return -EINVAL;
  253. if (copy_from_user(&napi, arg, sizeof(napi)))
  254. return -EFAULT;
  255. if (napi.pad[0] || napi.pad[1] || napi.resv)
  256. return -EINVAL;
  257. if (copy_to_user(arg, &curr, sizeof(curr)))
  258. return -EFAULT;
  259. switch (napi.opcode) {
  260. case IO_URING_NAPI_REGISTER_OP:
  261. return io_napi_register_napi(ctx, &napi);
  262. case IO_URING_NAPI_STATIC_ADD_ID:
  263. if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC)
  264. return -EINVAL;
  265. return __io_napi_add_id(ctx, napi.op_param);
  266. case IO_URING_NAPI_STATIC_DEL_ID:
  267. if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC)
  268. return -EINVAL;
  269. return __io_napi_del_id(ctx, napi.op_param);
  270. default:
  271. return -EINVAL;
  272. }
  273. }
  274. /*
  275. * io_napi_unregister() - Unregister napi with io-uring
  276. * @ctx: pointer to io-uring context structure
  277. * @arg: pointer to io_uring_napi structure
  278. *
  279. * Unregister napi. If arg has been specified copy the busy poll timeout and
  280. * prefer busy poll setting to the passed in structure.
  281. */
  282. int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg)
  283. {
  284. const struct io_uring_napi curr = {
  285. .busy_poll_to = ktime_to_us(ctx->napi_busy_poll_dt),
  286. .prefer_busy_poll = ctx->napi_prefer_busy_poll
  287. };
  288. if (arg && copy_to_user(arg, &curr, sizeof(curr)))
  289. return -EFAULT;
  290. WRITE_ONCE(ctx->napi_busy_poll_dt, 0);
  291. WRITE_ONCE(ctx->napi_prefer_busy_poll, false);
  292. WRITE_ONCE(ctx->napi_track_mode, IO_URING_NAPI_TRACKING_INACTIVE);
  293. return 0;
  294. }
  295. /*
  296. * __io_napi_busy_loop() - execute busy poll loop
  297. * @ctx: pointer to io-uring context structure
  298. * @iowq: pointer to io wait queue
  299. *
  300. * Execute the busy poll loop and merge the spliced off list.
  301. */
  302. void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq)
  303. {
  304. if (ctx->flags & IORING_SETUP_SQPOLL)
  305. return;
  306. iowq->napi_busy_poll_dt = READ_ONCE(ctx->napi_busy_poll_dt);
  307. if (iowq->timeout != KTIME_MAX) {
  308. ktime_t dt = ktime_sub(iowq->timeout, io_get_time(ctx));
  309. iowq->napi_busy_poll_dt = min_t(u64, iowq->napi_busy_poll_dt, dt);
  310. }
  311. iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll);
  312. io_napi_blocking_busy_loop(ctx, iowq);
  313. }
  314. /*
  315. * io_napi_sqpoll_busy_poll() - busy poll loop for sqpoll
  316. * @ctx: pointer to io-uring context structure
  317. *
  318. * Splice of the napi list and execute the napi busy poll loop.
  319. */
  320. int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx)
  321. {
  322. bool is_stale = false;
  323. if (!READ_ONCE(ctx->napi_busy_poll_dt))
  324. return 0;
  325. if (list_empty_careful(&ctx->napi_list))
  326. return 0;
  327. scoped_guard(rcu) {
  328. is_stale = __io_napi_do_busy_loop(ctx, NULL, NULL);
  329. }
  330. io_napi_remove_stale(ctx, is_stale);
  331. return 1;
  332. }
  333. #endif