xsk.c 44 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972
  1. // SPDX-License-Identifier: GPL-2.0
  2. /* XDP sockets
  3. *
  4. * AF_XDP sockets allows a channel between XDP programs and userspace
  5. * applications.
  6. * Copyright(c) 2018 Intel Corporation.
  7. *
  8. * Author(s): Björn Töpel <bjorn.topel@intel.com>
  9. * Magnus Karlsson <magnus.karlsson@intel.com>
  10. */
  11. #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
  12. #include <linux/if_xdp.h>
  13. #include <linux/init.h>
  14. #include <linux/sched/mm.h>
  15. #include <linux/sched/signal.h>
  16. #include <linux/sched/task.h>
  17. #include <linux/socket.h>
  18. #include <linux/file.h>
  19. #include <linux/uaccess.h>
  20. #include <linux/net.h>
  21. #include <linux/netdevice.h>
  22. #include <linux/rculist.h>
  23. #include <linux/vmalloc.h>
  24. #include <net/xdp_sock_drv.h>
  25. #include <net/busy_poll.h>
  26. #include <net/netdev_lock.h>
  27. #include <net/netdev_rx_queue.h>
  28. #include <net/xdp.h>
  29. #include "xsk_queue.h"
  30. #include "xdp_umem.h"
  31. #include "xsk.h"
  32. #define TX_BATCH_SIZE 32
  33. #define MAX_PER_SOCKET_BUDGET 32
  34. struct xsk_addrs {
  35. u32 num_descs;
  36. u64 addrs[MAX_SKB_FRAGS + 1];
  37. };
  38. static struct kmem_cache *xsk_tx_generic_cache;
  39. void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
  40. {
  41. if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
  42. return;
  43. pool->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
  44. pool->cached_need_wakeup |= XDP_WAKEUP_RX;
  45. }
  46. EXPORT_SYMBOL(xsk_set_rx_need_wakeup);
  47. void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool)
  48. {
  49. struct xdp_sock *xs;
  50. if (pool->cached_need_wakeup & XDP_WAKEUP_TX)
  51. return;
  52. rcu_read_lock();
  53. list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
  54. xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
  55. }
  56. rcu_read_unlock();
  57. pool->cached_need_wakeup |= XDP_WAKEUP_TX;
  58. }
  59. EXPORT_SYMBOL(xsk_set_tx_need_wakeup);
  60. void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool)
  61. {
  62. if (!(pool->cached_need_wakeup & XDP_WAKEUP_RX))
  63. return;
  64. pool->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
  65. pool->cached_need_wakeup &= ~XDP_WAKEUP_RX;
  66. }
  67. EXPORT_SYMBOL(xsk_clear_rx_need_wakeup);
  68. void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool)
  69. {
  70. struct xdp_sock *xs;
  71. if (!(pool->cached_need_wakeup & XDP_WAKEUP_TX))
  72. return;
  73. rcu_read_lock();
  74. list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
  75. xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
  76. }
  77. rcu_read_unlock();
  78. pool->cached_need_wakeup &= ~XDP_WAKEUP_TX;
  79. }
  80. EXPORT_SYMBOL(xsk_clear_tx_need_wakeup);
  81. bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool)
  82. {
  83. return pool->uses_need_wakeup;
  84. }
  85. EXPORT_SYMBOL(xsk_uses_need_wakeup);
  86. struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev,
  87. u16 queue_id)
  88. {
  89. if (queue_id < dev->real_num_rx_queues)
  90. return dev->_rx[queue_id].pool;
  91. if (queue_id < dev->real_num_tx_queues)
  92. return dev->_tx[queue_id].pool;
  93. return NULL;
  94. }
  95. EXPORT_SYMBOL(xsk_get_pool_from_qid);
  96. void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id)
  97. {
  98. if (queue_id < dev->num_rx_queues)
  99. dev->_rx[queue_id].pool = NULL;
  100. if (queue_id < dev->num_tx_queues)
  101. dev->_tx[queue_id].pool = NULL;
  102. }
  103. /* The buffer pool is stored both in the _rx struct and the _tx struct as we do
  104. * not know if the device has more tx queues than rx, or the opposite.
  105. * This might also change during run time.
  106. */
  107. int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
  108. u16 queue_id)
  109. {
  110. if (queue_id >= max_t(unsigned int,
  111. dev->real_num_rx_queues,
  112. dev->real_num_tx_queues))
  113. return -EINVAL;
  114. if (queue_id < dev->real_num_rx_queues)
  115. dev->_rx[queue_id].pool = pool;
  116. if (queue_id < dev->real_num_tx_queues)
  117. dev->_tx[queue_id].pool = pool;
  118. return 0;
  119. }
  120. static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len,
  121. u32 flags)
  122. {
  123. u64 addr;
  124. int err;
  125. addr = xp_get_handle(xskb, xskb->pool);
  126. err = xskq_prod_reserve_desc(xs->rx, addr, len, flags);
  127. if (err) {
  128. xs->rx_queue_full++;
  129. return err;
  130. }
  131. xp_release(xskb);
  132. return 0;
  133. }
  134. static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
  135. {
  136. struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
  137. u32 frags = xdp_buff_has_frags(xdp);
  138. struct xdp_buff_xsk *pos, *tmp;
  139. struct list_head *xskb_list;
  140. u32 contd = 0;
  141. u32 num_desc;
  142. int err;
  143. if (likely(!frags)) {
  144. err = __xsk_rcv_zc(xs, xskb, len, contd);
  145. if (err)
  146. goto err;
  147. return 0;
  148. }
  149. contd = XDP_PKT_CONTD;
  150. num_desc = xdp_get_shared_info_from_buff(xdp)->nr_frags + 1;
  151. if (xskq_prod_nb_free(xs->rx, num_desc) < num_desc) {
  152. xs->rx_queue_full++;
  153. err = -ENOBUFS;
  154. goto err;
  155. }
  156. __xsk_rcv_zc(xs, xskb, len, contd);
  157. xskb_list = &xskb->pool->xskb_list;
  158. list_for_each_entry_safe(pos, tmp, xskb_list, list_node) {
  159. if (list_is_singular(xskb_list))
  160. contd = 0;
  161. len = pos->xdp.data_end - pos->xdp.data;
  162. __xsk_rcv_zc(xs, pos, len, contd);
  163. list_del_init(&pos->list_node);
  164. }
  165. return 0;
  166. err:
  167. xsk_buff_free(xdp);
  168. return err;
  169. }
  170. static void *xsk_copy_xdp_start(struct xdp_buff *from)
  171. {
  172. if (unlikely(xdp_data_meta_unsupported(from)))
  173. return from->data;
  174. else
  175. return from->data_meta;
  176. }
  177. static u32 xsk_copy_xdp(void *to, void **from, u32 to_len,
  178. u32 *from_len, skb_frag_t **frag, u32 rem)
  179. {
  180. u32 copied = 0;
  181. while (1) {
  182. u32 copy_len = min_t(u32, *from_len, to_len);
  183. memcpy(to, *from, copy_len);
  184. copied += copy_len;
  185. if (rem == copied)
  186. return copied;
  187. if (*from_len == copy_len) {
  188. *from = skb_frag_address(*frag);
  189. *from_len = skb_frag_size((*frag)++);
  190. } else {
  191. *from += copy_len;
  192. *from_len -= copy_len;
  193. }
  194. if (to_len == copy_len)
  195. return copied;
  196. to_len -= copy_len;
  197. to += copy_len;
  198. }
  199. }
  200. static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
  201. {
  202. u32 frame_size = __xsk_pool_get_rx_frame_size(xs->pool);
  203. void *copy_from = xsk_copy_xdp_start(xdp), *copy_to;
  204. u32 from_len, meta_len, rem, num_desc;
  205. struct xdp_buff_xsk *xskb;
  206. struct xdp_buff *xsk_xdp;
  207. skb_frag_t *frag;
  208. from_len = xdp->data_end - copy_from;
  209. meta_len = xdp->data - copy_from;
  210. rem = len + meta_len;
  211. if (len <= frame_size && !xdp_buff_has_frags(xdp)) {
  212. int err;
  213. xsk_xdp = xsk_buff_alloc(xs->pool);
  214. if (!xsk_xdp) {
  215. xs->rx_dropped++;
  216. return -ENOMEM;
  217. }
  218. memcpy(xsk_xdp->data - meta_len, copy_from, rem);
  219. xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp);
  220. err = __xsk_rcv_zc(xs, xskb, len, 0);
  221. if (err) {
  222. xsk_buff_free(xsk_xdp);
  223. return err;
  224. }
  225. return 0;
  226. }
  227. num_desc = (len - 1) / frame_size + 1;
  228. if (!xsk_buff_can_alloc(xs->pool, num_desc)) {
  229. xs->rx_dropped++;
  230. return -ENOMEM;
  231. }
  232. if (xskq_prod_nb_free(xs->rx, num_desc) < num_desc) {
  233. xs->rx_queue_full++;
  234. return -ENOBUFS;
  235. }
  236. if (xdp_buff_has_frags(xdp)) {
  237. struct skb_shared_info *sinfo;
  238. sinfo = xdp_get_shared_info_from_buff(xdp);
  239. frag = &sinfo->frags[0];
  240. }
  241. do {
  242. u32 to_len = frame_size + meta_len;
  243. u32 copied;
  244. xsk_xdp = xsk_buff_alloc(xs->pool);
  245. copy_to = xsk_xdp->data - meta_len;
  246. copied = xsk_copy_xdp(copy_to, &copy_from, to_len, &from_len, &frag, rem);
  247. rem -= copied;
  248. xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp);
  249. __xsk_rcv_zc(xs, xskb, copied - meta_len, rem ? XDP_PKT_CONTD : 0);
  250. meta_len = 0;
  251. } while (rem);
  252. return 0;
  253. }
  254. static bool xsk_tx_writeable(struct xdp_sock *xs)
  255. {
  256. if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2)
  257. return false;
  258. return true;
  259. }
  260. static void __xsk_tx_release(struct xdp_sock *xs)
  261. {
  262. __xskq_cons_release(xs->tx);
  263. if (xsk_tx_writeable(xs))
  264. xs->sk.sk_write_space(&xs->sk);
  265. }
  266. static bool xsk_is_bound(struct xdp_sock *xs)
  267. {
  268. if (READ_ONCE(xs->state) == XSK_BOUND) {
  269. /* Matches smp_wmb() in bind(). */
  270. smp_rmb();
  271. return true;
  272. }
  273. return false;
  274. }
  275. static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
  276. {
  277. if (!xsk_is_bound(xs))
  278. return -ENXIO;
  279. if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
  280. return -EINVAL;
  281. if (len > __xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) {
  282. xs->rx_dropped++;
  283. return -ENOSPC;
  284. }
  285. return 0;
  286. }
  287. static void xsk_flush(struct xdp_sock *xs)
  288. {
  289. xskq_prod_submit(xs->rx);
  290. __xskq_cons_release(xs->pool->fq);
  291. sock_def_readable(&xs->sk);
  292. }
  293. int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
  294. {
  295. u32 len = xdp_get_buff_len(xdp);
  296. int err;
  297. err = xsk_rcv_check(xs, xdp, len);
  298. if (!err) {
  299. spin_lock_bh(&xs->pool->rx_lock);
  300. err = __xsk_rcv(xs, xdp, len);
  301. xsk_flush(xs);
  302. spin_unlock_bh(&xs->pool->rx_lock);
  303. }
  304. return err;
  305. }
  306. static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
  307. {
  308. u32 len = xdp_get_buff_len(xdp);
  309. int err;
  310. err = xsk_rcv_check(xs, xdp, len);
  311. if (err)
  312. return err;
  313. if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) {
  314. len = xdp->data_end - xdp->data;
  315. return xsk_rcv_zc(xs, xdp, len);
  316. }
  317. err = __xsk_rcv(xs, xdp, len);
  318. if (!err)
  319. xdp_return_buff(xdp);
  320. return err;
  321. }
  322. int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
  323. {
  324. int err;
  325. err = xsk_rcv(xs, xdp);
  326. if (err)
  327. return err;
  328. if (!xs->flush_node.prev) {
  329. struct list_head *flush_list = bpf_net_ctx_get_xskmap_flush_list();
  330. list_add(&xs->flush_node, flush_list);
  331. }
  332. return 0;
  333. }
  334. void __xsk_map_flush(struct list_head *flush_list)
  335. {
  336. struct xdp_sock *xs, *tmp;
  337. list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
  338. xsk_flush(xs);
  339. __list_del_clearprev(&xs->flush_node);
  340. }
  341. }
  342. void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries)
  343. {
  344. xskq_prod_submit_n(pool->cq, nb_entries);
  345. }
  346. EXPORT_SYMBOL(xsk_tx_completed);
  347. void xsk_tx_release(struct xsk_buff_pool *pool)
  348. {
  349. struct xdp_sock *xs;
  350. rcu_read_lock();
  351. list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list)
  352. __xsk_tx_release(xs);
  353. rcu_read_unlock();
  354. }
  355. EXPORT_SYMBOL(xsk_tx_release);
  356. bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc)
  357. {
  358. bool budget_exhausted = false;
  359. struct xdp_sock *xs;
  360. rcu_read_lock();
  361. again:
  362. list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
  363. if (xs->tx_budget_spent >= MAX_PER_SOCKET_BUDGET) {
  364. budget_exhausted = true;
  365. continue;
  366. }
  367. if (!xskq_cons_peek_desc(xs->tx, desc, pool)) {
  368. if (xskq_has_descs(xs->tx))
  369. xskq_cons_release(xs->tx);
  370. continue;
  371. }
  372. xs->tx_budget_spent++;
  373. /* This is the backpressure mechanism for the Tx path.
  374. * Reserve space in the completion queue and only proceed
  375. * if there is space in it. This avoids having to implement
  376. * any buffering in the Tx path.
  377. */
  378. if (xskq_prod_reserve_addr(pool->cq, desc->addr))
  379. goto out;
  380. xskq_cons_release(xs->tx);
  381. rcu_read_unlock();
  382. return true;
  383. }
  384. if (budget_exhausted) {
  385. list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list)
  386. xs->tx_budget_spent = 0;
  387. budget_exhausted = false;
  388. goto again;
  389. }
  390. out:
  391. rcu_read_unlock();
  392. return false;
  393. }
  394. EXPORT_SYMBOL(xsk_tx_peek_desc);
  395. static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, u32 max_entries)
  396. {
  397. struct xdp_desc *descs = pool->tx_descs;
  398. u32 nb_pkts = 0;
  399. while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts]))
  400. nb_pkts++;
  401. xsk_tx_release(pool);
  402. return nb_pkts;
  403. }
  404. u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 nb_pkts)
  405. {
  406. struct xdp_sock *xs;
  407. rcu_read_lock();
  408. if (!list_is_singular(&pool->xsk_tx_list)) {
  409. /* Fallback to the non-batched version */
  410. rcu_read_unlock();
  411. return xsk_tx_peek_release_fallback(pool, nb_pkts);
  412. }
  413. xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list);
  414. if (!xs) {
  415. nb_pkts = 0;
  416. goto out;
  417. }
  418. nb_pkts = xskq_cons_nb_entries(xs->tx, nb_pkts);
  419. /* This is the backpressure mechanism for the Tx path. Try to
  420. * reserve space in the completion queue for all packets, but
  421. * if there are fewer slots available, just process that many
  422. * packets. This avoids having to implement any buffering in
  423. * the Tx path.
  424. */
  425. nb_pkts = xskq_prod_nb_free(pool->cq, nb_pkts);
  426. if (!nb_pkts)
  427. goto out;
  428. nb_pkts = xskq_cons_read_desc_batch(xs->tx, pool, nb_pkts);
  429. if (!nb_pkts) {
  430. xs->tx->queue_empty_descs++;
  431. goto out;
  432. }
  433. __xskq_cons_release(xs->tx);
  434. xskq_prod_write_addr_batch(pool->cq, pool->tx_descs, nb_pkts);
  435. xs->sk.sk_write_space(&xs->sk);
  436. out:
  437. rcu_read_unlock();
  438. return nb_pkts;
  439. }
  440. EXPORT_SYMBOL(xsk_tx_peek_release_desc_batch);
  441. static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
  442. {
  443. struct net_device *dev = xs->dev;
  444. return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
  445. }
  446. static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool)
  447. {
  448. int ret;
  449. spin_lock(&pool->cq->cq_cached_prod_lock);
  450. ret = xskq_prod_reserve(pool->cq);
  451. spin_unlock(&pool->cq->cq_cached_prod_lock);
  452. return ret;
  453. }
  454. static bool xsk_skb_destructor_is_addr(struct sk_buff *skb)
  455. {
  456. return (uintptr_t)skb_shinfo(skb)->destructor_arg & 0x1UL;
  457. }
  458. static u64 xsk_skb_destructor_get_addr(struct sk_buff *skb)
  459. {
  460. return (u64)((uintptr_t)skb_shinfo(skb)->destructor_arg & ~0x1UL);
  461. }
  462. static void xsk_skb_destructor_set_addr(struct sk_buff *skb, u64 addr)
  463. {
  464. skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t)addr | 0x1UL);
  465. }
  466. static void xsk_inc_num_desc(struct sk_buff *skb)
  467. {
  468. struct xsk_addrs *xsk_addr;
  469. if (!xsk_skb_destructor_is_addr(skb)) {
  470. xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
  471. xsk_addr->num_descs++;
  472. }
  473. }
  474. static u32 xsk_get_num_desc(struct sk_buff *skb)
  475. {
  476. struct xsk_addrs *xsk_addr;
  477. if (xsk_skb_destructor_is_addr(skb))
  478. return 1;
  479. xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
  480. return xsk_addr->num_descs;
  481. }
  482. static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool,
  483. struct sk_buff *skb)
  484. {
  485. u32 num_descs = xsk_get_num_desc(skb);
  486. struct xsk_addrs *xsk_addr;
  487. u32 descs_processed = 0;
  488. unsigned long flags;
  489. u32 idx, i;
  490. spin_lock_irqsave(&pool->cq_prod_lock, flags);
  491. idx = xskq_get_prod(pool->cq);
  492. if (unlikely(num_descs > 1)) {
  493. xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
  494. for (i = 0; i < num_descs; i++) {
  495. xskq_prod_write_addr(pool->cq, idx + descs_processed,
  496. xsk_addr->addrs[i]);
  497. descs_processed++;
  498. }
  499. kmem_cache_free(xsk_tx_generic_cache, xsk_addr);
  500. } else {
  501. xskq_prod_write_addr(pool->cq, idx,
  502. xsk_skb_destructor_get_addr(skb));
  503. descs_processed++;
  504. }
  505. xskq_prod_submit_n(pool->cq, descs_processed);
  506. spin_unlock_irqrestore(&pool->cq_prod_lock, flags);
  507. }
  508. static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
  509. {
  510. spin_lock(&pool->cq->cq_cached_prod_lock);
  511. xskq_prod_cancel_n(pool->cq, n);
  512. spin_unlock(&pool->cq->cq_cached_prod_lock);
  513. }
  514. INDIRECT_CALLABLE_SCOPE
  515. void xsk_destruct_skb(struct sk_buff *skb)
  516. {
  517. struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta;
  518. if (compl->tx_timestamp) {
  519. /* sw completion timestamp, not a real one */
  520. *compl->tx_timestamp = ktime_get_tai_fast_ns();
  521. }
  522. xsk_cq_submit_addr_locked(xdp_sk(skb->sk)->pool, skb);
  523. sock_wfree(skb);
  524. }
  525. static void xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs,
  526. u64 addr)
  527. {
  528. skb->dev = xs->dev;
  529. skb->priority = READ_ONCE(xs->sk.sk_priority);
  530. skb->mark = READ_ONCE(xs->sk.sk_mark);
  531. skb->destructor = xsk_destruct_skb;
  532. xsk_skb_destructor_set_addr(skb, addr);
  533. }
  534. static void xsk_consume_skb(struct sk_buff *skb)
  535. {
  536. struct xdp_sock *xs = xdp_sk(skb->sk);
  537. u32 num_descs = xsk_get_num_desc(skb);
  538. struct xsk_addrs *xsk_addr;
  539. if (unlikely(num_descs > 1)) {
  540. xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
  541. kmem_cache_free(xsk_tx_generic_cache, xsk_addr);
  542. }
  543. skb->destructor = sock_wfree;
  544. xsk_cq_cancel_locked(xs->pool, num_descs);
  545. /* Free skb without triggering the perf drop trace */
  546. consume_skb(skb);
  547. xs->skb = NULL;
  548. }
  549. static void xsk_drop_skb(struct sk_buff *skb)
  550. {
  551. xdp_sk(skb->sk)->tx->invalid_descs += xsk_get_num_desc(skb);
  552. xsk_consume_skb(skb);
  553. }
  554. static int xsk_skb_metadata(struct sk_buff *skb, void *buffer,
  555. struct xdp_desc *desc, struct xsk_buff_pool *pool,
  556. u32 hr)
  557. {
  558. struct xsk_tx_metadata *meta = NULL;
  559. if (unlikely(pool->tx_metadata_len == 0))
  560. return -EINVAL;
  561. meta = buffer - pool->tx_metadata_len;
  562. if (unlikely(!xsk_buff_valid_tx_metadata(meta)))
  563. return -EINVAL;
  564. if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) {
  565. if (unlikely(meta->request.csum_start +
  566. meta->request.csum_offset +
  567. sizeof(__sum16) > desc->len))
  568. return -EINVAL;
  569. skb->csum_start = hr + meta->request.csum_start;
  570. skb->csum_offset = meta->request.csum_offset;
  571. skb->ip_summed = CHECKSUM_PARTIAL;
  572. if (unlikely(pool->tx_sw_csum)) {
  573. int err;
  574. err = skb_checksum_help(skb);
  575. if (err)
  576. return err;
  577. }
  578. }
  579. if (meta->flags & XDP_TXMD_FLAGS_LAUNCH_TIME)
  580. skb->skb_mstamp_ns = meta->request.launch_time;
  581. xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta);
  582. return 0;
  583. }
  584. static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
  585. struct xdp_desc *desc)
  586. {
  587. struct xsk_buff_pool *pool = xs->pool;
  588. u32 hr, len, ts, offset, copy, copied;
  589. struct sk_buff *skb = xs->skb;
  590. struct page *page;
  591. void *buffer;
  592. int err, i;
  593. u64 addr;
  594. addr = desc->addr;
  595. buffer = xsk_buff_raw_get_data(pool, addr);
  596. if (!skb) {
  597. hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom));
  598. skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err);
  599. if (unlikely(!skb))
  600. return ERR_PTR(err);
  601. skb_reserve(skb, hr);
  602. xsk_skb_init_misc(skb, xs, desc->addr);
  603. if (desc->options & XDP_TX_METADATA) {
  604. err = xsk_skb_metadata(skb, buffer, desc, pool, hr);
  605. if (unlikely(err))
  606. return ERR_PTR(err);
  607. }
  608. } else {
  609. struct xsk_addrs *xsk_addr;
  610. if (xsk_skb_destructor_is_addr(skb)) {
  611. xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache,
  612. GFP_KERNEL);
  613. if (!xsk_addr)
  614. return ERR_PTR(-ENOMEM);
  615. xsk_addr->num_descs = 1;
  616. xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb);
  617. skb_shinfo(skb)->destructor_arg = (void *)xsk_addr;
  618. } else {
  619. xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
  620. }
  621. /* in case of -EOVERFLOW that could happen below,
  622. * xsk_consume_skb() will release this node as whole skb
  623. * would be dropped, which implies freeing all list elements
  624. */
  625. xsk_addr->addrs[xsk_addr->num_descs] = desc->addr;
  626. }
  627. len = desc->len;
  628. ts = pool->unaligned ? len : pool->chunk_size;
  629. offset = offset_in_page(buffer);
  630. addr = buffer - pool->addrs;
  631. for (copied = 0, i = skb_shinfo(skb)->nr_frags; copied < len; i++) {
  632. if (unlikely(i >= MAX_SKB_FRAGS))
  633. return ERR_PTR(-EOVERFLOW);
  634. page = pool->umem->pgs[addr >> PAGE_SHIFT];
  635. get_page(page);
  636. copy = min_t(u32, PAGE_SIZE - offset, len - copied);
  637. skb_fill_page_desc(skb, i, page, offset, copy);
  638. copied += copy;
  639. addr += copy;
  640. offset = 0;
  641. }
  642. skb->len += len;
  643. skb->data_len += len;
  644. skb->truesize += ts;
  645. refcount_add(ts, &xs->sk.sk_wmem_alloc);
  646. return skb;
  647. }
  648. static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
  649. struct xdp_desc *desc)
  650. {
  651. struct net_device *dev = xs->dev;
  652. struct sk_buff *skb = xs->skb;
  653. int err;
  654. if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
  655. skb = xsk_build_skb_zerocopy(xs, desc);
  656. if (IS_ERR(skb)) {
  657. err = PTR_ERR(skb);
  658. skb = NULL;
  659. goto free_err;
  660. }
  661. } else {
  662. u32 hr, tr, len;
  663. void *buffer;
  664. buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
  665. len = desc->len;
  666. if (!skb) {
  667. hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom));
  668. tr = dev->needed_tailroom;
  669. skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err);
  670. if (unlikely(!skb))
  671. goto free_err;
  672. skb_reserve(skb, hr);
  673. skb_put(skb, len);
  674. err = skb_store_bits(skb, 0, buffer, len);
  675. if (unlikely(err))
  676. goto free_err;
  677. xsk_skb_init_misc(skb, xs, desc->addr);
  678. if (desc->options & XDP_TX_METADATA) {
  679. err = xsk_skb_metadata(skb, buffer, desc,
  680. xs->pool, hr);
  681. if (unlikely(err))
  682. goto free_err;
  683. }
  684. } else {
  685. int nr_frags = skb_shinfo(skb)->nr_frags;
  686. struct xsk_addrs *xsk_addr;
  687. struct page *page;
  688. u8 *vaddr;
  689. if (xsk_skb_destructor_is_addr(skb)) {
  690. xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache,
  691. GFP_KERNEL);
  692. if (!xsk_addr) {
  693. err = -ENOMEM;
  694. goto free_err;
  695. }
  696. xsk_addr->num_descs = 1;
  697. xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb);
  698. skb_shinfo(skb)->destructor_arg = (void *)xsk_addr;
  699. } else {
  700. xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
  701. }
  702. if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) {
  703. err = -EOVERFLOW;
  704. goto free_err;
  705. }
  706. page = alloc_page(xs->sk.sk_allocation);
  707. if (unlikely(!page)) {
  708. err = -EAGAIN;
  709. goto free_err;
  710. }
  711. vaddr = kmap_local_page(page);
  712. memcpy(vaddr, buffer, len);
  713. kunmap_local(vaddr);
  714. skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE);
  715. refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc);
  716. xsk_addr->addrs[xsk_addr->num_descs] = desc->addr;
  717. }
  718. }
  719. xsk_inc_num_desc(skb);
  720. return skb;
  721. free_err:
  722. if (skb && !skb_shinfo(skb)->nr_frags)
  723. kfree_skb(skb);
  724. if (err == -EOVERFLOW) {
  725. /* Drop the packet */
  726. xsk_inc_num_desc(xs->skb);
  727. xsk_drop_skb(xs->skb);
  728. xskq_cons_release(xs->tx);
  729. } else {
  730. /* Let application retry */
  731. xsk_cq_cancel_locked(xs->pool, 1);
  732. }
  733. return ERR_PTR(err);
  734. }
  735. static int __xsk_generic_xmit(struct sock *sk)
  736. {
  737. struct xdp_sock *xs = xdp_sk(sk);
  738. bool sent_frame = false;
  739. struct xdp_desc desc;
  740. struct sk_buff *skb;
  741. u32 max_batch;
  742. int err = 0;
  743. mutex_lock(&xs->mutex);
  744. /* Since we dropped the RCU read lock, the socket state might have changed. */
  745. if (unlikely(!xsk_is_bound(xs))) {
  746. err = -ENXIO;
  747. goto out;
  748. }
  749. if (xs->queue_id >= xs->dev->real_num_tx_queues)
  750. goto out;
  751. max_batch = READ_ONCE(xs->max_tx_budget);
  752. while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
  753. if (max_batch-- == 0) {
  754. err = -EAGAIN;
  755. goto out;
  756. }
  757. /* This is the backpressure mechanism for the Tx path.
  758. * Reserve space in the completion queue and only proceed
  759. * if there is space in it. This avoids having to implement
  760. * any buffering in the Tx path.
  761. */
  762. err = xsk_cq_reserve_locked(xs->pool);
  763. if (err) {
  764. err = -EAGAIN;
  765. goto out;
  766. }
  767. skb = xsk_build_skb(xs, &desc);
  768. if (IS_ERR(skb)) {
  769. err = PTR_ERR(skb);
  770. if (err != -EOVERFLOW)
  771. goto out;
  772. err = 0;
  773. continue;
  774. }
  775. xskq_cons_release(xs->tx);
  776. if (xp_mb_desc(&desc)) {
  777. xs->skb = skb;
  778. continue;
  779. }
  780. err = __dev_direct_xmit(skb, xs->queue_id);
  781. if (err == NETDEV_TX_BUSY) {
  782. /* Tell user-space to retry the send */
  783. xskq_cons_cancel_n(xs->tx, xsk_get_num_desc(skb));
  784. xsk_consume_skb(skb);
  785. err = -EAGAIN;
  786. goto out;
  787. }
  788. /* Ignore NET_XMIT_CN as packet might have been sent */
  789. if (err == NET_XMIT_DROP) {
  790. /* SKB completed but not sent */
  791. err = -EBUSY;
  792. xs->skb = NULL;
  793. goto out;
  794. }
  795. sent_frame = true;
  796. xs->skb = NULL;
  797. }
  798. if (xskq_has_descs(xs->tx)) {
  799. if (xs->skb)
  800. xsk_drop_skb(xs->skb);
  801. xskq_cons_release(xs->tx);
  802. }
  803. out:
  804. if (sent_frame)
  805. __xsk_tx_release(xs);
  806. mutex_unlock(&xs->mutex);
  807. return err;
  808. }
  809. static int xsk_generic_xmit(struct sock *sk)
  810. {
  811. int ret;
  812. /* Drop the RCU lock since the SKB path might sleep. */
  813. rcu_read_unlock();
  814. ret = __xsk_generic_xmit(sk);
  815. /* Reaquire RCU lock before going into common code. */
  816. rcu_read_lock();
  817. return ret;
  818. }
  819. static bool xsk_no_wakeup(struct sock *sk)
  820. {
  821. #ifdef CONFIG_NET_RX_BUSY_POLL
  822. /* Prefer busy-polling, skip the wakeup. */
  823. return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) &&
  824. napi_id_valid(READ_ONCE(sk->sk_napi_id));
  825. #else
  826. return false;
  827. #endif
  828. }
  829. static int xsk_check_common(struct xdp_sock *xs)
  830. {
  831. if (unlikely(!xsk_is_bound(xs)))
  832. return -ENXIO;
  833. if (unlikely(!(xs->dev->flags & IFF_UP)))
  834. return -ENETDOWN;
  835. return 0;
  836. }
  837. static int __xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
  838. {
  839. bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
  840. struct sock *sk = sock->sk;
  841. struct xdp_sock *xs = xdp_sk(sk);
  842. struct xsk_buff_pool *pool;
  843. int err;
  844. err = xsk_check_common(xs);
  845. if (err)
  846. return err;
  847. if (unlikely(need_wait))
  848. return -EOPNOTSUPP;
  849. if (unlikely(!xs->tx))
  850. return -ENOBUFS;
  851. if (sk_can_busy_loop(sk))
  852. sk_busy_loop(sk, 1); /* only support non-blocking sockets */
  853. if (xs->zc && xsk_no_wakeup(sk))
  854. return 0;
  855. pool = xs->pool;
  856. if (pool->cached_need_wakeup & XDP_WAKEUP_TX) {
  857. if (xs->zc)
  858. return xsk_wakeup(xs, XDP_WAKEUP_TX);
  859. return xsk_generic_xmit(sk);
  860. }
  861. return 0;
  862. }
  863. static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
  864. {
  865. int ret;
  866. rcu_read_lock();
  867. ret = __xsk_sendmsg(sock, m, total_len);
  868. rcu_read_unlock();
  869. return ret;
  870. }
  871. static int __xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags)
  872. {
  873. bool need_wait = !(flags & MSG_DONTWAIT);
  874. struct sock *sk = sock->sk;
  875. struct xdp_sock *xs = xdp_sk(sk);
  876. int err;
  877. err = xsk_check_common(xs);
  878. if (err)
  879. return err;
  880. if (unlikely(!xs->rx))
  881. return -ENOBUFS;
  882. if (unlikely(need_wait))
  883. return -EOPNOTSUPP;
  884. if (sk_can_busy_loop(sk))
  885. sk_busy_loop(sk, 1); /* only support non-blocking sockets */
  886. if (xsk_no_wakeup(sk))
  887. return 0;
  888. if (xs->pool->cached_need_wakeup & XDP_WAKEUP_RX && xs->zc)
  889. return xsk_wakeup(xs, XDP_WAKEUP_RX);
  890. return 0;
  891. }
  892. static int xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags)
  893. {
  894. int ret;
  895. rcu_read_lock();
  896. ret = __xsk_recvmsg(sock, m, len, flags);
  897. rcu_read_unlock();
  898. return ret;
  899. }
  900. static __poll_t xsk_poll(struct file *file, struct socket *sock,
  901. struct poll_table_struct *wait)
  902. {
  903. __poll_t mask = 0;
  904. struct sock *sk = sock->sk;
  905. struct xdp_sock *xs = xdp_sk(sk);
  906. struct xsk_buff_pool *pool;
  907. sock_poll_wait(file, sock, wait);
  908. rcu_read_lock();
  909. if (xsk_check_common(xs))
  910. goto out;
  911. pool = xs->pool;
  912. if (pool->cached_need_wakeup) {
  913. if (xs->zc)
  914. xsk_wakeup(xs, pool->cached_need_wakeup);
  915. else if (xs->tx)
  916. /* Poll needs to drive Tx also in copy mode */
  917. xsk_generic_xmit(sk);
  918. }
  919. if (xs->rx && !xskq_prod_is_empty(xs->rx))
  920. mask |= EPOLLIN | EPOLLRDNORM;
  921. if (xs->tx && xsk_tx_writeable(xs))
  922. mask |= EPOLLOUT | EPOLLWRNORM;
  923. out:
  924. rcu_read_unlock();
  925. return mask;
  926. }
  927. static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
  928. bool umem_queue)
  929. {
  930. struct xsk_queue *q;
  931. if (entries == 0 || *queue || !is_power_of_2(entries))
  932. return -EINVAL;
  933. q = xskq_create(entries, umem_queue);
  934. if (!q)
  935. return -ENOMEM;
  936. /* Make sure queue is ready before it can be seen by others */
  937. smp_wmb();
  938. WRITE_ONCE(*queue, q);
  939. return 0;
  940. }
  941. static void xsk_unbind_dev(struct xdp_sock *xs)
  942. {
  943. struct net_device *dev = xs->dev;
  944. if (xs->state != XSK_BOUND)
  945. return;
  946. WRITE_ONCE(xs->state, XSK_UNBOUND);
  947. /* Wait for driver to stop using the xdp socket. */
  948. xp_del_xsk(xs->pool, xs);
  949. synchronize_net();
  950. dev_put(dev);
  951. }
  952. static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
  953. struct xdp_sock __rcu ***map_entry)
  954. {
  955. struct xsk_map *map = NULL;
  956. struct xsk_map_node *node;
  957. *map_entry = NULL;
  958. spin_lock_bh(&xs->map_list_lock);
  959. node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node,
  960. node);
  961. if (node) {
  962. bpf_map_inc(&node->map->map);
  963. map = node->map;
  964. *map_entry = node->map_entry;
  965. }
  966. spin_unlock_bh(&xs->map_list_lock);
  967. return map;
  968. }
  969. static void xsk_delete_from_maps(struct xdp_sock *xs)
  970. {
  971. /* This function removes the current XDP socket from all the
  972. * maps it resides in. We need to take extra care here, due to
  973. * the two locks involved. Each map has a lock synchronizing
  974. * updates to the entries, and each socket has a lock that
  975. * synchronizes access to the list of maps (map_list). For
  976. * deadlock avoidance the locks need to be taken in the order
  977. * "map lock"->"socket map list lock". We start off by
  978. * accessing the socket map list, and take a reference to the
  979. * map to guarantee existence between the
  980. * xsk_get_map_list_entry() and xsk_map_try_sock_delete()
  981. * calls. Then we ask the map to remove the socket, which
  982. * tries to remove the socket from the map. Note that there
  983. * might be updates to the map between
  984. * xsk_get_map_list_entry() and xsk_map_try_sock_delete().
  985. */
  986. struct xdp_sock __rcu **map_entry = NULL;
  987. struct xsk_map *map;
  988. while ((map = xsk_get_map_list_entry(xs, &map_entry))) {
  989. xsk_map_try_sock_delete(map, xs, map_entry);
  990. bpf_map_put(&map->map);
  991. }
  992. }
  993. static int xsk_release(struct socket *sock)
  994. {
  995. struct sock *sk = sock->sk;
  996. struct xdp_sock *xs = xdp_sk(sk);
  997. struct net *net;
  998. if (!sk)
  999. return 0;
  1000. net = sock_net(sk);
  1001. if (xs->skb)
  1002. xsk_drop_skb(xs->skb);
  1003. mutex_lock(&net->xdp.lock);
  1004. sk_del_node_init_rcu(sk);
  1005. mutex_unlock(&net->xdp.lock);
  1006. sock_prot_inuse_add(net, sk->sk_prot, -1);
  1007. xsk_delete_from_maps(xs);
  1008. mutex_lock(&xs->mutex);
  1009. xsk_unbind_dev(xs);
  1010. mutex_unlock(&xs->mutex);
  1011. xskq_destroy(xs->rx);
  1012. xskq_destroy(xs->tx);
  1013. xskq_destroy(xs->fq_tmp);
  1014. xskq_destroy(xs->cq_tmp);
  1015. sock_orphan(sk);
  1016. sock->sk = NULL;
  1017. sock_put(sk);
  1018. return 0;
  1019. }
  1020. static struct socket *xsk_lookup_xsk_from_fd(int fd)
  1021. {
  1022. struct socket *sock;
  1023. int err;
  1024. sock = sockfd_lookup(fd, &err);
  1025. if (!sock)
  1026. return ERR_PTR(-ENOTSOCK);
  1027. if (sock->sk->sk_family != PF_XDP) {
  1028. sockfd_put(sock);
  1029. return ERR_PTR(-ENOPROTOOPT);
  1030. }
  1031. return sock;
  1032. }
  1033. static bool xsk_validate_queues(struct xdp_sock *xs)
  1034. {
  1035. return xs->fq_tmp && xs->cq_tmp;
  1036. }
  1037. static int xsk_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len)
  1038. {
  1039. struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
  1040. struct sock *sk = sock->sk;
  1041. struct xdp_sock *xs = xdp_sk(sk);
  1042. struct net_device *dev;
  1043. int bound_dev_if;
  1044. u32 flags, qid;
  1045. int err = 0;
  1046. if (addr_len < sizeof(struct sockaddr_xdp))
  1047. return -EINVAL;
  1048. if (sxdp->sxdp_family != AF_XDP)
  1049. return -EINVAL;
  1050. flags = sxdp->sxdp_flags;
  1051. if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
  1052. XDP_USE_NEED_WAKEUP | XDP_USE_SG))
  1053. return -EINVAL;
  1054. bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
  1055. if (bound_dev_if && bound_dev_if != sxdp->sxdp_ifindex)
  1056. return -EINVAL;
  1057. rtnl_lock();
  1058. mutex_lock(&xs->mutex);
  1059. if (xs->state != XSK_READY) {
  1060. err = -EBUSY;
  1061. goto out_release;
  1062. }
  1063. dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
  1064. if (!dev) {
  1065. err = -ENODEV;
  1066. goto out_release;
  1067. }
  1068. netdev_lock_ops(dev);
  1069. if (!xs->rx && !xs->tx) {
  1070. err = -EINVAL;
  1071. goto out_unlock;
  1072. }
  1073. qid = sxdp->sxdp_queue_id;
  1074. if (flags & XDP_SHARED_UMEM) {
  1075. struct xdp_sock *umem_xs;
  1076. struct socket *sock;
  1077. if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
  1078. (flags & XDP_USE_NEED_WAKEUP) || (flags & XDP_USE_SG)) {
  1079. /* Cannot specify flags for shared sockets. */
  1080. err = -EINVAL;
  1081. goto out_unlock;
  1082. }
  1083. if (xs->umem) {
  1084. /* We have already our own. */
  1085. err = -EINVAL;
  1086. goto out_unlock;
  1087. }
  1088. sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
  1089. if (IS_ERR(sock)) {
  1090. err = PTR_ERR(sock);
  1091. goto out_unlock;
  1092. }
  1093. umem_xs = xdp_sk(sock->sk);
  1094. if (!xsk_is_bound(umem_xs)) {
  1095. err = -EBADF;
  1096. sockfd_put(sock);
  1097. goto out_unlock;
  1098. }
  1099. if (umem_xs->queue_id != qid || umem_xs->dev != dev) {
  1100. /* One fill and completion ring required for each queue id. */
  1101. if (!xsk_validate_queues(xs)) {
  1102. err = -EINVAL;
  1103. sockfd_put(sock);
  1104. goto out_unlock;
  1105. }
  1106. /* Share the umem with another socket on another qid
  1107. * and/or device.
  1108. */
  1109. xs->pool = xp_create_and_assign_umem(xs,
  1110. umem_xs->umem);
  1111. if (!xs->pool) {
  1112. err = -ENOMEM;
  1113. sockfd_put(sock);
  1114. goto out_unlock;
  1115. }
  1116. err = xp_assign_dev_shared(xs->pool, umem_xs, dev,
  1117. qid);
  1118. if (err) {
  1119. xp_destroy(xs->pool);
  1120. xs->pool = NULL;
  1121. sockfd_put(sock);
  1122. goto out_unlock;
  1123. }
  1124. } else {
  1125. /* Share the buffer pool with the other socket. */
  1126. if (xs->fq_tmp || xs->cq_tmp) {
  1127. /* Do not allow setting your own fq or cq. */
  1128. err = -EINVAL;
  1129. sockfd_put(sock);
  1130. goto out_unlock;
  1131. }
  1132. xp_get_pool(umem_xs->pool);
  1133. xs->pool = umem_xs->pool;
  1134. /* If underlying shared umem was created without Tx
  1135. * ring, allocate Tx descs array that Tx batching API
  1136. * utilizes
  1137. */
  1138. if (xs->tx && !xs->pool->tx_descs) {
  1139. err = xp_alloc_tx_descs(xs->pool, xs);
  1140. if (err) {
  1141. xp_put_pool(xs->pool);
  1142. xs->pool = NULL;
  1143. sockfd_put(sock);
  1144. goto out_unlock;
  1145. }
  1146. }
  1147. }
  1148. xdp_get_umem(umem_xs->umem);
  1149. WRITE_ONCE(xs->umem, umem_xs->umem);
  1150. sockfd_put(sock);
  1151. } else if (!xs->umem || !xsk_validate_queues(xs)) {
  1152. err = -EINVAL;
  1153. goto out_unlock;
  1154. } else {
  1155. /* This xsk has its own umem. */
  1156. xs->pool = xp_create_and_assign_umem(xs, xs->umem);
  1157. if (!xs->pool) {
  1158. err = -ENOMEM;
  1159. goto out_unlock;
  1160. }
  1161. err = xp_assign_dev(xs->pool, dev, qid, flags);
  1162. if (err) {
  1163. xp_destroy(xs->pool);
  1164. xs->pool = NULL;
  1165. goto out_unlock;
  1166. }
  1167. }
  1168. /* FQ and CQ are now owned by the buffer pool and cleaned up with it. */
  1169. xs->fq_tmp = NULL;
  1170. xs->cq_tmp = NULL;
  1171. xs->dev = dev;
  1172. xs->zc = xs->umem->zc;
  1173. xs->sg = !!(xs->umem->flags & XDP_UMEM_SG_FLAG);
  1174. xs->queue_id = qid;
  1175. xp_add_xsk(xs->pool, xs);
  1176. if (qid < dev->real_num_rx_queues) {
  1177. struct netdev_rx_queue *rxq;
  1178. rxq = __netif_get_rx_queue(dev, qid);
  1179. if (rxq->napi)
  1180. __sk_mark_napi_id_once(sk, rxq->napi->napi_id);
  1181. }
  1182. out_unlock:
  1183. if (err) {
  1184. dev_put(dev);
  1185. } else {
  1186. /* Matches smp_rmb() in bind() for shared umem
  1187. * sockets, and xsk_is_bound().
  1188. */
  1189. smp_wmb();
  1190. WRITE_ONCE(xs->state, XSK_BOUND);
  1191. }
  1192. netdev_unlock_ops(dev);
  1193. out_release:
  1194. mutex_unlock(&xs->mutex);
  1195. rtnl_unlock();
  1196. return err;
  1197. }
  1198. struct xdp_umem_reg_v1 {
  1199. __u64 addr; /* Start of packet data area */
  1200. __u64 len; /* Length of packet data area */
  1201. __u32 chunk_size;
  1202. __u32 headroom;
  1203. };
  1204. static int xsk_setsockopt(struct socket *sock, int level, int optname,
  1205. sockptr_t optval, unsigned int optlen)
  1206. {
  1207. struct sock *sk = sock->sk;
  1208. struct xdp_sock *xs = xdp_sk(sk);
  1209. int err;
  1210. if (level != SOL_XDP)
  1211. return -ENOPROTOOPT;
  1212. switch (optname) {
  1213. case XDP_RX_RING:
  1214. case XDP_TX_RING:
  1215. {
  1216. struct xsk_queue **q;
  1217. int entries;
  1218. if (optlen < sizeof(entries))
  1219. return -EINVAL;
  1220. if (copy_from_sockptr(&entries, optval, sizeof(entries)))
  1221. return -EFAULT;
  1222. mutex_lock(&xs->mutex);
  1223. if (xs->state != XSK_READY) {
  1224. mutex_unlock(&xs->mutex);
  1225. return -EBUSY;
  1226. }
  1227. q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
  1228. err = xsk_init_queue(entries, q, false);
  1229. if (!err && optname == XDP_TX_RING)
  1230. /* Tx needs to be explicitly woken up the first time */
  1231. xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
  1232. mutex_unlock(&xs->mutex);
  1233. return err;
  1234. }
  1235. case XDP_UMEM_REG:
  1236. {
  1237. size_t mr_size = sizeof(struct xdp_umem_reg);
  1238. struct xdp_umem_reg mr = {};
  1239. struct xdp_umem *umem;
  1240. if (optlen < sizeof(struct xdp_umem_reg_v1))
  1241. return -EINVAL;
  1242. else if (optlen < sizeof(mr))
  1243. mr_size = sizeof(struct xdp_umem_reg_v1);
  1244. BUILD_BUG_ON(sizeof(struct xdp_umem_reg_v1) >= sizeof(struct xdp_umem_reg));
  1245. /* Make sure the last field of the struct doesn't have
  1246. * uninitialized padding. All padding has to be explicit
  1247. * and has to be set to zero by the userspace to make
  1248. * struct xdp_umem_reg extensible in the future.
  1249. */
  1250. BUILD_BUG_ON(offsetof(struct xdp_umem_reg, tx_metadata_len) +
  1251. sizeof_field(struct xdp_umem_reg, tx_metadata_len) !=
  1252. sizeof(struct xdp_umem_reg));
  1253. if (copy_from_sockptr(&mr, optval, mr_size))
  1254. return -EFAULT;
  1255. mutex_lock(&xs->mutex);
  1256. if (xs->state != XSK_READY || xs->umem) {
  1257. mutex_unlock(&xs->mutex);
  1258. return -EBUSY;
  1259. }
  1260. umem = xdp_umem_create(&mr);
  1261. if (IS_ERR(umem)) {
  1262. mutex_unlock(&xs->mutex);
  1263. return PTR_ERR(umem);
  1264. }
  1265. /* Make sure umem is ready before it can be seen by others */
  1266. smp_wmb();
  1267. WRITE_ONCE(xs->umem, umem);
  1268. mutex_unlock(&xs->mutex);
  1269. return 0;
  1270. }
  1271. case XDP_UMEM_FILL_RING:
  1272. case XDP_UMEM_COMPLETION_RING:
  1273. {
  1274. struct xsk_queue **q;
  1275. int entries;
  1276. if (optlen < sizeof(entries))
  1277. return -EINVAL;
  1278. if (copy_from_sockptr(&entries, optval, sizeof(entries)))
  1279. return -EFAULT;
  1280. mutex_lock(&xs->mutex);
  1281. if (xs->state != XSK_READY) {
  1282. mutex_unlock(&xs->mutex);
  1283. return -EBUSY;
  1284. }
  1285. q = (optname == XDP_UMEM_FILL_RING) ? &xs->fq_tmp :
  1286. &xs->cq_tmp;
  1287. err = xsk_init_queue(entries, q, true);
  1288. mutex_unlock(&xs->mutex);
  1289. return err;
  1290. }
  1291. case XDP_MAX_TX_SKB_BUDGET:
  1292. {
  1293. unsigned int budget;
  1294. if (optlen != sizeof(budget))
  1295. return -EINVAL;
  1296. if (copy_from_sockptr(&budget, optval, sizeof(budget)))
  1297. return -EFAULT;
  1298. if (!xs->tx ||
  1299. budget < TX_BATCH_SIZE || budget > xs->tx->nentries)
  1300. return -EACCES;
  1301. WRITE_ONCE(xs->max_tx_budget, budget);
  1302. return 0;
  1303. }
  1304. default:
  1305. break;
  1306. }
  1307. return -ENOPROTOOPT;
  1308. }
  1309. static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring)
  1310. {
  1311. ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
  1312. ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
  1313. ring->desc = offsetof(struct xdp_rxtx_ring, desc);
  1314. }
  1315. static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring)
  1316. {
  1317. ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer);
  1318. ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
  1319. ring->desc = offsetof(struct xdp_umem_ring, desc);
  1320. }
  1321. struct xdp_statistics_v1 {
  1322. __u64 rx_dropped;
  1323. __u64 rx_invalid_descs;
  1324. __u64 tx_invalid_descs;
  1325. };
  1326. static int xsk_getsockopt(struct socket *sock, int level, int optname,
  1327. char __user *optval, int __user *optlen)
  1328. {
  1329. struct sock *sk = sock->sk;
  1330. struct xdp_sock *xs = xdp_sk(sk);
  1331. int len;
  1332. if (level != SOL_XDP)
  1333. return -ENOPROTOOPT;
  1334. if (get_user(len, optlen))
  1335. return -EFAULT;
  1336. if (len < 0)
  1337. return -EINVAL;
  1338. switch (optname) {
  1339. case XDP_STATISTICS:
  1340. {
  1341. struct xdp_statistics stats = {};
  1342. bool extra_stats = true;
  1343. size_t stats_size;
  1344. if (len < sizeof(struct xdp_statistics_v1)) {
  1345. return -EINVAL;
  1346. } else if (len < sizeof(stats)) {
  1347. extra_stats = false;
  1348. stats_size = sizeof(struct xdp_statistics_v1);
  1349. } else {
  1350. stats_size = sizeof(stats);
  1351. }
  1352. mutex_lock(&xs->mutex);
  1353. stats.rx_dropped = xs->rx_dropped;
  1354. if (extra_stats) {
  1355. stats.rx_ring_full = xs->rx_queue_full;
  1356. stats.rx_fill_ring_empty_descs =
  1357. xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0;
  1358. stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx);
  1359. } else {
  1360. stats.rx_dropped += xs->rx_queue_full;
  1361. }
  1362. stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
  1363. stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
  1364. mutex_unlock(&xs->mutex);
  1365. if (copy_to_user(optval, &stats, stats_size))
  1366. return -EFAULT;
  1367. if (put_user(stats_size, optlen))
  1368. return -EFAULT;
  1369. return 0;
  1370. }
  1371. case XDP_MMAP_OFFSETS:
  1372. {
  1373. struct xdp_mmap_offsets off;
  1374. struct xdp_mmap_offsets_v1 off_v1;
  1375. bool flags_supported = true;
  1376. void *to_copy;
  1377. if (len < sizeof(off_v1))
  1378. return -EINVAL;
  1379. else if (len < sizeof(off))
  1380. flags_supported = false;
  1381. if (flags_supported) {
  1382. /* xdp_ring_offset is identical to xdp_ring_offset_v1
  1383. * except for the flags field added to the end.
  1384. */
  1385. xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
  1386. &off.rx);
  1387. xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
  1388. &off.tx);
  1389. xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
  1390. &off.fr);
  1391. xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
  1392. &off.cr);
  1393. off.rx.flags = offsetof(struct xdp_rxtx_ring,
  1394. ptrs.flags);
  1395. off.tx.flags = offsetof(struct xdp_rxtx_ring,
  1396. ptrs.flags);
  1397. off.fr.flags = offsetof(struct xdp_umem_ring,
  1398. ptrs.flags);
  1399. off.cr.flags = offsetof(struct xdp_umem_ring,
  1400. ptrs.flags);
  1401. len = sizeof(off);
  1402. to_copy = &off;
  1403. } else {
  1404. xsk_enter_rxtx_offsets(&off_v1.rx);
  1405. xsk_enter_rxtx_offsets(&off_v1.tx);
  1406. xsk_enter_umem_offsets(&off_v1.fr);
  1407. xsk_enter_umem_offsets(&off_v1.cr);
  1408. len = sizeof(off_v1);
  1409. to_copy = &off_v1;
  1410. }
  1411. if (copy_to_user(optval, to_copy, len))
  1412. return -EFAULT;
  1413. if (put_user(len, optlen))
  1414. return -EFAULT;
  1415. return 0;
  1416. }
  1417. case XDP_OPTIONS:
  1418. {
  1419. struct xdp_options opts = {};
  1420. if (len < sizeof(opts))
  1421. return -EINVAL;
  1422. mutex_lock(&xs->mutex);
  1423. if (xs->zc)
  1424. opts.flags |= XDP_OPTIONS_ZEROCOPY;
  1425. mutex_unlock(&xs->mutex);
  1426. len = sizeof(opts);
  1427. if (copy_to_user(optval, &opts, len))
  1428. return -EFAULT;
  1429. if (put_user(len, optlen))
  1430. return -EFAULT;
  1431. return 0;
  1432. }
  1433. default:
  1434. break;
  1435. }
  1436. return -EOPNOTSUPP;
  1437. }
  1438. static int xsk_mmap(struct file *file, struct socket *sock,
  1439. struct vm_area_struct *vma)
  1440. {
  1441. loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
  1442. unsigned long size = vma->vm_end - vma->vm_start;
  1443. struct xdp_sock *xs = xdp_sk(sock->sk);
  1444. int state = READ_ONCE(xs->state);
  1445. struct xsk_queue *q = NULL;
  1446. if (state != XSK_READY && state != XSK_BOUND)
  1447. return -EBUSY;
  1448. if (offset == XDP_PGOFF_RX_RING) {
  1449. q = READ_ONCE(xs->rx);
  1450. } else if (offset == XDP_PGOFF_TX_RING) {
  1451. q = READ_ONCE(xs->tx);
  1452. } else {
  1453. /* Matches the smp_wmb() in XDP_UMEM_REG */
  1454. smp_rmb();
  1455. if (offset == XDP_UMEM_PGOFF_FILL_RING)
  1456. q = state == XSK_READY ? READ_ONCE(xs->fq_tmp) :
  1457. READ_ONCE(xs->pool->fq);
  1458. else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
  1459. q = state == XSK_READY ? READ_ONCE(xs->cq_tmp) :
  1460. READ_ONCE(xs->pool->cq);
  1461. }
  1462. if (!q)
  1463. return -EINVAL;
  1464. /* Matches the smp_wmb() in xsk_init_queue */
  1465. smp_rmb();
  1466. if (size > q->ring_vmalloc_size)
  1467. return -EINVAL;
  1468. return remap_vmalloc_range(vma, q->ring, 0);
  1469. }
  1470. static int xsk_notifier(struct notifier_block *this,
  1471. unsigned long msg, void *ptr)
  1472. {
  1473. struct net_device *dev = netdev_notifier_info_to_dev(ptr);
  1474. struct net *net = dev_net(dev);
  1475. struct sock *sk;
  1476. switch (msg) {
  1477. case NETDEV_UNREGISTER:
  1478. mutex_lock(&net->xdp.lock);
  1479. sk_for_each(sk, &net->xdp.list) {
  1480. struct xdp_sock *xs = xdp_sk(sk);
  1481. mutex_lock(&xs->mutex);
  1482. if (xs->dev == dev) {
  1483. sk->sk_err = ENETDOWN;
  1484. if (!sock_flag(sk, SOCK_DEAD))
  1485. sk_error_report(sk);
  1486. xsk_unbind_dev(xs);
  1487. /* Clear device references. */
  1488. xp_clear_dev(xs->pool);
  1489. }
  1490. mutex_unlock(&xs->mutex);
  1491. }
  1492. mutex_unlock(&net->xdp.lock);
  1493. break;
  1494. }
  1495. return NOTIFY_DONE;
  1496. }
  1497. static struct proto xsk_proto = {
  1498. .name = "XDP",
  1499. .owner = THIS_MODULE,
  1500. .obj_size = sizeof(struct xdp_sock),
  1501. };
  1502. static const struct proto_ops xsk_proto_ops = {
  1503. .family = PF_XDP,
  1504. .owner = THIS_MODULE,
  1505. .release = xsk_release,
  1506. .bind = xsk_bind,
  1507. .connect = sock_no_connect,
  1508. .socketpair = sock_no_socketpair,
  1509. .accept = sock_no_accept,
  1510. .getname = sock_no_getname,
  1511. .poll = xsk_poll,
  1512. .ioctl = sock_no_ioctl,
  1513. .listen = sock_no_listen,
  1514. .shutdown = sock_no_shutdown,
  1515. .setsockopt = xsk_setsockopt,
  1516. .getsockopt = xsk_getsockopt,
  1517. .sendmsg = xsk_sendmsg,
  1518. .recvmsg = xsk_recvmsg,
  1519. .mmap = xsk_mmap,
  1520. };
  1521. static void xsk_destruct(struct sock *sk)
  1522. {
  1523. struct xdp_sock *xs = xdp_sk(sk);
  1524. if (!sock_flag(sk, SOCK_DEAD))
  1525. return;
  1526. if (!xp_put_pool(xs->pool))
  1527. xdp_put_umem(xs->umem, !xs->pool);
  1528. }
  1529. static int xsk_create(struct net *net, struct socket *sock, int protocol,
  1530. int kern)
  1531. {
  1532. struct xdp_sock *xs;
  1533. struct sock *sk;
  1534. if (!ns_capable(net->user_ns, CAP_NET_RAW))
  1535. return -EPERM;
  1536. if (sock->type != SOCK_RAW)
  1537. return -ESOCKTNOSUPPORT;
  1538. if (protocol)
  1539. return -EPROTONOSUPPORT;
  1540. sock->state = SS_UNCONNECTED;
  1541. sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
  1542. if (!sk)
  1543. return -ENOBUFS;
  1544. sock->ops = &xsk_proto_ops;
  1545. sock_init_data(sock, sk);
  1546. sk->sk_family = PF_XDP;
  1547. sk->sk_destruct = xsk_destruct;
  1548. sock_set_flag(sk, SOCK_RCU_FREE);
  1549. xs = xdp_sk(sk);
  1550. xs->state = XSK_READY;
  1551. xs->max_tx_budget = TX_BATCH_SIZE;
  1552. mutex_init(&xs->mutex);
  1553. INIT_LIST_HEAD(&xs->map_list);
  1554. spin_lock_init(&xs->map_list_lock);
  1555. mutex_lock(&net->xdp.lock);
  1556. sk_add_node_rcu(sk, &net->xdp.list);
  1557. mutex_unlock(&net->xdp.lock);
  1558. sock_prot_inuse_add(net, &xsk_proto, 1);
  1559. return 0;
  1560. }
  1561. static const struct net_proto_family xsk_family_ops = {
  1562. .family = PF_XDP,
  1563. .create = xsk_create,
  1564. .owner = THIS_MODULE,
  1565. };
  1566. static struct notifier_block xsk_netdev_notifier = {
  1567. .notifier_call = xsk_notifier,
  1568. };
  1569. static int __net_init xsk_net_init(struct net *net)
  1570. {
  1571. mutex_init(&net->xdp.lock);
  1572. INIT_HLIST_HEAD(&net->xdp.list);
  1573. return 0;
  1574. }
  1575. static void __net_exit xsk_net_exit(struct net *net)
  1576. {
  1577. WARN_ON_ONCE(!hlist_empty(&net->xdp.list));
  1578. }
  1579. static struct pernet_operations xsk_net_ops = {
  1580. .init = xsk_net_init,
  1581. .exit = xsk_net_exit,
  1582. };
  1583. static int __init xsk_init(void)
  1584. {
  1585. int err;
  1586. err = proto_register(&xsk_proto, 0 /* no slab */);
  1587. if (err)
  1588. goto out;
  1589. err = sock_register(&xsk_family_ops);
  1590. if (err)
  1591. goto out_proto;
  1592. err = register_pernet_subsys(&xsk_net_ops);
  1593. if (err)
  1594. goto out_sk;
  1595. err = register_netdevice_notifier(&xsk_netdev_notifier);
  1596. if (err)
  1597. goto out_pernet;
  1598. xsk_tx_generic_cache = kmem_cache_create("xsk_generic_xmit_cache",
  1599. sizeof(struct xsk_addrs),
  1600. 0, SLAB_HWCACHE_ALIGN, NULL);
  1601. if (!xsk_tx_generic_cache) {
  1602. err = -ENOMEM;
  1603. goto out_unreg_notif;
  1604. }
  1605. return 0;
  1606. out_unreg_notif:
  1607. unregister_netdevice_notifier(&xsk_netdev_notifier);
  1608. out_pernet:
  1609. unregister_pernet_subsys(&xsk_net_ops);
  1610. out_sk:
  1611. sock_unregister(PF_XDP);
  1612. out_proto:
  1613. proto_unregister(&xsk_proto);
  1614. out:
  1615. return err;
  1616. }
  1617. fs_initcall(xsk_init);