kcmsock.c 43 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * Kernel Connection Multiplexor
  4. *
  5. * Copyright (c) 2016 Tom Herbert <tom@herbertland.com>
  6. */
  7. #include <linux/bpf.h>
  8. #include <linux/errno.h>
  9. #include <linux/errqueue.h>
  10. #include <linux/file.h>
  11. #include <linux/filter.h>
  12. #include <linux/in.h>
  13. #include <linux/kernel.h>
  14. #include <linux/module.h>
  15. #include <linux/net.h>
  16. #include <linux/netdevice.h>
  17. #include <linux/poll.h>
  18. #include <linux/rculist.h>
  19. #include <linux/skbuff.h>
  20. #include <linux/socket.h>
  21. #include <linux/splice.h>
  22. #include <linux/uaccess.h>
  23. #include <linux/workqueue.h>
  24. #include <linux/syscalls.h>
  25. #include <linux/sched/signal.h>
  26. #include <net/kcm.h>
  27. #include <net/netns/generic.h>
  28. #include <net/sock.h>
  29. #include <uapi/linux/kcm.h>
  30. #include <trace/events/sock.h>
  31. unsigned int kcm_net_id;
  32. static struct kmem_cache *kcm_psockp __read_mostly;
  33. static struct kmem_cache *kcm_muxp __read_mostly;
  34. static struct workqueue_struct *kcm_wq;
  35. static inline struct kcm_sock *kcm_sk(const struct sock *sk)
  36. {
  37. return (struct kcm_sock *)sk;
  38. }
  39. static inline struct kcm_tx_msg *kcm_tx_msg(struct sk_buff *skb)
  40. {
  41. return (struct kcm_tx_msg *)skb->cb;
  42. }
  43. static void report_csk_error(struct sock *csk, int err)
  44. {
  45. csk->sk_err = EPIPE;
  46. sk_error_report(csk);
  47. }
  48. static void kcm_abort_tx_psock(struct kcm_psock *psock, int err,
  49. bool wakeup_kcm)
  50. {
  51. struct sock *csk = psock->sk;
  52. struct kcm_mux *mux = psock->mux;
  53. /* Unrecoverable error in transmit */
  54. spin_lock_bh(&mux->lock);
  55. if (psock->tx_stopped) {
  56. spin_unlock_bh(&mux->lock);
  57. return;
  58. }
  59. psock->tx_stopped = 1;
  60. KCM_STATS_INCR(psock->stats.tx_aborts);
  61. if (!psock->tx_kcm) {
  62. /* Take off psocks_avail list */
  63. list_del(&psock->psock_avail_list);
  64. } else if (wakeup_kcm) {
  65. /* In this case psock is being aborted while outside of
  66. * write_msgs and psock is reserved. Schedule tx_work
  67. * to handle the failure there. Need to commit tx_stopped
  68. * before queuing work.
  69. */
  70. smp_mb();
  71. queue_work(kcm_wq, &psock->tx_kcm->tx_work);
  72. }
  73. spin_unlock_bh(&mux->lock);
  74. /* Report error on lower socket */
  75. report_csk_error(csk, err);
  76. }
  77. /* RX mux lock held. */
  78. static void kcm_update_rx_mux_stats(struct kcm_mux *mux,
  79. struct kcm_psock *psock)
  80. {
  81. STRP_STATS_ADD(mux->stats.rx_bytes,
  82. psock->strp.stats.bytes -
  83. psock->saved_rx_bytes);
  84. mux->stats.rx_msgs +=
  85. psock->strp.stats.msgs - psock->saved_rx_msgs;
  86. psock->saved_rx_msgs = psock->strp.stats.msgs;
  87. psock->saved_rx_bytes = psock->strp.stats.bytes;
  88. }
  89. static void kcm_update_tx_mux_stats(struct kcm_mux *mux,
  90. struct kcm_psock *psock)
  91. {
  92. KCM_STATS_ADD(mux->stats.tx_bytes,
  93. psock->stats.tx_bytes - psock->saved_tx_bytes);
  94. mux->stats.tx_msgs +=
  95. psock->stats.tx_msgs - psock->saved_tx_msgs;
  96. psock->saved_tx_msgs = psock->stats.tx_msgs;
  97. psock->saved_tx_bytes = psock->stats.tx_bytes;
  98. }
  99. static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
  100. /* KCM is ready to receive messages on its queue-- either the KCM is new or
  101. * has become unblocked after being blocked on full socket buffer. Queue any
  102. * pending ready messages on a psock. RX mux lock held.
  103. */
  104. static void kcm_rcv_ready(struct kcm_sock *kcm)
  105. {
  106. struct kcm_mux *mux = kcm->mux;
  107. struct kcm_psock *psock;
  108. struct sk_buff *skb;
  109. if (unlikely(kcm->rx_wait || kcm->rx_psock || kcm->rx_disabled))
  110. return;
  111. while (unlikely((skb = __skb_dequeue(&mux->rx_hold_queue)))) {
  112. if (kcm_queue_rcv_skb(&kcm->sk, skb)) {
  113. /* Assuming buffer limit has been reached */
  114. skb_queue_head(&mux->rx_hold_queue, skb);
  115. WARN_ON(!sk_rmem_alloc_get(&kcm->sk));
  116. return;
  117. }
  118. }
  119. while (!list_empty(&mux->psocks_ready)) {
  120. psock = list_first_entry(&mux->psocks_ready, struct kcm_psock,
  121. psock_ready_list);
  122. if (kcm_queue_rcv_skb(&kcm->sk, psock->ready_rx_msg)) {
  123. /* Assuming buffer limit has been reached */
  124. WARN_ON(!sk_rmem_alloc_get(&kcm->sk));
  125. return;
  126. }
  127. /* Consumed the ready message on the psock. Schedule rx_work to
  128. * get more messages.
  129. */
  130. list_del(&psock->psock_ready_list);
  131. psock->ready_rx_msg = NULL;
  132. /* Commit clearing of ready_rx_msg for queuing work */
  133. smp_mb();
  134. strp_unpause(&psock->strp);
  135. strp_check_rcv(&psock->strp);
  136. }
  137. /* Buffer limit is okay now, add to ready list */
  138. list_add_tail(&kcm->wait_rx_list,
  139. &kcm->mux->kcm_rx_waiters);
  140. /* paired with lockless reads in kcm_rfree() */
  141. WRITE_ONCE(kcm->rx_wait, true);
  142. }
  143. static void kcm_rfree(struct sk_buff *skb)
  144. {
  145. struct sock *sk = skb->sk;
  146. struct kcm_sock *kcm = kcm_sk(sk);
  147. struct kcm_mux *mux = kcm->mux;
  148. unsigned int len = skb->truesize;
  149. sk_mem_uncharge(sk, len);
  150. atomic_sub(len, &sk->sk_rmem_alloc);
  151. /* For reading rx_wait and rx_psock without holding lock */
  152. smp_mb__after_atomic();
  153. if (!READ_ONCE(kcm->rx_wait) && !READ_ONCE(kcm->rx_psock) &&
  154. sk_rmem_alloc_get(sk) < sk->sk_rcvlowat) {
  155. spin_lock_bh(&mux->rx_lock);
  156. kcm_rcv_ready(kcm);
  157. spin_unlock_bh(&mux->rx_lock);
  158. }
  159. }
  160. static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
  161. {
  162. struct sk_buff_head *list = &sk->sk_receive_queue;
  163. if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
  164. return -ENOMEM;
  165. if (!sk_rmem_schedule(sk, skb, skb->truesize))
  166. return -ENOBUFS;
  167. skb->dev = NULL;
  168. skb_orphan(skb);
  169. skb->sk = sk;
  170. skb->destructor = kcm_rfree;
  171. atomic_add(skb->truesize, &sk->sk_rmem_alloc);
  172. sk_mem_charge(sk, skb->truesize);
  173. skb_queue_tail(list, skb);
  174. if (!sock_flag(sk, SOCK_DEAD))
  175. sk->sk_data_ready(sk);
  176. return 0;
  177. }
  178. /* Requeue received messages for a kcm socket to other kcm sockets. This is
  179. * called with a kcm socket is receive disabled.
  180. * RX mux lock held.
  181. */
  182. static void requeue_rx_msgs(struct kcm_mux *mux, struct sk_buff_head *head)
  183. {
  184. struct sk_buff *skb;
  185. struct kcm_sock *kcm;
  186. while ((skb = skb_dequeue(head))) {
  187. /* Reset destructor to avoid calling kcm_rcv_ready */
  188. skb->destructor = sock_rfree;
  189. skb_orphan(skb);
  190. try_again:
  191. if (list_empty(&mux->kcm_rx_waiters)) {
  192. skb_queue_tail(&mux->rx_hold_queue, skb);
  193. continue;
  194. }
  195. kcm = list_first_entry(&mux->kcm_rx_waiters,
  196. struct kcm_sock, wait_rx_list);
  197. if (kcm_queue_rcv_skb(&kcm->sk, skb)) {
  198. /* Should mean socket buffer full */
  199. list_del(&kcm->wait_rx_list);
  200. /* paired with lockless reads in kcm_rfree() */
  201. WRITE_ONCE(kcm->rx_wait, false);
  202. /* Commit rx_wait to read in kcm_free */
  203. smp_wmb();
  204. goto try_again;
  205. }
  206. }
  207. }
  208. /* Lower sock lock held */
  209. static struct kcm_sock *reserve_rx_kcm(struct kcm_psock *psock,
  210. struct sk_buff *head)
  211. {
  212. struct kcm_mux *mux = psock->mux;
  213. struct kcm_sock *kcm;
  214. WARN_ON(psock->ready_rx_msg);
  215. if (psock->rx_kcm)
  216. return psock->rx_kcm;
  217. spin_lock_bh(&mux->rx_lock);
  218. if (psock->rx_kcm) {
  219. spin_unlock_bh(&mux->rx_lock);
  220. return psock->rx_kcm;
  221. }
  222. kcm_update_rx_mux_stats(mux, psock);
  223. if (list_empty(&mux->kcm_rx_waiters)) {
  224. psock->ready_rx_msg = head;
  225. strp_pause(&psock->strp);
  226. list_add_tail(&psock->psock_ready_list,
  227. &mux->psocks_ready);
  228. spin_unlock_bh(&mux->rx_lock);
  229. return NULL;
  230. }
  231. kcm = list_first_entry(&mux->kcm_rx_waiters,
  232. struct kcm_sock, wait_rx_list);
  233. list_del(&kcm->wait_rx_list);
  234. /* paired with lockless reads in kcm_rfree() */
  235. WRITE_ONCE(kcm->rx_wait, false);
  236. psock->rx_kcm = kcm;
  237. /* paired with lockless reads in kcm_rfree() */
  238. WRITE_ONCE(kcm->rx_psock, psock);
  239. spin_unlock_bh(&mux->rx_lock);
  240. return kcm;
  241. }
  242. static void kcm_done(struct kcm_sock *kcm);
  243. static void kcm_done_work(struct work_struct *w)
  244. {
  245. kcm_done(container_of(w, struct kcm_sock, done_work));
  246. }
  247. /* Lower sock held */
  248. static void unreserve_rx_kcm(struct kcm_psock *psock,
  249. bool rcv_ready)
  250. {
  251. struct kcm_sock *kcm = psock->rx_kcm;
  252. struct kcm_mux *mux = psock->mux;
  253. if (!kcm)
  254. return;
  255. spin_lock_bh(&mux->rx_lock);
  256. psock->rx_kcm = NULL;
  257. /* paired with lockless reads in kcm_rfree() */
  258. WRITE_ONCE(kcm->rx_psock, NULL);
  259. /* Commit kcm->rx_psock before sk_rmem_alloc_get to sync with
  260. * kcm_rfree
  261. */
  262. smp_mb();
  263. if (unlikely(kcm->done)) {
  264. spin_unlock_bh(&mux->rx_lock);
  265. /* Need to run kcm_done in a task since we need to qcquire
  266. * callback locks which may already be held here.
  267. */
  268. INIT_WORK(&kcm->done_work, kcm_done_work);
  269. schedule_work(&kcm->done_work);
  270. return;
  271. }
  272. if (unlikely(kcm->rx_disabled)) {
  273. requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue);
  274. } else if (rcv_ready || unlikely(!sk_rmem_alloc_get(&kcm->sk))) {
  275. /* Check for degenerative race with rx_wait that all
  276. * data was dequeued (accounted for in kcm_rfree).
  277. */
  278. kcm_rcv_ready(kcm);
  279. }
  280. spin_unlock_bh(&mux->rx_lock);
  281. }
  282. /* Lower sock lock held */
  283. static void psock_data_ready(struct sock *sk)
  284. {
  285. struct kcm_psock *psock;
  286. trace_sk_data_ready(sk);
  287. read_lock_bh(&sk->sk_callback_lock);
  288. psock = (struct kcm_psock *)sk->sk_user_data;
  289. if (likely(psock))
  290. strp_data_ready(&psock->strp);
  291. read_unlock_bh(&sk->sk_callback_lock);
  292. }
  293. /* Called with lower sock held */
  294. static void kcm_rcv_strparser(struct strparser *strp, struct sk_buff *skb)
  295. {
  296. struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp);
  297. struct kcm_sock *kcm;
  298. try_queue:
  299. kcm = reserve_rx_kcm(psock, skb);
  300. if (!kcm) {
  301. /* Unable to reserve a KCM, message is held in psock and strp
  302. * is paused.
  303. */
  304. return;
  305. }
  306. if (kcm_queue_rcv_skb(&kcm->sk, skb)) {
  307. /* Should mean socket buffer full */
  308. unreserve_rx_kcm(psock, false);
  309. goto try_queue;
  310. }
  311. }
  312. static int kcm_parse_func_strparser(struct strparser *strp, struct sk_buff *skb)
  313. {
  314. struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp);
  315. struct bpf_prog *prog = psock->bpf_prog;
  316. int res;
  317. res = bpf_prog_run_pin_on_cpu(prog, skb);
  318. return res;
  319. }
  320. static int kcm_read_sock_done(struct strparser *strp, int err)
  321. {
  322. struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp);
  323. unreserve_rx_kcm(psock, true);
  324. return err;
  325. }
  326. static void psock_state_change(struct sock *sk)
  327. {
  328. /* TCP only does a EPOLLIN for a half close. Do a EPOLLHUP here
  329. * since application will normally not poll with EPOLLIN
  330. * on the TCP sockets.
  331. */
  332. report_csk_error(sk, EPIPE);
  333. }
  334. static void psock_write_space(struct sock *sk)
  335. {
  336. struct kcm_psock *psock;
  337. struct kcm_mux *mux;
  338. struct kcm_sock *kcm;
  339. read_lock_bh(&sk->sk_callback_lock);
  340. psock = (struct kcm_psock *)sk->sk_user_data;
  341. if (unlikely(!psock))
  342. goto out;
  343. mux = psock->mux;
  344. spin_lock_bh(&mux->lock);
  345. /* Check if the socket is reserved so someone is waiting for sending. */
  346. kcm = psock->tx_kcm;
  347. if (kcm)
  348. queue_work(kcm_wq, &kcm->tx_work);
  349. spin_unlock_bh(&mux->lock);
  350. out:
  351. read_unlock_bh(&sk->sk_callback_lock);
  352. }
  353. static void unreserve_psock(struct kcm_sock *kcm);
  354. /* kcm sock is locked. */
  355. static struct kcm_psock *reserve_psock(struct kcm_sock *kcm)
  356. {
  357. struct kcm_mux *mux = kcm->mux;
  358. struct kcm_psock *psock;
  359. psock = kcm->tx_psock;
  360. smp_rmb(); /* Must read tx_psock before tx_wait */
  361. if (psock) {
  362. WARN_ON(kcm->tx_wait);
  363. if (unlikely(psock->tx_stopped))
  364. unreserve_psock(kcm);
  365. else
  366. return kcm->tx_psock;
  367. }
  368. spin_lock_bh(&mux->lock);
  369. /* Check again under lock to see if psock was reserved for this
  370. * psock via psock_unreserve.
  371. */
  372. psock = kcm->tx_psock;
  373. if (unlikely(psock)) {
  374. WARN_ON(kcm->tx_wait);
  375. spin_unlock_bh(&mux->lock);
  376. return kcm->tx_psock;
  377. }
  378. if (!list_empty(&mux->psocks_avail)) {
  379. psock = list_first_entry(&mux->psocks_avail,
  380. struct kcm_psock,
  381. psock_avail_list);
  382. list_del(&psock->psock_avail_list);
  383. if (kcm->tx_wait) {
  384. list_del(&kcm->wait_psock_list);
  385. kcm->tx_wait = false;
  386. }
  387. kcm->tx_psock = psock;
  388. psock->tx_kcm = kcm;
  389. KCM_STATS_INCR(psock->stats.reserved);
  390. } else if (!kcm->tx_wait) {
  391. list_add_tail(&kcm->wait_psock_list,
  392. &mux->kcm_tx_waiters);
  393. kcm->tx_wait = true;
  394. }
  395. spin_unlock_bh(&mux->lock);
  396. return psock;
  397. }
  398. /* mux lock held */
  399. static void psock_now_avail(struct kcm_psock *psock)
  400. {
  401. struct kcm_mux *mux = psock->mux;
  402. struct kcm_sock *kcm;
  403. if (list_empty(&mux->kcm_tx_waiters)) {
  404. list_add_tail(&psock->psock_avail_list,
  405. &mux->psocks_avail);
  406. } else {
  407. kcm = list_first_entry(&mux->kcm_tx_waiters,
  408. struct kcm_sock,
  409. wait_psock_list);
  410. list_del(&kcm->wait_psock_list);
  411. kcm->tx_wait = false;
  412. psock->tx_kcm = kcm;
  413. /* Commit before changing tx_psock since that is read in
  414. * reserve_psock before queuing work.
  415. */
  416. smp_mb();
  417. kcm->tx_psock = psock;
  418. KCM_STATS_INCR(psock->stats.reserved);
  419. queue_work(kcm_wq, &kcm->tx_work);
  420. }
  421. }
  422. /* kcm sock is locked. */
  423. static void unreserve_psock(struct kcm_sock *kcm)
  424. {
  425. struct kcm_psock *psock;
  426. struct kcm_mux *mux = kcm->mux;
  427. spin_lock_bh(&mux->lock);
  428. psock = kcm->tx_psock;
  429. if (WARN_ON(!psock)) {
  430. spin_unlock_bh(&mux->lock);
  431. return;
  432. }
  433. smp_rmb(); /* Read tx_psock before tx_wait */
  434. kcm_update_tx_mux_stats(mux, psock);
  435. WARN_ON(kcm->tx_wait);
  436. kcm->tx_psock = NULL;
  437. psock->tx_kcm = NULL;
  438. KCM_STATS_INCR(psock->stats.unreserved);
  439. if (unlikely(psock->tx_stopped)) {
  440. if (psock->done) {
  441. /* Deferred free */
  442. list_del(&psock->psock_list);
  443. mux->psocks_cnt--;
  444. sock_put(psock->sk);
  445. fput(psock->sk->sk_socket->file);
  446. kmem_cache_free(kcm_psockp, psock);
  447. }
  448. /* Don't put back on available list */
  449. spin_unlock_bh(&mux->lock);
  450. return;
  451. }
  452. psock_now_avail(psock);
  453. spin_unlock_bh(&mux->lock);
  454. }
  455. static void kcm_report_tx_retry(struct kcm_sock *kcm)
  456. {
  457. struct kcm_mux *mux = kcm->mux;
  458. spin_lock_bh(&mux->lock);
  459. KCM_STATS_INCR(mux->stats.tx_retries);
  460. spin_unlock_bh(&mux->lock);
  461. }
  462. /* Write any messages ready on the kcm socket. Called with kcm sock lock
  463. * held. Return bytes actually sent or error.
  464. */
  465. static int kcm_write_msgs(struct kcm_sock *kcm)
  466. {
  467. unsigned int total_sent = 0;
  468. struct sock *sk = &kcm->sk;
  469. struct kcm_psock *psock;
  470. struct sk_buff *head;
  471. int ret = 0;
  472. kcm->tx_wait_more = false;
  473. psock = kcm->tx_psock;
  474. if (unlikely(psock && psock->tx_stopped)) {
  475. /* A reserved psock was aborted asynchronously. Unreserve
  476. * it and we'll retry the message.
  477. */
  478. unreserve_psock(kcm);
  479. kcm_report_tx_retry(kcm);
  480. if (skb_queue_empty(&sk->sk_write_queue))
  481. return 0;
  482. kcm_tx_msg(skb_peek(&sk->sk_write_queue))->started_tx = false;
  483. }
  484. retry:
  485. while ((head = skb_peek(&sk->sk_write_queue))) {
  486. struct msghdr msg = {
  487. .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES,
  488. };
  489. struct kcm_tx_msg *txm = kcm_tx_msg(head);
  490. struct sk_buff *skb;
  491. unsigned int msize;
  492. int i;
  493. if (!txm->started_tx) {
  494. psock = reserve_psock(kcm);
  495. if (!psock)
  496. goto out;
  497. skb = head;
  498. txm->frag_offset = 0;
  499. txm->sent = 0;
  500. txm->started_tx = true;
  501. } else {
  502. if (WARN_ON(!psock)) {
  503. ret = -EINVAL;
  504. goto out;
  505. }
  506. skb = txm->frag_skb;
  507. }
  508. if (WARN_ON_ONCE(!skb_shinfo(skb)->nr_frags) ||
  509. WARN_ON_ONCE(!skb_frag_page(&skb_shinfo(skb)->frags[0]))) {
  510. ret = -EINVAL;
  511. goto out;
  512. }
  513. msize = 0;
  514. for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
  515. msize += skb_frag_size(&skb_shinfo(skb)->frags[i]);
  516. iov_iter_bvec(&msg.msg_iter, ITER_SOURCE,
  517. (const struct bio_vec *)skb_shinfo(skb)->frags,
  518. skb_shinfo(skb)->nr_frags, msize);
  519. iov_iter_advance(&msg.msg_iter, txm->frag_offset);
  520. do {
  521. ret = sock_sendmsg(psock->sk->sk_socket, &msg);
  522. if (ret <= 0) {
  523. if (ret == -EAGAIN) {
  524. /* Save state to try again when there's
  525. * write space on the socket
  526. */
  527. txm->frag_skb = skb;
  528. ret = 0;
  529. goto out;
  530. }
  531. /* Hard failure in sending message, abort this
  532. * psock since it has lost framing
  533. * synchronization and retry sending the
  534. * message from the beginning.
  535. */
  536. kcm_abort_tx_psock(psock, ret ? -ret : EPIPE,
  537. true);
  538. unreserve_psock(kcm);
  539. psock = NULL;
  540. txm->started_tx = false;
  541. kcm_report_tx_retry(kcm);
  542. ret = 0;
  543. goto retry;
  544. }
  545. txm->sent += ret;
  546. txm->frag_offset += ret;
  547. KCM_STATS_ADD(psock->stats.tx_bytes, ret);
  548. } while (msg.msg_iter.count > 0);
  549. if (skb == head) {
  550. if (skb_has_frag_list(skb)) {
  551. txm->frag_skb = skb_shinfo(skb)->frag_list;
  552. txm->frag_offset = 0;
  553. continue;
  554. }
  555. } else if (skb->next) {
  556. txm->frag_skb = skb->next;
  557. txm->frag_offset = 0;
  558. continue;
  559. }
  560. /* Successfully sent the whole packet, account for it. */
  561. sk->sk_wmem_queued -= txm->sent;
  562. total_sent += txm->sent;
  563. skb_dequeue(&sk->sk_write_queue);
  564. kfree_skb(head);
  565. KCM_STATS_INCR(psock->stats.tx_msgs);
  566. }
  567. out:
  568. if (!head) {
  569. /* Done with all queued messages. */
  570. WARN_ON(!skb_queue_empty(&sk->sk_write_queue));
  571. if (psock)
  572. unreserve_psock(kcm);
  573. }
  574. /* Check if write space is available */
  575. sk->sk_write_space(sk);
  576. return total_sent ? : ret;
  577. }
  578. static void kcm_tx_work(struct work_struct *w)
  579. {
  580. struct kcm_sock *kcm = container_of(w, struct kcm_sock, tx_work);
  581. struct sock *sk = &kcm->sk;
  582. int err;
  583. lock_sock(sk);
  584. /* Primarily for SOCK_DGRAM sockets, also handle asynchronous tx
  585. * aborts
  586. */
  587. err = kcm_write_msgs(kcm);
  588. if (err < 0) {
  589. /* Hard failure in write, report error on KCM socket */
  590. pr_warn("KCM: Hard failure on kcm_write_msgs %d\n", err);
  591. report_csk_error(&kcm->sk, -err);
  592. goto out;
  593. }
  594. /* Primarily for SOCK_SEQPACKET sockets */
  595. if (likely(sk->sk_socket) &&
  596. test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
  597. clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
  598. sk->sk_write_space(sk);
  599. }
  600. out:
  601. release_sock(sk);
  602. }
  603. static void kcm_push(struct kcm_sock *kcm)
  604. {
  605. if (kcm->tx_wait_more)
  606. kcm_write_msgs(kcm);
  607. }
  608. static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
  609. {
  610. struct sock *sk = sock->sk;
  611. struct kcm_sock *kcm = kcm_sk(sk);
  612. struct sk_buff *skb = NULL, *head = NULL, *frag_prev = NULL;
  613. size_t copy, copied = 0;
  614. long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
  615. int eor = (sock->type == SOCK_DGRAM) ?
  616. !(msg->msg_flags & MSG_MORE) : !!(msg->msg_flags & MSG_EOR);
  617. int err = -EPIPE;
  618. mutex_lock(&kcm->tx_mutex);
  619. lock_sock(sk);
  620. /* Per tcp_sendmsg this should be in poll */
  621. sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
  622. if (sk->sk_err)
  623. goto out_error;
  624. if (kcm->seq_skb) {
  625. /* Previously opened message */
  626. head = kcm->seq_skb;
  627. skb = kcm_tx_msg(head)->last_skb;
  628. goto start;
  629. }
  630. /* Call the sk_stream functions to manage the sndbuf mem. */
  631. if (!sk_stream_memory_free(sk)) {
  632. kcm_push(kcm);
  633. set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
  634. err = sk_stream_wait_memory(sk, &timeo);
  635. if (err)
  636. goto out_error;
  637. }
  638. if (msg_data_left(msg)) {
  639. /* New message, alloc head skb */
  640. head = alloc_skb(0, sk->sk_allocation);
  641. while (!head) {
  642. kcm_push(kcm);
  643. err = sk_stream_wait_memory(sk, &timeo);
  644. if (err)
  645. goto out_error;
  646. head = alloc_skb(0, sk->sk_allocation);
  647. }
  648. skb = head;
  649. /* Set ip_summed to CHECKSUM_UNNECESSARY to avoid calling
  650. * csum_and_copy_from_iter from skb_do_copy_data_nocache.
  651. */
  652. skb->ip_summed = CHECKSUM_UNNECESSARY;
  653. }
  654. start:
  655. while (msg_data_left(msg)) {
  656. bool merge = true;
  657. int i = skb_shinfo(skb)->nr_frags;
  658. struct page_frag *pfrag = sk_page_frag(sk);
  659. if (!sk_page_frag_refill(sk, pfrag))
  660. goto wait_for_memory;
  661. if (!skb_can_coalesce(skb, i, pfrag->page,
  662. pfrag->offset)) {
  663. if (i == MAX_SKB_FRAGS) {
  664. struct sk_buff *tskb;
  665. tskb = alloc_skb(0, sk->sk_allocation);
  666. if (!tskb)
  667. goto wait_for_memory;
  668. if (head == skb)
  669. skb_shinfo(head)->frag_list = tskb;
  670. else
  671. skb->next = tskb;
  672. frag_prev = skb;
  673. skb = tskb;
  674. skb->ip_summed = CHECKSUM_UNNECESSARY;
  675. continue;
  676. }
  677. merge = false;
  678. }
  679. if (msg->msg_flags & MSG_SPLICE_PAGES) {
  680. copy = msg_data_left(msg);
  681. if (!sk_wmem_schedule(sk, copy))
  682. goto wait_for_memory;
  683. err = skb_splice_from_iter(skb, &msg->msg_iter, copy);
  684. if (err < 0) {
  685. if (err == -EMSGSIZE)
  686. goto wait_for_memory;
  687. goto out_error;
  688. }
  689. copy = err;
  690. skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG;
  691. sk_wmem_queued_add(sk, copy);
  692. sk_mem_charge(sk, copy);
  693. if (head != skb)
  694. head->truesize += copy;
  695. } else {
  696. copy = min_t(int, msg_data_left(msg),
  697. pfrag->size - pfrag->offset);
  698. if (!sk_wmem_schedule(sk, copy))
  699. goto wait_for_memory;
  700. err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
  701. pfrag->page,
  702. pfrag->offset,
  703. copy);
  704. if (err)
  705. goto out_error;
  706. /* Update the skb. */
  707. if (merge) {
  708. skb_frag_size_add(
  709. &skb_shinfo(skb)->frags[i - 1], copy);
  710. } else {
  711. skb_fill_page_desc(skb, i, pfrag->page,
  712. pfrag->offset, copy);
  713. get_page(pfrag->page);
  714. }
  715. pfrag->offset += copy;
  716. }
  717. copied += copy;
  718. if (head != skb) {
  719. head->len += copy;
  720. head->data_len += copy;
  721. }
  722. continue;
  723. wait_for_memory:
  724. kcm_push(kcm);
  725. err = sk_stream_wait_memory(sk, &timeo);
  726. if (err)
  727. goto out_error;
  728. }
  729. if (eor) {
  730. bool not_busy = skb_queue_empty(&sk->sk_write_queue);
  731. if (head) {
  732. /* Message complete, queue it on send buffer */
  733. __skb_queue_tail(&sk->sk_write_queue, head);
  734. kcm->seq_skb = NULL;
  735. KCM_STATS_INCR(kcm->stats.tx_msgs);
  736. }
  737. if (msg->msg_flags & MSG_BATCH) {
  738. kcm->tx_wait_more = true;
  739. } else if (kcm->tx_wait_more || not_busy) {
  740. err = kcm_write_msgs(kcm);
  741. if (err < 0) {
  742. /* We got a hard error in write_msgs but have
  743. * already queued this message. Report an error
  744. * in the socket, but don't affect return value
  745. * from sendmsg
  746. */
  747. pr_warn("KCM: Hard failure on kcm_write_msgs\n");
  748. report_csk_error(&kcm->sk, -err);
  749. }
  750. }
  751. } else {
  752. /* Message not complete, save state */
  753. partial_message:
  754. if (head) {
  755. kcm->seq_skb = head;
  756. kcm_tx_msg(head)->last_skb = skb;
  757. }
  758. }
  759. KCM_STATS_ADD(kcm->stats.tx_bytes, copied);
  760. release_sock(sk);
  761. mutex_unlock(&kcm->tx_mutex);
  762. return copied;
  763. out_error:
  764. kcm_push(kcm);
  765. /* When MAX_SKB_FRAGS was reached, a new skb was allocated and
  766. * linked into the frag_list before data copy. If the copy
  767. * subsequently failed, this skb has zero frags. Remove it from
  768. * the frag_list to prevent kcm_write_msgs from later hitting
  769. * WARN_ON(!skb_shinfo(skb)->nr_frags).
  770. */
  771. if (frag_prev && !skb_shinfo(skb)->nr_frags) {
  772. if (head == frag_prev)
  773. skb_shinfo(head)->frag_list = NULL;
  774. else
  775. frag_prev->next = NULL;
  776. kfree_skb(skb);
  777. /* Update skb as it may be saved in partial_message via goto */
  778. skb = frag_prev;
  779. }
  780. if (sock->type == SOCK_SEQPACKET) {
  781. /* Wrote some bytes before encountering an
  782. * error, return partial success.
  783. */
  784. if (copied)
  785. goto partial_message;
  786. if (head != kcm->seq_skb)
  787. kfree_skb(head);
  788. } else {
  789. kfree_skb(head);
  790. kcm->seq_skb = NULL;
  791. }
  792. err = sk_stream_error(sk, msg->msg_flags, err);
  793. /* make sure we wake any epoll edge trigger waiter */
  794. if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
  795. sk->sk_write_space(sk);
  796. release_sock(sk);
  797. mutex_unlock(&kcm->tx_mutex);
  798. return err;
  799. }
  800. static void kcm_splice_eof(struct socket *sock)
  801. {
  802. struct sock *sk = sock->sk;
  803. struct kcm_sock *kcm = kcm_sk(sk);
  804. if (skb_queue_empty_lockless(&sk->sk_write_queue))
  805. return;
  806. lock_sock(sk);
  807. kcm_write_msgs(kcm);
  808. release_sock(sk);
  809. }
  810. static int kcm_recvmsg(struct socket *sock, struct msghdr *msg,
  811. size_t len, int flags)
  812. {
  813. struct sock *sk = sock->sk;
  814. struct kcm_sock *kcm = kcm_sk(sk);
  815. int err = 0;
  816. struct strp_msg *stm;
  817. int copied = 0;
  818. struct sk_buff *skb;
  819. skb = skb_recv_datagram(sk, flags, &err);
  820. if (!skb)
  821. goto out;
  822. /* Okay, have a message on the receive queue */
  823. stm = strp_msg(skb);
  824. if (len > stm->full_len)
  825. len = stm->full_len;
  826. err = skb_copy_datagram_msg(skb, stm->offset, msg, len);
  827. if (err < 0)
  828. goto out;
  829. copied = len;
  830. if (likely(!(flags & MSG_PEEK))) {
  831. KCM_STATS_ADD(kcm->stats.rx_bytes, copied);
  832. if (copied < stm->full_len) {
  833. if (sock->type == SOCK_DGRAM) {
  834. /* Truncated message */
  835. msg->msg_flags |= MSG_TRUNC;
  836. goto msg_finished;
  837. }
  838. stm->offset += copied;
  839. stm->full_len -= copied;
  840. } else {
  841. msg_finished:
  842. /* Finished with message */
  843. msg->msg_flags |= MSG_EOR;
  844. KCM_STATS_INCR(kcm->stats.rx_msgs);
  845. }
  846. }
  847. out:
  848. skb_free_datagram(sk, skb);
  849. return copied ? : err;
  850. }
  851. static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos,
  852. struct pipe_inode_info *pipe, size_t len,
  853. unsigned int flags)
  854. {
  855. struct sock *sk = sock->sk;
  856. struct kcm_sock *kcm = kcm_sk(sk);
  857. struct strp_msg *stm;
  858. int err = 0;
  859. ssize_t copied;
  860. struct sk_buff *skb;
  861. if (sock->file->f_flags & O_NONBLOCK || flags & SPLICE_F_NONBLOCK)
  862. flags = MSG_DONTWAIT;
  863. else
  864. flags = 0;
  865. /* Only support splice for SOCKSEQPACKET */
  866. skb = skb_recv_datagram(sk, flags, &err);
  867. if (!skb)
  868. goto err_out;
  869. /* Okay, have a message on the receive queue */
  870. stm = strp_msg(skb);
  871. if (len > stm->full_len)
  872. len = stm->full_len;
  873. copied = skb_splice_bits(skb, sk, stm->offset, pipe, len, flags);
  874. if (copied < 0) {
  875. err = copied;
  876. goto err_out;
  877. }
  878. KCM_STATS_ADD(kcm->stats.rx_bytes, copied);
  879. stm->offset += copied;
  880. stm->full_len -= copied;
  881. /* We have no way to return MSG_EOR. If all the bytes have been
  882. * read we still leave the message in the receive socket buffer.
  883. * A subsequent recvmsg needs to be done to return MSG_EOR and
  884. * finish reading the message.
  885. */
  886. skb_free_datagram(sk, skb);
  887. return copied;
  888. err_out:
  889. skb_free_datagram(sk, skb);
  890. return err;
  891. }
  892. /* kcm sock lock held */
  893. static void kcm_recv_disable(struct kcm_sock *kcm)
  894. {
  895. struct kcm_mux *mux = kcm->mux;
  896. if (kcm->rx_disabled)
  897. return;
  898. spin_lock_bh(&mux->rx_lock);
  899. kcm->rx_disabled = 1;
  900. /* If a psock is reserved we'll do cleanup in unreserve */
  901. if (!kcm->rx_psock) {
  902. if (kcm->rx_wait) {
  903. list_del(&kcm->wait_rx_list);
  904. /* paired with lockless reads in kcm_rfree() */
  905. WRITE_ONCE(kcm->rx_wait, false);
  906. }
  907. requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue);
  908. }
  909. spin_unlock_bh(&mux->rx_lock);
  910. }
  911. /* kcm sock lock held */
  912. static void kcm_recv_enable(struct kcm_sock *kcm)
  913. {
  914. struct kcm_mux *mux = kcm->mux;
  915. if (!kcm->rx_disabled)
  916. return;
  917. spin_lock_bh(&mux->rx_lock);
  918. kcm->rx_disabled = 0;
  919. kcm_rcv_ready(kcm);
  920. spin_unlock_bh(&mux->rx_lock);
  921. }
  922. static int kcm_setsockopt(struct socket *sock, int level, int optname,
  923. sockptr_t optval, unsigned int optlen)
  924. {
  925. struct kcm_sock *kcm = kcm_sk(sock->sk);
  926. int val, valbool;
  927. int err = 0;
  928. if (level != SOL_KCM)
  929. return -ENOPROTOOPT;
  930. if (optlen < sizeof(int))
  931. return -EINVAL;
  932. if (copy_from_sockptr(&val, optval, sizeof(int)))
  933. return -EFAULT;
  934. valbool = val ? 1 : 0;
  935. switch (optname) {
  936. case KCM_RECV_DISABLE:
  937. lock_sock(&kcm->sk);
  938. if (valbool)
  939. kcm_recv_disable(kcm);
  940. else
  941. kcm_recv_enable(kcm);
  942. release_sock(&kcm->sk);
  943. break;
  944. default:
  945. err = -ENOPROTOOPT;
  946. }
  947. return err;
  948. }
  949. static int kcm_getsockopt(struct socket *sock, int level, int optname,
  950. char __user *optval, int __user *optlen)
  951. {
  952. struct kcm_sock *kcm = kcm_sk(sock->sk);
  953. int val, len;
  954. if (level != SOL_KCM)
  955. return -ENOPROTOOPT;
  956. if (get_user(len, optlen))
  957. return -EFAULT;
  958. if (len < 0)
  959. return -EINVAL;
  960. len = min_t(unsigned int, len, sizeof(int));
  961. switch (optname) {
  962. case KCM_RECV_DISABLE:
  963. val = kcm->rx_disabled;
  964. break;
  965. default:
  966. return -ENOPROTOOPT;
  967. }
  968. if (put_user(len, optlen))
  969. return -EFAULT;
  970. if (copy_to_user(optval, &val, len))
  971. return -EFAULT;
  972. return 0;
  973. }
  974. static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux)
  975. {
  976. struct kcm_sock *tkcm;
  977. struct list_head *head;
  978. int index = 0;
  979. /* For SOCK_SEQPACKET sock type, datagram_poll checks the sk_state, so
  980. * we set sk_state, otherwise epoll_wait always returns right away with
  981. * EPOLLHUP
  982. */
  983. kcm->sk.sk_state = TCP_ESTABLISHED;
  984. /* Add to mux's kcm sockets list */
  985. kcm->mux = mux;
  986. spin_lock_bh(&mux->lock);
  987. head = &mux->kcm_socks;
  988. list_for_each_entry(tkcm, &mux->kcm_socks, kcm_sock_list) {
  989. if (tkcm->index != index)
  990. break;
  991. head = &tkcm->kcm_sock_list;
  992. index++;
  993. }
  994. list_add(&kcm->kcm_sock_list, head);
  995. kcm->index = index;
  996. mux->kcm_socks_cnt++;
  997. spin_unlock_bh(&mux->lock);
  998. INIT_WORK(&kcm->tx_work, kcm_tx_work);
  999. mutex_init(&kcm->tx_mutex);
  1000. spin_lock_bh(&mux->rx_lock);
  1001. kcm_rcv_ready(kcm);
  1002. spin_unlock_bh(&mux->rx_lock);
  1003. }
  1004. static int kcm_attach(struct socket *sock, struct socket *csock,
  1005. struct bpf_prog *prog)
  1006. {
  1007. struct kcm_sock *kcm = kcm_sk(sock->sk);
  1008. struct kcm_mux *mux = kcm->mux;
  1009. struct sock *csk;
  1010. struct kcm_psock *psock = NULL, *tpsock;
  1011. struct list_head *head;
  1012. int index = 0;
  1013. static const struct strp_callbacks cb = {
  1014. .rcv_msg = kcm_rcv_strparser,
  1015. .parse_msg = kcm_parse_func_strparser,
  1016. .read_sock_done = kcm_read_sock_done,
  1017. };
  1018. int err = 0;
  1019. csk = csock->sk;
  1020. if (!csk)
  1021. return -EINVAL;
  1022. lock_sock(csk);
  1023. /* Only allow TCP sockets to be attached for now */
  1024. if ((csk->sk_family != AF_INET && csk->sk_family != AF_INET6) ||
  1025. csk->sk_protocol != IPPROTO_TCP) {
  1026. err = -EOPNOTSUPP;
  1027. goto out;
  1028. }
  1029. /* Don't allow listeners or closed sockets */
  1030. if (csk->sk_state == TCP_LISTEN || csk->sk_state == TCP_CLOSE) {
  1031. err = -EOPNOTSUPP;
  1032. goto out;
  1033. }
  1034. psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL);
  1035. if (!psock) {
  1036. err = -ENOMEM;
  1037. goto out;
  1038. }
  1039. psock->mux = mux;
  1040. psock->sk = csk;
  1041. psock->bpf_prog = prog;
  1042. write_lock_bh(&csk->sk_callback_lock);
  1043. /* Check if sk_user_data is already by KCM or someone else.
  1044. * Must be done under lock to prevent race conditions.
  1045. */
  1046. if (csk->sk_user_data) {
  1047. write_unlock_bh(&csk->sk_callback_lock);
  1048. kmem_cache_free(kcm_psockp, psock);
  1049. err = -EALREADY;
  1050. goto out;
  1051. }
  1052. err = strp_init(&psock->strp, csk, &cb);
  1053. if (err) {
  1054. write_unlock_bh(&csk->sk_callback_lock);
  1055. kmem_cache_free(kcm_psockp, psock);
  1056. goto out;
  1057. }
  1058. psock->save_data_ready = csk->sk_data_ready;
  1059. psock->save_write_space = csk->sk_write_space;
  1060. psock->save_state_change = csk->sk_state_change;
  1061. csk->sk_user_data = psock;
  1062. csk->sk_data_ready = psock_data_ready;
  1063. csk->sk_write_space = psock_write_space;
  1064. csk->sk_state_change = psock_state_change;
  1065. write_unlock_bh(&csk->sk_callback_lock);
  1066. sock_hold(csk);
  1067. /* Finished initialization, now add the psock to the MUX. */
  1068. spin_lock_bh(&mux->lock);
  1069. head = &mux->psocks;
  1070. list_for_each_entry(tpsock, &mux->psocks, psock_list) {
  1071. if (tpsock->index != index)
  1072. break;
  1073. head = &tpsock->psock_list;
  1074. index++;
  1075. }
  1076. list_add(&psock->psock_list, head);
  1077. psock->index = index;
  1078. KCM_STATS_INCR(mux->stats.psock_attach);
  1079. mux->psocks_cnt++;
  1080. psock_now_avail(psock);
  1081. spin_unlock_bh(&mux->lock);
  1082. /* Schedule RX work in case there are already bytes queued */
  1083. strp_check_rcv(&psock->strp);
  1084. out:
  1085. release_sock(csk);
  1086. return err;
  1087. }
  1088. static int kcm_attach_ioctl(struct socket *sock, struct kcm_attach *info)
  1089. {
  1090. struct socket *csock;
  1091. struct bpf_prog *prog;
  1092. int err;
  1093. csock = sockfd_lookup(info->fd, &err);
  1094. if (!csock)
  1095. return -ENOENT;
  1096. prog = bpf_prog_get_type(info->bpf_fd, BPF_PROG_TYPE_SOCKET_FILTER);
  1097. if (IS_ERR(prog)) {
  1098. err = PTR_ERR(prog);
  1099. goto out;
  1100. }
  1101. err = kcm_attach(sock, csock, prog);
  1102. if (err) {
  1103. bpf_prog_put(prog);
  1104. goto out;
  1105. }
  1106. /* Keep reference on file also */
  1107. return 0;
  1108. out:
  1109. sockfd_put(csock);
  1110. return err;
  1111. }
  1112. static void kcm_unattach(struct kcm_psock *psock)
  1113. {
  1114. struct sock *csk = psock->sk;
  1115. struct kcm_mux *mux = psock->mux;
  1116. lock_sock(csk);
  1117. /* Stop getting callbacks from TCP socket. After this there should
  1118. * be no way to reserve a kcm for this psock.
  1119. */
  1120. write_lock_bh(&csk->sk_callback_lock);
  1121. csk->sk_user_data = NULL;
  1122. csk->sk_data_ready = psock->save_data_ready;
  1123. csk->sk_write_space = psock->save_write_space;
  1124. csk->sk_state_change = psock->save_state_change;
  1125. strp_stop(&psock->strp);
  1126. if (WARN_ON(psock->rx_kcm)) {
  1127. write_unlock_bh(&csk->sk_callback_lock);
  1128. release_sock(csk);
  1129. return;
  1130. }
  1131. spin_lock_bh(&mux->rx_lock);
  1132. /* Stop receiver activities. After this point psock should not be
  1133. * able to get onto ready list either through callbacks or work.
  1134. */
  1135. if (psock->ready_rx_msg) {
  1136. list_del(&psock->psock_ready_list);
  1137. kfree_skb(psock->ready_rx_msg);
  1138. psock->ready_rx_msg = NULL;
  1139. KCM_STATS_INCR(mux->stats.rx_ready_drops);
  1140. }
  1141. spin_unlock_bh(&mux->rx_lock);
  1142. write_unlock_bh(&csk->sk_callback_lock);
  1143. /* Call strp_done without sock lock */
  1144. release_sock(csk);
  1145. strp_done(&psock->strp);
  1146. lock_sock(csk);
  1147. bpf_prog_put(psock->bpf_prog);
  1148. spin_lock_bh(&mux->lock);
  1149. aggregate_psock_stats(&psock->stats, &mux->aggregate_psock_stats);
  1150. save_strp_stats(&psock->strp, &mux->aggregate_strp_stats);
  1151. KCM_STATS_INCR(mux->stats.psock_unattach);
  1152. if (psock->tx_kcm) {
  1153. /* psock was reserved. Just mark it finished and we will clean
  1154. * up in the kcm paths, we need kcm lock which can not be
  1155. * acquired here.
  1156. */
  1157. KCM_STATS_INCR(mux->stats.psock_unattach_rsvd);
  1158. spin_unlock_bh(&mux->lock);
  1159. /* We are unattaching a socket that is reserved. Abort the
  1160. * socket since we may be out of sync in sending on it. We need
  1161. * to do this without the mux lock.
  1162. */
  1163. kcm_abort_tx_psock(psock, EPIPE, false);
  1164. spin_lock_bh(&mux->lock);
  1165. if (!psock->tx_kcm) {
  1166. /* psock now unreserved in window mux was unlocked */
  1167. goto no_reserved;
  1168. }
  1169. psock->done = 1;
  1170. /* Commit done before queuing work to process it */
  1171. smp_mb();
  1172. /* Queue tx work to make sure psock->done is handled */
  1173. queue_work(kcm_wq, &psock->tx_kcm->tx_work);
  1174. spin_unlock_bh(&mux->lock);
  1175. } else {
  1176. no_reserved:
  1177. if (!psock->tx_stopped)
  1178. list_del(&psock->psock_avail_list);
  1179. list_del(&psock->psock_list);
  1180. mux->psocks_cnt--;
  1181. spin_unlock_bh(&mux->lock);
  1182. sock_put(csk);
  1183. fput(csk->sk_socket->file);
  1184. kmem_cache_free(kcm_psockp, psock);
  1185. }
  1186. release_sock(csk);
  1187. }
  1188. static int kcm_unattach_ioctl(struct socket *sock, struct kcm_unattach *info)
  1189. {
  1190. struct kcm_sock *kcm = kcm_sk(sock->sk);
  1191. struct kcm_mux *mux = kcm->mux;
  1192. struct kcm_psock *psock;
  1193. struct socket *csock;
  1194. struct sock *csk;
  1195. int err;
  1196. csock = sockfd_lookup(info->fd, &err);
  1197. if (!csock)
  1198. return -ENOENT;
  1199. csk = csock->sk;
  1200. if (!csk) {
  1201. err = -EINVAL;
  1202. goto out;
  1203. }
  1204. err = -ENOENT;
  1205. spin_lock_bh(&mux->lock);
  1206. list_for_each_entry(psock, &mux->psocks, psock_list) {
  1207. if (psock->sk != csk)
  1208. continue;
  1209. /* Found the matching psock */
  1210. if (psock->unattaching || WARN_ON(psock->done)) {
  1211. err = -EALREADY;
  1212. break;
  1213. }
  1214. psock->unattaching = 1;
  1215. spin_unlock_bh(&mux->lock);
  1216. /* Lower socket lock should already be held */
  1217. kcm_unattach(psock);
  1218. err = 0;
  1219. goto out;
  1220. }
  1221. spin_unlock_bh(&mux->lock);
  1222. out:
  1223. sockfd_put(csock);
  1224. return err;
  1225. }
  1226. static struct proto kcm_proto = {
  1227. .name = "KCM",
  1228. .owner = THIS_MODULE,
  1229. .obj_size = sizeof(struct kcm_sock),
  1230. };
  1231. /* Clone a kcm socket. */
  1232. static struct file *kcm_clone(struct socket *osock)
  1233. {
  1234. struct socket *newsock;
  1235. struct sock *newsk;
  1236. newsock = sock_alloc();
  1237. if (!newsock)
  1238. return ERR_PTR(-ENFILE);
  1239. newsock->type = osock->type;
  1240. newsock->ops = osock->ops;
  1241. __module_get(newsock->ops->owner);
  1242. newsk = sk_alloc(sock_net(osock->sk), PF_KCM, GFP_KERNEL,
  1243. &kcm_proto, false);
  1244. if (!newsk) {
  1245. sock_release(newsock);
  1246. return ERR_PTR(-ENOMEM);
  1247. }
  1248. sock_init_data(newsock, newsk);
  1249. init_kcm_sock(kcm_sk(newsk), kcm_sk(osock->sk)->mux);
  1250. return sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name);
  1251. }
  1252. static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
  1253. {
  1254. int err;
  1255. switch (cmd) {
  1256. case SIOCKCMATTACH: {
  1257. struct kcm_attach info;
  1258. if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
  1259. return -EFAULT;
  1260. err = kcm_attach_ioctl(sock, &info);
  1261. break;
  1262. }
  1263. case SIOCKCMUNATTACH: {
  1264. struct kcm_unattach info;
  1265. if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
  1266. return -EFAULT;
  1267. err = kcm_unattach_ioctl(sock, &info);
  1268. break;
  1269. }
  1270. case SIOCKCMCLONE: {
  1271. struct kcm_clone info;
  1272. FD_PREPARE(fdf, 0, kcm_clone(sock));
  1273. if (fdf.err)
  1274. return fdf.err;
  1275. info.fd = fd_prepare_fd(fdf);
  1276. if (copy_to_user((void __user *)arg, &info, sizeof(info)))
  1277. return -EFAULT;
  1278. fd_publish(fdf);
  1279. err = 0;
  1280. break;
  1281. }
  1282. default:
  1283. err = -ENOIOCTLCMD;
  1284. break;
  1285. }
  1286. return err;
  1287. }
  1288. static void release_mux(struct kcm_mux *mux)
  1289. {
  1290. struct kcm_net *knet = mux->knet;
  1291. struct kcm_psock *psock, *tmp_psock;
  1292. /* Release psocks */
  1293. list_for_each_entry_safe(psock, tmp_psock,
  1294. &mux->psocks, psock_list) {
  1295. if (!WARN_ON(psock->unattaching))
  1296. kcm_unattach(psock);
  1297. }
  1298. if (WARN_ON(mux->psocks_cnt))
  1299. return;
  1300. __skb_queue_purge(&mux->rx_hold_queue);
  1301. mutex_lock(&knet->mutex);
  1302. aggregate_mux_stats(&mux->stats, &knet->aggregate_mux_stats);
  1303. aggregate_psock_stats(&mux->aggregate_psock_stats,
  1304. &knet->aggregate_psock_stats);
  1305. aggregate_strp_stats(&mux->aggregate_strp_stats,
  1306. &knet->aggregate_strp_stats);
  1307. list_del_rcu(&mux->kcm_mux_list);
  1308. knet->count--;
  1309. mutex_unlock(&knet->mutex);
  1310. kfree_rcu(mux, rcu);
  1311. }
  1312. static void kcm_done(struct kcm_sock *kcm)
  1313. {
  1314. struct kcm_mux *mux = kcm->mux;
  1315. struct sock *sk = &kcm->sk;
  1316. int socks_cnt;
  1317. spin_lock_bh(&mux->rx_lock);
  1318. if (kcm->rx_psock) {
  1319. /* Cleanup in unreserve_rx_kcm */
  1320. WARN_ON(kcm->done);
  1321. kcm->rx_disabled = 1;
  1322. kcm->done = 1;
  1323. spin_unlock_bh(&mux->rx_lock);
  1324. return;
  1325. }
  1326. if (kcm->rx_wait) {
  1327. list_del(&kcm->wait_rx_list);
  1328. /* paired with lockless reads in kcm_rfree() */
  1329. WRITE_ONCE(kcm->rx_wait, false);
  1330. }
  1331. /* Move any pending receive messages to other kcm sockets */
  1332. requeue_rx_msgs(mux, &sk->sk_receive_queue);
  1333. spin_unlock_bh(&mux->rx_lock);
  1334. if (WARN_ON(sk_rmem_alloc_get(sk)))
  1335. return;
  1336. /* Detach from MUX */
  1337. spin_lock_bh(&mux->lock);
  1338. list_del(&kcm->kcm_sock_list);
  1339. mux->kcm_socks_cnt--;
  1340. socks_cnt = mux->kcm_socks_cnt;
  1341. spin_unlock_bh(&mux->lock);
  1342. if (!socks_cnt) {
  1343. /* We are done with the mux now. */
  1344. release_mux(mux);
  1345. }
  1346. WARN_ON(kcm->rx_wait);
  1347. sock_put(&kcm->sk);
  1348. }
  1349. /* Called by kcm_release to close a KCM socket.
  1350. * If this is the last KCM socket on the MUX, destroy the MUX.
  1351. */
  1352. static int kcm_release(struct socket *sock)
  1353. {
  1354. struct sock *sk = sock->sk;
  1355. struct kcm_sock *kcm;
  1356. struct kcm_mux *mux;
  1357. struct kcm_psock *psock;
  1358. if (!sk)
  1359. return 0;
  1360. kcm = kcm_sk(sk);
  1361. mux = kcm->mux;
  1362. lock_sock(sk);
  1363. sock_orphan(sk);
  1364. kfree_skb(kcm->seq_skb);
  1365. /* Purge queue under lock to avoid race condition with tx_work trying
  1366. * to act when queue is nonempty. If tx_work runs after this point
  1367. * it will just return.
  1368. */
  1369. __skb_queue_purge(&sk->sk_write_queue);
  1370. release_sock(sk);
  1371. spin_lock_bh(&mux->lock);
  1372. if (kcm->tx_wait) {
  1373. /* Take of tx_wait list, after this point there should be no way
  1374. * that a psock will be assigned to this kcm.
  1375. */
  1376. list_del(&kcm->wait_psock_list);
  1377. kcm->tx_wait = false;
  1378. }
  1379. spin_unlock_bh(&mux->lock);
  1380. /* Cancel work. After this point there should be no outside references
  1381. * to the kcm socket.
  1382. */
  1383. disable_work_sync(&kcm->tx_work);
  1384. lock_sock(sk);
  1385. psock = kcm->tx_psock;
  1386. if (psock) {
  1387. /* A psock was reserved, so we need to kill it since it
  1388. * may already have some bytes queued from a message. We
  1389. * need to do this after removing kcm from tx_wait list.
  1390. */
  1391. kcm_abort_tx_psock(psock, EPIPE, false);
  1392. unreserve_psock(kcm);
  1393. }
  1394. release_sock(sk);
  1395. WARN_ON(kcm->tx_wait);
  1396. WARN_ON(kcm->tx_psock);
  1397. sock->sk = NULL;
  1398. kcm_done(kcm);
  1399. return 0;
  1400. }
  1401. static const struct proto_ops kcm_dgram_ops = {
  1402. .family = PF_KCM,
  1403. .owner = THIS_MODULE,
  1404. .release = kcm_release,
  1405. .bind = sock_no_bind,
  1406. .connect = sock_no_connect,
  1407. .socketpair = sock_no_socketpair,
  1408. .accept = sock_no_accept,
  1409. .getname = sock_no_getname,
  1410. .poll = datagram_poll,
  1411. .ioctl = kcm_ioctl,
  1412. .listen = sock_no_listen,
  1413. .shutdown = sock_no_shutdown,
  1414. .setsockopt = kcm_setsockopt,
  1415. .getsockopt = kcm_getsockopt,
  1416. .sendmsg = kcm_sendmsg,
  1417. .recvmsg = kcm_recvmsg,
  1418. .mmap = sock_no_mmap,
  1419. .splice_eof = kcm_splice_eof,
  1420. };
  1421. static const struct proto_ops kcm_seqpacket_ops = {
  1422. .family = PF_KCM,
  1423. .owner = THIS_MODULE,
  1424. .release = kcm_release,
  1425. .bind = sock_no_bind,
  1426. .connect = sock_no_connect,
  1427. .socketpair = sock_no_socketpair,
  1428. .accept = sock_no_accept,
  1429. .getname = sock_no_getname,
  1430. .poll = datagram_poll,
  1431. .ioctl = kcm_ioctl,
  1432. .listen = sock_no_listen,
  1433. .shutdown = sock_no_shutdown,
  1434. .setsockopt = kcm_setsockopt,
  1435. .getsockopt = kcm_getsockopt,
  1436. .sendmsg = kcm_sendmsg,
  1437. .recvmsg = kcm_recvmsg,
  1438. .mmap = sock_no_mmap,
  1439. .splice_eof = kcm_splice_eof,
  1440. .splice_read = kcm_splice_read,
  1441. };
  1442. /* Create proto operation for kcm sockets */
  1443. static int kcm_create(struct net *net, struct socket *sock,
  1444. int protocol, int kern)
  1445. {
  1446. struct kcm_net *knet = net_generic(net, kcm_net_id);
  1447. struct sock *sk;
  1448. struct kcm_mux *mux;
  1449. switch (sock->type) {
  1450. case SOCK_DGRAM:
  1451. sock->ops = &kcm_dgram_ops;
  1452. break;
  1453. case SOCK_SEQPACKET:
  1454. sock->ops = &kcm_seqpacket_ops;
  1455. break;
  1456. default:
  1457. return -ESOCKTNOSUPPORT;
  1458. }
  1459. if (protocol != KCMPROTO_CONNECTED)
  1460. return -EPROTONOSUPPORT;
  1461. sk = sk_alloc(net, PF_KCM, GFP_KERNEL, &kcm_proto, kern);
  1462. if (!sk)
  1463. return -ENOMEM;
  1464. /* Allocate a kcm mux, shared between KCM sockets */
  1465. mux = kmem_cache_zalloc(kcm_muxp, GFP_KERNEL);
  1466. if (!mux) {
  1467. sk_free(sk);
  1468. return -ENOMEM;
  1469. }
  1470. spin_lock_init(&mux->lock);
  1471. spin_lock_init(&mux->rx_lock);
  1472. INIT_LIST_HEAD(&mux->kcm_socks);
  1473. INIT_LIST_HEAD(&mux->kcm_rx_waiters);
  1474. INIT_LIST_HEAD(&mux->kcm_tx_waiters);
  1475. INIT_LIST_HEAD(&mux->psocks);
  1476. INIT_LIST_HEAD(&mux->psocks_ready);
  1477. INIT_LIST_HEAD(&mux->psocks_avail);
  1478. mux->knet = knet;
  1479. /* Add new MUX to list */
  1480. mutex_lock(&knet->mutex);
  1481. list_add_rcu(&mux->kcm_mux_list, &knet->mux_list);
  1482. knet->count++;
  1483. mutex_unlock(&knet->mutex);
  1484. skb_queue_head_init(&mux->rx_hold_queue);
  1485. /* Init KCM socket */
  1486. sock_init_data(sock, sk);
  1487. init_kcm_sock(kcm_sk(sk), mux);
  1488. return 0;
  1489. }
  1490. static const struct net_proto_family kcm_family_ops = {
  1491. .family = PF_KCM,
  1492. .create = kcm_create,
  1493. .owner = THIS_MODULE,
  1494. };
  1495. static __net_init int kcm_init_net(struct net *net)
  1496. {
  1497. struct kcm_net *knet = net_generic(net, kcm_net_id);
  1498. INIT_LIST_HEAD_RCU(&knet->mux_list);
  1499. mutex_init(&knet->mutex);
  1500. return 0;
  1501. }
  1502. static __net_exit void kcm_exit_net(struct net *net)
  1503. {
  1504. struct kcm_net *knet = net_generic(net, kcm_net_id);
  1505. /* All KCM sockets should be closed at this point, which should mean
  1506. * that all multiplexors and psocks have been destroyed.
  1507. */
  1508. WARN_ON(!list_empty(&knet->mux_list));
  1509. mutex_destroy(&knet->mutex);
  1510. }
  1511. static struct pernet_operations kcm_net_ops = {
  1512. .init = kcm_init_net,
  1513. .exit = kcm_exit_net,
  1514. .id = &kcm_net_id,
  1515. .size = sizeof(struct kcm_net),
  1516. };
  1517. static int __init kcm_init(void)
  1518. {
  1519. int err = -ENOMEM;
  1520. kcm_muxp = KMEM_CACHE(kcm_mux, SLAB_HWCACHE_ALIGN);
  1521. if (!kcm_muxp)
  1522. goto fail;
  1523. kcm_psockp = KMEM_CACHE(kcm_psock, SLAB_HWCACHE_ALIGN);
  1524. if (!kcm_psockp)
  1525. goto fail;
  1526. kcm_wq = create_singlethread_workqueue("kkcmd");
  1527. if (!kcm_wq)
  1528. goto fail;
  1529. err = proto_register(&kcm_proto, 1);
  1530. if (err)
  1531. goto fail;
  1532. err = register_pernet_device(&kcm_net_ops);
  1533. if (err)
  1534. goto net_ops_fail;
  1535. err = sock_register(&kcm_family_ops);
  1536. if (err)
  1537. goto sock_register_fail;
  1538. err = kcm_proc_init();
  1539. if (err)
  1540. goto proc_init_fail;
  1541. return 0;
  1542. proc_init_fail:
  1543. sock_unregister(PF_KCM);
  1544. sock_register_fail:
  1545. unregister_pernet_device(&kcm_net_ops);
  1546. net_ops_fail:
  1547. proto_unregister(&kcm_proto);
  1548. fail:
  1549. kmem_cache_destroy(kcm_muxp);
  1550. kmem_cache_destroy(kcm_psockp);
  1551. if (kcm_wq)
  1552. destroy_workqueue(kcm_wq);
  1553. return err;
  1554. }
  1555. static void __exit kcm_exit(void)
  1556. {
  1557. kcm_proc_exit();
  1558. sock_unregister(PF_KCM);
  1559. unregister_pernet_device(&kcm_net_ops);
  1560. proto_unregister(&kcm_proto);
  1561. destroy_workqueue(kcm_wq);
  1562. kmem_cache_destroy(kcm_muxp);
  1563. kmem_cache_destroy(kcm_psockp);
  1564. }
  1565. module_init(kcm_init);
  1566. module_exit(kcm_exit);
  1567. MODULE_LICENSE("GPL");
  1568. MODULE_DESCRIPTION("KCM (Kernel Connection Multiplexor) sockets");
  1569. MODULE_ALIAS_NETPROTO(PF_KCM);