eventpoll.c 69 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. /*
  3. * fs/eventpoll.c (Efficient event retrieval implementation)
  4. * Copyright (C) 2001,...,2009 Davide Libenzi
  5. *
  6. * Davide Libenzi <davidel@xmailserver.org>
  7. */
  8. #include <linux/init.h>
  9. #include <linux/kernel.h>
  10. #include <linux/sched/signal.h>
  11. #include <linux/fs.h>
  12. #include <linux/file.h>
  13. #include <linux/signal.h>
  14. #include <linux/errno.h>
  15. #include <linux/mm.h>
  16. #include <linux/slab.h>
  17. #include <linux/poll.h>
  18. #include <linux/string.h>
  19. #include <linux/list.h>
  20. #include <linux/hash.h>
  21. #include <linux/spinlock.h>
  22. #include <linux/syscalls.h>
  23. #include <linux/rbtree.h>
  24. #include <linux/wait.h>
  25. #include <linux/eventpoll.h>
  26. #include <linux/mount.h>
  27. #include <linux/bitops.h>
  28. #include <linux/mutex.h>
  29. #include <linux/anon_inodes.h>
  30. #include <linux/device.h>
  31. #include <linux/uaccess.h>
  32. #include <asm/io.h>
  33. #include <asm/mman.h>
  34. #include <linux/atomic.h>
  35. #include <linux/proc_fs.h>
  36. #include <linux/seq_file.h>
  37. #include <linux/compat.h>
  38. #include <linux/rculist.h>
  39. #include <linux/capability.h>
  40. #include <net/busy_poll.h>
  41. /*
  42. * LOCKING:
  43. * There are three level of locking required by epoll :
  44. *
  45. * 1) epnested_mutex (mutex)
  46. * 2) ep->mtx (mutex)
  47. * 3) ep->lock (spinlock)
  48. *
  49. * The acquire order is the one listed above, from 1 to 3.
  50. * We need a spinlock (ep->lock) because we manipulate objects
  51. * from inside the poll callback, that might be triggered from
  52. * a wake_up() that in turn might be called from IRQ context.
  53. * So we can't sleep inside the poll callback and hence we need
  54. * a spinlock. During the event transfer loop (from kernel to
  55. * user space) we could end up sleeping due a copy_to_user(), so
  56. * we need a lock that will allow us to sleep. This lock is a
  57. * mutex (ep->mtx). It is acquired during the event transfer loop,
  58. * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file().
  59. * The epnested_mutex is acquired when inserting an epoll fd onto another
  60. * epoll fd. We do this so that we walk the epoll tree and ensure that this
  61. * insertion does not create a cycle of epoll file descriptors, which
  62. * could lead to deadlock. We need a global mutex to prevent two
  63. * simultaneous inserts (A into B and B into A) from racing and
  64. * constructing a cycle without either insert observing that it is
  65. * going to.
  66. * It is necessary to acquire multiple "ep->mtx"es at once in the
  67. * case when one epoll fd is added to another. In this case, we
  68. * always acquire the locks in the order of nesting (i.e. after
  69. * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired
  70. * before e2->mtx). Since we disallow cycles of epoll file
  71. * descriptors, this ensures that the mutexes are well-ordered. In
  72. * order to communicate this nesting to lockdep, when walking a tree
  73. * of epoll file descriptors, we use the current recursion depth as
  74. * the lockdep subkey.
  75. * It is possible to drop the "ep->mtx" and to use the global
  76. * mutex "epnested_mutex" (together with "ep->lock") to have it working,
  77. * but having "ep->mtx" will make the interface more scalable.
  78. * Events that require holding "epnested_mutex" are very rare, while for
  79. * normal operations the epoll private "ep->mtx" will guarantee
  80. * a better scalability.
  81. */
  82. /* Epoll private bits inside the event mask */
  83. #define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE)
  84. #define EPOLLINOUT_BITS (EPOLLIN | EPOLLOUT)
  85. #define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | EPOLLERR | EPOLLHUP | \
  86. EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE)
  87. /* Maximum number of nesting allowed inside epoll sets */
  88. #define EP_MAX_NESTS 4
  89. #define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
  90. #define EP_UNACTIVE_PTR ((void *) -1L)
  91. #define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))
  92. struct epoll_filefd {
  93. struct file *file;
  94. int fd;
  95. } __packed;
  96. /* Wait structure used by the poll hooks */
  97. struct eppoll_entry {
  98. /* List header used to link this structure to the "struct epitem" */
  99. struct eppoll_entry *next;
  100. /* The "base" pointer is set to the container "struct epitem" */
  101. struct epitem *base;
  102. /*
  103. * Wait queue item that will be linked to the target file wait
  104. * queue head.
  105. */
  106. wait_queue_entry_t wait;
  107. /* The wait queue head that linked the "wait" wait queue item */
  108. wait_queue_head_t *whead;
  109. };
  110. /*
  111. * Each file descriptor added to the eventpoll interface will
  112. * have an entry of this type linked to the "rbr" RB tree.
  113. * Avoid increasing the size of this struct, there can be many thousands
  114. * of these on a server and we do not want this to take another cache line.
  115. */
  116. struct epitem {
  117. union {
  118. /* RB tree node links this structure to the eventpoll RB tree */
  119. struct rb_node rbn;
  120. /* Used to free the struct epitem */
  121. struct rcu_head rcu;
  122. };
  123. /* List header used to link this structure to the eventpoll ready list */
  124. struct list_head rdllink;
  125. /*
  126. * Works together "struct eventpoll"->ovflist in keeping the
  127. * single linked chain of items.
  128. */
  129. struct epitem *next;
  130. /* The file descriptor information this item refers to */
  131. struct epoll_filefd ffd;
  132. /*
  133. * Protected by file->f_lock, true for to-be-released epitem already
  134. * removed from the "struct file" items list; together with
  135. * eventpoll->refcount orchestrates "struct eventpoll" disposal
  136. */
  137. bool dying;
  138. /* List containing poll wait queues */
  139. struct eppoll_entry *pwqlist;
  140. /* The "container" of this item */
  141. struct eventpoll *ep;
  142. /* List header used to link this item to the "struct file" items list */
  143. struct hlist_node fllink;
  144. /* wakeup_source used when EPOLLWAKEUP is set */
  145. struct wakeup_source __rcu *ws;
  146. /* The structure that describe the interested events and the source fd */
  147. struct epoll_event event;
  148. };
  149. /*
  150. * This structure is stored inside the "private_data" member of the file
  151. * structure and represents the main data structure for the eventpoll
  152. * interface.
  153. */
  154. struct eventpoll {
  155. /*
  156. * This mutex is used to ensure that files are not removed
  157. * while epoll is using them. This is held during the event
  158. * collection loop, the file cleanup path, the epoll file exit
  159. * code and the ctl operations.
  160. */
  161. struct mutex mtx;
  162. /* Wait queue used by sys_epoll_wait() */
  163. wait_queue_head_t wq;
  164. /* Wait queue used by file->poll() */
  165. wait_queue_head_t poll_wait;
  166. /* List of ready file descriptors */
  167. struct list_head rdllist;
  168. /* Lock which protects rdllist and ovflist */
  169. spinlock_t lock;
  170. /* RB tree root used to store monitored fd structs */
  171. struct rb_root_cached rbr;
  172. /*
  173. * This is a single linked list that chains all the "struct epitem" that
  174. * happened while transferring ready events to userspace w/out
  175. * holding ->lock.
  176. */
  177. struct epitem *ovflist;
  178. /* wakeup_source used when ep_send_events or __ep_eventpoll_poll is running */
  179. struct wakeup_source *ws;
  180. /* The user that created the eventpoll descriptor */
  181. struct user_struct *user;
  182. struct file *file;
  183. /* used to optimize loop detection check */
  184. u64 gen;
  185. struct hlist_head refs;
  186. u8 loop_check_depth;
  187. /*
  188. * usage count, used together with epitem->dying to
  189. * orchestrate the disposal of this struct
  190. */
  191. refcount_t refcount;
  192. /* used to defer freeing past ep_get_upwards_depth_proc() RCU walk */
  193. struct rcu_head rcu;
  194. #ifdef CONFIG_NET_RX_BUSY_POLL
  195. /* used to track busy poll napi_id */
  196. unsigned int napi_id;
  197. /* busy poll timeout */
  198. u32 busy_poll_usecs;
  199. /* busy poll packet budget */
  200. u16 busy_poll_budget;
  201. bool prefer_busy_poll;
  202. #endif
  203. #ifdef CONFIG_DEBUG_LOCK_ALLOC
  204. /* tracks wakeup nests for lockdep validation */
  205. u8 nests;
  206. #endif
  207. };
  208. /* Wrapper struct used by poll queueing */
  209. struct ep_pqueue {
  210. poll_table pt;
  211. struct epitem *epi;
  212. };
  213. /*
  214. * Configuration options available inside /proc/sys/fs/epoll/
  215. */
  216. /* Maximum number of epoll watched descriptors, per user */
  217. static long max_user_watches __read_mostly;
  218. /* Used for cycles detection */
  219. static DEFINE_MUTEX(epnested_mutex);
  220. static u64 loop_check_gen = 0;
  221. /* Used to check for epoll file descriptor inclusion loops */
  222. static struct eventpoll *inserting_into;
  223. /* Slab cache used to allocate "struct epitem" */
  224. static struct kmem_cache *epi_cache __ro_after_init;
  225. /* Slab cache used to allocate "struct eppoll_entry" */
  226. static struct kmem_cache *pwq_cache __ro_after_init;
  227. /*
  228. * List of files with newly added links, where we may need to limit the number
  229. * of emanating paths. Protected by the epnested_mutex.
  230. */
  231. struct epitems_head {
  232. struct hlist_head epitems;
  233. struct epitems_head *next;
  234. };
  235. static struct epitems_head *tfile_check_list = EP_UNACTIVE_PTR;
  236. static struct kmem_cache *ephead_cache __ro_after_init;
  237. static inline void free_ephead(struct epitems_head *head)
  238. {
  239. if (head)
  240. kmem_cache_free(ephead_cache, head);
  241. }
  242. static void list_file(struct file *file)
  243. {
  244. struct epitems_head *head;
  245. head = container_of(file->f_ep, struct epitems_head, epitems);
  246. if (!head->next) {
  247. head->next = tfile_check_list;
  248. tfile_check_list = head;
  249. }
  250. }
  251. static void unlist_file(struct epitems_head *head)
  252. {
  253. struct epitems_head *to_free = head;
  254. struct hlist_node *p = rcu_dereference(hlist_first_rcu(&head->epitems));
  255. if (p) {
  256. struct epitem *epi= container_of(p, struct epitem, fllink);
  257. spin_lock(&epi->ffd.file->f_lock);
  258. if (!hlist_empty(&head->epitems))
  259. to_free = NULL;
  260. head->next = NULL;
  261. spin_unlock(&epi->ffd.file->f_lock);
  262. }
  263. free_ephead(to_free);
  264. }
  265. #ifdef CONFIG_SYSCTL
  266. #include <linux/sysctl.h>
  267. static long long_zero;
  268. static long long_max = LONG_MAX;
  269. static const struct ctl_table epoll_table[] = {
  270. {
  271. .procname = "max_user_watches",
  272. .data = &max_user_watches,
  273. .maxlen = sizeof(max_user_watches),
  274. .mode = 0644,
  275. .proc_handler = proc_doulongvec_minmax,
  276. .extra1 = &long_zero,
  277. .extra2 = &long_max,
  278. },
  279. };
  280. static void __init epoll_sysctls_init(void)
  281. {
  282. register_sysctl("fs/epoll", epoll_table);
  283. }
  284. #else
  285. #define epoll_sysctls_init() do { } while (0)
  286. #endif /* CONFIG_SYSCTL */
  287. static const struct file_operations eventpoll_fops;
  288. static inline int is_file_epoll(struct file *f)
  289. {
  290. return f->f_op == &eventpoll_fops;
  291. }
  292. /* Setup the structure that is used as key for the RB tree */
  293. static inline void ep_set_ffd(struct epoll_filefd *ffd,
  294. struct file *file, int fd)
  295. {
  296. ffd->file = file;
  297. ffd->fd = fd;
  298. }
  299. /* Compare RB tree keys */
  300. static inline int ep_cmp_ffd(struct epoll_filefd *p1,
  301. struct epoll_filefd *p2)
  302. {
  303. return (p1->file > p2->file ? +1:
  304. (p1->file < p2->file ? -1 : p1->fd - p2->fd));
  305. }
  306. /* Tells us if the item is currently linked */
  307. static inline int ep_is_linked(struct epitem *epi)
  308. {
  309. return !list_empty(&epi->rdllink);
  310. }
  311. static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p)
  312. {
  313. return container_of(p, struct eppoll_entry, wait);
  314. }
  315. /* Get the "struct epitem" from a wait queue pointer */
  316. static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p)
  317. {
  318. return container_of(p, struct eppoll_entry, wait)->base;
  319. }
  320. /**
  321. * ep_events_available - Checks if ready events might be available.
  322. *
  323. * @ep: Pointer to the eventpoll context.
  324. *
  325. * Return: a value different than %zero if ready events are available,
  326. * or %zero otherwise.
  327. */
  328. static inline int ep_events_available(struct eventpoll *ep)
  329. {
  330. return !list_empty_careful(&ep->rdllist) ||
  331. READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR;
  332. }
  333. #ifdef CONFIG_NET_RX_BUSY_POLL
  334. /**
  335. * busy_loop_ep_timeout - check if busy poll has timed out. The timeout value
  336. * from the epoll instance ep is preferred, but if it is not set fallback to
  337. * the system-wide global via busy_loop_timeout.
  338. *
  339. * @start_time: The start time used to compute the remaining time until timeout.
  340. * @ep: Pointer to the eventpoll context.
  341. *
  342. * Return: true if the timeout has expired, false otherwise.
  343. */
  344. static bool busy_loop_ep_timeout(unsigned long start_time,
  345. struct eventpoll *ep)
  346. {
  347. unsigned long bp_usec = READ_ONCE(ep->busy_poll_usecs);
  348. if (bp_usec) {
  349. unsigned long end_time = start_time + bp_usec;
  350. unsigned long now = busy_loop_current_time();
  351. return time_after(now, end_time);
  352. } else {
  353. return busy_loop_timeout(start_time);
  354. }
  355. }
  356. static bool ep_busy_loop_on(struct eventpoll *ep)
  357. {
  358. return !!READ_ONCE(ep->busy_poll_usecs) ||
  359. READ_ONCE(ep->prefer_busy_poll) ||
  360. net_busy_loop_on();
  361. }
  362. static bool ep_busy_loop_end(void *p, unsigned long start_time)
  363. {
  364. struct eventpoll *ep = p;
  365. return ep_events_available(ep) || busy_loop_ep_timeout(start_time, ep);
  366. }
  367. /*
  368. * Busy poll if globally on and supporting sockets found && no events,
  369. * busy loop will return if need_resched or ep_events_available.
  370. *
  371. * we must do our busy polling with irqs enabled
  372. */
  373. static bool ep_busy_loop(struct eventpoll *ep)
  374. {
  375. unsigned int napi_id = READ_ONCE(ep->napi_id);
  376. u16 budget = READ_ONCE(ep->busy_poll_budget);
  377. bool prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll);
  378. if (!budget)
  379. budget = BUSY_POLL_BUDGET;
  380. if (napi_id_valid(napi_id) && ep_busy_loop_on(ep)) {
  381. napi_busy_loop(napi_id, ep_busy_loop_end,
  382. ep, prefer_busy_poll, budget);
  383. if (ep_events_available(ep))
  384. return true;
  385. /*
  386. * Busy poll timed out. Drop NAPI ID for now, we can add
  387. * it back in when we have moved a socket with a valid NAPI
  388. * ID onto the ready list.
  389. */
  390. if (prefer_busy_poll)
  391. napi_resume_irqs(napi_id);
  392. ep->napi_id = 0;
  393. return false;
  394. }
  395. return false;
  396. }
  397. /*
  398. * Set epoll busy poll NAPI ID from sk.
  399. */
  400. static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
  401. {
  402. struct eventpoll *ep = epi->ep;
  403. unsigned int napi_id;
  404. struct socket *sock;
  405. struct sock *sk;
  406. if (!ep_busy_loop_on(ep))
  407. return;
  408. sock = sock_from_file(epi->ffd.file);
  409. if (!sock)
  410. return;
  411. sk = sock->sk;
  412. if (!sk)
  413. return;
  414. napi_id = READ_ONCE(sk->sk_napi_id);
  415. /* Non-NAPI IDs can be rejected
  416. * or
  417. * Nothing to do if we already have this ID
  418. */
  419. if (!napi_id_valid(napi_id) || napi_id == ep->napi_id)
  420. return;
  421. /* record NAPI ID for use in next busy poll */
  422. ep->napi_id = napi_id;
  423. }
  424. static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd,
  425. unsigned long arg)
  426. {
  427. struct eventpoll *ep = file->private_data;
  428. void __user *uarg = (void __user *)arg;
  429. struct epoll_params epoll_params;
  430. switch (cmd) {
  431. case EPIOCSPARAMS:
  432. if (copy_from_user(&epoll_params, uarg, sizeof(epoll_params)))
  433. return -EFAULT;
  434. /* pad byte must be zero */
  435. if (epoll_params.__pad)
  436. return -EINVAL;
  437. if (epoll_params.busy_poll_usecs > S32_MAX)
  438. return -EINVAL;
  439. if (epoll_params.prefer_busy_poll > 1)
  440. return -EINVAL;
  441. if (epoll_params.busy_poll_budget > NAPI_POLL_WEIGHT &&
  442. !capable(CAP_NET_ADMIN))
  443. return -EPERM;
  444. WRITE_ONCE(ep->busy_poll_usecs, epoll_params.busy_poll_usecs);
  445. WRITE_ONCE(ep->busy_poll_budget, epoll_params.busy_poll_budget);
  446. WRITE_ONCE(ep->prefer_busy_poll, epoll_params.prefer_busy_poll);
  447. return 0;
  448. case EPIOCGPARAMS:
  449. memset(&epoll_params, 0, sizeof(epoll_params));
  450. epoll_params.busy_poll_usecs = READ_ONCE(ep->busy_poll_usecs);
  451. epoll_params.busy_poll_budget = READ_ONCE(ep->busy_poll_budget);
  452. epoll_params.prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll);
  453. if (copy_to_user(uarg, &epoll_params, sizeof(epoll_params)))
  454. return -EFAULT;
  455. return 0;
  456. default:
  457. return -ENOIOCTLCMD;
  458. }
  459. }
  460. static void ep_suspend_napi_irqs(struct eventpoll *ep)
  461. {
  462. unsigned int napi_id = READ_ONCE(ep->napi_id);
  463. if (napi_id_valid(napi_id) && READ_ONCE(ep->prefer_busy_poll))
  464. napi_suspend_irqs(napi_id);
  465. }
  466. static void ep_resume_napi_irqs(struct eventpoll *ep)
  467. {
  468. unsigned int napi_id = READ_ONCE(ep->napi_id);
  469. if (napi_id_valid(napi_id) && READ_ONCE(ep->prefer_busy_poll))
  470. napi_resume_irqs(napi_id);
  471. }
  472. #else
  473. static inline bool ep_busy_loop(struct eventpoll *ep)
  474. {
  475. return false;
  476. }
  477. static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
  478. {
  479. }
  480. static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd,
  481. unsigned long arg)
  482. {
  483. return -EOPNOTSUPP;
  484. }
  485. static void ep_suspend_napi_irqs(struct eventpoll *ep)
  486. {
  487. }
  488. static void ep_resume_napi_irqs(struct eventpoll *ep)
  489. {
  490. }
  491. #endif /* CONFIG_NET_RX_BUSY_POLL */
  492. /*
  493. * As described in commit 0ccf831cb lockdep: annotate epoll
  494. * the use of wait queues used by epoll is done in a very controlled
  495. * manner. Wake ups can nest inside each other, but are never done
  496. * with the same locking. For example:
  497. *
  498. * dfd = socket(...);
  499. * efd1 = epoll_create();
  500. * efd2 = epoll_create();
  501. * epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...);
  502. * epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...);
  503. *
  504. * When a packet arrives to the device underneath "dfd", the net code will
  505. * issue a wake_up() on its poll wake list. Epoll (efd1) has installed a
  506. * callback wakeup entry on that queue, and the wake_up() performed by the
  507. * "dfd" net code will end up in ep_poll_callback(). At this point epoll
  508. * (efd1) notices that it may have some event ready, so it needs to wake up
  509. * the waiters on its poll wait list (efd2). So it calls ep_poll_safewake()
  510. * that ends up in another wake_up(), after having checked about the
  511. * recursion constraints. That are, no more than EP_MAX_NESTS, to avoid
  512. * stack blasting.
  513. *
  514. * When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle
  515. * this special case of epoll.
  516. */
  517. #ifdef CONFIG_DEBUG_LOCK_ALLOC
  518. static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi,
  519. unsigned pollflags)
  520. {
  521. struct eventpoll *ep_src;
  522. unsigned long flags;
  523. u8 nests = 0;
  524. /*
  525. * To set the subclass or nesting level for spin_lock_irqsave_nested()
  526. * it might be natural to create a per-cpu nest count. However, since
  527. * we can recurse on ep->poll_wait.lock, and a non-raw spinlock can
  528. * schedule() in the -rt kernel, the per-cpu variable are no longer
  529. * protected. Thus, we are introducing a per eventpoll nest field.
  530. * If we are not being call from ep_poll_callback(), epi is NULL and
  531. * we are at the first level of nesting, 0. Otherwise, we are being
  532. * called from ep_poll_callback() and if a previous wakeup source is
  533. * not an epoll file itself, we are at depth 1 since the wakeup source
  534. * is depth 0. If the wakeup source is a previous epoll file in the
  535. * wakeup chain then we use its nests value and record ours as
  536. * nests + 1. The previous epoll file nests value is stable since its
  537. * already holding its own poll_wait.lock.
  538. */
  539. if (epi) {
  540. if ((is_file_epoll(epi->ffd.file))) {
  541. ep_src = epi->ffd.file->private_data;
  542. nests = ep_src->nests;
  543. } else {
  544. nests = 1;
  545. }
  546. }
  547. spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests);
  548. ep->nests = nests + 1;
  549. wake_up_locked_poll(&ep->poll_wait, EPOLLIN | pollflags);
  550. ep->nests = 0;
  551. spin_unlock_irqrestore(&ep->poll_wait.lock, flags);
  552. }
  553. #else
  554. static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi,
  555. __poll_t pollflags)
  556. {
  557. wake_up_poll(&ep->poll_wait, EPOLLIN | pollflags);
  558. }
  559. #endif
  560. static void ep_remove_wait_queue(struct eppoll_entry *pwq)
  561. {
  562. wait_queue_head_t *whead;
  563. rcu_read_lock();
  564. /*
  565. * If it is cleared by POLLFREE, it should be rcu-safe.
  566. * If we read NULL we need a barrier paired with
  567. * smp_store_release() in ep_poll_callback(), otherwise
  568. * we rely on whead->lock.
  569. */
  570. whead = smp_load_acquire(&pwq->whead);
  571. if (whead)
  572. remove_wait_queue(whead, &pwq->wait);
  573. rcu_read_unlock();
  574. }
  575. /*
  576. * This function unregisters poll callbacks from the associated file
  577. * descriptor. Must be called with "mtx" held.
  578. */
  579. static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
  580. {
  581. struct eppoll_entry **p = &epi->pwqlist;
  582. struct eppoll_entry *pwq;
  583. while ((pwq = *p) != NULL) {
  584. *p = pwq->next;
  585. ep_remove_wait_queue(pwq);
  586. kmem_cache_free(pwq_cache, pwq);
  587. }
  588. }
  589. /* call only when ep->mtx is held */
  590. static inline struct wakeup_source *ep_wakeup_source(struct epitem *epi)
  591. {
  592. return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx));
  593. }
  594. /* call only when ep->mtx is held */
  595. static inline void ep_pm_stay_awake(struct epitem *epi)
  596. {
  597. struct wakeup_source *ws = ep_wakeup_source(epi);
  598. if (ws)
  599. __pm_stay_awake(ws);
  600. }
  601. static inline bool ep_has_wakeup_source(struct epitem *epi)
  602. {
  603. return rcu_access_pointer(epi->ws) ? true : false;
  604. }
  605. /* call when ep->mtx cannot be held (ep_poll_callback) */
  606. static inline void ep_pm_stay_awake_rcu(struct epitem *epi)
  607. {
  608. struct wakeup_source *ws;
  609. rcu_read_lock();
  610. ws = rcu_dereference(epi->ws);
  611. if (ws)
  612. __pm_stay_awake(ws);
  613. rcu_read_unlock();
  614. }
  615. /*
  616. * ep->mutex needs to be held because we could be hit by
  617. * eventpoll_release_file() and epoll_ctl().
  618. */
  619. static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist)
  620. {
  621. /*
  622. * Steal the ready list, and re-init the original one to the
  623. * empty list. Also, set ep->ovflist to NULL so that events
  624. * happening while looping w/out locks, are not lost. We cannot
  625. * have the poll callback to queue directly on ep->rdllist,
  626. * because we want the "sproc" callback to be able to do it
  627. * in a lockless way.
  628. */
  629. lockdep_assert_irqs_enabled();
  630. spin_lock_irq(&ep->lock);
  631. list_splice_init(&ep->rdllist, txlist);
  632. WRITE_ONCE(ep->ovflist, NULL);
  633. spin_unlock_irq(&ep->lock);
  634. }
  635. static void ep_done_scan(struct eventpoll *ep,
  636. struct list_head *txlist)
  637. {
  638. struct epitem *epi, *nepi;
  639. spin_lock_irq(&ep->lock);
  640. /*
  641. * During the time we spent inside the "sproc" callback, some
  642. * other events might have been queued by the poll callback.
  643. * We re-insert them inside the main ready-list here.
  644. */
  645. for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL;
  646. nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
  647. /*
  648. * We need to check if the item is already in the list.
  649. * During the "sproc" callback execution time, items are
  650. * queued into ->ovflist but the "txlist" might already
  651. * contain them, and the list_splice() below takes care of them.
  652. */
  653. if (!ep_is_linked(epi)) {
  654. /*
  655. * ->ovflist is LIFO, so we have to reverse it in order
  656. * to keep in FIFO.
  657. */
  658. list_add(&epi->rdllink, &ep->rdllist);
  659. ep_pm_stay_awake(epi);
  660. }
  661. }
  662. /*
  663. * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
  664. * releasing the lock, events will be queued in the normal way inside
  665. * ep->rdllist.
  666. */
  667. WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR);
  668. /*
  669. * Quickly re-inject items left on "txlist".
  670. */
  671. list_splice(txlist, &ep->rdllist);
  672. __pm_relax(ep->ws);
  673. if (!list_empty(&ep->rdllist)) {
  674. if (waitqueue_active(&ep->wq))
  675. wake_up(&ep->wq);
  676. }
  677. spin_unlock_irq(&ep->lock);
  678. }
  679. static void ep_get(struct eventpoll *ep)
  680. {
  681. refcount_inc(&ep->refcount);
  682. }
  683. /*
  684. * Returns true if the event poll can be disposed
  685. */
  686. static bool ep_refcount_dec_and_test(struct eventpoll *ep)
  687. {
  688. if (!refcount_dec_and_test(&ep->refcount))
  689. return false;
  690. WARN_ON_ONCE(!RB_EMPTY_ROOT(&ep->rbr.rb_root));
  691. return true;
  692. }
  693. static void ep_free(struct eventpoll *ep)
  694. {
  695. ep_resume_napi_irqs(ep);
  696. mutex_destroy(&ep->mtx);
  697. free_uid(ep->user);
  698. wakeup_source_unregister(ep->ws);
  699. /* ep_get_upwards_depth_proc() may still hold epi->ep under RCU */
  700. kfree_rcu(ep, rcu);
  701. }
  702. /*
  703. * Removes a "struct epitem" from the eventpoll RB tree and deallocates
  704. * all the associated resources. Must be called with "mtx" held.
  705. * If the dying flag is set, do the removal only if force is true.
  706. * This prevents ep_clear_and_put() from dropping all the ep references
  707. * while running concurrently with eventpoll_release_file().
  708. * Returns true if the eventpoll can be disposed.
  709. */
  710. static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
  711. {
  712. struct file *file = epi->ffd.file;
  713. struct epitems_head *to_free;
  714. struct hlist_head *head;
  715. lockdep_assert_irqs_enabled();
  716. /*
  717. * Removes poll wait queue hooks.
  718. */
  719. ep_unregister_pollwait(ep, epi);
  720. /* Remove the current item from the list of epoll hooks */
  721. spin_lock(&file->f_lock);
  722. if (epi->dying && !force) {
  723. spin_unlock(&file->f_lock);
  724. return false;
  725. }
  726. to_free = NULL;
  727. head = file->f_ep;
  728. if (head->first == &epi->fllink && !epi->fllink.next) {
  729. /* See eventpoll_release() for details. */
  730. WRITE_ONCE(file->f_ep, NULL);
  731. if (!is_file_epoll(file)) {
  732. struct epitems_head *v;
  733. v = container_of(head, struct epitems_head, epitems);
  734. if (!smp_load_acquire(&v->next))
  735. to_free = v;
  736. }
  737. }
  738. hlist_del_rcu(&epi->fllink);
  739. spin_unlock(&file->f_lock);
  740. free_ephead(to_free);
  741. rb_erase_cached(&epi->rbn, &ep->rbr);
  742. spin_lock_irq(&ep->lock);
  743. if (ep_is_linked(epi))
  744. list_del_init(&epi->rdllink);
  745. spin_unlock_irq(&ep->lock);
  746. wakeup_source_unregister(ep_wakeup_source(epi));
  747. /*
  748. * At this point it is safe to free the eventpoll item. Use the union
  749. * field epi->rcu, since we are trying to minimize the size of
  750. * 'struct epitem'. The 'rbn' field is no longer in use. Protected by
  751. * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make
  752. * use of the rbn field.
  753. */
  754. kfree_rcu(epi, rcu);
  755. percpu_counter_dec(&ep->user->epoll_watches);
  756. return true;
  757. }
  758. /*
  759. * ep_remove variant for callers owing an additional reference to the ep
  760. */
  761. static void ep_remove_safe(struct eventpoll *ep, struct epitem *epi)
  762. {
  763. if (__ep_remove(ep, epi, false))
  764. WARN_ON_ONCE(ep_refcount_dec_and_test(ep));
  765. }
  766. static void ep_clear_and_put(struct eventpoll *ep)
  767. {
  768. struct rb_node *rbp, *next;
  769. struct epitem *epi;
  770. /* We need to release all tasks waiting for these file */
  771. if (waitqueue_active(&ep->poll_wait))
  772. ep_poll_safewake(ep, NULL, 0);
  773. mutex_lock(&ep->mtx);
  774. /*
  775. * Walks through the whole tree by unregistering poll callbacks.
  776. */
  777. for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
  778. epi = rb_entry(rbp, struct epitem, rbn);
  779. ep_unregister_pollwait(ep, epi);
  780. cond_resched();
  781. }
  782. /*
  783. * Walks through the whole tree and try to free each "struct epitem".
  784. * Note that ep_remove_safe() will not remove the epitem in case of a
  785. * racing eventpoll_release_file(); the latter will do the removal.
  786. * At this point we are sure no poll callbacks will be lingering around.
  787. * Since we still own a reference to the eventpoll struct, the loop can't
  788. * dispose it.
  789. */
  790. for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = next) {
  791. next = rb_next(rbp);
  792. epi = rb_entry(rbp, struct epitem, rbn);
  793. ep_remove_safe(ep, epi);
  794. cond_resched();
  795. }
  796. mutex_unlock(&ep->mtx);
  797. if (ep_refcount_dec_and_test(ep))
  798. ep_free(ep);
  799. }
  800. static long ep_eventpoll_ioctl(struct file *file, unsigned int cmd,
  801. unsigned long arg)
  802. {
  803. int ret;
  804. if (!is_file_epoll(file))
  805. return -EINVAL;
  806. switch (cmd) {
  807. case EPIOCSPARAMS:
  808. case EPIOCGPARAMS:
  809. ret = ep_eventpoll_bp_ioctl(file, cmd, arg);
  810. break;
  811. default:
  812. ret = -EINVAL;
  813. break;
  814. }
  815. return ret;
  816. }
  817. static int ep_eventpoll_release(struct inode *inode, struct file *file)
  818. {
  819. struct eventpoll *ep = file->private_data;
  820. if (ep)
  821. ep_clear_and_put(ep);
  822. return 0;
  823. }
  824. static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, int depth);
  825. static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int depth)
  826. {
  827. struct eventpoll *ep = file->private_data;
  828. LIST_HEAD(txlist);
  829. struct epitem *epi, *tmp;
  830. poll_table pt;
  831. __poll_t res = 0;
  832. init_poll_funcptr(&pt, NULL);
  833. /* Insert inside our poll wait queue */
  834. poll_wait(file, &ep->poll_wait, wait);
  835. /*
  836. * Proceed to find out if wanted events are really available inside
  837. * the ready list.
  838. */
  839. mutex_lock_nested(&ep->mtx, depth);
  840. ep_start_scan(ep, &txlist);
  841. list_for_each_entry_safe(epi, tmp, &txlist, rdllink) {
  842. if (ep_item_poll(epi, &pt, depth + 1)) {
  843. res = EPOLLIN | EPOLLRDNORM;
  844. break;
  845. } else {
  846. /*
  847. * Item has been dropped into the ready list by the poll
  848. * callback, but it's not actually ready, as far as
  849. * caller requested events goes. We can remove it here.
  850. */
  851. __pm_relax(ep_wakeup_source(epi));
  852. list_del_init(&epi->rdllink);
  853. }
  854. }
  855. ep_done_scan(ep, &txlist);
  856. mutex_unlock(&ep->mtx);
  857. return res;
  858. }
  859. /*
  860. * The ffd.file pointer may be in the process of being torn down due to
  861. * being closed, but we may not have finished eventpoll_release() yet.
  862. *
  863. * Normally, even with the atomic_long_inc_not_zero, the file may have
  864. * been free'd and then gotten re-allocated to something else (since
  865. * files are not RCU-delayed, they are SLAB_TYPESAFE_BY_RCU).
  866. *
  867. * But for epoll, users hold the ep->mtx mutex, and as such any file in
  868. * the process of being free'd will block in eventpoll_release_file()
  869. * and thus the underlying file allocation will not be free'd, and the
  870. * file re-use cannot happen.
  871. *
  872. * For the same reason we can avoid a rcu_read_lock() around the
  873. * operation - 'ffd.file' cannot go away even if the refcount has
  874. * reached zero (but we must still not call out to ->poll() functions
  875. * etc).
  876. */
  877. static struct file *epi_fget(const struct epitem *epi)
  878. {
  879. struct file *file;
  880. file = epi->ffd.file;
  881. if (!file_ref_get(&file->f_ref))
  882. file = NULL;
  883. return file;
  884. }
  885. /*
  886. * Differs from ep_eventpoll_poll() in that internal callers already have
  887. * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested()
  888. * is correctly annotated.
  889. */
  890. static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
  891. int depth)
  892. {
  893. struct file *file = epi_fget(epi);
  894. __poll_t res;
  895. /*
  896. * We could return EPOLLERR | EPOLLHUP or something, but let's
  897. * treat this more as "file doesn't exist, poll didn't happen".
  898. */
  899. if (!file)
  900. return 0;
  901. pt->_key = epi->event.events;
  902. if (!is_file_epoll(file))
  903. res = vfs_poll(file, pt);
  904. else
  905. res = __ep_eventpoll_poll(file, pt, depth);
  906. fput(file);
  907. return res & epi->event.events;
  908. }
  909. static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait)
  910. {
  911. return __ep_eventpoll_poll(file, wait, 0);
  912. }
  913. #ifdef CONFIG_PROC_FS
  914. static void ep_show_fdinfo(struct seq_file *m, struct file *f)
  915. {
  916. struct eventpoll *ep = f->private_data;
  917. struct rb_node *rbp;
  918. mutex_lock(&ep->mtx);
  919. for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
  920. struct epitem *epi = rb_entry(rbp, struct epitem, rbn);
  921. struct inode *inode = file_inode(epi->ffd.file);
  922. seq_printf(m, "tfd: %8d events: %8x data: %16llx "
  923. " pos:%lli ino:%lx sdev:%x\n",
  924. epi->ffd.fd, epi->event.events,
  925. (long long)epi->event.data,
  926. (long long)epi->ffd.file->f_pos,
  927. inode->i_ino, inode->i_sb->s_dev);
  928. if (seq_has_overflowed(m))
  929. break;
  930. }
  931. mutex_unlock(&ep->mtx);
  932. }
  933. #endif
  934. /* File callbacks that implement the eventpoll file behaviour */
  935. static const struct file_operations eventpoll_fops = {
  936. #ifdef CONFIG_PROC_FS
  937. .show_fdinfo = ep_show_fdinfo,
  938. #endif
  939. .release = ep_eventpoll_release,
  940. .poll = ep_eventpoll_poll,
  941. .llseek = noop_llseek,
  942. .unlocked_ioctl = ep_eventpoll_ioctl,
  943. .compat_ioctl = compat_ptr_ioctl,
  944. };
  945. /*
  946. * This is called from eventpoll_release() to unlink files from the eventpoll
  947. * interface. We need to have this facility to cleanup correctly files that are
  948. * closed without being removed from the eventpoll interface.
  949. */
  950. void eventpoll_release_file(struct file *file)
  951. {
  952. struct eventpoll *ep;
  953. struct epitem *epi;
  954. bool dispose;
  955. /*
  956. * Use the 'dying' flag to prevent a concurrent ep_clear_and_put() from
  957. * touching the epitems list before eventpoll_release_file() can access
  958. * the ep->mtx.
  959. */
  960. again:
  961. spin_lock(&file->f_lock);
  962. if (file->f_ep && file->f_ep->first) {
  963. epi = hlist_entry(file->f_ep->first, struct epitem, fllink);
  964. epi->dying = true;
  965. spin_unlock(&file->f_lock);
  966. /*
  967. * ep access is safe as we still own a reference to the ep
  968. * struct
  969. */
  970. ep = epi->ep;
  971. mutex_lock(&ep->mtx);
  972. dispose = __ep_remove(ep, epi, true);
  973. mutex_unlock(&ep->mtx);
  974. if (dispose && ep_refcount_dec_and_test(ep))
  975. ep_free(ep);
  976. goto again;
  977. }
  978. spin_unlock(&file->f_lock);
  979. }
  980. static int ep_alloc(struct eventpoll **pep)
  981. {
  982. struct eventpoll *ep;
  983. ep = kzalloc_obj(*ep);
  984. if (unlikely(!ep))
  985. return -ENOMEM;
  986. mutex_init(&ep->mtx);
  987. spin_lock_init(&ep->lock);
  988. init_waitqueue_head(&ep->wq);
  989. init_waitqueue_head(&ep->poll_wait);
  990. INIT_LIST_HEAD(&ep->rdllist);
  991. ep->rbr = RB_ROOT_CACHED;
  992. ep->ovflist = EP_UNACTIVE_PTR;
  993. ep->user = get_current_user();
  994. refcount_set(&ep->refcount, 1);
  995. *pep = ep;
  996. return 0;
  997. }
  998. /*
  999. * Search the file inside the eventpoll tree. The RB tree operations
  1000. * are protected by the "mtx" mutex, and ep_find() must be called with
  1001. * "mtx" held.
  1002. */
  1003. static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
  1004. {
  1005. int kcmp;
  1006. struct rb_node *rbp;
  1007. struct epitem *epi, *epir = NULL;
  1008. struct epoll_filefd ffd;
  1009. ep_set_ffd(&ffd, file, fd);
  1010. for (rbp = ep->rbr.rb_root.rb_node; rbp; ) {
  1011. epi = rb_entry(rbp, struct epitem, rbn);
  1012. kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
  1013. if (kcmp > 0)
  1014. rbp = rbp->rb_right;
  1015. else if (kcmp < 0)
  1016. rbp = rbp->rb_left;
  1017. else {
  1018. epir = epi;
  1019. break;
  1020. }
  1021. }
  1022. return epir;
  1023. }
  1024. #ifdef CONFIG_KCMP
  1025. static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff)
  1026. {
  1027. struct rb_node *rbp;
  1028. struct epitem *epi;
  1029. for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
  1030. epi = rb_entry(rbp, struct epitem, rbn);
  1031. if (epi->ffd.fd == tfd) {
  1032. if (toff == 0)
  1033. return epi;
  1034. else
  1035. toff--;
  1036. }
  1037. cond_resched();
  1038. }
  1039. return NULL;
  1040. }
  1041. struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
  1042. unsigned long toff)
  1043. {
  1044. struct file *file_raw;
  1045. struct eventpoll *ep;
  1046. struct epitem *epi;
  1047. if (!is_file_epoll(file))
  1048. return ERR_PTR(-EINVAL);
  1049. ep = file->private_data;
  1050. mutex_lock(&ep->mtx);
  1051. epi = ep_find_tfd(ep, tfd, toff);
  1052. if (epi)
  1053. file_raw = epi->ffd.file;
  1054. else
  1055. file_raw = ERR_PTR(-ENOENT);
  1056. mutex_unlock(&ep->mtx);
  1057. return file_raw;
  1058. }
  1059. #endif /* CONFIG_KCMP */
  1060. /*
  1061. * This is the callback that is passed to the wait queue wakeup
  1062. * mechanism. It is called by the stored file descriptors when they
  1063. * have events to report.
  1064. */
  1065. static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
  1066. {
  1067. int pwake = 0;
  1068. struct epitem *epi = ep_item_from_wait(wait);
  1069. struct eventpoll *ep = epi->ep;
  1070. __poll_t pollflags = key_to_poll(key);
  1071. unsigned long flags;
  1072. int ewake = 0;
  1073. spin_lock_irqsave(&ep->lock, flags);
  1074. ep_set_busy_poll_napi_id(epi);
  1075. /*
  1076. * If the event mask does not contain any poll(2) event, we consider the
  1077. * descriptor to be disabled. This condition is likely the effect of the
  1078. * EPOLLONESHOT bit that disables the descriptor when an event is received,
  1079. * until the next EPOLL_CTL_MOD will be issued.
  1080. */
  1081. if (!(epi->event.events & ~EP_PRIVATE_BITS))
  1082. goto out_unlock;
  1083. /*
  1084. * Check the events coming with the callback. At this stage, not
  1085. * every device reports the events in the "key" parameter of the
  1086. * callback. We need to be able to handle both cases here, hence the
  1087. * test for "key" != NULL before the event match test.
  1088. */
  1089. if (pollflags && !(pollflags & epi->event.events))
  1090. goto out_unlock;
  1091. /*
  1092. * If we are transferring events to userspace, we can hold no locks
  1093. * (because we're accessing user memory, and because of linux f_op->poll()
  1094. * semantics). All the events that happen during that period of time are
  1095. * chained in ep->ovflist and requeued later on.
  1096. */
  1097. if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
  1098. if (epi->next == EP_UNACTIVE_PTR) {
  1099. epi->next = READ_ONCE(ep->ovflist);
  1100. WRITE_ONCE(ep->ovflist, epi);
  1101. ep_pm_stay_awake_rcu(epi);
  1102. }
  1103. } else if (!ep_is_linked(epi)) {
  1104. /* In the usual case, add event to ready list. */
  1105. list_add_tail(&epi->rdllink, &ep->rdllist);
  1106. ep_pm_stay_awake_rcu(epi);
  1107. }
  1108. /*
  1109. * Wake up ( if active ) both the eventpoll wait list and the ->poll()
  1110. * wait list.
  1111. */
  1112. if (waitqueue_active(&ep->wq)) {
  1113. if ((epi->event.events & EPOLLEXCLUSIVE) &&
  1114. !(pollflags & POLLFREE)) {
  1115. switch (pollflags & EPOLLINOUT_BITS) {
  1116. case EPOLLIN:
  1117. if (epi->event.events & EPOLLIN)
  1118. ewake = 1;
  1119. break;
  1120. case EPOLLOUT:
  1121. if (epi->event.events & EPOLLOUT)
  1122. ewake = 1;
  1123. break;
  1124. case 0:
  1125. ewake = 1;
  1126. break;
  1127. }
  1128. }
  1129. if (sync)
  1130. wake_up_sync(&ep->wq);
  1131. else
  1132. wake_up(&ep->wq);
  1133. }
  1134. if (waitqueue_active(&ep->poll_wait))
  1135. pwake++;
  1136. out_unlock:
  1137. spin_unlock_irqrestore(&ep->lock, flags);
  1138. /* We have to call this outside the lock */
  1139. if (pwake)
  1140. ep_poll_safewake(ep, epi, pollflags & EPOLL_URING_WAKE);
  1141. if (!(epi->event.events & EPOLLEXCLUSIVE))
  1142. ewake = 1;
  1143. if (pollflags & POLLFREE) {
  1144. /*
  1145. * If we race with ep_remove_wait_queue() it can miss
  1146. * ->whead = NULL and do another remove_wait_queue() after
  1147. * us, so we can't use __remove_wait_queue().
  1148. */
  1149. list_del_init(&wait->entry);
  1150. /*
  1151. * ->whead != NULL protects us from the race with
  1152. * ep_clear_and_put() or ep_remove(), ep_remove_wait_queue()
  1153. * takes whead->lock held by the caller. Once we nullify it,
  1154. * nothing protects ep/epi or even wait.
  1155. */
  1156. smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL);
  1157. }
  1158. return ewake;
  1159. }
  1160. /*
  1161. * This is the callback that is used to add our wait queue to the
  1162. * target file wakeup lists.
  1163. */
  1164. static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
  1165. poll_table *pt)
  1166. {
  1167. struct ep_pqueue *epq = container_of(pt, struct ep_pqueue, pt);
  1168. struct epitem *epi = epq->epi;
  1169. struct eppoll_entry *pwq;
  1170. if (unlikely(!epi)) // an earlier allocation has failed
  1171. return;
  1172. pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL);
  1173. if (unlikely(!pwq)) {
  1174. epq->epi = NULL;
  1175. return;
  1176. }
  1177. init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
  1178. pwq->whead = whead;
  1179. pwq->base = epi;
  1180. if (epi->event.events & EPOLLEXCLUSIVE)
  1181. add_wait_queue_exclusive(whead, &pwq->wait);
  1182. else
  1183. add_wait_queue(whead, &pwq->wait);
  1184. pwq->next = epi->pwqlist;
  1185. epi->pwqlist = pwq;
  1186. }
  1187. static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
  1188. {
  1189. int kcmp;
  1190. struct rb_node **p = &ep->rbr.rb_root.rb_node, *parent = NULL;
  1191. struct epitem *epic;
  1192. bool leftmost = true;
  1193. while (*p) {
  1194. parent = *p;
  1195. epic = rb_entry(parent, struct epitem, rbn);
  1196. kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd);
  1197. if (kcmp > 0) {
  1198. p = &parent->rb_right;
  1199. leftmost = false;
  1200. } else
  1201. p = &parent->rb_left;
  1202. }
  1203. rb_link_node(&epi->rbn, parent, p);
  1204. rb_insert_color_cached(&epi->rbn, &ep->rbr, leftmost);
  1205. }
  1206. #define PATH_ARR_SIZE 5
  1207. /*
  1208. * These are the number paths of length 1 to 5, that we are allowing to emanate
  1209. * from a single file of interest. For example, we allow 1000 paths of length
  1210. * 1, to emanate from each file of interest. This essentially represents the
  1211. * potential wakeup paths, which need to be limited in order to avoid massive
  1212. * uncontrolled wakeup storms. The common use case should be a single ep which
  1213. * is connected to n file sources. In this case each file source has 1 path
  1214. * of length 1. Thus, the numbers below should be more than sufficient. These
  1215. * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify
  1216. * and delete can't add additional paths. Protected by the epnested_mutex.
  1217. */
  1218. static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
  1219. static int path_count[PATH_ARR_SIZE];
  1220. static int path_count_inc(int nests)
  1221. {
  1222. /* Allow an arbitrary number of depth 1 paths */
  1223. if (nests == 0)
  1224. return 0;
  1225. if (++path_count[nests] > path_limits[nests])
  1226. return -1;
  1227. return 0;
  1228. }
  1229. static void path_count_init(void)
  1230. {
  1231. int i;
  1232. for (i = 0; i < PATH_ARR_SIZE; i++)
  1233. path_count[i] = 0;
  1234. }
  1235. static int reverse_path_check_proc(struct hlist_head *refs, int depth)
  1236. {
  1237. int error = 0;
  1238. struct epitem *epi;
  1239. if (depth > EP_MAX_NESTS) /* too deep nesting */
  1240. return -1;
  1241. /* CTL_DEL can remove links here, but that can't increase our count */
  1242. hlist_for_each_entry_rcu(epi, refs, fllink) {
  1243. struct hlist_head *refs = &epi->ep->refs;
  1244. if (hlist_empty(refs))
  1245. error = path_count_inc(depth);
  1246. else
  1247. error = reverse_path_check_proc(refs, depth + 1);
  1248. if (error != 0)
  1249. break;
  1250. }
  1251. return error;
  1252. }
  1253. /**
  1254. * reverse_path_check - The tfile_check_list is list of epitem_head, which have
  1255. * links that are proposed to be newly added. We need to
  1256. * make sure that those added links don't add too many
  1257. * paths such that we will spend all our time waking up
  1258. * eventpoll objects.
  1259. *
  1260. * Return: %zero if the proposed links don't create too many paths,
  1261. * %-1 otherwise.
  1262. */
  1263. static int reverse_path_check(void)
  1264. {
  1265. struct epitems_head *p;
  1266. for (p = tfile_check_list; p != EP_UNACTIVE_PTR; p = p->next) {
  1267. int error;
  1268. path_count_init();
  1269. rcu_read_lock();
  1270. error = reverse_path_check_proc(&p->epitems, 0);
  1271. rcu_read_unlock();
  1272. if (error)
  1273. return error;
  1274. }
  1275. return 0;
  1276. }
  1277. static int ep_create_wakeup_source(struct epitem *epi)
  1278. {
  1279. struct name_snapshot n;
  1280. struct wakeup_source *ws;
  1281. if (!epi->ep->ws) {
  1282. epi->ep->ws = wakeup_source_register(NULL, "eventpoll");
  1283. if (!epi->ep->ws)
  1284. return -ENOMEM;
  1285. }
  1286. take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry);
  1287. ws = wakeup_source_register(NULL, n.name.name);
  1288. release_dentry_name_snapshot(&n);
  1289. if (!ws)
  1290. return -ENOMEM;
  1291. rcu_assign_pointer(epi->ws, ws);
  1292. return 0;
  1293. }
  1294. /* rare code path, only used when EPOLL_CTL_MOD removes a wakeup source */
  1295. static noinline void ep_destroy_wakeup_source(struct epitem *epi)
  1296. {
  1297. struct wakeup_source *ws = ep_wakeup_source(epi);
  1298. RCU_INIT_POINTER(epi->ws, NULL);
  1299. /*
  1300. * wait for ep_pm_stay_awake_rcu to finish, synchronize_rcu is
  1301. * used internally by wakeup_source_remove, too (called by
  1302. * wakeup_source_unregister), so we cannot use call_rcu
  1303. */
  1304. synchronize_rcu();
  1305. wakeup_source_unregister(ws);
  1306. }
  1307. static int attach_epitem(struct file *file, struct epitem *epi)
  1308. {
  1309. struct epitems_head *to_free = NULL;
  1310. struct hlist_head *head = NULL;
  1311. struct eventpoll *ep = NULL;
  1312. if (is_file_epoll(file))
  1313. ep = file->private_data;
  1314. if (ep) {
  1315. head = &ep->refs;
  1316. } else if (!READ_ONCE(file->f_ep)) {
  1317. allocate:
  1318. to_free = kmem_cache_zalloc(ephead_cache, GFP_KERNEL);
  1319. if (!to_free)
  1320. return -ENOMEM;
  1321. head = &to_free->epitems;
  1322. }
  1323. spin_lock(&file->f_lock);
  1324. if (!file->f_ep) {
  1325. if (unlikely(!head)) {
  1326. spin_unlock(&file->f_lock);
  1327. goto allocate;
  1328. }
  1329. /* See eventpoll_release() for details. */
  1330. WRITE_ONCE(file->f_ep, head);
  1331. to_free = NULL;
  1332. }
  1333. hlist_add_head_rcu(&epi->fllink, file->f_ep);
  1334. spin_unlock(&file->f_lock);
  1335. free_ephead(to_free);
  1336. return 0;
  1337. }
  1338. /*
  1339. * Must be called with "mtx" held.
  1340. */
  1341. static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
  1342. struct file *tfile, int fd, int full_check)
  1343. {
  1344. int error, pwake = 0;
  1345. __poll_t revents;
  1346. struct epitem *epi;
  1347. struct ep_pqueue epq;
  1348. struct eventpoll *tep = NULL;
  1349. if (is_file_epoll(tfile))
  1350. tep = tfile->private_data;
  1351. lockdep_assert_irqs_enabled();
  1352. if (unlikely(percpu_counter_compare(&ep->user->epoll_watches,
  1353. max_user_watches) >= 0))
  1354. return -ENOSPC;
  1355. percpu_counter_inc(&ep->user->epoll_watches);
  1356. if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) {
  1357. percpu_counter_dec(&ep->user->epoll_watches);
  1358. return -ENOMEM;
  1359. }
  1360. /* Item initialization follow here ... */
  1361. INIT_LIST_HEAD(&epi->rdllink);
  1362. epi->ep = ep;
  1363. ep_set_ffd(&epi->ffd, tfile, fd);
  1364. epi->event = *event;
  1365. epi->next = EP_UNACTIVE_PTR;
  1366. if (tep)
  1367. mutex_lock_nested(&tep->mtx, 1);
  1368. /* Add the current item to the list of active epoll hook for this file */
  1369. if (unlikely(attach_epitem(tfile, epi) < 0)) {
  1370. if (tep)
  1371. mutex_unlock(&tep->mtx);
  1372. kmem_cache_free(epi_cache, epi);
  1373. percpu_counter_dec(&ep->user->epoll_watches);
  1374. return -ENOMEM;
  1375. }
  1376. if (full_check && !tep)
  1377. list_file(tfile);
  1378. /*
  1379. * Add the current item to the RB tree. All RB tree operations are
  1380. * protected by "mtx", and ep_insert() is called with "mtx" held.
  1381. */
  1382. ep_rbtree_insert(ep, epi);
  1383. if (tep)
  1384. mutex_unlock(&tep->mtx);
  1385. /*
  1386. * ep_remove_safe() calls in the later error paths can't lead to
  1387. * ep_free() as the ep file itself still holds an ep reference.
  1388. */
  1389. ep_get(ep);
  1390. /* now check if we've created too many backpaths */
  1391. if (unlikely(full_check && reverse_path_check())) {
  1392. ep_remove_safe(ep, epi);
  1393. return -EINVAL;
  1394. }
  1395. if (epi->event.events & EPOLLWAKEUP) {
  1396. error = ep_create_wakeup_source(epi);
  1397. if (error) {
  1398. ep_remove_safe(ep, epi);
  1399. return error;
  1400. }
  1401. }
  1402. /* Initialize the poll table using the queue callback */
  1403. epq.epi = epi;
  1404. init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
  1405. /*
  1406. * Attach the item to the poll hooks and get current event bits.
  1407. * We can safely use the file* here because its usage count has
  1408. * been increased by the caller of this function. Note that after
  1409. * this operation completes, the poll callback can start hitting
  1410. * the new item.
  1411. */
  1412. revents = ep_item_poll(epi, &epq.pt, 1);
  1413. /*
  1414. * We have to check if something went wrong during the poll wait queue
  1415. * install process. Namely an allocation for a wait queue failed due
  1416. * high memory pressure.
  1417. */
  1418. if (unlikely(!epq.epi)) {
  1419. ep_remove_safe(ep, epi);
  1420. return -ENOMEM;
  1421. }
  1422. /* We have to drop the new item inside our item list to keep track of it */
  1423. spin_lock_irq(&ep->lock);
  1424. /* record NAPI ID of new item if present */
  1425. ep_set_busy_poll_napi_id(epi);
  1426. /* If the file is already "ready" we drop it inside the ready list */
  1427. if (revents && !ep_is_linked(epi)) {
  1428. list_add_tail(&epi->rdllink, &ep->rdllist);
  1429. ep_pm_stay_awake(epi);
  1430. /* Notify waiting tasks that events are available */
  1431. if (waitqueue_active(&ep->wq))
  1432. wake_up(&ep->wq);
  1433. if (waitqueue_active(&ep->poll_wait))
  1434. pwake++;
  1435. }
  1436. spin_unlock_irq(&ep->lock);
  1437. /* We have to call this outside the lock */
  1438. if (pwake)
  1439. ep_poll_safewake(ep, NULL, 0);
  1440. return 0;
  1441. }
  1442. /*
  1443. * Modify the interest event mask by dropping an event if the new mask
  1444. * has a match in the current file status. Must be called with "mtx" held.
  1445. */
  1446. static int ep_modify(struct eventpoll *ep, struct epitem *epi,
  1447. const struct epoll_event *event)
  1448. {
  1449. int pwake = 0;
  1450. poll_table pt;
  1451. lockdep_assert_irqs_enabled();
  1452. init_poll_funcptr(&pt, NULL);
  1453. /*
  1454. * Set the new event interest mask before calling f_op->poll();
  1455. * otherwise we might miss an event that happens between the
  1456. * f_op->poll() call and the new event set registering.
  1457. */
  1458. epi->event.events = event->events; /* need barrier below */
  1459. epi->event.data = event->data; /* protected by mtx */
  1460. if (epi->event.events & EPOLLWAKEUP) {
  1461. if (!ep_has_wakeup_source(epi))
  1462. ep_create_wakeup_source(epi);
  1463. } else if (ep_has_wakeup_source(epi)) {
  1464. ep_destroy_wakeup_source(epi);
  1465. }
  1466. /*
  1467. * The following barrier has two effects:
  1468. *
  1469. * 1) Flush epi changes above to other CPUs. This ensures
  1470. * we do not miss events from ep_poll_callback if an
  1471. * event occurs immediately after we call f_op->poll().
  1472. * We need this because we did not take ep->lock while
  1473. * changing epi above (but ep_poll_callback does take
  1474. * ep->lock).
  1475. *
  1476. * 2) We also need to ensure we do not miss _past_ events
  1477. * when calling f_op->poll(). This barrier also
  1478. * pairs with the barrier in wq_has_sleeper (see
  1479. * comments for wq_has_sleeper).
  1480. *
  1481. * This barrier will now guarantee ep_poll_callback or f_op->poll
  1482. * (or both) will notice the readiness of an item.
  1483. */
  1484. smp_mb();
  1485. /*
  1486. * Get current event bits. We can safely use the file* here because
  1487. * its usage count has been increased by the caller of this function.
  1488. * If the item is "hot" and it is not registered inside the ready
  1489. * list, push it inside.
  1490. */
  1491. if (ep_item_poll(epi, &pt, 1)) {
  1492. spin_lock_irq(&ep->lock);
  1493. if (!ep_is_linked(epi)) {
  1494. list_add_tail(&epi->rdllink, &ep->rdllist);
  1495. ep_pm_stay_awake(epi);
  1496. /* Notify waiting tasks that events are available */
  1497. if (waitqueue_active(&ep->wq))
  1498. wake_up(&ep->wq);
  1499. if (waitqueue_active(&ep->poll_wait))
  1500. pwake++;
  1501. }
  1502. spin_unlock_irq(&ep->lock);
  1503. }
  1504. /* We have to call this outside the lock */
  1505. if (pwake)
  1506. ep_poll_safewake(ep, NULL, 0);
  1507. return 0;
  1508. }
  1509. static int ep_send_events(struct eventpoll *ep,
  1510. struct epoll_event __user *events, int maxevents)
  1511. {
  1512. struct epitem *epi, *tmp;
  1513. LIST_HEAD(txlist);
  1514. poll_table pt;
  1515. int res = 0;
  1516. /*
  1517. * Always short-circuit for fatal signals to allow threads to make a
  1518. * timely exit without the chance of finding more events available and
  1519. * fetching repeatedly.
  1520. */
  1521. if (fatal_signal_pending(current))
  1522. return -EINTR;
  1523. init_poll_funcptr(&pt, NULL);
  1524. mutex_lock(&ep->mtx);
  1525. ep_start_scan(ep, &txlist);
  1526. /*
  1527. * We can loop without lock because we are passed a task private list.
  1528. * Items cannot vanish during the loop we are holding ep->mtx.
  1529. */
  1530. list_for_each_entry_safe(epi, tmp, &txlist, rdllink) {
  1531. struct wakeup_source *ws;
  1532. __poll_t revents;
  1533. if (res >= maxevents)
  1534. break;
  1535. /*
  1536. * Activate ep->ws before deactivating epi->ws to prevent
  1537. * triggering auto-suspend here (in case we reactive epi->ws
  1538. * below).
  1539. *
  1540. * This could be rearranged to delay the deactivation of epi->ws
  1541. * instead, but then epi->ws would temporarily be out of sync
  1542. * with ep_is_linked().
  1543. */
  1544. ws = ep_wakeup_source(epi);
  1545. if (ws) {
  1546. if (ws->active)
  1547. __pm_stay_awake(ep->ws);
  1548. __pm_relax(ws);
  1549. }
  1550. list_del_init(&epi->rdllink);
  1551. /*
  1552. * If the event mask intersect the caller-requested one,
  1553. * deliver the event to userspace. Again, we are holding ep->mtx,
  1554. * so no operations coming from userspace can change the item.
  1555. */
  1556. revents = ep_item_poll(epi, &pt, 1);
  1557. if (!revents)
  1558. continue;
  1559. events = epoll_put_uevent(revents, epi->event.data, events);
  1560. if (!events) {
  1561. list_add(&epi->rdllink, &txlist);
  1562. ep_pm_stay_awake(epi);
  1563. if (!res)
  1564. res = -EFAULT;
  1565. break;
  1566. }
  1567. res++;
  1568. if (epi->event.events & EPOLLONESHOT)
  1569. epi->event.events &= EP_PRIVATE_BITS;
  1570. else if (!(epi->event.events & EPOLLET)) {
  1571. /*
  1572. * If this file has been added with Level
  1573. * Trigger mode, we need to insert back inside
  1574. * the ready list, so that the next call to
  1575. * epoll_wait() will check again the events
  1576. * availability. At this point, no one can insert
  1577. * into ep->rdllist besides us. The epoll_ctl()
  1578. * callers are locked out by
  1579. * ep_send_events() holding "mtx" and the
  1580. * poll callback will queue them in ep->ovflist.
  1581. */
  1582. list_add_tail(&epi->rdllink, &ep->rdllist);
  1583. ep_pm_stay_awake(epi);
  1584. }
  1585. }
  1586. ep_done_scan(ep, &txlist);
  1587. mutex_unlock(&ep->mtx);
  1588. return res;
  1589. }
  1590. static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms)
  1591. {
  1592. struct timespec64 now;
  1593. if (ms < 0)
  1594. return NULL;
  1595. if (!ms) {
  1596. to->tv_sec = 0;
  1597. to->tv_nsec = 0;
  1598. return to;
  1599. }
  1600. to->tv_sec = ms / MSEC_PER_SEC;
  1601. to->tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC);
  1602. ktime_get_ts64(&now);
  1603. *to = timespec64_add_safe(now, *to);
  1604. return to;
  1605. }
  1606. /*
  1607. * autoremove_wake_function, but remove even on failure to wake up, because we
  1608. * know that default_wake_function/ttwu will only fail if the thread is already
  1609. * woken, and in that case the ep_poll loop will remove the entry anyways, not
  1610. * try to reuse it.
  1611. */
  1612. static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry,
  1613. unsigned int mode, int sync, void *key)
  1614. {
  1615. int ret = default_wake_function(wq_entry, mode, sync, key);
  1616. /*
  1617. * Pairs with list_empty_careful in ep_poll, and ensures future loop
  1618. * iterations see the cause of this wakeup.
  1619. */
  1620. list_del_init_careful(&wq_entry->entry);
  1621. return ret;
  1622. }
  1623. static int ep_try_send_events(struct eventpoll *ep,
  1624. struct epoll_event __user *events, int maxevents)
  1625. {
  1626. int res;
  1627. /*
  1628. * Try to transfer events to user space. In case we get 0 events and
  1629. * there's still timeout left over, we go trying again in search of
  1630. * more luck.
  1631. */
  1632. res = ep_send_events(ep, events, maxevents);
  1633. if (res > 0)
  1634. ep_suspend_napi_irqs(ep);
  1635. return res;
  1636. }
  1637. static int ep_schedule_timeout(ktime_t *to)
  1638. {
  1639. if (to)
  1640. return ktime_after(*to, ktime_get());
  1641. else
  1642. return 1;
  1643. }
  1644. /**
  1645. * ep_poll - Retrieves ready events, and delivers them to the caller-supplied
  1646. * event buffer.
  1647. *
  1648. * @ep: Pointer to the eventpoll context.
  1649. * @events: Pointer to the userspace buffer where the ready events should be
  1650. * stored.
  1651. * @maxevents: Size (in terms of number of events) of the caller event buffer.
  1652. * @timeout: Maximum timeout for the ready events fetch operation, in
  1653. * timespec. If the timeout is zero, the function will not block,
  1654. * while if the @timeout ptr is NULL, the function will block
  1655. * until at least one event has been retrieved (or an error
  1656. * occurred).
  1657. *
  1658. * Return: the number of ready events which have been fetched, or an
  1659. * error code, in case of error.
  1660. */
  1661. static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
  1662. int maxevents, struct timespec64 *timeout)
  1663. {
  1664. int res, eavail, timed_out = 0;
  1665. u64 slack = 0;
  1666. wait_queue_entry_t wait;
  1667. ktime_t expires, *to = NULL;
  1668. lockdep_assert_irqs_enabled();
  1669. if (timeout && (timeout->tv_sec | timeout->tv_nsec)) {
  1670. slack = select_estimate_accuracy(timeout);
  1671. to = &expires;
  1672. *to = timespec64_to_ktime(*timeout);
  1673. } else if (timeout) {
  1674. /*
  1675. * Avoid the unnecessary trip to the wait queue loop, if the
  1676. * caller specified a non blocking operation.
  1677. */
  1678. timed_out = 1;
  1679. }
  1680. /*
  1681. * This call is racy: We may or may not see events that are being added
  1682. * to the ready list under the lock (e.g., in IRQ callbacks). For cases
  1683. * with a non-zero timeout, this thread will check the ready list under
  1684. * lock and will add to the wait queue. For cases with a zero
  1685. * timeout, the user by definition should not care and will have to
  1686. * recheck again.
  1687. */
  1688. eavail = ep_events_available(ep);
  1689. while (1) {
  1690. if (eavail) {
  1691. res = ep_try_send_events(ep, events, maxevents);
  1692. if (res)
  1693. return res;
  1694. }
  1695. if (timed_out)
  1696. return 0;
  1697. eavail = ep_busy_loop(ep);
  1698. if (eavail)
  1699. continue;
  1700. if (signal_pending(current))
  1701. return -EINTR;
  1702. /*
  1703. * Internally init_wait() uses autoremove_wake_function(),
  1704. * thus wait entry is removed from the wait queue on each
  1705. * wakeup. Why it is important? In case of several waiters
  1706. * each new wakeup will hit the next waiter, giving it the
  1707. * chance to harvest new event. Otherwise wakeup can be
  1708. * lost. This is also good performance-wise, because on
  1709. * normal wakeup path no need to call __remove_wait_queue()
  1710. * explicitly, thus ep->lock is not taken, which halts the
  1711. * event delivery.
  1712. *
  1713. * In fact, we now use an even more aggressive function that
  1714. * unconditionally removes, because we don't reuse the wait
  1715. * entry between loop iterations. This lets us also avoid the
  1716. * performance issue if a process is killed, causing all of its
  1717. * threads to wake up without being removed normally.
  1718. */
  1719. init_wait(&wait);
  1720. wait.func = ep_autoremove_wake_function;
  1721. spin_lock_irq(&ep->lock);
  1722. /*
  1723. * Barrierless variant, waitqueue_active() is called under
  1724. * the same lock on wakeup ep_poll_callback() side, so it
  1725. * is safe to avoid an explicit barrier.
  1726. */
  1727. __set_current_state(TASK_INTERRUPTIBLE);
  1728. /*
  1729. * Do the final check under the lock. ep_start/done_scan()
  1730. * plays with two lists (->rdllist and ->ovflist) and there
  1731. * is always a race when both lists are empty for short
  1732. * period of time although events are pending, so lock is
  1733. * important.
  1734. */
  1735. eavail = ep_events_available(ep);
  1736. if (!eavail)
  1737. __add_wait_queue_exclusive(&ep->wq, &wait);
  1738. spin_unlock_irq(&ep->lock);
  1739. if (!eavail)
  1740. timed_out = !ep_schedule_timeout(to) ||
  1741. !schedule_hrtimeout_range(to, slack,
  1742. HRTIMER_MODE_ABS);
  1743. __set_current_state(TASK_RUNNING);
  1744. /*
  1745. * We were woken up, thus go and try to harvest some events.
  1746. * If timed out and still on the wait queue, recheck eavail
  1747. * carefully under lock, below.
  1748. */
  1749. eavail = 1;
  1750. if (!list_empty_careful(&wait.entry)) {
  1751. spin_lock_irq(&ep->lock);
  1752. /*
  1753. * If the thread timed out and is not on the wait queue,
  1754. * it means that the thread was woken up after its
  1755. * timeout expired before it could reacquire the lock.
  1756. * Thus, when wait.entry is empty, it needs to harvest
  1757. * events.
  1758. */
  1759. if (timed_out)
  1760. eavail = list_empty(&wait.entry);
  1761. __remove_wait_queue(&ep->wq, &wait);
  1762. spin_unlock_irq(&ep->lock);
  1763. }
  1764. }
  1765. }
  1766. /**
  1767. * ep_loop_check_proc - verify that adding an epoll file @ep inside another
  1768. * epoll file does not create closed loops, and
  1769. * determine the depth of the subtree starting at @ep
  1770. *
  1771. * @ep: the &struct eventpoll to be currently checked.
  1772. * @depth: Current depth of the path being checked.
  1773. *
  1774. * Return: depth of the subtree, or a value bigger than EP_MAX_NESTS if we found
  1775. * a loop or went too deep.
  1776. */
  1777. static int ep_loop_check_proc(struct eventpoll *ep, int depth)
  1778. {
  1779. int result = 0;
  1780. struct rb_node *rbp;
  1781. struct epitem *epi;
  1782. if (ep->gen == loop_check_gen)
  1783. return ep->loop_check_depth;
  1784. mutex_lock_nested(&ep->mtx, depth + 1);
  1785. ep->gen = loop_check_gen;
  1786. for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
  1787. epi = rb_entry(rbp, struct epitem, rbn);
  1788. if (unlikely(is_file_epoll(epi->ffd.file))) {
  1789. struct eventpoll *ep_tovisit;
  1790. ep_tovisit = epi->ffd.file->private_data;
  1791. if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS)
  1792. result = EP_MAX_NESTS+1;
  1793. else
  1794. result = max(result, ep_loop_check_proc(ep_tovisit, depth + 1) + 1);
  1795. if (result > EP_MAX_NESTS)
  1796. break;
  1797. } else {
  1798. /*
  1799. * If we've reached a file that is not associated with
  1800. * an ep, then we need to check if the newly added
  1801. * links are going to add too many wakeup paths. We do
  1802. * this by adding it to the tfile_check_list, if it's
  1803. * not already there, and calling reverse_path_check()
  1804. * during ep_insert().
  1805. */
  1806. list_file(epi->ffd.file);
  1807. }
  1808. }
  1809. ep->loop_check_depth = result;
  1810. mutex_unlock(&ep->mtx);
  1811. return result;
  1812. }
  1813. /* ep_get_upwards_depth_proc - determine depth of @ep when traversed upwards */
  1814. static int ep_get_upwards_depth_proc(struct eventpoll *ep, int depth)
  1815. {
  1816. int result = 0;
  1817. struct epitem *epi;
  1818. if (ep->gen == loop_check_gen)
  1819. return ep->loop_check_depth;
  1820. hlist_for_each_entry_rcu(epi, &ep->refs, fllink)
  1821. result = max(result, ep_get_upwards_depth_proc(epi->ep, depth + 1) + 1);
  1822. ep->gen = loop_check_gen;
  1823. ep->loop_check_depth = result;
  1824. return result;
  1825. }
  1826. /**
  1827. * ep_loop_check - Performs a check to verify that adding an epoll file (@to)
  1828. * into another epoll file (represented by @ep) does not create
  1829. * closed loops or too deep chains.
  1830. *
  1831. * @ep: Pointer to the epoll we are inserting into.
  1832. * @to: Pointer to the epoll to be inserted.
  1833. *
  1834. * Return: %zero if adding the epoll @to inside the epoll @from
  1835. * does not violate the constraints, or %-1 otherwise.
  1836. */
  1837. static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to)
  1838. {
  1839. int depth, upwards_depth;
  1840. inserting_into = ep;
  1841. /*
  1842. * Check how deep down we can get from @to, and whether it is possible
  1843. * to loop up to @ep.
  1844. */
  1845. depth = ep_loop_check_proc(to, 0);
  1846. if (depth > EP_MAX_NESTS)
  1847. return -1;
  1848. /* Check how far up we can go from @ep. */
  1849. rcu_read_lock();
  1850. upwards_depth = ep_get_upwards_depth_proc(ep, 0);
  1851. rcu_read_unlock();
  1852. return (depth+1+upwards_depth > EP_MAX_NESTS) ? -1 : 0;
  1853. }
  1854. static void clear_tfile_check_list(void)
  1855. {
  1856. rcu_read_lock();
  1857. while (tfile_check_list != EP_UNACTIVE_PTR) {
  1858. struct epitems_head *head = tfile_check_list;
  1859. tfile_check_list = head->next;
  1860. unlist_file(head);
  1861. }
  1862. rcu_read_unlock();
  1863. }
  1864. /*
  1865. * Open an eventpoll file descriptor.
  1866. */
  1867. static int do_epoll_create(int flags)
  1868. {
  1869. int error;
  1870. struct eventpoll *ep;
  1871. /* Check the EPOLL_* constant for consistency. */
  1872. BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
  1873. if (flags & ~EPOLL_CLOEXEC)
  1874. return -EINVAL;
  1875. /*
  1876. * Create the internal data structure ("struct eventpoll").
  1877. */
  1878. error = ep_alloc(&ep);
  1879. if (error < 0)
  1880. return error;
  1881. /*
  1882. * Creates all the items needed to setup an eventpoll file. That is,
  1883. * a file structure and a free file descriptor.
  1884. */
  1885. FD_PREPARE(fdf, O_RDWR | (flags & O_CLOEXEC),
  1886. anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
  1887. O_RDWR | (flags & O_CLOEXEC)));
  1888. if (fdf.err) {
  1889. ep_clear_and_put(ep);
  1890. return fdf.err;
  1891. }
  1892. ep->file = fd_prepare_file(fdf);
  1893. return fd_publish(fdf);
  1894. }
  1895. SYSCALL_DEFINE1(epoll_create1, int, flags)
  1896. {
  1897. return do_epoll_create(flags);
  1898. }
  1899. SYSCALL_DEFINE1(epoll_create, int, size)
  1900. {
  1901. if (size <= 0)
  1902. return -EINVAL;
  1903. return do_epoll_create(0);
  1904. }
  1905. #ifdef CONFIG_PM_SLEEP
  1906. static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev)
  1907. {
  1908. if ((epev->events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND))
  1909. epev->events &= ~EPOLLWAKEUP;
  1910. }
  1911. #else
  1912. static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev)
  1913. {
  1914. epev->events &= ~EPOLLWAKEUP;
  1915. }
  1916. #endif
  1917. static inline int epoll_mutex_lock(struct mutex *mutex, int depth,
  1918. bool nonblock)
  1919. {
  1920. if (!nonblock) {
  1921. mutex_lock_nested(mutex, depth);
  1922. return 0;
  1923. }
  1924. if (mutex_trylock(mutex))
  1925. return 0;
  1926. return -EAGAIN;
  1927. }
  1928. int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
  1929. bool nonblock)
  1930. {
  1931. int error;
  1932. int full_check = 0;
  1933. struct eventpoll *ep;
  1934. struct epitem *epi;
  1935. struct eventpoll *tep = NULL;
  1936. CLASS(fd, f)(epfd);
  1937. if (fd_empty(f))
  1938. return -EBADF;
  1939. /* Get the "struct file *" for the target file */
  1940. CLASS(fd, tf)(fd);
  1941. if (fd_empty(tf))
  1942. return -EBADF;
  1943. /* The target file descriptor must support poll */
  1944. if (!file_can_poll(fd_file(tf)))
  1945. return -EPERM;
  1946. /* Check if EPOLLWAKEUP is allowed */
  1947. if (ep_op_has_event(op))
  1948. ep_take_care_of_epollwakeup(epds);
  1949. /*
  1950. * We have to check that the file structure underneath the file descriptor
  1951. * the user passed to us _is_ an eventpoll file. And also we do not permit
  1952. * adding an epoll file descriptor inside itself.
  1953. */
  1954. error = -EINVAL;
  1955. if (fd_file(f) == fd_file(tf) || !is_file_epoll(fd_file(f)))
  1956. goto error_tgt_fput;
  1957. /*
  1958. * epoll adds to the wakeup queue at EPOLL_CTL_ADD time only,
  1959. * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
  1960. * Also, we do not currently supported nested exclusive wakeups.
  1961. */
  1962. if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
  1963. if (op == EPOLL_CTL_MOD)
  1964. goto error_tgt_fput;
  1965. if (op == EPOLL_CTL_ADD && (is_file_epoll(fd_file(tf)) ||
  1966. (epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
  1967. goto error_tgt_fput;
  1968. }
  1969. /*
  1970. * At this point it is safe to assume that the "private_data" contains
  1971. * our own data structure.
  1972. */
  1973. ep = fd_file(f)->private_data;
  1974. /*
  1975. * When we insert an epoll file descriptor inside another epoll file
  1976. * descriptor, there is the chance of creating closed loops, which are
  1977. * better be handled here, than in more critical paths. While we are
  1978. * checking for loops we also determine the list of files reachable
  1979. * and hang them on the tfile_check_list, so we can check that we
  1980. * haven't created too many possible wakeup paths.
  1981. *
  1982. * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
  1983. * the epoll file descriptor is attaching directly to a wakeup source,
  1984. * unless the epoll file descriptor is nested. The purpose of taking the
  1985. * 'epnested_mutex' on add is to prevent complex toplogies such as loops and
  1986. * deep wakeup paths from forming in parallel through multiple
  1987. * EPOLL_CTL_ADD operations.
  1988. */
  1989. error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
  1990. if (error)
  1991. goto error_tgt_fput;
  1992. if (op == EPOLL_CTL_ADD) {
  1993. if (READ_ONCE(fd_file(f)->f_ep) || ep->gen == loop_check_gen ||
  1994. is_file_epoll(fd_file(tf))) {
  1995. mutex_unlock(&ep->mtx);
  1996. error = epoll_mutex_lock(&epnested_mutex, 0, nonblock);
  1997. if (error)
  1998. goto error_tgt_fput;
  1999. loop_check_gen++;
  2000. full_check = 1;
  2001. if (is_file_epoll(fd_file(tf))) {
  2002. tep = fd_file(tf)->private_data;
  2003. error = -ELOOP;
  2004. if (ep_loop_check(ep, tep) != 0)
  2005. goto error_tgt_fput;
  2006. }
  2007. error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
  2008. if (error)
  2009. goto error_tgt_fput;
  2010. }
  2011. }
  2012. /*
  2013. * Try to lookup the file inside our RB tree. Since we grabbed "mtx"
  2014. * above, we can be sure to be able to use the item looked up by
  2015. * ep_find() till we release the mutex.
  2016. */
  2017. epi = ep_find(ep, fd_file(tf), fd);
  2018. error = -EINVAL;
  2019. switch (op) {
  2020. case EPOLL_CTL_ADD:
  2021. if (!epi) {
  2022. epds->events |= EPOLLERR | EPOLLHUP;
  2023. error = ep_insert(ep, epds, fd_file(tf), fd, full_check);
  2024. } else
  2025. error = -EEXIST;
  2026. break;
  2027. case EPOLL_CTL_DEL:
  2028. if (epi) {
  2029. /*
  2030. * The eventpoll itself is still alive: the refcount
  2031. * can't go to zero here.
  2032. */
  2033. ep_remove_safe(ep, epi);
  2034. error = 0;
  2035. } else {
  2036. error = -ENOENT;
  2037. }
  2038. break;
  2039. case EPOLL_CTL_MOD:
  2040. if (epi) {
  2041. if (!(epi->event.events & EPOLLEXCLUSIVE)) {
  2042. epds->events |= EPOLLERR | EPOLLHUP;
  2043. error = ep_modify(ep, epi, epds);
  2044. }
  2045. } else
  2046. error = -ENOENT;
  2047. break;
  2048. }
  2049. mutex_unlock(&ep->mtx);
  2050. error_tgt_fput:
  2051. if (full_check) {
  2052. clear_tfile_check_list();
  2053. loop_check_gen++;
  2054. mutex_unlock(&epnested_mutex);
  2055. }
  2056. return error;
  2057. }
  2058. /*
  2059. * The following function implements the controller interface for
  2060. * the eventpoll file that enables the insertion/removal/change of
  2061. * file descriptors inside the interest set.
  2062. */
  2063. SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
  2064. struct epoll_event __user *, event)
  2065. {
  2066. struct epoll_event epds;
  2067. if (ep_op_has_event(op) &&
  2068. copy_from_user(&epds, event, sizeof(struct epoll_event)))
  2069. return -EFAULT;
  2070. return do_epoll_ctl(epfd, op, fd, &epds, false);
  2071. }
  2072. static int ep_check_params(struct file *file, struct epoll_event __user *evs,
  2073. int maxevents)
  2074. {
  2075. /* The maximum number of event must be greater than zero */
  2076. if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
  2077. return -EINVAL;
  2078. /* Verify that the area passed by the user is writeable */
  2079. if (!access_ok(evs, maxevents * sizeof(struct epoll_event)))
  2080. return -EFAULT;
  2081. /*
  2082. * We have to check that the file structure underneath the fd
  2083. * the user passed to us _is_ an eventpoll file.
  2084. */
  2085. if (!is_file_epoll(file))
  2086. return -EINVAL;
  2087. return 0;
  2088. }
  2089. int epoll_sendevents(struct file *file, struct epoll_event __user *events,
  2090. int maxevents)
  2091. {
  2092. struct eventpoll *ep;
  2093. int ret;
  2094. ret = ep_check_params(file, events, maxevents);
  2095. if (unlikely(ret))
  2096. return ret;
  2097. ep = file->private_data;
  2098. /*
  2099. * Racy call, but that's ok - it should get retried based on
  2100. * poll readiness anyway.
  2101. */
  2102. if (ep_events_available(ep))
  2103. return ep_try_send_events(ep, events, maxevents);
  2104. return 0;
  2105. }
  2106. /*
  2107. * Implement the event wait interface for the eventpoll file. It is the kernel
  2108. * part of the user space epoll_wait(2).
  2109. */
  2110. static int do_epoll_wait(int epfd, struct epoll_event __user *events,
  2111. int maxevents, struct timespec64 *to)
  2112. {
  2113. struct eventpoll *ep;
  2114. int ret;
  2115. /* Get the "struct file *" for the eventpoll file */
  2116. CLASS(fd, f)(epfd);
  2117. if (fd_empty(f))
  2118. return -EBADF;
  2119. ret = ep_check_params(fd_file(f), events, maxevents);
  2120. if (unlikely(ret))
  2121. return ret;
  2122. /*
  2123. * At this point it is safe to assume that the "private_data" contains
  2124. * our own data structure.
  2125. */
  2126. ep = fd_file(f)->private_data;
  2127. /* Time to fish for events ... */
  2128. return ep_poll(ep, events, maxevents, to);
  2129. }
  2130. SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
  2131. int, maxevents, int, timeout)
  2132. {
  2133. struct timespec64 to;
  2134. return do_epoll_wait(epfd, events, maxevents,
  2135. ep_timeout_to_timespec(&to, timeout));
  2136. }
  2137. /*
  2138. * Implement the event wait interface for the eventpoll file. It is the kernel
  2139. * part of the user space epoll_pwait(2).
  2140. */
  2141. static int do_epoll_pwait(int epfd, struct epoll_event __user *events,
  2142. int maxevents, struct timespec64 *to,
  2143. const sigset_t __user *sigmask, size_t sigsetsize)
  2144. {
  2145. int error;
  2146. /*
  2147. * If the caller wants a certain signal mask to be set during the wait,
  2148. * we apply it here.
  2149. */
  2150. error = set_user_sigmask(sigmask, sigsetsize);
  2151. if (error)
  2152. return error;
  2153. error = do_epoll_wait(epfd, events, maxevents, to);
  2154. restore_saved_sigmask_unless(error == -EINTR);
  2155. return error;
  2156. }
  2157. SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
  2158. int, maxevents, int, timeout, const sigset_t __user *, sigmask,
  2159. size_t, sigsetsize)
  2160. {
  2161. struct timespec64 to;
  2162. return do_epoll_pwait(epfd, events, maxevents,
  2163. ep_timeout_to_timespec(&to, timeout),
  2164. sigmask, sigsetsize);
  2165. }
  2166. SYSCALL_DEFINE6(epoll_pwait2, int, epfd, struct epoll_event __user *, events,
  2167. int, maxevents, const struct __kernel_timespec __user *, timeout,
  2168. const sigset_t __user *, sigmask, size_t, sigsetsize)
  2169. {
  2170. struct timespec64 ts, *to = NULL;
  2171. if (timeout) {
  2172. if (get_timespec64(&ts, timeout))
  2173. return -EFAULT;
  2174. to = &ts;
  2175. if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
  2176. return -EINVAL;
  2177. }
  2178. return do_epoll_pwait(epfd, events, maxevents, to,
  2179. sigmask, sigsetsize);
  2180. }
  2181. #ifdef CONFIG_COMPAT
  2182. static int do_compat_epoll_pwait(int epfd, struct epoll_event __user *events,
  2183. int maxevents, struct timespec64 *timeout,
  2184. const compat_sigset_t __user *sigmask,
  2185. compat_size_t sigsetsize)
  2186. {
  2187. long err;
  2188. /*
  2189. * If the caller wants a certain signal mask to be set during the wait,
  2190. * we apply it here.
  2191. */
  2192. err = set_compat_user_sigmask(sigmask, sigsetsize);
  2193. if (err)
  2194. return err;
  2195. err = do_epoll_wait(epfd, events, maxevents, timeout);
  2196. restore_saved_sigmask_unless(err == -EINTR);
  2197. return err;
  2198. }
  2199. COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
  2200. struct epoll_event __user *, events,
  2201. int, maxevents, int, timeout,
  2202. const compat_sigset_t __user *, sigmask,
  2203. compat_size_t, sigsetsize)
  2204. {
  2205. struct timespec64 to;
  2206. return do_compat_epoll_pwait(epfd, events, maxevents,
  2207. ep_timeout_to_timespec(&to, timeout),
  2208. sigmask, sigsetsize);
  2209. }
  2210. COMPAT_SYSCALL_DEFINE6(epoll_pwait2, int, epfd,
  2211. struct epoll_event __user *, events,
  2212. int, maxevents,
  2213. const struct __kernel_timespec __user *, timeout,
  2214. const compat_sigset_t __user *, sigmask,
  2215. compat_size_t, sigsetsize)
  2216. {
  2217. struct timespec64 ts, *to = NULL;
  2218. if (timeout) {
  2219. if (get_timespec64(&ts, timeout))
  2220. return -EFAULT;
  2221. to = &ts;
  2222. if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
  2223. return -EINVAL;
  2224. }
  2225. return do_compat_epoll_pwait(epfd, events, maxevents, to,
  2226. sigmask, sigsetsize);
  2227. }
  2228. #endif
  2229. static int __init eventpoll_init(void)
  2230. {
  2231. struct sysinfo si;
  2232. si_meminfo(&si);
  2233. /*
  2234. * Allows top 4% of lomem to be allocated for epoll watches (per user).
  2235. */
  2236. max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
  2237. EP_ITEM_COST;
  2238. BUG_ON(max_user_watches < 0);
  2239. /*
  2240. * We can have many thousands of epitems, so prevent this from
  2241. * using an extra cache line on 64-bit (and smaller) CPUs
  2242. */
  2243. BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128);
  2244. /* Allocates slab cache used to allocate "struct epitem" items */
  2245. epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
  2246. 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
  2247. /* Allocates slab cache used to allocate "struct eppoll_entry" */
  2248. pwq_cache = kmem_cache_create("eventpoll_pwq",
  2249. sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
  2250. epoll_sysctls_init();
  2251. ephead_cache = kmem_cache_create("ep_head",
  2252. sizeof(struct epitems_head), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
  2253. return 0;
  2254. }
  2255. fs_initcall(eventpoll_init);