mqueue.c 43 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * POSIX message queues filesystem for Linux.
  4. *
  5. * Copyright (C) 2003,2004 Krzysztof Benedyczak (golbi@mat.uni.torun.pl)
  6. * Michal Wronski (michal.wronski@gmail.com)
  7. *
  8. * Spinlocks: Mohamed Abbas (abbas.mohamed@intel.com)
  9. * Lockless receive & send, fd based notify:
  10. * Manfred Spraul (manfred@colorfullife.com)
  11. *
  12. * Audit: George Wilson (ltcgcw@us.ibm.com)
  13. */
  14. #include <linux/capability.h>
  15. #include <linux/init.h>
  16. #include <linux/pagemap.h>
  17. #include <linux/file.h>
  18. #include <linux/mount.h>
  19. #include <linux/fs_context.h>
  20. #include <linux/namei.h>
  21. #include <linux/sysctl.h>
  22. #include <linux/poll.h>
  23. #include <linux/mqueue.h>
  24. #include <linux/msg.h>
  25. #include <linux/skbuff.h>
  26. #include <linux/vmalloc.h>
  27. #include <linux/netlink.h>
  28. #include <linux/syscalls.h>
  29. #include <linux/audit.h>
  30. #include <linux/signal.h>
  31. #include <linux/mutex.h>
  32. #include <linux/nsproxy.h>
  33. #include <linux/pid.h>
  34. #include <linux/ipc_namespace.h>
  35. #include <linux/user_namespace.h>
  36. #include <linux/slab.h>
  37. #include <linux/sched/wake_q.h>
  38. #include <linux/sched/signal.h>
  39. #include <linux/sched/user.h>
  40. #include <net/sock.h>
  41. #include "util.h"
  42. struct mqueue_fs_context {
  43. struct ipc_namespace *ipc_ns;
  44. bool newns; /* Set if newly created ipc namespace */
  45. };
  46. #define MQUEUE_MAGIC 0x19800202
  47. #define DIRENT_SIZE 20
  48. #define FILENT_SIZE 80
  49. #define SEND 0
  50. #define RECV 1
  51. #define STATE_NONE 0
  52. #define STATE_READY 1
  53. struct posix_msg_tree_node {
  54. struct rb_node rb_node;
  55. struct list_head msg_list;
  56. int priority;
  57. };
  58. /*
  59. * Locking:
  60. *
  61. * Accesses to a message queue are synchronized by acquiring info->lock.
  62. *
  63. * There are two notable exceptions:
  64. * - The actual wakeup of a sleeping task is performed using the wake_q
  65. * framework. info->lock is already released when wake_up_q is called.
  66. * - The exit codepaths after sleeping check ext_wait_queue->state without
  67. * any locks. If it is STATE_READY, then the syscall is completed without
  68. * acquiring info->lock.
  69. *
  70. * MQ_BARRIER:
  71. * To achieve proper release/acquire memory barrier pairing, the state is set to
  72. * STATE_READY with smp_store_release(), and it is read with READ_ONCE followed
  73. * by smp_acquire__after_ctrl_dep(). In addition, wake_q_add_safe() is used.
  74. *
  75. * This prevents the following races:
  76. *
  77. * 1) With the simple wake_q_add(), the task could be gone already before
  78. * the increase of the reference happens
  79. * Thread A
  80. * Thread B
  81. * WRITE_ONCE(wait.state, STATE_NONE);
  82. * schedule_hrtimeout()
  83. * wake_q_add(A)
  84. * if (cmpxchg()) // success
  85. * ->state = STATE_READY (reordered)
  86. * <timeout returns>
  87. * if (wait.state == STATE_READY) return;
  88. * sysret to user space
  89. * sys_exit()
  90. * get_task_struct() // UaF
  91. *
  92. * Solution: Use wake_q_add_safe() and perform the get_task_struct() before
  93. * the smp_store_release() that does ->state = STATE_READY.
  94. *
  95. * 2) Without proper _release/_acquire barriers, the woken up task
  96. * could read stale data
  97. *
  98. * Thread A
  99. * Thread B
  100. * do_mq_timedreceive
  101. * WRITE_ONCE(wait.state, STATE_NONE);
  102. * schedule_hrtimeout()
  103. * state = STATE_READY;
  104. * <timeout returns>
  105. * if (wait.state == STATE_READY) return;
  106. * msg_ptr = wait.msg; // Access to stale data!
  107. * receiver->msg = message; (reordered)
  108. *
  109. * Solution: use _release and _acquire barriers.
  110. *
  111. * 3) There is intentionally no barrier when setting current->state
  112. * to TASK_INTERRUPTIBLE: spin_unlock(&info->lock) provides the
  113. * release memory barrier, and the wakeup is triggered when holding
  114. * info->lock, i.e. spin_lock(&info->lock) provided a pairing
  115. * acquire memory barrier.
  116. */
  117. struct ext_wait_queue { /* queue of sleeping tasks */
  118. struct task_struct *task;
  119. struct list_head list;
  120. struct msg_msg *msg; /* ptr of loaded message */
  121. int state; /* one of STATE_* values */
  122. };
  123. struct mqueue_inode_info {
  124. spinlock_t lock;
  125. struct inode vfs_inode;
  126. wait_queue_head_t wait_q;
  127. struct rb_root msg_tree;
  128. struct rb_node *msg_tree_rightmost;
  129. struct posix_msg_tree_node *node_cache;
  130. struct mq_attr attr;
  131. struct sigevent notify;
  132. struct pid *notify_owner;
  133. u32 notify_self_exec_id;
  134. struct user_namespace *notify_user_ns;
  135. struct ucounts *ucounts; /* user who created, for accounting */
  136. struct sock *notify_sock;
  137. struct sk_buff *notify_cookie;
  138. /* for tasks waiting for free space and messages, respectively */
  139. struct ext_wait_queue e_wait_q[2];
  140. unsigned long qsize; /* size of queue in memory (sum of all msgs) */
  141. };
  142. static struct file_system_type mqueue_fs_type;
  143. static const struct inode_operations mqueue_dir_inode_operations;
  144. static const struct file_operations mqueue_file_operations;
  145. static const struct super_operations mqueue_super_ops;
  146. static const struct fs_context_operations mqueue_fs_context_ops;
  147. static void remove_notification(struct mqueue_inode_info *info);
  148. static struct kmem_cache *mqueue_inode_cachep;
  149. static inline struct mqueue_inode_info *MQUEUE_I(struct inode *inode)
  150. {
  151. return container_of(inode, struct mqueue_inode_info, vfs_inode);
  152. }
  153. /*
  154. * This routine should be called with the mq_lock held.
  155. */
  156. static inline struct ipc_namespace *__get_ns_from_inode(struct inode *inode)
  157. {
  158. return get_ipc_ns(inode->i_sb->s_fs_info);
  159. }
  160. static struct ipc_namespace *get_ns_from_inode(struct inode *inode)
  161. {
  162. struct ipc_namespace *ns;
  163. spin_lock(&mq_lock);
  164. ns = __get_ns_from_inode(inode);
  165. spin_unlock(&mq_lock);
  166. return ns;
  167. }
  168. /* Auxiliary functions to manipulate messages' list */
  169. static int msg_insert(struct msg_msg *msg, struct mqueue_inode_info *info)
  170. {
  171. struct rb_node **p, *parent = NULL;
  172. struct posix_msg_tree_node *leaf;
  173. bool rightmost = true;
  174. p = &info->msg_tree.rb_node;
  175. while (*p) {
  176. parent = *p;
  177. leaf = rb_entry(parent, struct posix_msg_tree_node, rb_node);
  178. if (likely(leaf->priority == msg->m_type))
  179. goto insert_msg;
  180. else if (msg->m_type < leaf->priority) {
  181. p = &(*p)->rb_left;
  182. rightmost = false;
  183. } else
  184. p = &(*p)->rb_right;
  185. }
  186. if (info->node_cache) {
  187. leaf = info->node_cache;
  188. info->node_cache = NULL;
  189. } else {
  190. leaf = kmalloc_obj(*leaf, GFP_ATOMIC);
  191. if (!leaf)
  192. return -ENOMEM;
  193. INIT_LIST_HEAD(&leaf->msg_list);
  194. }
  195. leaf->priority = msg->m_type;
  196. if (rightmost)
  197. info->msg_tree_rightmost = &leaf->rb_node;
  198. rb_link_node(&leaf->rb_node, parent, p);
  199. rb_insert_color(&leaf->rb_node, &info->msg_tree);
  200. insert_msg:
  201. info->attr.mq_curmsgs++;
  202. info->qsize += msg->m_ts;
  203. list_add_tail(&msg->m_list, &leaf->msg_list);
  204. return 0;
  205. }
  206. static inline void msg_tree_erase(struct posix_msg_tree_node *leaf,
  207. struct mqueue_inode_info *info)
  208. {
  209. struct rb_node *node = &leaf->rb_node;
  210. if (info->msg_tree_rightmost == node)
  211. info->msg_tree_rightmost = rb_prev(node);
  212. rb_erase(node, &info->msg_tree);
  213. if (info->node_cache)
  214. kfree(leaf);
  215. else
  216. info->node_cache = leaf;
  217. }
  218. static inline struct msg_msg *msg_get(struct mqueue_inode_info *info)
  219. {
  220. struct rb_node *parent = NULL;
  221. struct posix_msg_tree_node *leaf;
  222. struct msg_msg *msg;
  223. try_again:
  224. /*
  225. * During insert, low priorities go to the left and high to the
  226. * right. On receive, we want the highest priorities first, so
  227. * walk all the way to the right.
  228. */
  229. parent = info->msg_tree_rightmost;
  230. if (!parent) {
  231. if (info->attr.mq_curmsgs) {
  232. pr_warn_once("Inconsistency in POSIX message queue, "
  233. "no tree element, but supposedly messages "
  234. "should exist!\n");
  235. info->attr.mq_curmsgs = 0;
  236. }
  237. return NULL;
  238. }
  239. leaf = rb_entry(parent, struct posix_msg_tree_node, rb_node);
  240. if (unlikely(list_empty(&leaf->msg_list))) {
  241. pr_warn_once("Inconsistency in POSIX message queue, "
  242. "empty leaf node but we haven't implemented "
  243. "lazy leaf delete!\n");
  244. msg_tree_erase(leaf, info);
  245. goto try_again;
  246. } else {
  247. msg = list_first_entry(&leaf->msg_list,
  248. struct msg_msg, m_list);
  249. list_del(&msg->m_list);
  250. if (list_empty(&leaf->msg_list)) {
  251. msg_tree_erase(leaf, info);
  252. }
  253. }
  254. info->attr.mq_curmsgs--;
  255. info->qsize -= msg->m_ts;
  256. return msg;
  257. }
  258. static struct inode *mqueue_get_inode(struct super_block *sb,
  259. struct ipc_namespace *ipc_ns, umode_t mode,
  260. struct mq_attr *attr)
  261. {
  262. struct inode *inode;
  263. int ret = -ENOMEM;
  264. inode = new_inode(sb);
  265. if (!inode)
  266. goto err;
  267. inode->i_ino = get_next_ino();
  268. inode->i_mode = mode;
  269. inode->i_uid = current_fsuid();
  270. inode->i_gid = current_fsgid();
  271. simple_inode_init_ts(inode);
  272. if (S_ISREG(mode)) {
  273. struct mqueue_inode_info *info;
  274. unsigned long mq_bytes, mq_treesize;
  275. inode->i_fop = &mqueue_file_operations;
  276. inode->i_size = FILENT_SIZE;
  277. /* mqueue specific info */
  278. info = MQUEUE_I(inode);
  279. spin_lock_init(&info->lock);
  280. init_waitqueue_head(&info->wait_q);
  281. INIT_LIST_HEAD(&info->e_wait_q[0].list);
  282. INIT_LIST_HEAD(&info->e_wait_q[1].list);
  283. info->notify_owner = NULL;
  284. info->notify_user_ns = NULL;
  285. info->qsize = 0;
  286. info->ucounts = NULL; /* set when all is ok */
  287. info->msg_tree = RB_ROOT;
  288. info->msg_tree_rightmost = NULL;
  289. info->node_cache = NULL;
  290. memset(&info->attr, 0, sizeof(info->attr));
  291. info->attr.mq_maxmsg = min(ipc_ns->mq_msg_max,
  292. ipc_ns->mq_msg_default);
  293. info->attr.mq_msgsize = min(ipc_ns->mq_msgsize_max,
  294. ipc_ns->mq_msgsize_default);
  295. if (attr) {
  296. info->attr.mq_maxmsg = attr->mq_maxmsg;
  297. info->attr.mq_msgsize = attr->mq_msgsize;
  298. }
  299. /*
  300. * We used to allocate a static array of pointers and account
  301. * the size of that array as well as one msg_msg struct per
  302. * possible message into the queue size. That's no longer
  303. * accurate as the queue is now an rbtree and will grow and
  304. * shrink depending on usage patterns. We can, however, still
  305. * account one msg_msg struct per message, but the nodes are
  306. * allocated depending on priority usage, and most programs
  307. * only use one, or a handful, of priorities. However, since
  308. * this is pinned memory, we need to assume worst case, so
  309. * that means the min(mq_maxmsg, max_priorities) * struct
  310. * posix_msg_tree_node.
  311. */
  312. ret = -EINVAL;
  313. if (info->attr.mq_maxmsg <= 0 || info->attr.mq_msgsize <= 0)
  314. goto out_inode;
  315. if (capable(CAP_SYS_RESOURCE)) {
  316. if (info->attr.mq_maxmsg > HARD_MSGMAX ||
  317. info->attr.mq_msgsize > HARD_MSGSIZEMAX)
  318. goto out_inode;
  319. } else {
  320. if (info->attr.mq_maxmsg > ipc_ns->mq_msg_max ||
  321. info->attr.mq_msgsize > ipc_ns->mq_msgsize_max)
  322. goto out_inode;
  323. }
  324. ret = -EOVERFLOW;
  325. /* check for overflow */
  326. if (info->attr.mq_msgsize > ULONG_MAX/info->attr.mq_maxmsg)
  327. goto out_inode;
  328. mq_treesize = info->attr.mq_maxmsg * sizeof(struct msg_msg) +
  329. min_t(unsigned int, info->attr.mq_maxmsg, MQ_PRIO_MAX) *
  330. sizeof(struct posix_msg_tree_node);
  331. mq_bytes = info->attr.mq_maxmsg * info->attr.mq_msgsize;
  332. if (mq_bytes + mq_treesize < mq_bytes)
  333. goto out_inode;
  334. mq_bytes += mq_treesize;
  335. info->ucounts = get_ucounts(current_ucounts());
  336. if (info->ucounts) {
  337. long msgqueue;
  338. spin_lock(&mq_lock);
  339. msgqueue = inc_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes);
  340. if (msgqueue == LONG_MAX || msgqueue > rlimit(RLIMIT_MSGQUEUE)) {
  341. dec_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes);
  342. spin_unlock(&mq_lock);
  343. put_ucounts(info->ucounts);
  344. info->ucounts = NULL;
  345. /* mqueue_evict_inode() releases info->messages */
  346. ret = -EMFILE;
  347. goto out_inode;
  348. }
  349. spin_unlock(&mq_lock);
  350. }
  351. } else if (S_ISDIR(mode)) {
  352. inc_nlink(inode);
  353. /* Some things misbehave if size == 0 on a directory */
  354. inode->i_size = 2 * DIRENT_SIZE;
  355. inode->i_op = &mqueue_dir_inode_operations;
  356. inode->i_fop = &simple_dir_operations;
  357. }
  358. return inode;
  359. out_inode:
  360. iput(inode);
  361. err:
  362. return ERR_PTR(ret);
  363. }
  364. static int mqueue_fill_super(struct super_block *sb, struct fs_context *fc)
  365. {
  366. struct inode *inode;
  367. struct ipc_namespace *ns = sb->s_fs_info;
  368. sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV;
  369. sb->s_blocksize = PAGE_SIZE;
  370. sb->s_blocksize_bits = PAGE_SHIFT;
  371. sb->s_magic = MQUEUE_MAGIC;
  372. sb->s_op = &mqueue_super_ops;
  373. sb->s_d_flags = DCACHE_DONTCACHE;
  374. inode = mqueue_get_inode(sb, ns, S_IFDIR | S_ISVTX | S_IRWXUGO, NULL);
  375. if (IS_ERR(inode))
  376. return PTR_ERR(inode);
  377. sb->s_root = d_make_root(inode);
  378. if (!sb->s_root)
  379. return -ENOMEM;
  380. return 0;
  381. }
  382. static int mqueue_get_tree(struct fs_context *fc)
  383. {
  384. struct mqueue_fs_context *ctx = fc->fs_private;
  385. /*
  386. * With a newly created ipc namespace, we don't need to do a search
  387. * for an ipc namespace match, but we still need to set s_fs_info.
  388. */
  389. if (ctx->newns) {
  390. fc->s_fs_info = ctx->ipc_ns;
  391. return get_tree_nodev(fc, mqueue_fill_super);
  392. }
  393. return get_tree_keyed(fc, mqueue_fill_super, ctx->ipc_ns);
  394. }
  395. static void mqueue_fs_context_free(struct fs_context *fc)
  396. {
  397. struct mqueue_fs_context *ctx = fc->fs_private;
  398. put_ipc_ns(ctx->ipc_ns);
  399. kfree(ctx);
  400. }
  401. static int mqueue_init_fs_context(struct fs_context *fc)
  402. {
  403. struct mqueue_fs_context *ctx;
  404. ctx = kzalloc_obj(struct mqueue_fs_context);
  405. if (!ctx)
  406. return -ENOMEM;
  407. ctx->ipc_ns = get_ipc_ns(current->nsproxy->ipc_ns);
  408. put_user_ns(fc->user_ns);
  409. fc->user_ns = get_user_ns(ctx->ipc_ns->user_ns);
  410. fc->fs_private = ctx;
  411. fc->ops = &mqueue_fs_context_ops;
  412. return 0;
  413. }
  414. /*
  415. * mq_init_ns() is currently the only caller of mq_create_mount().
  416. * So the ns parameter is always a newly created ipc namespace.
  417. */
  418. static struct vfsmount *mq_create_mount(struct ipc_namespace *ns)
  419. {
  420. struct mqueue_fs_context *ctx;
  421. struct fs_context *fc;
  422. struct vfsmount *mnt;
  423. fc = fs_context_for_mount(&mqueue_fs_type, SB_KERNMOUNT);
  424. if (IS_ERR(fc))
  425. return ERR_CAST(fc);
  426. ctx = fc->fs_private;
  427. ctx->newns = true;
  428. put_ipc_ns(ctx->ipc_ns);
  429. ctx->ipc_ns = get_ipc_ns(ns);
  430. put_user_ns(fc->user_ns);
  431. fc->user_ns = get_user_ns(ctx->ipc_ns->user_ns);
  432. mnt = fc_mount_longterm(fc);
  433. put_fs_context(fc);
  434. return mnt;
  435. }
  436. static void init_once(void *foo)
  437. {
  438. struct mqueue_inode_info *p = foo;
  439. inode_init_once(&p->vfs_inode);
  440. }
  441. static struct inode *mqueue_alloc_inode(struct super_block *sb)
  442. {
  443. struct mqueue_inode_info *ei;
  444. ei = alloc_inode_sb(sb, mqueue_inode_cachep, GFP_KERNEL);
  445. if (!ei)
  446. return NULL;
  447. return &ei->vfs_inode;
  448. }
  449. static void mqueue_free_inode(struct inode *inode)
  450. {
  451. kmem_cache_free(mqueue_inode_cachep, MQUEUE_I(inode));
  452. }
  453. static void mqueue_evict_inode(struct inode *inode)
  454. {
  455. struct mqueue_inode_info *info;
  456. struct ipc_namespace *ipc_ns;
  457. struct msg_msg *msg, *nmsg;
  458. LIST_HEAD(tmp_msg);
  459. clear_inode(inode);
  460. if (S_ISDIR(inode->i_mode))
  461. return;
  462. ipc_ns = get_ns_from_inode(inode);
  463. info = MQUEUE_I(inode);
  464. spin_lock(&info->lock);
  465. while ((msg = msg_get(info)) != NULL)
  466. list_add_tail(&msg->m_list, &tmp_msg);
  467. kfree(info->node_cache);
  468. spin_unlock(&info->lock);
  469. list_for_each_entry_safe(msg, nmsg, &tmp_msg, m_list) {
  470. list_del(&msg->m_list);
  471. free_msg(msg);
  472. }
  473. if (info->ucounts) {
  474. unsigned long mq_bytes, mq_treesize;
  475. /* Total amount of bytes accounted for the mqueue */
  476. mq_treesize = info->attr.mq_maxmsg * sizeof(struct msg_msg) +
  477. min_t(unsigned int, info->attr.mq_maxmsg, MQ_PRIO_MAX) *
  478. sizeof(struct posix_msg_tree_node);
  479. mq_bytes = mq_treesize + (info->attr.mq_maxmsg *
  480. info->attr.mq_msgsize);
  481. spin_lock(&mq_lock);
  482. dec_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes);
  483. /*
  484. * get_ns_from_inode() ensures that the
  485. * (ipc_ns = sb->s_fs_info) is either a valid ipc_ns
  486. * to which we now hold a reference, or it is NULL.
  487. * We can't put it here under mq_lock, though.
  488. */
  489. if (ipc_ns)
  490. ipc_ns->mq_queues_count--;
  491. spin_unlock(&mq_lock);
  492. put_ucounts(info->ucounts);
  493. info->ucounts = NULL;
  494. }
  495. if (ipc_ns)
  496. put_ipc_ns(ipc_ns);
  497. }
  498. static int mqueue_create_attr(struct dentry *dentry, umode_t mode, void *arg)
  499. {
  500. struct inode *dir = dentry->d_parent->d_inode;
  501. struct inode *inode;
  502. struct mq_attr *attr = arg;
  503. int error;
  504. struct ipc_namespace *ipc_ns;
  505. spin_lock(&mq_lock);
  506. ipc_ns = __get_ns_from_inode(dir);
  507. if (!ipc_ns) {
  508. error = -EACCES;
  509. goto out_unlock;
  510. }
  511. if (ipc_ns->mq_queues_count >= ipc_ns->mq_queues_max &&
  512. !capable(CAP_SYS_RESOURCE)) {
  513. error = -ENOSPC;
  514. goto out_unlock;
  515. }
  516. ipc_ns->mq_queues_count++;
  517. spin_unlock(&mq_lock);
  518. inode = mqueue_get_inode(dir->i_sb, ipc_ns, mode, attr);
  519. if (IS_ERR(inode)) {
  520. error = PTR_ERR(inode);
  521. spin_lock(&mq_lock);
  522. ipc_ns->mq_queues_count--;
  523. goto out_unlock;
  524. }
  525. put_ipc_ns(ipc_ns);
  526. dir->i_size += DIRENT_SIZE;
  527. simple_inode_init_ts(dir);
  528. d_make_persistent(dentry, inode);
  529. return 0;
  530. out_unlock:
  531. spin_unlock(&mq_lock);
  532. if (ipc_ns)
  533. put_ipc_ns(ipc_ns);
  534. return error;
  535. }
  536. static int mqueue_create(struct mnt_idmap *idmap, struct inode *dir,
  537. struct dentry *dentry, umode_t mode, bool excl)
  538. {
  539. return mqueue_create_attr(dentry, mode, NULL);
  540. }
  541. static int mqueue_unlink(struct inode *dir, struct dentry *dentry)
  542. {
  543. dir->i_size -= DIRENT_SIZE;
  544. return simple_unlink(dir, dentry);
  545. }
  546. /*
  547. * This is routine for system read from queue file.
  548. * To avoid mess with doing here some sort of mq_receive we allow
  549. * to read only queue size & notification info (the only values
  550. * that are interesting from user point of view and aren't accessible
  551. * through std routines)
  552. */
  553. static ssize_t mqueue_read_file(struct file *filp, char __user *u_data,
  554. size_t count, loff_t *off)
  555. {
  556. struct inode *inode = file_inode(filp);
  557. struct mqueue_inode_info *info = MQUEUE_I(inode);
  558. char buffer[FILENT_SIZE];
  559. ssize_t ret;
  560. spin_lock(&info->lock);
  561. snprintf(buffer, sizeof(buffer),
  562. "QSIZE:%-10lu NOTIFY:%-5d SIGNO:%-5d NOTIFY_PID:%-6d\n",
  563. info->qsize,
  564. info->notify_owner ? info->notify.sigev_notify : 0,
  565. (info->notify_owner &&
  566. info->notify.sigev_notify == SIGEV_SIGNAL) ?
  567. info->notify.sigev_signo : 0,
  568. pid_vnr(info->notify_owner));
  569. spin_unlock(&info->lock);
  570. buffer[sizeof(buffer)-1] = '\0';
  571. ret = simple_read_from_buffer(u_data, count, off, buffer,
  572. strlen(buffer));
  573. if (ret <= 0)
  574. return ret;
  575. inode_set_atime_to_ts(inode, inode_set_ctime_current(inode));
  576. return ret;
  577. }
  578. static int mqueue_flush_file(struct file *filp, fl_owner_t id)
  579. {
  580. struct mqueue_inode_info *info = MQUEUE_I(file_inode(filp));
  581. spin_lock(&info->lock);
  582. if (task_tgid(current) == info->notify_owner)
  583. remove_notification(info);
  584. spin_unlock(&info->lock);
  585. return 0;
  586. }
  587. static __poll_t mqueue_poll_file(struct file *filp, struct poll_table_struct *poll_tab)
  588. {
  589. struct mqueue_inode_info *info = MQUEUE_I(file_inode(filp));
  590. __poll_t retval = 0;
  591. poll_wait(filp, &info->wait_q, poll_tab);
  592. spin_lock(&info->lock);
  593. if (info->attr.mq_curmsgs)
  594. retval = EPOLLIN | EPOLLRDNORM;
  595. if (info->attr.mq_curmsgs < info->attr.mq_maxmsg)
  596. retval |= EPOLLOUT | EPOLLWRNORM;
  597. spin_unlock(&info->lock);
  598. return retval;
  599. }
  600. /* Adds current to info->e_wait_q[sr] before element with smaller prio */
  601. static void wq_add(struct mqueue_inode_info *info, int sr,
  602. struct ext_wait_queue *ewp)
  603. {
  604. struct ext_wait_queue *walk;
  605. list_for_each_entry(walk, &info->e_wait_q[sr].list, list) {
  606. if (walk->task->prio <= current->prio) {
  607. list_add_tail(&ewp->list, &walk->list);
  608. return;
  609. }
  610. }
  611. list_add_tail(&ewp->list, &info->e_wait_q[sr].list);
  612. }
  613. /*
  614. * Puts current task to sleep. Caller must hold queue lock. After return
  615. * lock isn't held.
  616. * sr: SEND or RECV
  617. */
  618. static int wq_sleep(struct mqueue_inode_info *info, int sr,
  619. ktime_t *timeout, struct ext_wait_queue *ewp)
  620. __releases(&info->lock)
  621. {
  622. int retval;
  623. signed long time;
  624. wq_add(info, sr, ewp);
  625. for (;;) {
  626. /* memory barrier not required, we hold info->lock */
  627. __set_current_state(TASK_INTERRUPTIBLE);
  628. spin_unlock(&info->lock);
  629. time = schedule_hrtimeout_range_clock(timeout, 0,
  630. HRTIMER_MODE_ABS, CLOCK_REALTIME);
  631. if (READ_ONCE(ewp->state) == STATE_READY) {
  632. /* see MQ_BARRIER for purpose/pairing */
  633. smp_acquire__after_ctrl_dep();
  634. retval = 0;
  635. goto out;
  636. }
  637. spin_lock(&info->lock);
  638. /* we hold info->lock, so no memory barrier required */
  639. if (READ_ONCE(ewp->state) == STATE_READY) {
  640. retval = 0;
  641. goto out_unlock;
  642. }
  643. if (signal_pending(current)) {
  644. retval = -ERESTARTSYS;
  645. break;
  646. }
  647. if (time == 0) {
  648. retval = -ETIMEDOUT;
  649. break;
  650. }
  651. }
  652. list_del(&ewp->list);
  653. out_unlock:
  654. spin_unlock(&info->lock);
  655. out:
  656. return retval;
  657. }
  658. /*
  659. * Returns waiting task that should be serviced first or NULL if none exists
  660. */
  661. static struct ext_wait_queue *wq_get_first_waiter(
  662. struct mqueue_inode_info *info, int sr)
  663. {
  664. struct list_head *ptr;
  665. ptr = info->e_wait_q[sr].list.prev;
  666. if (ptr == &info->e_wait_q[sr].list)
  667. return NULL;
  668. return list_entry(ptr, struct ext_wait_queue, list);
  669. }
  670. static inline void set_cookie(struct sk_buff *skb, char code)
  671. {
  672. ((char *)skb->data)[NOTIFY_COOKIE_LEN-1] = code;
  673. }
  674. /*
  675. * The next function is only to split too long sys_mq_timedsend
  676. */
  677. static void __do_notify(struct mqueue_inode_info *info)
  678. {
  679. /* notification
  680. * invoked when there is registered process and there isn't process
  681. * waiting synchronously for message AND state of queue changed from
  682. * empty to not empty. Here we are sure that no one is waiting
  683. * synchronously. */
  684. if (info->notify_owner &&
  685. info->attr.mq_curmsgs == 1) {
  686. switch (info->notify.sigev_notify) {
  687. case SIGEV_NONE:
  688. break;
  689. case SIGEV_SIGNAL: {
  690. struct kernel_siginfo sig_i;
  691. struct task_struct *task;
  692. /* do_mq_notify() accepts sigev_signo == 0, why?? */
  693. if (!info->notify.sigev_signo)
  694. break;
  695. clear_siginfo(&sig_i);
  696. sig_i.si_signo = info->notify.sigev_signo;
  697. sig_i.si_errno = 0;
  698. sig_i.si_code = SI_MESGQ;
  699. sig_i.si_value = info->notify.sigev_value;
  700. rcu_read_lock();
  701. /* map current pid/uid into info->owner's namespaces */
  702. sig_i.si_pid = task_tgid_nr_ns(current,
  703. ns_of_pid(info->notify_owner));
  704. sig_i.si_uid = from_kuid_munged(info->notify_user_ns,
  705. current_uid());
  706. /*
  707. * We can't use kill_pid_info(), this signal should
  708. * bypass check_kill_permission(). It is from kernel
  709. * but si_fromuser() can't know this.
  710. * We do check the self_exec_id, to avoid sending
  711. * signals to programs that don't expect them.
  712. */
  713. task = pid_task(info->notify_owner, PIDTYPE_TGID);
  714. if (task && task->self_exec_id ==
  715. info->notify_self_exec_id) {
  716. do_send_sig_info(info->notify.sigev_signo,
  717. &sig_i, task, PIDTYPE_TGID);
  718. }
  719. rcu_read_unlock();
  720. break;
  721. }
  722. case SIGEV_THREAD:
  723. set_cookie(info->notify_cookie, NOTIFY_WOKENUP);
  724. netlink_sendskb(info->notify_sock, info->notify_cookie);
  725. break;
  726. }
  727. /* after notification unregisters process */
  728. put_pid(info->notify_owner);
  729. put_user_ns(info->notify_user_ns);
  730. info->notify_owner = NULL;
  731. info->notify_user_ns = NULL;
  732. }
  733. wake_up(&info->wait_q);
  734. }
  735. static int prepare_timeout(const struct __kernel_timespec __user *u_abs_timeout,
  736. struct timespec64 *ts)
  737. {
  738. if (get_timespec64(ts, u_abs_timeout))
  739. return -EFAULT;
  740. if (!timespec64_valid(ts))
  741. return -EINVAL;
  742. return 0;
  743. }
  744. static void remove_notification(struct mqueue_inode_info *info)
  745. {
  746. if (info->notify_owner != NULL &&
  747. info->notify.sigev_notify == SIGEV_THREAD) {
  748. set_cookie(info->notify_cookie, NOTIFY_REMOVED);
  749. netlink_sendskb(info->notify_sock, info->notify_cookie);
  750. }
  751. put_pid(info->notify_owner);
  752. put_user_ns(info->notify_user_ns);
  753. info->notify_owner = NULL;
  754. info->notify_user_ns = NULL;
  755. }
  756. static int prepare_open(struct dentry *dentry, int oflag, int ro,
  757. umode_t mode, struct filename *name,
  758. struct mq_attr *attr)
  759. {
  760. static const int oflag2acc[O_ACCMODE] = { MAY_READ, MAY_WRITE,
  761. MAY_READ | MAY_WRITE };
  762. int acc;
  763. if (d_really_is_negative(dentry)) {
  764. if (!(oflag & O_CREAT))
  765. return -ENOENT;
  766. if (ro)
  767. return ro;
  768. audit_inode_parent_hidden(name, dentry->d_parent);
  769. return vfs_mkobj(dentry, mode & ~current_umask(),
  770. mqueue_create_attr, attr);
  771. }
  772. /* it already existed */
  773. audit_inode(name, dentry, 0);
  774. if ((oflag & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL))
  775. return -EEXIST;
  776. if ((oflag & O_ACCMODE) == (O_RDWR | O_WRONLY))
  777. return -EINVAL;
  778. acc = oflag2acc[oflag & O_ACCMODE];
  779. return inode_permission(&nop_mnt_idmap, d_inode(dentry), acc);
  780. }
  781. static struct file *mqueue_file_open(struct filename *name,
  782. struct vfsmount *mnt, int oflag, int ro,
  783. umode_t mode, struct mq_attr *attr)
  784. {
  785. struct dentry *dentry;
  786. struct file *file;
  787. int ret;
  788. dentry = start_creating_noperm(mnt->mnt_root, &QSTR(name->name));
  789. if (IS_ERR(dentry))
  790. return ERR_CAST(dentry);
  791. ret = prepare_open(dentry, oflag, ro, mode, name, attr);
  792. file = ERR_PTR(ret);
  793. if (!ret) {
  794. const struct path path = { .mnt = mnt, .dentry = dentry };
  795. file = dentry_open(&path, oflag, current_cred());
  796. }
  797. end_creating(dentry);
  798. return file;
  799. }
  800. static int do_mq_open(const char __user *u_name, int oflag, umode_t mode,
  801. struct mq_attr *attr)
  802. {
  803. struct vfsmount *mnt = current->nsproxy->ipc_ns->mq_mnt;
  804. int fd, ro;
  805. audit_mq_open(oflag, mode, attr);
  806. CLASS(filename, name)(u_name);
  807. if (IS_ERR(name))
  808. return PTR_ERR(name);
  809. ro = mnt_want_write(mnt); /* we'll drop it in any case */
  810. fd = FD_ADD(O_CLOEXEC, mqueue_file_open(name, mnt, oflag, ro, mode, attr));
  811. if (!ro)
  812. mnt_drop_write(mnt);
  813. return fd;
  814. }
  815. SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, umode_t, mode,
  816. struct mq_attr __user *, u_attr)
  817. {
  818. struct mq_attr attr;
  819. if (u_attr && copy_from_user(&attr, u_attr, sizeof(struct mq_attr)))
  820. return -EFAULT;
  821. return do_mq_open(u_name, oflag, mode, u_attr ? &attr : NULL);
  822. }
  823. SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
  824. {
  825. int err;
  826. struct dentry *dentry;
  827. struct inode *inode;
  828. struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
  829. struct vfsmount *mnt = ipc_ns->mq_mnt;
  830. CLASS(filename, name)(u_name);
  831. if (IS_ERR(name))
  832. return PTR_ERR(name);
  833. audit_inode_parent_hidden(name, mnt->mnt_root);
  834. err = mnt_want_write(mnt);
  835. if (err)
  836. return err;
  837. dentry = start_removing_noperm(mnt->mnt_root, &QSTR(name->name));
  838. if (IS_ERR(dentry)) {
  839. err = PTR_ERR(dentry);
  840. goto out_drop_write;
  841. }
  842. inode = d_inode(dentry);
  843. ihold(inode);
  844. err = vfs_unlink(&nop_mnt_idmap, d_inode(mnt->mnt_root),
  845. dentry, NULL);
  846. end_removing(dentry);
  847. iput(inode);
  848. out_drop_write:
  849. mnt_drop_write(mnt);
  850. return err;
  851. }
  852. /* Pipelined send and receive functions.
  853. *
  854. * If a receiver finds no waiting message, then it registers itself in the
  855. * list of waiting receivers. A sender checks that list before adding the new
  856. * message into the message array. If there is a waiting receiver, then it
  857. * bypasses the message array and directly hands the message over to the
  858. * receiver. The receiver accepts the message and returns without grabbing the
  859. * queue spinlock:
  860. *
  861. * - Set pointer to message.
  862. * - Queue the receiver task for later wakeup (without the info->lock).
  863. * - Update its state to STATE_READY. Now the receiver can continue.
  864. * - Wake up the process after the lock is dropped. Should the process wake up
  865. * before this wakeup (due to a timeout or a signal) it will either see
  866. * STATE_READY and continue or acquire the lock to check the state again.
  867. *
  868. * The same algorithm is used for senders.
  869. */
  870. static inline void __pipelined_op(struct wake_q_head *wake_q,
  871. struct mqueue_inode_info *info,
  872. struct ext_wait_queue *this)
  873. {
  874. struct task_struct *task;
  875. list_del(&this->list);
  876. task = get_task_struct(this->task);
  877. /* see MQ_BARRIER for purpose/pairing */
  878. smp_store_release(&this->state, STATE_READY);
  879. wake_q_add_safe(wake_q, task);
  880. }
  881. /* pipelined_send() - send a message directly to the task waiting in
  882. * sys_mq_timedreceive() (without inserting message into a queue).
  883. */
  884. static inline void pipelined_send(struct wake_q_head *wake_q,
  885. struct mqueue_inode_info *info,
  886. struct msg_msg *message,
  887. struct ext_wait_queue *receiver)
  888. {
  889. receiver->msg = message;
  890. __pipelined_op(wake_q, info, receiver);
  891. }
  892. /* pipelined_receive() - if there is task waiting in sys_mq_timedsend()
  893. * gets its message and put to the queue (we have one free place for sure). */
  894. static inline void pipelined_receive(struct wake_q_head *wake_q,
  895. struct mqueue_inode_info *info)
  896. {
  897. struct ext_wait_queue *sender = wq_get_first_waiter(info, SEND);
  898. if (!sender) {
  899. /* for poll */
  900. wake_up_interruptible(&info->wait_q);
  901. return;
  902. }
  903. if (msg_insert(sender->msg, info))
  904. return;
  905. __pipelined_op(wake_q, info, sender);
  906. }
  907. static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr,
  908. size_t msg_len, unsigned int msg_prio,
  909. struct timespec64 *ts)
  910. {
  911. struct inode *inode;
  912. struct ext_wait_queue wait;
  913. struct ext_wait_queue *receiver;
  914. struct msg_msg *msg_ptr;
  915. struct mqueue_inode_info *info;
  916. ktime_t expires, *timeout = NULL;
  917. struct posix_msg_tree_node *new_leaf = NULL;
  918. int ret = 0;
  919. DEFINE_WAKE_Q(wake_q);
  920. if (unlikely(msg_prio >= (unsigned long) MQ_PRIO_MAX))
  921. return -EINVAL;
  922. if (ts) {
  923. expires = timespec64_to_ktime(*ts);
  924. timeout = &expires;
  925. }
  926. audit_mq_sendrecv(mqdes, msg_len, msg_prio, ts);
  927. CLASS(fd, f)(mqdes);
  928. if (fd_empty(f))
  929. return -EBADF;
  930. inode = file_inode(fd_file(f));
  931. if (unlikely(fd_file(f)->f_op != &mqueue_file_operations))
  932. return -EBADF;
  933. info = MQUEUE_I(inode);
  934. audit_file(fd_file(f));
  935. if (unlikely(!(fd_file(f)->f_mode & FMODE_WRITE)))
  936. return -EBADF;
  937. if (unlikely(msg_len > info->attr.mq_msgsize))
  938. return -EMSGSIZE;
  939. /* First try to allocate memory, before doing anything with
  940. * existing queues. */
  941. msg_ptr = load_msg(u_msg_ptr, msg_len);
  942. if (IS_ERR(msg_ptr))
  943. return PTR_ERR(msg_ptr);
  944. msg_ptr->m_ts = msg_len;
  945. msg_ptr->m_type = msg_prio;
  946. /*
  947. * msg_insert really wants us to have a valid, spare node struct so
  948. * it doesn't have to kmalloc a GFP_ATOMIC allocation, but it will
  949. * fall back to that if necessary.
  950. */
  951. if (!info->node_cache)
  952. new_leaf = kmalloc_obj(*new_leaf);
  953. spin_lock(&info->lock);
  954. if (!info->node_cache && new_leaf) {
  955. /* Save our speculative allocation into the cache */
  956. INIT_LIST_HEAD(&new_leaf->msg_list);
  957. info->node_cache = new_leaf;
  958. new_leaf = NULL;
  959. } else {
  960. kfree(new_leaf);
  961. }
  962. if (info->attr.mq_curmsgs == info->attr.mq_maxmsg) {
  963. if (fd_file(f)->f_flags & O_NONBLOCK) {
  964. ret = -EAGAIN;
  965. } else {
  966. wait.task = current;
  967. wait.msg = (void *) msg_ptr;
  968. /* memory barrier not required, we hold info->lock */
  969. WRITE_ONCE(wait.state, STATE_NONE);
  970. ret = wq_sleep(info, SEND, timeout, &wait);
  971. /*
  972. * wq_sleep must be called with info->lock held, and
  973. * returns with the lock released
  974. */
  975. goto out_free;
  976. }
  977. } else {
  978. receiver = wq_get_first_waiter(info, RECV);
  979. if (receiver) {
  980. pipelined_send(&wake_q, info, msg_ptr, receiver);
  981. } else {
  982. /* adds message to the queue */
  983. ret = msg_insert(msg_ptr, info);
  984. if (ret)
  985. goto out_unlock;
  986. __do_notify(info);
  987. }
  988. simple_inode_init_ts(inode);
  989. }
  990. out_unlock:
  991. spin_unlock(&info->lock);
  992. wake_up_q(&wake_q);
  993. out_free:
  994. if (ret)
  995. free_msg(msg_ptr);
  996. return ret;
  997. }
  998. static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr,
  999. size_t msg_len, unsigned int __user *u_msg_prio,
  1000. struct timespec64 *ts)
  1001. {
  1002. ssize_t ret;
  1003. struct msg_msg *msg_ptr;
  1004. struct inode *inode;
  1005. struct mqueue_inode_info *info;
  1006. struct ext_wait_queue wait;
  1007. ktime_t expires, *timeout = NULL;
  1008. struct posix_msg_tree_node *new_leaf = NULL;
  1009. if (ts) {
  1010. expires = timespec64_to_ktime(*ts);
  1011. timeout = &expires;
  1012. }
  1013. audit_mq_sendrecv(mqdes, msg_len, 0, ts);
  1014. CLASS(fd, f)(mqdes);
  1015. if (fd_empty(f))
  1016. return -EBADF;
  1017. inode = file_inode(fd_file(f));
  1018. if (unlikely(fd_file(f)->f_op != &mqueue_file_operations))
  1019. return -EBADF;
  1020. info = MQUEUE_I(inode);
  1021. audit_file(fd_file(f));
  1022. if (unlikely(!(fd_file(f)->f_mode & FMODE_READ)))
  1023. return -EBADF;
  1024. /* checks if buffer is big enough */
  1025. if (unlikely(msg_len < info->attr.mq_msgsize))
  1026. return -EMSGSIZE;
  1027. /*
  1028. * msg_insert really wants us to have a valid, spare node struct so
  1029. * it doesn't have to kmalloc a GFP_ATOMIC allocation, but it will
  1030. * fall back to that if necessary.
  1031. */
  1032. if (!info->node_cache)
  1033. new_leaf = kmalloc_obj(*new_leaf);
  1034. spin_lock(&info->lock);
  1035. if (!info->node_cache && new_leaf) {
  1036. /* Save our speculative allocation into the cache */
  1037. INIT_LIST_HEAD(&new_leaf->msg_list);
  1038. info->node_cache = new_leaf;
  1039. } else {
  1040. kfree(new_leaf);
  1041. }
  1042. if (info->attr.mq_curmsgs == 0) {
  1043. if (fd_file(f)->f_flags & O_NONBLOCK) {
  1044. spin_unlock(&info->lock);
  1045. ret = -EAGAIN;
  1046. } else {
  1047. wait.task = current;
  1048. /* memory barrier not required, we hold info->lock */
  1049. WRITE_ONCE(wait.state, STATE_NONE);
  1050. ret = wq_sleep(info, RECV, timeout, &wait);
  1051. msg_ptr = wait.msg;
  1052. }
  1053. } else {
  1054. DEFINE_WAKE_Q(wake_q);
  1055. msg_ptr = msg_get(info);
  1056. simple_inode_init_ts(inode);
  1057. /* There is now free space in queue. */
  1058. pipelined_receive(&wake_q, info);
  1059. spin_unlock(&info->lock);
  1060. wake_up_q(&wake_q);
  1061. ret = 0;
  1062. }
  1063. if (ret == 0) {
  1064. ret = msg_ptr->m_ts;
  1065. if ((u_msg_prio && put_user(msg_ptr->m_type, u_msg_prio)) ||
  1066. store_msg(u_msg_ptr, msg_ptr, msg_ptr->m_ts)) {
  1067. ret = -EFAULT;
  1068. }
  1069. free_msg(msg_ptr);
  1070. }
  1071. return ret;
  1072. }
  1073. SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
  1074. size_t, msg_len, unsigned int, msg_prio,
  1075. const struct __kernel_timespec __user *, u_abs_timeout)
  1076. {
  1077. struct timespec64 ts, *p = NULL;
  1078. if (u_abs_timeout) {
  1079. int res = prepare_timeout(u_abs_timeout, &ts);
  1080. if (res)
  1081. return res;
  1082. p = &ts;
  1083. }
  1084. return do_mq_timedsend(mqdes, u_msg_ptr, msg_len, msg_prio, p);
  1085. }
  1086. SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr,
  1087. size_t, msg_len, unsigned int __user *, u_msg_prio,
  1088. const struct __kernel_timespec __user *, u_abs_timeout)
  1089. {
  1090. struct timespec64 ts, *p = NULL;
  1091. if (u_abs_timeout) {
  1092. int res = prepare_timeout(u_abs_timeout, &ts);
  1093. if (res)
  1094. return res;
  1095. p = &ts;
  1096. }
  1097. return do_mq_timedreceive(mqdes, u_msg_ptr, msg_len, u_msg_prio, p);
  1098. }
  1099. /*
  1100. * Notes: the case when user wants us to deregister (with NULL as pointer)
  1101. * and he isn't currently owner of notification, will be silently discarded.
  1102. * It isn't explicitly defined in the POSIX.
  1103. */
  1104. static int do_mq_notify(mqd_t mqdes, const struct sigevent *notification)
  1105. {
  1106. int ret;
  1107. struct sock *sock;
  1108. struct inode *inode;
  1109. struct mqueue_inode_info *info;
  1110. struct sk_buff *nc;
  1111. audit_mq_notify(mqdes, notification);
  1112. nc = NULL;
  1113. sock = NULL;
  1114. if (notification != NULL) {
  1115. if (unlikely(notification->sigev_notify != SIGEV_NONE &&
  1116. notification->sigev_notify != SIGEV_SIGNAL &&
  1117. notification->sigev_notify != SIGEV_THREAD))
  1118. return -EINVAL;
  1119. if (notification->sigev_notify == SIGEV_SIGNAL &&
  1120. !valid_signal(notification->sigev_signo)) {
  1121. return -EINVAL;
  1122. }
  1123. if (notification->sigev_notify == SIGEV_THREAD) {
  1124. long timeo;
  1125. /* create the notify skb */
  1126. nc = alloc_skb(NOTIFY_COOKIE_LEN, GFP_KERNEL);
  1127. if (!nc)
  1128. return -ENOMEM;
  1129. if (copy_from_user(nc->data,
  1130. notification->sigev_value.sival_ptr,
  1131. NOTIFY_COOKIE_LEN)) {
  1132. kfree_skb(nc);
  1133. return -EFAULT;
  1134. }
  1135. /* TODO: add a header? */
  1136. skb_put(nc, NOTIFY_COOKIE_LEN);
  1137. /* and attach it to the socket */
  1138. retry:
  1139. sock = netlink_getsockbyfd(notification->sigev_signo);
  1140. if (IS_ERR(sock)) {
  1141. kfree_skb(nc);
  1142. return PTR_ERR(sock);
  1143. }
  1144. timeo = MAX_SCHEDULE_TIMEOUT;
  1145. ret = netlink_attachskb(sock, nc, &timeo, NULL);
  1146. if (ret == 1)
  1147. goto retry;
  1148. if (ret)
  1149. return ret;
  1150. }
  1151. }
  1152. CLASS(fd, f)(mqdes);
  1153. if (fd_empty(f)) {
  1154. ret = -EBADF;
  1155. goto out;
  1156. }
  1157. inode = file_inode(fd_file(f));
  1158. if (unlikely(fd_file(f)->f_op != &mqueue_file_operations)) {
  1159. ret = -EBADF;
  1160. goto out;
  1161. }
  1162. info = MQUEUE_I(inode);
  1163. ret = 0;
  1164. spin_lock(&info->lock);
  1165. if (notification == NULL) {
  1166. if (info->notify_owner == task_tgid(current)) {
  1167. remove_notification(info);
  1168. inode_set_atime_to_ts(inode,
  1169. inode_set_ctime_current(inode));
  1170. }
  1171. } else if (info->notify_owner != NULL) {
  1172. ret = -EBUSY;
  1173. } else {
  1174. switch (notification->sigev_notify) {
  1175. case SIGEV_NONE:
  1176. info->notify.sigev_notify = SIGEV_NONE;
  1177. break;
  1178. case SIGEV_THREAD:
  1179. info->notify_sock = sock;
  1180. info->notify_cookie = nc;
  1181. sock = NULL;
  1182. nc = NULL;
  1183. info->notify.sigev_notify = SIGEV_THREAD;
  1184. break;
  1185. case SIGEV_SIGNAL:
  1186. info->notify.sigev_signo = notification->sigev_signo;
  1187. info->notify.sigev_value = notification->sigev_value;
  1188. info->notify.sigev_notify = SIGEV_SIGNAL;
  1189. info->notify_self_exec_id = current->self_exec_id;
  1190. break;
  1191. }
  1192. info->notify_owner = get_pid(task_tgid(current));
  1193. info->notify_user_ns = get_user_ns(current_user_ns());
  1194. inode_set_atime_to_ts(inode, inode_set_ctime_current(inode));
  1195. }
  1196. spin_unlock(&info->lock);
  1197. out:
  1198. if (sock)
  1199. netlink_detachskb(sock, nc);
  1200. return ret;
  1201. }
  1202. SYSCALL_DEFINE2(mq_notify, mqd_t, mqdes,
  1203. const struct sigevent __user *, u_notification)
  1204. {
  1205. struct sigevent n, *p = NULL;
  1206. if (u_notification) {
  1207. if (copy_from_user(&n, u_notification, sizeof(struct sigevent)))
  1208. return -EFAULT;
  1209. p = &n;
  1210. }
  1211. return do_mq_notify(mqdes, p);
  1212. }
  1213. static int do_mq_getsetattr(int mqdes, struct mq_attr *new, struct mq_attr *old)
  1214. {
  1215. struct inode *inode;
  1216. struct mqueue_inode_info *info;
  1217. if (new && (new->mq_flags & (~O_NONBLOCK)))
  1218. return -EINVAL;
  1219. CLASS(fd, f)(mqdes);
  1220. if (fd_empty(f))
  1221. return -EBADF;
  1222. if (unlikely(fd_file(f)->f_op != &mqueue_file_operations))
  1223. return -EBADF;
  1224. inode = file_inode(fd_file(f));
  1225. info = MQUEUE_I(inode);
  1226. spin_lock(&info->lock);
  1227. if (old) {
  1228. *old = info->attr;
  1229. old->mq_flags = fd_file(f)->f_flags & O_NONBLOCK;
  1230. }
  1231. if (new) {
  1232. audit_mq_getsetattr(mqdes, new);
  1233. spin_lock(&fd_file(f)->f_lock);
  1234. if (new->mq_flags & O_NONBLOCK)
  1235. fd_file(f)->f_flags |= O_NONBLOCK;
  1236. else
  1237. fd_file(f)->f_flags &= ~O_NONBLOCK;
  1238. spin_unlock(&fd_file(f)->f_lock);
  1239. inode_set_atime_to_ts(inode, inode_set_ctime_current(inode));
  1240. }
  1241. spin_unlock(&info->lock);
  1242. return 0;
  1243. }
  1244. SYSCALL_DEFINE3(mq_getsetattr, mqd_t, mqdes,
  1245. const struct mq_attr __user *, u_mqstat,
  1246. struct mq_attr __user *, u_omqstat)
  1247. {
  1248. int ret;
  1249. struct mq_attr mqstat, omqstat;
  1250. struct mq_attr *new = NULL, *old = NULL;
  1251. if (u_mqstat) {
  1252. new = &mqstat;
  1253. if (copy_from_user(new, u_mqstat, sizeof(struct mq_attr)))
  1254. return -EFAULT;
  1255. }
  1256. if (u_omqstat)
  1257. old = &omqstat;
  1258. ret = do_mq_getsetattr(mqdes, new, old);
  1259. if (ret || !old)
  1260. return ret;
  1261. if (copy_to_user(u_omqstat, old, sizeof(struct mq_attr)))
  1262. return -EFAULT;
  1263. return 0;
  1264. }
  1265. #ifdef CONFIG_COMPAT
  1266. struct compat_mq_attr {
  1267. compat_long_t mq_flags; /* message queue flags */
  1268. compat_long_t mq_maxmsg; /* maximum number of messages */
  1269. compat_long_t mq_msgsize; /* maximum message size */
  1270. compat_long_t mq_curmsgs; /* number of messages currently queued */
  1271. compat_long_t __reserved[4]; /* ignored for input, zeroed for output */
  1272. };
  1273. static inline int get_compat_mq_attr(struct mq_attr *attr,
  1274. const struct compat_mq_attr __user *uattr)
  1275. {
  1276. struct compat_mq_attr v;
  1277. if (copy_from_user(&v, uattr, sizeof(*uattr)))
  1278. return -EFAULT;
  1279. memset(attr, 0, sizeof(*attr));
  1280. attr->mq_flags = v.mq_flags;
  1281. attr->mq_maxmsg = v.mq_maxmsg;
  1282. attr->mq_msgsize = v.mq_msgsize;
  1283. attr->mq_curmsgs = v.mq_curmsgs;
  1284. return 0;
  1285. }
  1286. static inline int put_compat_mq_attr(const struct mq_attr *attr,
  1287. struct compat_mq_attr __user *uattr)
  1288. {
  1289. struct compat_mq_attr v;
  1290. memset(&v, 0, sizeof(v));
  1291. v.mq_flags = attr->mq_flags;
  1292. v.mq_maxmsg = attr->mq_maxmsg;
  1293. v.mq_msgsize = attr->mq_msgsize;
  1294. v.mq_curmsgs = attr->mq_curmsgs;
  1295. if (copy_to_user(uattr, &v, sizeof(*uattr)))
  1296. return -EFAULT;
  1297. return 0;
  1298. }
  1299. COMPAT_SYSCALL_DEFINE4(mq_open, const char __user *, u_name,
  1300. int, oflag, compat_mode_t, mode,
  1301. struct compat_mq_attr __user *, u_attr)
  1302. {
  1303. struct mq_attr attr, *p = NULL;
  1304. if (u_attr && oflag & O_CREAT) {
  1305. p = &attr;
  1306. if (get_compat_mq_attr(&attr, u_attr))
  1307. return -EFAULT;
  1308. }
  1309. return do_mq_open(u_name, oflag, mode, p);
  1310. }
  1311. COMPAT_SYSCALL_DEFINE2(mq_notify, mqd_t, mqdes,
  1312. const struct compat_sigevent __user *, u_notification)
  1313. {
  1314. struct sigevent n, *p = NULL;
  1315. if (u_notification) {
  1316. if (get_compat_sigevent(&n, u_notification))
  1317. return -EFAULT;
  1318. if (n.sigev_notify == SIGEV_THREAD)
  1319. n.sigev_value.sival_ptr = compat_ptr(n.sigev_value.sival_int);
  1320. p = &n;
  1321. }
  1322. return do_mq_notify(mqdes, p);
  1323. }
  1324. COMPAT_SYSCALL_DEFINE3(mq_getsetattr, mqd_t, mqdes,
  1325. const struct compat_mq_attr __user *, u_mqstat,
  1326. struct compat_mq_attr __user *, u_omqstat)
  1327. {
  1328. int ret;
  1329. struct mq_attr mqstat, omqstat;
  1330. struct mq_attr *new = NULL, *old = NULL;
  1331. if (u_mqstat) {
  1332. new = &mqstat;
  1333. if (get_compat_mq_attr(new, u_mqstat))
  1334. return -EFAULT;
  1335. }
  1336. if (u_omqstat)
  1337. old = &omqstat;
  1338. ret = do_mq_getsetattr(mqdes, new, old);
  1339. if (ret || !old)
  1340. return ret;
  1341. if (put_compat_mq_attr(old, u_omqstat))
  1342. return -EFAULT;
  1343. return 0;
  1344. }
  1345. #endif
  1346. #ifdef CONFIG_COMPAT_32BIT_TIME
  1347. static int compat_prepare_timeout(const struct old_timespec32 __user *p,
  1348. struct timespec64 *ts)
  1349. {
  1350. if (get_old_timespec32(ts, p))
  1351. return -EFAULT;
  1352. if (!timespec64_valid(ts))
  1353. return -EINVAL;
  1354. return 0;
  1355. }
  1356. SYSCALL_DEFINE5(mq_timedsend_time32, mqd_t, mqdes,
  1357. const char __user *, u_msg_ptr,
  1358. unsigned int, msg_len, unsigned int, msg_prio,
  1359. const struct old_timespec32 __user *, u_abs_timeout)
  1360. {
  1361. struct timespec64 ts, *p = NULL;
  1362. if (u_abs_timeout) {
  1363. int res = compat_prepare_timeout(u_abs_timeout, &ts);
  1364. if (res)
  1365. return res;
  1366. p = &ts;
  1367. }
  1368. return do_mq_timedsend(mqdes, u_msg_ptr, msg_len, msg_prio, p);
  1369. }
  1370. SYSCALL_DEFINE5(mq_timedreceive_time32, mqd_t, mqdes,
  1371. char __user *, u_msg_ptr,
  1372. unsigned int, msg_len, unsigned int __user *, u_msg_prio,
  1373. const struct old_timespec32 __user *, u_abs_timeout)
  1374. {
  1375. struct timespec64 ts, *p = NULL;
  1376. if (u_abs_timeout) {
  1377. int res = compat_prepare_timeout(u_abs_timeout, &ts);
  1378. if (res)
  1379. return res;
  1380. p = &ts;
  1381. }
  1382. return do_mq_timedreceive(mqdes, u_msg_ptr, msg_len, u_msg_prio, p);
  1383. }
  1384. #endif
  1385. static const struct inode_operations mqueue_dir_inode_operations = {
  1386. .lookup = simple_lookup,
  1387. .create = mqueue_create,
  1388. .unlink = mqueue_unlink,
  1389. };
  1390. static const struct file_operations mqueue_file_operations = {
  1391. .flush = mqueue_flush_file,
  1392. .poll = mqueue_poll_file,
  1393. .read = mqueue_read_file,
  1394. .llseek = default_llseek,
  1395. };
  1396. static const struct super_operations mqueue_super_ops = {
  1397. .alloc_inode = mqueue_alloc_inode,
  1398. .free_inode = mqueue_free_inode,
  1399. .evict_inode = mqueue_evict_inode,
  1400. .statfs = simple_statfs,
  1401. };
  1402. static const struct fs_context_operations mqueue_fs_context_ops = {
  1403. .free = mqueue_fs_context_free,
  1404. .get_tree = mqueue_get_tree,
  1405. };
  1406. static struct file_system_type mqueue_fs_type = {
  1407. .name = "mqueue",
  1408. .init_fs_context = mqueue_init_fs_context,
  1409. .kill_sb = kill_anon_super,
  1410. .fs_flags = FS_USERNS_MOUNT,
  1411. };
  1412. int mq_init_ns(struct ipc_namespace *ns)
  1413. {
  1414. struct vfsmount *m;
  1415. ns->mq_queues_count = 0;
  1416. ns->mq_queues_max = DFLT_QUEUESMAX;
  1417. ns->mq_msg_max = DFLT_MSGMAX;
  1418. ns->mq_msgsize_max = DFLT_MSGSIZEMAX;
  1419. ns->mq_msg_default = DFLT_MSG;
  1420. ns->mq_msgsize_default = DFLT_MSGSIZE;
  1421. m = mq_create_mount(ns);
  1422. if (IS_ERR(m))
  1423. return PTR_ERR(m);
  1424. ns->mq_mnt = m;
  1425. return 0;
  1426. }
  1427. void mq_clear_sbinfo(struct ipc_namespace *ns)
  1428. {
  1429. ns->mq_mnt->mnt_sb->s_fs_info = NULL;
  1430. }
  1431. static int __init init_mqueue_fs(void)
  1432. {
  1433. int error;
  1434. mqueue_inode_cachep = kmem_cache_create("mqueue_inode_cache",
  1435. sizeof(struct mqueue_inode_info), 0,
  1436. SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, init_once);
  1437. if (mqueue_inode_cachep == NULL)
  1438. return -ENOMEM;
  1439. if (!setup_mq_sysctls(&init_ipc_ns)) {
  1440. pr_warn("sysctl registration failed\n");
  1441. error = -ENOMEM;
  1442. goto out_kmem;
  1443. }
  1444. error = register_filesystem(&mqueue_fs_type);
  1445. if (error)
  1446. goto out_sysctl;
  1447. spin_lock_init(&mq_lock);
  1448. error = mq_init_ns(&init_ipc_ns);
  1449. if (error)
  1450. goto out_filesystem;
  1451. return 0;
  1452. out_filesystem:
  1453. unregister_filesystem(&mqueue_fs_type);
  1454. out_sysctl:
  1455. retire_mq_sysctls(&init_ipc_ns);
  1456. out_kmem:
  1457. kmem_cache_destroy(mqueue_inode_cachep);
  1458. return error;
  1459. }
  1460. device_initcall(init_mqueue_fs);