pidfs.c 29 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136
  1. // SPDX-License-Identifier: GPL-2.0
  2. #include <linux/anon_inodes.h>
  3. #include <linux/exportfs.h>
  4. #include <linux/file.h>
  5. #include <linux/fs.h>
  6. #include <linux/cgroup.h>
  7. #include <linux/magic.h>
  8. #include <linux/mount.h>
  9. #include <linux/pid.h>
  10. #include <linux/pidfs.h>
  11. #include <linux/pid_namespace.h>
  12. #include <linux/poll.h>
  13. #include <linux/proc_fs.h>
  14. #include <linux/proc_ns.h>
  15. #include <linux/pseudo_fs.h>
  16. #include <linux/ptrace.h>
  17. #include <linux/seq_file.h>
  18. #include <uapi/linux/pidfd.h>
  19. #include <linux/ipc_namespace.h>
  20. #include <linux/time_namespace.h>
  21. #include <linux/utsname.h>
  22. #include <net/net_namespace.h>
  23. #include <linux/coredump.h>
  24. #include <linux/rhashtable.h>
  25. #include <linux/xattr.h>
  26. #include <linux/cookie.h>
  27. #include "internal.h"
  28. #include "mount.h"
  29. #define PIDFS_PID_DEAD ERR_PTR(-ESRCH)
  30. static struct kmem_cache *pidfs_attr_cachep __ro_after_init;
  31. static struct kmem_cache *pidfs_xattr_cachep __ro_after_init;
  32. static struct path pidfs_root_path = {};
  33. void pidfs_get_root(struct path *path)
  34. {
  35. *path = pidfs_root_path;
  36. path_get(path);
  37. }
  38. enum pidfs_attr_mask_bits {
  39. PIDFS_ATTR_BIT_EXIT = 0,
  40. PIDFS_ATTR_BIT_COREDUMP = 1,
  41. };
  42. struct pidfs_attr {
  43. unsigned long attr_mask;
  44. struct simple_xattrs *xattrs;
  45. struct /* exit info */ {
  46. __u64 cgroupid;
  47. __s32 exit_code;
  48. };
  49. __u32 coredump_mask;
  50. __u32 coredump_signal;
  51. };
  52. static struct rhashtable pidfs_ino_ht;
  53. static const struct rhashtable_params pidfs_ino_ht_params = {
  54. .key_offset = offsetof(struct pid, ino),
  55. .key_len = sizeof(u64),
  56. .head_offset = offsetof(struct pid, pidfs_hash),
  57. .automatic_shrinking = true,
  58. };
  59. /*
  60. * inode number handling
  61. *
  62. * On 64 bit nothing special happens. The 64bit number assigned
  63. * to struct pid is the inode number.
  64. *
  65. * On 32 bit the 64 bit number assigned to struct pid is split
  66. * into two 32 bit numbers. The lower 32 bits are used as the
  67. * inode number and the upper 32 bits are used as the inode
  68. * generation number.
  69. *
  70. * On 32 bit pidfs_ino() will return the lower 32 bit. When
  71. * pidfs_ino() returns zero a wrap around happened. When a
  72. * wraparound happens the 64 bit number will be incremented by 1
  73. * so inode numbering starts at 1 again.
  74. *
  75. * On 64 bit comparing two pidfds is as simple as comparing
  76. * inode numbers.
  77. *
  78. * When a wraparound happens on 32 bit multiple pidfds with the
  79. * same inode number are likely to exist (This isn't a problem
  80. * since before pidfs pidfds used the anonymous inode meaning
  81. * all pidfds had the same inode number.). Userspace can
  82. * reconstruct the 64 bit identifier by retrieving both the
  83. * inode number and the inode generation number to compare or
  84. * use file handles.
  85. */
  86. #if BITS_PER_LONG == 32
  87. DEFINE_SPINLOCK(pidfs_ino_lock);
  88. static u64 pidfs_ino_nr = 1;
  89. static inline unsigned long pidfs_ino(u64 ino)
  90. {
  91. return lower_32_bits(ino);
  92. }
  93. /* On 32 bit the generation number are the upper 32 bits. */
  94. static inline u32 pidfs_gen(u64 ino)
  95. {
  96. return upper_32_bits(ino);
  97. }
  98. static inline u64 pidfs_alloc_ino(void)
  99. {
  100. u64 ino;
  101. spin_lock(&pidfs_ino_lock);
  102. if (pidfs_ino(pidfs_ino_nr) == 0)
  103. pidfs_ino_nr++;
  104. ino = pidfs_ino_nr++;
  105. spin_unlock(&pidfs_ino_lock);
  106. return ino;
  107. }
  108. #else
  109. /* On 64 bit simply return ino. */
  110. static inline unsigned long pidfs_ino(u64 ino)
  111. {
  112. return ino;
  113. }
  114. /* On 64 bit the generation number is 0. */
  115. static inline u32 pidfs_gen(u64 ino)
  116. {
  117. return 0;
  118. }
  119. DEFINE_COOKIE(pidfs_ino_cookie);
  120. static u64 pidfs_alloc_ino(void)
  121. {
  122. u64 ino;
  123. preempt_disable();
  124. ino = gen_cookie_next(&pidfs_ino_cookie);
  125. preempt_enable();
  126. VFS_WARN_ON_ONCE(ino < 1);
  127. return ino;
  128. }
  129. #endif
  130. void pidfs_prepare_pid(struct pid *pid)
  131. {
  132. pid->stashed = NULL;
  133. pid->attr = NULL;
  134. pid->ino = 0;
  135. }
  136. int pidfs_add_pid(struct pid *pid)
  137. {
  138. int ret;
  139. pid->ino = pidfs_alloc_ino();
  140. ret = rhashtable_insert_fast(&pidfs_ino_ht, &pid->pidfs_hash,
  141. pidfs_ino_ht_params);
  142. if (unlikely(ret))
  143. pid->ino = 0;
  144. return ret;
  145. }
  146. void pidfs_remove_pid(struct pid *pid)
  147. {
  148. if (likely(pid->ino))
  149. rhashtable_remove_fast(&pidfs_ino_ht, &pid->pidfs_hash,
  150. pidfs_ino_ht_params);
  151. }
  152. void pidfs_free_pid(struct pid *pid)
  153. {
  154. struct pidfs_attr *attr __free(kfree) = no_free_ptr(pid->attr);
  155. struct simple_xattrs *xattrs __free(kfree) = NULL;
  156. /*
  157. * Any dentry must've been wiped from the pid by now.
  158. * Otherwise there's a reference count bug.
  159. */
  160. VFS_WARN_ON_ONCE(pid->stashed);
  161. /*
  162. * This if an error occurred during e.g., task creation that
  163. * causes us to never go through the exit path.
  164. */
  165. if (unlikely(!attr))
  166. return;
  167. /* This never had a pidfd created. */
  168. if (IS_ERR(attr))
  169. return;
  170. xattrs = no_free_ptr(attr->xattrs);
  171. if (xattrs)
  172. simple_xattrs_free(xattrs, NULL);
  173. }
  174. #ifdef CONFIG_PROC_FS
  175. /**
  176. * pidfd_show_fdinfo - print information about a pidfd
  177. * @m: proc fdinfo file
  178. * @f: file referencing a pidfd
  179. *
  180. * Pid:
  181. * This function will print the pid that a given pidfd refers to in the
  182. * pid namespace of the procfs instance.
  183. * If the pid namespace of the process is not a descendant of the pid
  184. * namespace of the procfs instance 0 will be shown as its pid. This is
  185. * similar to calling getppid() on a process whose parent is outside of
  186. * its pid namespace.
  187. *
  188. * NSpid:
  189. * If pid namespaces are supported then this function will also print
  190. * the pid of a given pidfd refers to for all descendant pid namespaces
  191. * starting from the current pid namespace of the instance, i.e. the
  192. * Pid field and the first entry in the NSpid field will be identical.
  193. * If the pid namespace of the process is not a descendant of the pid
  194. * namespace of the procfs instance 0 will be shown as its first NSpid
  195. * entry and no others will be shown.
  196. * Note that this differs from the Pid and NSpid fields in
  197. * /proc/<pid>/status where Pid and NSpid are always shown relative to
  198. * the pid namespace of the procfs instance. The difference becomes
  199. * obvious when sending around a pidfd between pid namespaces from a
  200. * different branch of the tree, i.e. where no ancestral relation is
  201. * present between the pid namespaces:
  202. * - create two new pid namespaces ns1 and ns2 in the initial pid
  203. * namespace (also take care to create new mount namespaces in the
  204. * new pid namespace and mount procfs)
  205. * - create a process with a pidfd in ns1
  206. * - send pidfd from ns1 to ns2
  207. * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid
  208. * have exactly one entry, which is 0
  209. */
  210. static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
  211. {
  212. struct pid *pid = pidfd_pid(f);
  213. struct pid_namespace *ns;
  214. pid_t nr = -1;
  215. if (likely(pid_has_task(pid, PIDTYPE_PID))) {
  216. ns = proc_pid_ns(file_inode(m->file)->i_sb);
  217. nr = pid_nr_ns(pid, ns);
  218. }
  219. seq_put_decimal_ll(m, "Pid:\t", nr);
  220. #ifdef CONFIG_PID_NS
  221. seq_put_decimal_ll(m, "\nNSpid:\t", nr);
  222. if (nr > 0) {
  223. int i;
  224. /* If nr is non-zero it means that 'pid' is valid and that
  225. * ns, i.e. the pid namespace associated with the procfs
  226. * instance, is in the pid namespace hierarchy of pid.
  227. * Start at one below the already printed level.
  228. */
  229. for (i = ns->level + 1; i <= pid->level; i++)
  230. seq_put_decimal_ll(m, "\t", pid->numbers[i].nr);
  231. }
  232. #endif
  233. seq_putc(m, '\n');
  234. }
  235. #endif
  236. /*
  237. * Poll support for process exit notification.
  238. */
  239. static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
  240. {
  241. struct pid *pid = pidfd_pid(file);
  242. struct task_struct *task;
  243. __poll_t poll_flags = 0;
  244. poll_wait(file, &pid->wait_pidfd, pts);
  245. /*
  246. * Don't wake waiters if the thread-group leader exited
  247. * prematurely. They either get notified when the last subthread
  248. * exits or not at all if one of the remaining subthreads execs
  249. * and assumes the struct pid of the old thread-group leader.
  250. */
  251. guard(rcu)();
  252. task = pid_task(pid, PIDTYPE_PID);
  253. if (!task)
  254. poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP;
  255. else if (task->exit_state && !delay_group_leader(task))
  256. poll_flags = EPOLLIN | EPOLLRDNORM;
  257. return poll_flags;
  258. }
  259. static inline bool pid_in_current_pidns(const struct pid *pid)
  260. {
  261. const struct pid_namespace *ns = task_active_pid_ns(current);
  262. if (ns->level <= pid->level)
  263. return pid->numbers[ns->level].ns == ns;
  264. return false;
  265. }
  266. static __u32 pidfs_coredump_mask(unsigned long mm_flags)
  267. {
  268. switch (__get_dumpable(mm_flags)) {
  269. case SUID_DUMP_USER:
  270. return PIDFD_COREDUMP_USER;
  271. case SUID_DUMP_ROOT:
  272. return PIDFD_COREDUMP_ROOT;
  273. case SUID_DUMP_DISABLE:
  274. return PIDFD_COREDUMP_SKIP;
  275. default:
  276. WARN_ON_ONCE(true);
  277. }
  278. return 0;
  279. }
  280. /* This must be updated whenever a new flag is added */
  281. #define PIDFD_INFO_SUPPORTED (PIDFD_INFO_PID | \
  282. PIDFD_INFO_CREDS | \
  283. PIDFD_INFO_CGROUPID | \
  284. PIDFD_INFO_EXIT | \
  285. PIDFD_INFO_COREDUMP | \
  286. PIDFD_INFO_SUPPORTED_MASK | \
  287. PIDFD_INFO_COREDUMP_SIGNAL)
  288. static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
  289. {
  290. struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg;
  291. struct task_struct *task __free(put_task) = NULL;
  292. struct pid *pid = pidfd_pid(file);
  293. size_t usize = _IOC_SIZE(cmd);
  294. struct pidfd_info kinfo = {};
  295. struct user_namespace *user_ns;
  296. struct pidfs_attr *attr;
  297. const struct cred *c;
  298. __u64 mask;
  299. BUILD_BUG_ON(sizeof(struct pidfd_info) != PIDFD_INFO_SIZE_VER2);
  300. if (!uinfo)
  301. return -EINVAL;
  302. if (usize < PIDFD_INFO_SIZE_VER0)
  303. return -EINVAL; /* First version, no smaller struct possible */
  304. if (copy_from_user(&mask, &uinfo->mask, sizeof(mask)))
  305. return -EFAULT;
  306. /*
  307. * Restrict information retrieval to tasks within the caller's pid
  308. * namespace hierarchy.
  309. */
  310. if (!pid_in_current_pidns(pid))
  311. return -EREMOTE;
  312. attr = READ_ONCE(pid->attr);
  313. if (mask & PIDFD_INFO_EXIT) {
  314. if (test_bit(PIDFS_ATTR_BIT_EXIT, &attr->attr_mask)) {
  315. smp_rmb();
  316. kinfo.mask |= PIDFD_INFO_EXIT;
  317. #ifdef CONFIG_CGROUPS
  318. kinfo.cgroupid = attr->cgroupid;
  319. kinfo.mask |= PIDFD_INFO_CGROUPID;
  320. #endif
  321. kinfo.exit_code = attr->exit_code;
  322. }
  323. }
  324. if (mask & PIDFD_INFO_COREDUMP) {
  325. if (test_bit(PIDFS_ATTR_BIT_COREDUMP, &attr->attr_mask)) {
  326. smp_rmb();
  327. kinfo.mask |= PIDFD_INFO_COREDUMP | PIDFD_INFO_COREDUMP_SIGNAL;
  328. kinfo.coredump_mask = attr->coredump_mask;
  329. kinfo.coredump_signal = attr->coredump_signal;
  330. }
  331. }
  332. task = get_pid_task(pid, PIDTYPE_PID);
  333. if (!task) {
  334. /*
  335. * If the task has already been reaped, only exit
  336. * information is available
  337. */
  338. if (!(mask & PIDFD_INFO_EXIT))
  339. return -ESRCH;
  340. goto copy_out;
  341. }
  342. c = get_task_cred(task);
  343. if (!c)
  344. return -ESRCH;
  345. if ((mask & PIDFD_INFO_COREDUMP) && !kinfo.coredump_mask) {
  346. guard(task_lock)(task);
  347. if (task->mm) {
  348. unsigned long flags = __mm_flags_get_dumpable(task->mm);
  349. kinfo.coredump_mask = pidfs_coredump_mask(flags);
  350. kinfo.mask |= PIDFD_INFO_COREDUMP;
  351. /* No coredump actually took place, so no coredump signal. */
  352. }
  353. }
  354. /* Unconditionally return identifiers and credentials, the rest only on request */
  355. user_ns = current_user_ns();
  356. kinfo.ruid = from_kuid_munged(user_ns, c->uid);
  357. kinfo.rgid = from_kgid_munged(user_ns, c->gid);
  358. kinfo.euid = from_kuid_munged(user_ns, c->euid);
  359. kinfo.egid = from_kgid_munged(user_ns, c->egid);
  360. kinfo.suid = from_kuid_munged(user_ns, c->suid);
  361. kinfo.sgid = from_kgid_munged(user_ns, c->sgid);
  362. kinfo.fsuid = from_kuid_munged(user_ns, c->fsuid);
  363. kinfo.fsgid = from_kgid_munged(user_ns, c->fsgid);
  364. kinfo.mask |= PIDFD_INFO_CREDS;
  365. put_cred(c);
  366. #ifdef CONFIG_CGROUPS
  367. if (!kinfo.cgroupid) {
  368. struct cgroup *cgrp;
  369. rcu_read_lock();
  370. cgrp = task_dfl_cgroup(task);
  371. kinfo.cgroupid = cgroup_id(cgrp);
  372. kinfo.mask |= PIDFD_INFO_CGROUPID;
  373. rcu_read_unlock();
  374. }
  375. #endif
  376. /*
  377. * Copy pid/tgid last, to reduce the chances the information might be
  378. * stale. Note that it is not possible to ensure it will be valid as the
  379. * task might return as soon as the copy_to_user finishes, but that's ok
  380. * and userspace expects that might happen and can act accordingly, so
  381. * this is just best-effort. What we can do however is checking that all
  382. * the fields are set correctly, or return ESRCH to avoid providing
  383. * incomplete information. */
  384. kinfo.ppid = task_ppid_vnr(task);
  385. kinfo.tgid = task_tgid_vnr(task);
  386. kinfo.pid = task_pid_vnr(task);
  387. kinfo.mask |= PIDFD_INFO_PID;
  388. if (kinfo.pid == 0 || kinfo.tgid == 0)
  389. return -ESRCH;
  390. copy_out:
  391. if (mask & PIDFD_INFO_SUPPORTED_MASK) {
  392. kinfo.mask |= PIDFD_INFO_SUPPORTED_MASK;
  393. kinfo.supported_mask = PIDFD_INFO_SUPPORTED;
  394. }
  395. /* Are there bits in the return mask not present in PIDFD_INFO_SUPPORTED? */
  396. WARN_ON_ONCE(~PIDFD_INFO_SUPPORTED & kinfo.mask);
  397. /*
  398. * If userspace and the kernel have the same struct size it can just
  399. * be copied. If userspace provides an older struct, only the bits that
  400. * userspace knows about will be copied. If userspace provides a new
  401. * struct, only the bits that the kernel knows about will be copied.
  402. */
  403. return copy_struct_to_user(uinfo, usize, &kinfo, sizeof(kinfo), NULL);
  404. }
  405. static bool pidfs_ioctl_valid(unsigned int cmd)
  406. {
  407. switch (cmd) {
  408. case FS_IOC_GETVERSION:
  409. case PIDFD_GET_CGROUP_NAMESPACE:
  410. case PIDFD_GET_IPC_NAMESPACE:
  411. case PIDFD_GET_MNT_NAMESPACE:
  412. case PIDFD_GET_NET_NAMESPACE:
  413. case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE:
  414. case PIDFD_GET_TIME_NAMESPACE:
  415. case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE:
  416. case PIDFD_GET_UTS_NAMESPACE:
  417. case PIDFD_GET_USER_NAMESPACE:
  418. case PIDFD_GET_PID_NAMESPACE:
  419. return true;
  420. }
  421. /* Extensible ioctls require some more careful checks. */
  422. switch (_IOC_NR(cmd)) {
  423. case _IOC_NR(PIDFD_GET_INFO):
  424. /*
  425. * Try to prevent performing a pidfd ioctl when someone
  426. * erronously mistook the file descriptor for a pidfd.
  427. * This is not perfect but will catch most cases.
  428. */
  429. return extensible_ioctl_valid(cmd, PIDFD_GET_INFO, PIDFD_INFO_SIZE_VER0);
  430. }
  431. return false;
  432. }
  433. static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
  434. {
  435. struct task_struct *task __free(put_task) = NULL;
  436. struct nsproxy *nsp __free(put_nsproxy) = NULL;
  437. struct ns_common *ns_common = NULL;
  438. if (!pidfs_ioctl_valid(cmd))
  439. return -ENOIOCTLCMD;
  440. if (cmd == FS_IOC_GETVERSION) {
  441. if (!arg)
  442. return -EINVAL;
  443. __u32 __user *argp = (__u32 __user *)arg;
  444. return put_user(file_inode(file)->i_generation, argp);
  445. }
  446. /* Extensible IOCTL that does not open namespace FDs, take a shortcut */
  447. if (_IOC_NR(cmd) == _IOC_NR(PIDFD_GET_INFO))
  448. return pidfd_info(file, cmd, arg);
  449. task = get_pid_task(pidfd_pid(file), PIDTYPE_PID);
  450. if (!task)
  451. return -ESRCH;
  452. if (arg)
  453. return -EINVAL;
  454. scoped_guard(task_lock, task) {
  455. nsp = task->nsproxy;
  456. if (nsp)
  457. get_nsproxy(nsp);
  458. }
  459. if (!nsp)
  460. return -ESRCH; /* just pretend it didn't exist */
  461. /*
  462. * We're trying to open a file descriptor to the namespace so perform a
  463. * filesystem cred ptrace check. Also, we mirror nsfs behavior.
  464. */
  465. if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
  466. return -EACCES;
  467. switch (cmd) {
  468. /* Namespaces that hang of nsproxy. */
  469. case PIDFD_GET_CGROUP_NAMESPACE:
  470. #ifdef CONFIG_CGROUPS
  471. if (!ns_ref_get(nsp->cgroup_ns))
  472. break;
  473. ns_common = to_ns_common(nsp->cgroup_ns);
  474. #endif
  475. break;
  476. case PIDFD_GET_IPC_NAMESPACE:
  477. #ifdef CONFIG_IPC_NS
  478. if (!ns_ref_get(nsp->ipc_ns))
  479. break;
  480. ns_common = to_ns_common(nsp->ipc_ns);
  481. #endif
  482. break;
  483. case PIDFD_GET_MNT_NAMESPACE:
  484. if (!ns_ref_get(nsp->mnt_ns))
  485. break;
  486. ns_common = to_ns_common(nsp->mnt_ns);
  487. break;
  488. case PIDFD_GET_NET_NAMESPACE:
  489. #ifdef CONFIG_NET_NS
  490. if (!ns_ref_get(nsp->net_ns))
  491. break;
  492. ns_common = to_ns_common(nsp->net_ns);
  493. #endif
  494. break;
  495. case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE:
  496. #ifdef CONFIG_PID_NS
  497. if (!ns_ref_get(nsp->pid_ns_for_children))
  498. break;
  499. ns_common = to_ns_common(nsp->pid_ns_for_children);
  500. #endif
  501. break;
  502. case PIDFD_GET_TIME_NAMESPACE:
  503. #ifdef CONFIG_TIME_NS
  504. if (!ns_ref_get(nsp->time_ns))
  505. break;
  506. ns_common = to_ns_common(nsp->time_ns);
  507. #endif
  508. break;
  509. case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE:
  510. #ifdef CONFIG_TIME_NS
  511. if (!ns_ref_get(nsp->time_ns_for_children))
  512. break;
  513. ns_common = to_ns_common(nsp->time_ns_for_children);
  514. #endif
  515. break;
  516. case PIDFD_GET_UTS_NAMESPACE:
  517. #ifdef CONFIG_UTS_NS
  518. if (!ns_ref_get(nsp->uts_ns))
  519. break;
  520. ns_common = to_ns_common(nsp->uts_ns);
  521. #endif
  522. break;
  523. /* Namespaces that don't hang of nsproxy. */
  524. case PIDFD_GET_USER_NAMESPACE:
  525. #ifdef CONFIG_USER_NS
  526. scoped_guard(rcu) {
  527. struct user_namespace *user_ns;
  528. user_ns = task_cred_xxx(task, user_ns);
  529. if (ns_ref_get(user_ns))
  530. ns_common = to_ns_common(user_ns);
  531. }
  532. #endif
  533. break;
  534. case PIDFD_GET_PID_NAMESPACE:
  535. #ifdef CONFIG_PID_NS
  536. scoped_guard(rcu) {
  537. struct pid_namespace *pid_ns;
  538. pid_ns = task_active_pid_ns(task);
  539. if (ns_ref_get(pid_ns))
  540. ns_common = to_ns_common(pid_ns);
  541. }
  542. #endif
  543. break;
  544. default:
  545. return -ENOIOCTLCMD;
  546. }
  547. if (!ns_common)
  548. return -EOPNOTSUPP;
  549. /* open_namespace() unconditionally consumes the reference */
  550. return open_namespace(ns_common);
  551. }
  552. static const struct file_operations pidfs_file_operations = {
  553. .poll = pidfd_poll,
  554. #ifdef CONFIG_PROC_FS
  555. .show_fdinfo = pidfd_show_fdinfo,
  556. #endif
  557. .unlocked_ioctl = pidfd_ioctl,
  558. .compat_ioctl = compat_ptr_ioctl,
  559. };
  560. struct pid *pidfd_pid(const struct file *file)
  561. {
  562. if (file->f_op != &pidfs_file_operations)
  563. return ERR_PTR(-EBADF);
  564. return file_inode(file)->i_private;
  565. }
  566. /*
  567. * We're called from release_task(). We know there's at least one
  568. * reference to struct pid being held that won't be released until the
  569. * task has been reaped which cannot happen until we're out of
  570. * release_task().
  571. *
  572. * If this struct pid has at least once been referred to by a pidfd then
  573. * pid->attr will be allocated. If not we mark the struct pid as dead so
  574. * anyone who is trying to register it with pidfs will fail to do so.
  575. * Otherwise we would hand out pidfs for reaped tasks without having
  576. * exit information available.
  577. *
  578. * Worst case is that we've filled in the info and the pid gets freed
  579. * right away in free_pid() when no one holds a pidfd anymore. Since
  580. * pidfs_exit() currently is placed after exit_task_work() we know that
  581. * it cannot be us aka the exiting task holding a pidfd to itself.
  582. */
  583. void pidfs_exit(struct task_struct *tsk)
  584. {
  585. struct pid *pid = task_pid(tsk);
  586. struct pidfs_attr *attr;
  587. #ifdef CONFIG_CGROUPS
  588. struct cgroup *cgrp;
  589. #endif
  590. might_sleep();
  591. /* Synchronize with pidfs_register_pid(). */
  592. scoped_guard(spinlock_irq, &pid->wait_pidfd.lock) {
  593. attr = pid->attr;
  594. if (!attr) {
  595. /*
  596. * No one ever held a pidfd for this struct pid.
  597. * Mark it as dead so no one can add a pidfs
  598. * entry anymore. We're about to be reaped and
  599. * so no exit information would be available.
  600. */
  601. pid->attr = PIDFS_PID_DEAD;
  602. return;
  603. }
  604. }
  605. /*
  606. * If @pid->attr is set someone might still legitimately hold a
  607. * pidfd to @pid or someone might concurrently still be getting
  608. * a reference to an already stashed dentry from @pid->stashed.
  609. * So defer cleaning @pid->attr until the last reference to @pid
  610. * is put
  611. */
  612. #ifdef CONFIG_CGROUPS
  613. rcu_read_lock();
  614. cgrp = task_dfl_cgroup(tsk);
  615. attr->cgroupid = cgroup_id(cgrp);
  616. rcu_read_unlock();
  617. #endif
  618. attr->exit_code = tsk->exit_code;
  619. /* Ensure that PIDFD_GET_INFO sees either all or nothing. */
  620. smp_wmb();
  621. set_bit(PIDFS_ATTR_BIT_EXIT, &attr->attr_mask);
  622. }
  623. #ifdef CONFIG_COREDUMP
  624. void pidfs_coredump(const struct coredump_params *cprm)
  625. {
  626. struct pid *pid = cprm->pid;
  627. struct pidfs_attr *attr;
  628. attr = READ_ONCE(pid->attr);
  629. VFS_WARN_ON_ONCE(!attr);
  630. VFS_WARN_ON_ONCE(attr == PIDFS_PID_DEAD);
  631. /* Note how we were coredumped and that we coredumped. */
  632. attr->coredump_mask = pidfs_coredump_mask(cprm->mm_flags) |
  633. PIDFD_COREDUMPED;
  634. /* If coredumping is set to skip we should never end up here. */
  635. VFS_WARN_ON_ONCE(attr->coredump_mask & PIDFD_COREDUMP_SKIP);
  636. /* Expose the signal number that caused the coredump. */
  637. attr->coredump_signal = cprm->siginfo->si_signo;
  638. smp_wmb();
  639. set_bit(PIDFS_ATTR_BIT_COREDUMP, &attr->attr_mask);
  640. }
  641. #endif
  642. static struct vfsmount *pidfs_mnt __ro_after_init;
  643. /*
  644. * The vfs falls back to simple_setattr() if i_op->setattr() isn't
  645. * implemented. Let's reject it completely until we have a clean
  646. * permission concept for pidfds.
  647. */
  648. static int pidfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
  649. struct iattr *attr)
  650. {
  651. return anon_inode_setattr(idmap, dentry, attr);
  652. }
  653. static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path,
  654. struct kstat *stat, u32 request_mask,
  655. unsigned int query_flags)
  656. {
  657. return anon_inode_getattr(idmap, path, stat, request_mask, query_flags);
  658. }
  659. static ssize_t pidfs_listxattr(struct dentry *dentry, char *buf, size_t size)
  660. {
  661. struct inode *inode = d_inode(dentry);
  662. struct pid *pid = inode->i_private;
  663. struct pidfs_attr *attr = pid->attr;
  664. struct simple_xattrs *xattrs;
  665. xattrs = READ_ONCE(attr->xattrs);
  666. if (!xattrs)
  667. return 0;
  668. return simple_xattr_list(inode, xattrs, buf, size);
  669. }
  670. static const struct inode_operations pidfs_inode_operations = {
  671. .getattr = pidfs_getattr,
  672. .setattr = pidfs_setattr,
  673. .listxattr = pidfs_listxattr,
  674. };
  675. static void pidfs_evict_inode(struct inode *inode)
  676. {
  677. struct pid *pid = inode->i_private;
  678. clear_inode(inode);
  679. put_pid(pid);
  680. }
  681. static const struct super_operations pidfs_sops = {
  682. .drop_inode = inode_just_drop,
  683. .evict_inode = pidfs_evict_inode,
  684. .statfs = simple_statfs,
  685. };
  686. /*
  687. * 'lsof' has knowledge of out historical anon_inode use, and expects
  688. * the pidfs dentry name to start with 'anon_inode'.
  689. */
  690. static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen)
  691. {
  692. return dynamic_dname(buffer, buflen, "anon_inode:[pidfd]");
  693. }
  694. const struct dentry_operations pidfs_dentry_operations = {
  695. .d_dname = pidfs_dname,
  696. .d_prune = stashed_dentry_prune,
  697. };
  698. static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
  699. struct inode *parent)
  700. {
  701. const struct pid *pid = inode->i_private;
  702. if (*max_len < 2) {
  703. *max_len = 2;
  704. return FILEID_INVALID;
  705. }
  706. *max_len = 2;
  707. *(u64 *)fh = pid->ino;
  708. return FILEID_KERNFS;
  709. }
  710. /* Find a struct pid based on the inode number. */
  711. static struct pid *pidfs_ino_get_pid(u64 ino)
  712. {
  713. struct pid *pid;
  714. struct pidfs_attr *attr;
  715. guard(rcu)();
  716. pid = rhashtable_lookup(&pidfs_ino_ht, &ino, pidfs_ino_ht_params);
  717. if (!pid)
  718. return NULL;
  719. attr = READ_ONCE(pid->attr);
  720. if (IS_ERR_OR_NULL(attr))
  721. return NULL;
  722. if (test_bit(PIDFS_ATTR_BIT_EXIT, &attr->attr_mask))
  723. return NULL;
  724. /* Within our pid namespace hierarchy? */
  725. if (pid_vnr(pid) == 0)
  726. return NULL;
  727. return get_pid(pid);
  728. }
  729. static struct dentry *pidfs_fh_to_dentry(struct super_block *sb,
  730. struct fid *fid, int fh_len,
  731. int fh_type)
  732. {
  733. int ret;
  734. u64 pid_ino;
  735. struct path path;
  736. struct pid *pid;
  737. if (fh_len < 2)
  738. return NULL;
  739. switch (fh_type) {
  740. case FILEID_KERNFS:
  741. pid_ino = *(u64 *)fid;
  742. break;
  743. default:
  744. return NULL;
  745. }
  746. pid = pidfs_ino_get_pid(pid_ino);
  747. if (!pid)
  748. return NULL;
  749. ret = path_from_stashed(&pid->stashed, pidfs_mnt, pid, &path);
  750. if (ret < 0)
  751. return ERR_PTR(ret);
  752. VFS_WARN_ON_ONCE(!pid->attr);
  753. mntput(path.mnt);
  754. return path.dentry;
  755. }
  756. /*
  757. * Make sure that we reject any nonsensical flags that users pass via
  758. * open_by_handle_at(). Note that PIDFD_THREAD is defined as O_EXCL, and
  759. * PIDFD_NONBLOCK as O_NONBLOCK.
  760. */
  761. #define VALID_FILE_HANDLE_OPEN_FLAGS \
  762. (O_RDONLY | O_WRONLY | O_RDWR | O_NONBLOCK | O_CLOEXEC | O_EXCL)
  763. static int pidfs_export_permission(struct handle_to_path_ctx *ctx,
  764. unsigned int oflags)
  765. {
  766. if (oflags & ~(VALID_FILE_HANDLE_OPEN_FLAGS | O_LARGEFILE))
  767. return -EINVAL;
  768. /*
  769. * pidfd_ino_get_pid() will verify that the struct pid is part
  770. * of the caller's pid namespace hierarchy. No further
  771. * permission checks are needed.
  772. */
  773. return 0;
  774. }
  775. static struct file *pidfs_export_open(const struct path *path, unsigned int oflags)
  776. {
  777. /*
  778. * Clear O_LARGEFILE as open_by_handle_at() forces it and raise
  779. * O_RDWR as pidfds always are.
  780. */
  781. oflags &= ~O_LARGEFILE;
  782. return dentry_open(path, oflags | O_RDWR, current_cred());
  783. }
  784. static const struct export_operations pidfs_export_operations = {
  785. .encode_fh = pidfs_encode_fh,
  786. .fh_to_dentry = pidfs_fh_to_dentry,
  787. .open = pidfs_export_open,
  788. .permission = pidfs_export_permission,
  789. };
  790. static int pidfs_init_inode(struct inode *inode, void *data)
  791. {
  792. const struct pid *pid = data;
  793. inode->i_private = data;
  794. inode->i_flags |= S_PRIVATE | S_ANON_INODE;
  795. /* We allow to set xattrs. */
  796. inode->i_flags &= ~S_IMMUTABLE;
  797. inode->i_mode |= S_IRWXU;
  798. inode->i_op = &pidfs_inode_operations;
  799. inode->i_fop = &pidfs_file_operations;
  800. inode->i_ino = pidfs_ino(pid->ino);
  801. inode->i_generation = pidfs_gen(pid->ino);
  802. return 0;
  803. }
  804. static void pidfs_put_data(void *data)
  805. {
  806. struct pid *pid = data;
  807. put_pid(pid);
  808. }
  809. /**
  810. * pidfs_register_pid - register a struct pid in pidfs
  811. * @pid: pid to pin
  812. *
  813. * Register a struct pid in pidfs.
  814. *
  815. * Return: On success zero, on error a negative error code is returned.
  816. */
  817. int pidfs_register_pid(struct pid *pid)
  818. {
  819. struct pidfs_attr *new_attr __free(kfree) = NULL;
  820. struct pidfs_attr *attr;
  821. might_sleep();
  822. if (!pid)
  823. return 0;
  824. attr = READ_ONCE(pid->attr);
  825. if (unlikely(attr == PIDFS_PID_DEAD))
  826. return PTR_ERR(PIDFS_PID_DEAD);
  827. if (attr)
  828. return 0;
  829. new_attr = kmem_cache_zalloc(pidfs_attr_cachep, GFP_KERNEL);
  830. if (!new_attr)
  831. return -ENOMEM;
  832. /* Synchronize with pidfs_exit(). */
  833. guard(spinlock_irq)(&pid->wait_pidfd.lock);
  834. attr = pid->attr;
  835. if (unlikely(attr == PIDFS_PID_DEAD))
  836. return PTR_ERR(PIDFS_PID_DEAD);
  837. if (unlikely(attr))
  838. return 0;
  839. pid->attr = no_free_ptr(new_attr);
  840. return 0;
  841. }
  842. static struct dentry *pidfs_stash_dentry(struct dentry **stashed,
  843. struct dentry *dentry)
  844. {
  845. int ret;
  846. struct pid *pid = d_inode(dentry)->i_private;
  847. VFS_WARN_ON_ONCE(stashed != &pid->stashed);
  848. ret = pidfs_register_pid(pid);
  849. if (ret)
  850. return ERR_PTR(ret);
  851. return stash_dentry(stashed, dentry);
  852. }
  853. static const struct stashed_operations pidfs_stashed_ops = {
  854. .stash_dentry = pidfs_stash_dentry,
  855. .init_inode = pidfs_init_inode,
  856. .put_data = pidfs_put_data,
  857. };
  858. static int pidfs_xattr_get(const struct xattr_handler *handler,
  859. struct dentry *unused, struct inode *inode,
  860. const char *suffix, void *value, size_t size)
  861. {
  862. struct pid *pid = inode->i_private;
  863. struct pidfs_attr *attr = pid->attr;
  864. const char *name;
  865. struct simple_xattrs *xattrs;
  866. xattrs = READ_ONCE(attr->xattrs);
  867. if (!xattrs)
  868. return 0;
  869. name = xattr_full_name(handler, suffix);
  870. return simple_xattr_get(xattrs, name, value, size);
  871. }
  872. static int pidfs_xattr_set(const struct xattr_handler *handler,
  873. struct mnt_idmap *idmap, struct dentry *unused,
  874. struct inode *inode, const char *suffix,
  875. const void *value, size_t size, int flags)
  876. {
  877. struct pid *pid = inode->i_private;
  878. struct pidfs_attr *attr = pid->attr;
  879. const char *name;
  880. struct simple_xattrs *xattrs;
  881. struct simple_xattr *old_xattr;
  882. /* Ensure we're the only one to set @attr->xattrs. */
  883. WARN_ON_ONCE(!inode_is_locked(inode));
  884. xattrs = READ_ONCE(attr->xattrs);
  885. if (!xattrs) {
  886. xattrs = kmem_cache_zalloc(pidfs_xattr_cachep, GFP_KERNEL);
  887. if (!xattrs)
  888. return -ENOMEM;
  889. simple_xattrs_init(xattrs);
  890. smp_store_release(&pid->attr->xattrs, xattrs);
  891. }
  892. name = xattr_full_name(handler, suffix);
  893. old_xattr = simple_xattr_set(xattrs, name, value, size, flags);
  894. if (IS_ERR(old_xattr))
  895. return PTR_ERR(old_xattr);
  896. simple_xattr_free(old_xattr);
  897. return 0;
  898. }
  899. static const struct xattr_handler pidfs_trusted_xattr_handler = {
  900. .prefix = XATTR_TRUSTED_PREFIX,
  901. .get = pidfs_xattr_get,
  902. .set = pidfs_xattr_set,
  903. };
  904. static const struct xattr_handler *const pidfs_xattr_handlers[] = {
  905. &pidfs_trusted_xattr_handler,
  906. NULL
  907. };
  908. static int pidfs_init_fs_context(struct fs_context *fc)
  909. {
  910. struct pseudo_fs_context *ctx;
  911. ctx = init_pseudo(fc, PID_FS_MAGIC);
  912. if (!ctx)
  913. return -ENOMEM;
  914. fc->s_iflags |= SB_I_NOEXEC;
  915. fc->s_iflags |= SB_I_NODEV;
  916. ctx->s_d_flags |= DCACHE_DONTCACHE;
  917. ctx->ops = &pidfs_sops;
  918. ctx->eops = &pidfs_export_operations;
  919. ctx->dops = &pidfs_dentry_operations;
  920. ctx->xattr = pidfs_xattr_handlers;
  921. fc->s_fs_info = (void *)&pidfs_stashed_ops;
  922. return 0;
  923. }
  924. static struct file_system_type pidfs_type = {
  925. .name = "pidfs",
  926. .init_fs_context = pidfs_init_fs_context,
  927. .kill_sb = kill_anon_super,
  928. };
  929. struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
  930. {
  931. struct file *pidfd_file;
  932. struct path path __free(path_put) = {};
  933. int ret;
  934. /*
  935. * Ensure that PIDFD_STALE can be passed as a flag without
  936. * overloading other uapi pidfd flags.
  937. */
  938. BUILD_BUG_ON(PIDFD_STALE == PIDFD_THREAD);
  939. BUILD_BUG_ON(PIDFD_STALE == PIDFD_NONBLOCK);
  940. ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path);
  941. if (ret < 0)
  942. return ERR_PTR(ret);
  943. VFS_WARN_ON_ONCE(!pid->attr);
  944. flags &= ~PIDFD_STALE;
  945. flags |= O_RDWR;
  946. pidfd_file = dentry_open(&path, flags, current_cred());
  947. /* Raise PIDFD_THREAD explicitly as do_dentry_open() strips it. */
  948. if (!IS_ERR(pidfd_file))
  949. pidfd_file->f_flags |= (flags & PIDFD_THREAD);
  950. return pidfd_file;
  951. }
  952. void __init pidfs_init(void)
  953. {
  954. if (rhashtable_init(&pidfs_ino_ht, &pidfs_ino_ht_params))
  955. panic("Failed to initialize pidfs hashtable");
  956. pidfs_attr_cachep = kmem_cache_create("pidfs_attr_cache", sizeof(struct pidfs_attr), 0,
  957. (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
  958. SLAB_ACCOUNT | SLAB_PANIC), NULL);
  959. pidfs_xattr_cachep = kmem_cache_create("pidfs_xattr_cache",
  960. sizeof(struct simple_xattrs), 0,
  961. (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
  962. SLAB_ACCOUNT | SLAB_PANIC), NULL);
  963. pidfs_mnt = kern_mount(&pidfs_type);
  964. if (IS_ERR(pidfs_mnt))
  965. panic("Failed to mount pidfs pseudo filesystem");
  966. pidfs_root_path.mnt = pidfs_mnt;
  967. pidfs_root_path.dentry = pidfs_mnt->mnt_root;
  968. }