pid_namespace.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * Pid namespaces
  4. *
  5. * Authors:
  6. * (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc.
  7. * (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM
  8. * Many thanks to Oleg Nesterov for comments and help
  9. *
  10. */
  11. #include <linux/pid.h>
  12. #include <linux/pid_namespace.h>
  13. #include <linux/user_namespace.h>
  14. #include <linux/syscalls.h>
  15. #include <linux/cred.h>
  16. #include <linux/err.h>
  17. #include <linux/acct.h>
  18. #include <linux/slab.h>
  19. #include <linux/proc_ns.h>
  20. #include <linux/reboot.h>
  21. #include <linux/export.h>
  22. #include <linux/sched/task.h>
  23. #include <linux/sched/signal.h>
  24. #include <linux/idr.h>
  25. #include <linux/nstree.h>
  26. #include <uapi/linux/wait.h>
  27. #include "pid_sysctl.h"
  28. static DEFINE_MUTEX(pid_caches_mutex);
  29. static struct kmem_cache *pid_ns_cachep;
  30. /* Write once array, filled from the beginning. */
  31. static struct kmem_cache *pid_cache[MAX_PID_NS_LEVEL];
  32. /*
  33. * creates the kmem cache to allocate pids from.
  34. * @level: pid namespace level
  35. */
  36. static struct kmem_cache *create_pid_cachep(unsigned int level)
  37. {
  38. /* Level 0 is init_pid_ns.pid_cachep */
  39. struct kmem_cache **pkc = &pid_cache[level - 1];
  40. struct kmem_cache *kc;
  41. char name[4 + 10 + 1];
  42. unsigned int len;
  43. kc = READ_ONCE(*pkc);
  44. if (kc)
  45. return kc;
  46. snprintf(name, sizeof(name), "pid_%u", level + 1);
  47. len = struct_size_t(struct pid, numbers, level + 1);
  48. mutex_lock(&pid_caches_mutex);
  49. /* Name collision forces to do allocation under mutex. */
  50. if (!*pkc)
  51. *pkc = kmem_cache_create(name, len, 0,
  52. SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
  53. mutex_unlock(&pid_caches_mutex);
  54. /* current can fail, but someone else can succeed. */
  55. return READ_ONCE(*pkc);
  56. }
  57. static struct ucounts *inc_pid_namespaces(struct user_namespace *ns)
  58. {
  59. return inc_ucount(ns, current_euid(), UCOUNT_PID_NAMESPACES);
  60. }
  61. static void dec_pid_namespaces(struct ucounts *ucounts)
  62. {
  63. dec_ucount(ucounts, UCOUNT_PID_NAMESPACES);
  64. }
  65. static void destroy_pid_namespace_work(struct work_struct *work);
  66. static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
  67. struct pid_namespace *parent_pid_ns)
  68. {
  69. struct pid_namespace *ns;
  70. unsigned int level = parent_pid_ns->level + 1;
  71. struct ucounts *ucounts;
  72. int err;
  73. err = -EINVAL;
  74. if (!in_userns(parent_pid_ns->user_ns, user_ns))
  75. goto out;
  76. err = -ENOSPC;
  77. if (level > MAX_PID_NS_LEVEL)
  78. goto out;
  79. ucounts = inc_pid_namespaces(user_ns);
  80. if (!ucounts)
  81. goto out;
  82. err = -ENOMEM;
  83. ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
  84. if (ns == NULL)
  85. goto out_dec;
  86. idr_init(&ns->idr);
  87. ns->pid_cachep = create_pid_cachep(level);
  88. if (ns->pid_cachep == NULL)
  89. goto out_free_idr;
  90. err = ns_common_init(ns);
  91. if (err)
  92. goto out_free_idr;
  93. ns->pid_max = PID_MAX_LIMIT;
  94. err = register_pidns_sysctls(ns);
  95. if (err)
  96. goto out_free_inum;
  97. ns->level = level;
  98. ns->parent = get_pid_ns(parent_pid_ns);
  99. ns->user_ns = get_user_ns(user_ns);
  100. ns->ucounts = ucounts;
  101. ns->pid_allocated = PIDNS_ADDING;
  102. INIT_WORK(&ns->work, destroy_pid_namespace_work);
  103. #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
  104. ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns);
  105. #endif
  106. ns_tree_add(ns);
  107. return ns;
  108. out_free_inum:
  109. ns_common_free(ns);
  110. out_free_idr:
  111. idr_destroy(&ns->idr);
  112. kmem_cache_free(pid_ns_cachep, ns);
  113. out_dec:
  114. dec_pid_namespaces(ucounts);
  115. out:
  116. return ERR_PTR(err);
  117. }
  118. static void delayed_free_pidns(struct rcu_head *p)
  119. {
  120. struct pid_namespace *ns = container_of(p, struct pid_namespace, rcu);
  121. dec_pid_namespaces(ns->ucounts);
  122. put_user_ns(ns->user_ns);
  123. kmem_cache_free(pid_ns_cachep, ns);
  124. }
  125. static void destroy_pid_namespace(struct pid_namespace *ns)
  126. {
  127. ns_tree_remove(ns);
  128. unregister_pidns_sysctls(ns);
  129. ns_common_free(ns);
  130. idr_destroy(&ns->idr);
  131. call_rcu(&ns->rcu, delayed_free_pidns);
  132. }
  133. static void destroy_pid_namespace_work(struct work_struct *work)
  134. {
  135. struct pid_namespace *ns =
  136. container_of(work, struct pid_namespace, work);
  137. do {
  138. struct pid_namespace *parent;
  139. parent = ns->parent;
  140. destroy_pid_namespace(ns);
  141. ns = parent;
  142. } while (ns != &init_pid_ns && ns_ref_put(ns));
  143. }
  144. struct pid_namespace *copy_pid_ns(u64 flags,
  145. struct user_namespace *user_ns, struct pid_namespace *old_ns)
  146. {
  147. if (!(flags & CLONE_NEWPID))
  148. return get_pid_ns(old_ns);
  149. if (task_active_pid_ns(current) != old_ns)
  150. return ERR_PTR(-EINVAL);
  151. return create_pid_namespace(user_ns, old_ns);
  152. }
  153. void put_pid_ns(struct pid_namespace *ns)
  154. {
  155. if (ns && ns_ref_put(ns))
  156. schedule_work(&ns->work);
  157. }
  158. EXPORT_SYMBOL_GPL(put_pid_ns);
  159. void zap_pid_ns_processes(struct pid_namespace *pid_ns)
  160. {
  161. int nr;
  162. int rc;
  163. struct task_struct *task, *me = current;
  164. int init_pids = thread_group_leader(me) ? 1 : 2;
  165. struct pid *pid;
  166. /* Don't allow any more processes into the pid namespace */
  167. disable_pid_allocation(pid_ns);
  168. /*
  169. * Ignore SIGCHLD causing any terminated children to autoreap.
  170. * This speeds up the namespace shutdown, plus see the comment
  171. * below.
  172. */
  173. spin_lock_irq(&me->sighand->siglock);
  174. me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN;
  175. spin_unlock_irq(&me->sighand->siglock);
  176. /*
  177. * The last thread in the cgroup-init thread group is terminating.
  178. * Find remaining pid_ts in the namespace, signal and wait for them
  179. * to exit.
  180. *
  181. * Note: This signals each threads in the namespace - even those that
  182. * belong to the same thread group, To avoid this, we would have
  183. * to walk the entire tasklist looking a processes in this
  184. * namespace, but that could be unnecessarily expensive if the
  185. * pid namespace has just a few processes. Or we need to
  186. * maintain a tasklist for each pid namespace.
  187. *
  188. */
  189. rcu_read_lock();
  190. read_lock(&tasklist_lock);
  191. nr = 2;
  192. idr_for_each_entry_continue(&pid_ns->idr, pid, nr) {
  193. task = pid_task(pid, PIDTYPE_PID);
  194. if (task && !__fatal_signal_pending(task))
  195. group_send_sig_info(SIGKILL, SEND_SIG_PRIV, task, PIDTYPE_MAX);
  196. }
  197. read_unlock(&tasklist_lock);
  198. rcu_read_unlock();
  199. /*
  200. * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD.
  201. * kernel_wait4() will also block until our children traced from the
  202. * parent namespace are detached and become EXIT_DEAD.
  203. */
  204. do {
  205. clear_thread_flag(TIF_SIGPENDING);
  206. clear_thread_flag(TIF_NOTIFY_SIGNAL);
  207. rc = kernel_wait4(-1, NULL, __WALL, NULL);
  208. } while (rc != -ECHILD);
  209. /*
  210. * kernel_wait4() misses EXIT_DEAD children, and EXIT_ZOMBIE
  211. * process whose parents processes are outside of the pid
  212. * namespace. Such processes are created with setns()+fork().
  213. *
  214. * If those EXIT_ZOMBIE processes are not reaped by their
  215. * parents before their parents exit, they will be reparented
  216. * to pid_ns->child_reaper. Thus pidns->child_reaper needs to
  217. * stay valid until they all go away.
  218. *
  219. * The code relies on the pid_ns->child_reaper ignoring
  220. * SIGCHILD to cause those EXIT_ZOMBIE processes to be
  221. * autoreaped if reparented.
  222. *
  223. * Semantically it is also desirable to wait for EXIT_ZOMBIE
  224. * processes before allowing the child_reaper to be reaped, as
  225. * that gives the invariant that when the init process of a
  226. * pid namespace is reaped all of the processes in the pid
  227. * namespace are gone.
  228. *
  229. * Once all of the other tasks are gone from the pid_namespace
  230. * free_pid() will awaken this task.
  231. */
  232. for (;;) {
  233. set_current_state(TASK_INTERRUPTIBLE);
  234. if (pid_ns->pid_allocated == init_pids)
  235. break;
  236. schedule();
  237. }
  238. __set_current_state(TASK_RUNNING);
  239. if (pid_ns->reboot)
  240. current->signal->group_exit_code = pid_ns->reboot;
  241. acct_exit_ns(pid_ns);
  242. return;
  243. }
  244. #ifdef CONFIG_CHECKPOINT_RESTORE
  245. static int pid_ns_ctl_handler(const struct ctl_table *table, int write,
  246. void *buffer, size_t *lenp, loff_t *ppos)
  247. {
  248. struct pid_namespace *pid_ns = task_active_pid_ns(current);
  249. struct ctl_table tmp = *table;
  250. int ret, next;
  251. if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns))
  252. return -EPERM;
  253. next = idr_get_cursor(&pid_ns->idr) - 1;
  254. tmp.data = &next;
  255. tmp.extra2 = &pid_ns->pid_max;
  256. ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
  257. if (!ret && write)
  258. idr_set_cursor(&pid_ns->idr, next + 1);
  259. return ret;
  260. }
  261. static const struct ctl_table pid_ns_ctl_table[] = {
  262. {
  263. .procname = "ns_last_pid",
  264. .maxlen = sizeof(int),
  265. .mode = 0666, /* permissions are checked in the handler */
  266. .proc_handler = pid_ns_ctl_handler,
  267. .extra1 = SYSCTL_ZERO,
  268. .extra2 = &init_pid_ns.pid_max,
  269. },
  270. };
  271. #endif /* CONFIG_CHECKPOINT_RESTORE */
  272. int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
  273. {
  274. if (pid_ns == &init_pid_ns)
  275. return 0;
  276. switch (cmd) {
  277. case LINUX_REBOOT_CMD_RESTART2:
  278. case LINUX_REBOOT_CMD_RESTART:
  279. pid_ns->reboot = SIGHUP;
  280. break;
  281. case LINUX_REBOOT_CMD_POWER_OFF:
  282. case LINUX_REBOOT_CMD_HALT:
  283. pid_ns->reboot = SIGINT;
  284. break;
  285. default:
  286. return -EINVAL;
  287. }
  288. read_lock(&tasklist_lock);
  289. send_sig(SIGKILL, pid_ns->child_reaper, 1);
  290. read_unlock(&tasklist_lock);
  291. do_exit(0);
  292. /* Not reached */
  293. return 0;
  294. }
  295. static struct ns_common *pidns_get(struct task_struct *task)
  296. {
  297. struct pid_namespace *ns;
  298. rcu_read_lock();
  299. ns = task_active_pid_ns(task);
  300. if (ns)
  301. get_pid_ns(ns);
  302. rcu_read_unlock();
  303. return ns ? &ns->ns : NULL;
  304. }
  305. static struct ns_common *pidns_for_children_get(struct task_struct *task)
  306. {
  307. struct pid_namespace *ns = NULL;
  308. task_lock(task);
  309. if (task->nsproxy) {
  310. ns = task->nsproxy->pid_ns_for_children;
  311. get_pid_ns(ns);
  312. }
  313. task_unlock(task);
  314. if (ns) {
  315. read_lock(&tasklist_lock);
  316. if (!ns->child_reaper) {
  317. put_pid_ns(ns);
  318. ns = NULL;
  319. }
  320. read_unlock(&tasklist_lock);
  321. }
  322. return ns ? &ns->ns : NULL;
  323. }
  324. static void pidns_put(struct ns_common *ns)
  325. {
  326. put_pid_ns(to_pid_ns(ns));
  327. }
  328. bool pidns_is_ancestor(struct pid_namespace *child,
  329. struct pid_namespace *ancestor)
  330. {
  331. struct pid_namespace *ns;
  332. if (child->level < ancestor->level)
  333. return false;
  334. for (ns = child; ns->level > ancestor->level; ns = ns->parent)
  335. ;
  336. return ns == ancestor;
  337. }
  338. static int pidns_install(struct nsset *nsset, struct ns_common *ns)
  339. {
  340. struct nsproxy *nsproxy = nsset->nsproxy;
  341. struct pid_namespace *active = task_active_pid_ns(current);
  342. struct pid_namespace *new = to_pid_ns(ns);
  343. if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
  344. !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
  345. return -EPERM;
  346. /*
  347. * Only allow entering the current active pid namespace
  348. * or a child of the current active pid namespace.
  349. *
  350. * This is required for fork to return a usable pid value and
  351. * this maintains the property that processes and their
  352. * children can not escape their current pid namespace.
  353. */
  354. if (!pidns_is_ancestor(new, active))
  355. return -EINVAL;
  356. put_pid_ns(nsproxy->pid_ns_for_children);
  357. nsproxy->pid_ns_for_children = get_pid_ns(new);
  358. return 0;
  359. }
  360. static struct ns_common *pidns_get_parent(struct ns_common *ns)
  361. {
  362. struct pid_namespace *active = task_active_pid_ns(current);
  363. struct pid_namespace *pid_ns, *p;
  364. /* See if the parent is in the current namespace */
  365. pid_ns = p = to_pid_ns(ns)->parent;
  366. for (;;) {
  367. if (!p)
  368. return ERR_PTR(-EPERM);
  369. if (p == active)
  370. break;
  371. p = p->parent;
  372. }
  373. return &get_pid_ns(pid_ns)->ns;
  374. }
  375. static struct user_namespace *pidns_owner(struct ns_common *ns)
  376. {
  377. return to_pid_ns(ns)->user_ns;
  378. }
  379. const struct proc_ns_operations pidns_operations = {
  380. .name = "pid",
  381. .get = pidns_get,
  382. .put = pidns_put,
  383. .install = pidns_install,
  384. .owner = pidns_owner,
  385. .get_parent = pidns_get_parent,
  386. };
  387. const struct proc_ns_operations pidns_for_children_operations = {
  388. .name = "pid_for_children",
  389. .real_ns_name = "pid",
  390. .get = pidns_for_children_get,
  391. .put = pidns_put,
  392. .install = pidns_install,
  393. .owner = pidns_owner,
  394. .get_parent = pidns_get_parent,
  395. };
  396. static __init int pid_namespaces_init(void)
  397. {
  398. pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC | SLAB_ACCOUNT);
  399. #ifdef CONFIG_CHECKPOINT_RESTORE
  400. register_sysctl_init("kernel", pid_ns_ctl_table);
  401. #endif
  402. register_pid_ns_sysctl_table_vm();
  403. ns_tree_add(&init_pid_ns);
  404. return 0;
  405. }
  406. __initcall(pid_namespaces_init);