net_namespace.c 37 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  3. #include <linux/workqueue.h>
  4. #include <linux/rtnetlink.h>
  5. #include <linux/cache.h>
  6. #include <linux/slab.h>
  7. #include <linux/list.h>
  8. #include <linux/delay.h>
  9. #include <linux/sched.h>
  10. #include <linux/idr.h>
  11. #include <linux/rculist.h>
  12. #include <linux/nsproxy.h>
  13. #include <linux/fs.h>
  14. #include <linux/proc_ns.h>
  15. #include <linux/file.h>
  16. #include <linux/export.h>
  17. #include <linux/user_namespace.h>
  18. #include <linux/net_namespace.h>
  19. #include <linux/sched/task.h>
  20. #include <linux/uidgid.h>
  21. #include <linux/proc_fs.h>
  22. #include <linux/nstree.h>
  23. #include <net/aligned_data.h>
  24. #include <net/sock.h>
  25. #include <net/netlink.h>
  26. #include <net/net_namespace.h>
  27. #include <net/netns/generic.h>
  28. /*
  29. * Our network namespace constructor/destructor lists
  30. */
  31. static LIST_HEAD(pernet_list);
  32. static struct list_head *first_device = &pernet_list;
  33. LIST_HEAD(net_namespace_list);
  34. EXPORT_SYMBOL_GPL(net_namespace_list);
  35. /* Protects net_namespace_list. Nests iside rtnl_lock() */
  36. DECLARE_RWSEM(net_rwsem);
  37. EXPORT_SYMBOL_GPL(net_rwsem);
  38. #ifdef CONFIG_KEYS
  39. static struct key_tag init_net_key_domain = { .usage = REFCOUNT_INIT(1) };
  40. #endif
  41. struct net init_net;
  42. EXPORT_SYMBOL(init_net);
  43. static bool init_net_initialized;
  44. /*
  45. * pernet_ops_rwsem: protects: pernet_list, net_generic_ids,
  46. * init_net_initialized and first_device pointer.
  47. * This is internal net namespace object. Please, don't use it
  48. * outside.
  49. */
  50. DECLARE_RWSEM(pernet_ops_rwsem);
  51. #define MIN_PERNET_OPS_ID \
  52. ((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *))
  53. #define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */
  54. static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
  55. static struct net_generic *net_alloc_generic(void)
  56. {
  57. unsigned int gen_ptrs = READ_ONCE(max_gen_ptrs);
  58. unsigned int generic_size;
  59. struct net_generic *ng;
  60. generic_size = offsetof(struct net_generic, ptr[gen_ptrs]);
  61. ng = kzalloc(generic_size, GFP_KERNEL);
  62. if (ng)
  63. ng->s.len = gen_ptrs;
  64. return ng;
  65. }
  66. static int net_assign_generic(struct net *net, unsigned int id, void *data)
  67. {
  68. struct net_generic *ng, *old_ng;
  69. BUG_ON(id < MIN_PERNET_OPS_ID);
  70. old_ng = rcu_dereference_protected(net->gen,
  71. lockdep_is_held(&pernet_ops_rwsem));
  72. if (old_ng->s.len > id) {
  73. old_ng->ptr[id] = data;
  74. return 0;
  75. }
  76. ng = net_alloc_generic();
  77. if (!ng)
  78. return -ENOMEM;
  79. /*
  80. * Some synchronisation notes:
  81. *
  82. * The net_generic explores the net->gen array inside rcu
  83. * read section. Besides once set the net->gen->ptr[x]
  84. * pointer never changes (see rules in netns/generic.h).
  85. *
  86. * That said, we simply duplicate this array and schedule
  87. * the old copy for kfree after a grace period.
  88. */
  89. memcpy(&ng->ptr[MIN_PERNET_OPS_ID], &old_ng->ptr[MIN_PERNET_OPS_ID],
  90. (old_ng->s.len - MIN_PERNET_OPS_ID) * sizeof(void *));
  91. ng->ptr[id] = data;
  92. rcu_assign_pointer(net->gen, ng);
  93. kfree_rcu(old_ng, s.rcu);
  94. return 0;
  95. }
  96. static int ops_init(const struct pernet_operations *ops, struct net *net)
  97. {
  98. struct net_generic *ng;
  99. int err = -ENOMEM;
  100. void *data = NULL;
  101. if (ops->id) {
  102. data = kzalloc(ops->size, GFP_KERNEL);
  103. if (!data)
  104. goto out;
  105. err = net_assign_generic(net, *ops->id, data);
  106. if (err)
  107. goto cleanup;
  108. }
  109. err = 0;
  110. if (ops->init)
  111. err = ops->init(net);
  112. if (!err)
  113. return 0;
  114. if (ops->id) {
  115. ng = rcu_dereference_protected(net->gen,
  116. lockdep_is_held(&pernet_ops_rwsem));
  117. ng->ptr[*ops->id] = NULL;
  118. }
  119. cleanup:
  120. kfree(data);
  121. out:
  122. return err;
  123. }
  124. static void ops_pre_exit_list(const struct pernet_operations *ops,
  125. struct list_head *net_exit_list)
  126. {
  127. struct net *net;
  128. if (ops->pre_exit) {
  129. list_for_each_entry(net, net_exit_list, exit_list)
  130. ops->pre_exit(net);
  131. }
  132. }
  133. static void ops_exit_rtnl_list(const struct list_head *ops_list,
  134. const struct pernet_operations *ops,
  135. struct list_head *net_exit_list)
  136. {
  137. const struct pernet_operations *saved_ops = ops;
  138. LIST_HEAD(dev_kill_list);
  139. struct net *net;
  140. rtnl_lock();
  141. list_for_each_entry(net, net_exit_list, exit_list) {
  142. __rtnl_net_lock(net);
  143. ops = saved_ops;
  144. list_for_each_entry_continue_reverse(ops, ops_list, list) {
  145. if (ops->exit_rtnl)
  146. ops->exit_rtnl(net, &dev_kill_list);
  147. }
  148. __rtnl_net_unlock(net);
  149. }
  150. unregister_netdevice_many(&dev_kill_list);
  151. rtnl_unlock();
  152. }
  153. static void ops_exit_list(const struct pernet_operations *ops,
  154. struct list_head *net_exit_list)
  155. {
  156. if (ops->exit) {
  157. struct net *net;
  158. list_for_each_entry(net, net_exit_list, exit_list) {
  159. ops->exit(net);
  160. cond_resched();
  161. }
  162. }
  163. if (ops->exit_batch)
  164. ops->exit_batch(net_exit_list);
  165. }
  166. static void ops_free_list(const struct pernet_operations *ops,
  167. struct list_head *net_exit_list)
  168. {
  169. struct net *net;
  170. if (ops->id) {
  171. list_for_each_entry(net, net_exit_list, exit_list)
  172. kfree(net_generic(net, *ops->id));
  173. }
  174. }
  175. static void ops_undo_list(const struct list_head *ops_list,
  176. const struct pernet_operations *ops,
  177. struct list_head *net_exit_list,
  178. bool expedite_rcu)
  179. {
  180. const struct pernet_operations *saved_ops;
  181. bool hold_rtnl = false;
  182. if (!ops)
  183. ops = list_entry(ops_list, typeof(*ops), list);
  184. saved_ops = ops;
  185. list_for_each_entry_continue_reverse(ops, ops_list, list) {
  186. hold_rtnl |= !!ops->exit_rtnl;
  187. ops_pre_exit_list(ops, net_exit_list);
  188. }
  189. /* Another CPU might be rcu-iterating the list, wait for it.
  190. * This needs to be before calling the exit() notifiers, so the
  191. * rcu_barrier() after ops_undo_list() isn't sufficient alone.
  192. * Also the pre_exit() and exit() methods need this barrier.
  193. */
  194. if (expedite_rcu)
  195. synchronize_rcu_expedited();
  196. else
  197. synchronize_rcu();
  198. if (hold_rtnl)
  199. ops_exit_rtnl_list(ops_list, saved_ops, net_exit_list);
  200. ops = saved_ops;
  201. list_for_each_entry_continue_reverse(ops, ops_list, list)
  202. ops_exit_list(ops, net_exit_list);
  203. ops = saved_ops;
  204. list_for_each_entry_continue_reverse(ops, ops_list, list)
  205. ops_free_list(ops, net_exit_list);
  206. }
  207. static void ops_undo_single(struct pernet_operations *ops,
  208. struct list_head *net_exit_list)
  209. {
  210. LIST_HEAD(ops_list);
  211. list_add(&ops->list, &ops_list);
  212. ops_undo_list(&ops_list, NULL, net_exit_list, false);
  213. list_del(&ops->list);
  214. }
  215. /* should be called with nsid_lock held */
  216. static int alloc_netid(struct net *net, struct net *peer, int reqid)
  217. {
  218. int min = 0, max = 0;
  219. if (reqid >= 0) {
  220. min = reqid;
  221. max = reqid + 1;
  222. }
  223. return idr_alloc(&net->netns_ids, peer, min, max, GFP_ATOMIC);
  224. }
  225. /* This function is used by idr_for_each(). If net is equal to peer, the
  226. * function returns the id so that idr_for_each() stops. Because we cannot
  227. * returns the id 0 (idr_for_each() will not stop), we return the magic value
  228. * NET_ID_ZERO (-1) for it.
  229. */
  230. #define NET_ID_ZERO -1
  231. static int net_eq_idr(int id, void *net, void *peer)
  232. {
  233. if (net_eq(net, peer))
  234. return id ? : NET_ID_ZERO;
  235. return 0;
  236. }
  237. /* Must be called from RCU-critical section or with nsid_lock held */
  238. static int __peernet2id(const struct net *net, struct net *peer)
  239. {
  240. int id = idr_for_each(&net->netns_ids, net_eq_idr, peer);
  241. /* Magic value for id 0. */
  242. if (id == NET_ID_ZERO)
  243. return 0;
  244. if (id > 0)
  245. return id;
  246. return NETNSA_NSID_NOT_ASSIGNED;
  247. }
  248. static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid,
  249. struct nlmsghdr *nlh, gfp_t gfp);
  250. /* This function returns the id of a peer netns. If no id is assigned, one will
  251. * be allocated and returned.
  252. */
  253. int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp)
  254. {
  255. int id;
  256. if (!check_net(net))
  257. return NETNSA_NSID_NOT_ASSIGNED;
  258. spin_lock(&net->nsid_lock);
  259. id = __peernet2id(net, peer);
  260. if (id >= 0) {
  261. spin_unlock(&net->nsid_lock);
  262. return id;
  263. }
  264. /* When peer is obtained from RCU lists, we may race with
  265. * its cleanup. Check whether it's alive, and this guarantees
  266. * we never hash a peer back to net->netns_ids, after it has
  267. * just been idr_remove()'d from there in cleanup_net().
  268. */
  269. if (!maybe_get_net(peer)) {
  270. spin_unlock(&net->nsid_lock);
  271. return NETNSA_NSID_NOT_ASSIGNED;
  272. }
  273. id = alloc_netid(net, peer, -1);
  274. spin_unlock(&net->nsid_lock);
  275. put_net(peer);
  276. if (id < 0)
  277. return NETNSA_NSID_NOT_ASSIGNED;
  278. rtnl_net_notifyid(net, RTM_NEWNSID, id, 0, NULL, gfp);
  279. return id;
  280. }
  281. EXPORT_SYMBOL_GPL(peernet2id_alloc);
  282. /* This function returns, if assigned, the id of a peer netns. */
  283. int peernet2id(const struct net *net, struct net *peer)
  284. {
  285. int id;
  286. rcu_read_lock();
  287. id = __peernet2id(net, peer);
  288. rcu_read_unlock();
  289. return id;
  290. }
  291. EXPORT_SYMBOL(peernet2id);
  292. /* This function returns true is the peer netns has an id assigned into the
  293. * current netns.
  294. */
  295. bool peernet_has_id(const struct net *net, struct net *peer)
  296. {
  297. return peernet2id(net, peer) >= 0;
  298. }
  299. struct net *get_net_ns_by_id(const struct net *net, int id)
  300. {
  301. struct net *peer;
  302. if (id < 0)
  303. return NULL;
  304. rcu_read_lock();
  305. peer = idr_find(&net->netns_ids, id);
  306. if (peer)
  307. peer = maybe_get_net(peer);
  308. rcu_read_unlock();
  309. return peer;
  310. }
  311. EXPORT_SYMBOL_GPL(get_net_ns_by_id);
  312. static __net_init void preinit_net_sysctl(struct net *net)
  313. {
  314. net->core.sysctl_somaxconn = SOMAXCONN;
  315. /* Limits per socket sk_omem_alloc usage.
  316. * TCP zerocopy regular usage needs 128 KB.
  317. */
  318. net->core.sysctl_optmem_max = 128 * 1024;
  319. net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED;
  320. net->core.sysctl_tstamp_allow_data = 1;
  321. net->core.sysctl_txq_reselection = msecs_to_jiffies(1000);
  322. }
  323. /* init code that must occur even if setup_net() is not called. */
  324. static __net_init int preinit_net(struct net *net, struct user_namespace *user_ns)
  325. {
  326. int ret;
  327. ret = ns_common_init(net);
  328. if (ret)
  329. return ret;
  330. refcount_set(&net->passive, 1);
  331. ref_tracker_dir_init(&net->refcnt_tracker, 128, "net_refcnt");
  332. ref_tracker_dir_init(&net->notrefcnt_tracker, 128, "net_notrefcnt");
  333. get_random_bytes(&net->hash_mix, sizeof(u32));
  334. net->dev_base_seq = 1;
  335. net->user_ns = user_ns;
  336. idr_init(&net->netns_ids);
  337. spin_lock_init(&net->nsid_lock);
  338. mutex_init(&net->ipv4.ra_mutex);
  339. #ifdef CONFIG_DEBUG_NET_SMALL_RTNL
  340. mutex_init(&net->rtnl_mutex);
  341. lock_set_cmp_fn(&net->rtnl_mutex, rtnl_net_lock_cmp_fn, NULL);
  342. #endif
  343. INIT_LIST_HEAD(&net->ptype_all);
  344. INIT_LIST_HEAD(&net->ptype_specific);
  345. preinit_net_sysctl(net);
  346. return 0;
  347. }
  348. /*
  349. * setup_net runs the initializers for the network namespace object.
  350. */
  351. static __net_init int setup_net(struct net *net)
  352. {
  353. /* Must be called with pernet_ops_rwsem held */
  354. const struct pernet_operations *ops;
  355. LIST_HEAD(net_exit_list);
  356. int error = 0;
  357. net->net_cookie = ns_tree_gen_id(net);
  358. list_for_each_entry(ops, &pernet_list, list) {
  359. error = ops_init(ops, net);
  360. if (error < 0)
  361. goto out_undo;
  362. }
  363. down_write(&net_rwsem);
  364. list_add_tail_rcu(&net->list, &net_namespace_list);
  365. up_write(&net_rwsem);
  366. ns_tree_add_raw(net);
  367. out:
  368. return error;
  369. out_undo:
  370. /* Walk through the list backwards calling the exit functions
  371. * for the pernet modules whose init functions did not fail.
  372. */
  373. list_add(&net->exit_list, &net_exit_list);
  374. ops_undo_list(&pernet_list, ops, &net_exit_list, false);
  375. rcu_barrier();
  376. goto out;
  377. }
  378. #ifdef CONFIG_NET_NS
  379. static struct ucounts *inc_net_namespaces(struct user_namespace *ns)
  380. {
  381. return inc_ucount(ns, current_euid(), UCOUNT_NET_NAMESPACES);
  382. }
  383. static void dec_net_namespaces(struct ucounts *ucounts)
  384. {
  385. dec_ucount(ucounts, UCOUNT_NET_NAMESPACES);
  386. }
  387. static struct kmem_cache *net_cachep __ro_after_init;
  388. static struct workqueue_struct *netns_wq;
  389. static struct net *net_alloc(void)
  390. {
  391. struct net *net = NULL;
  392. struct net_generic *ng;
  393. ng = net_alloc_generic();
  394. if (!ng)
  395. goto out;
  396. net = kmem_cache_zalloc(net_cachep, GFP_KERNEL);
  397. if (!net)
  398. goto out_free;
  399. #ifdef CONFIG_KEYS
  400. net->key_domain = kzalloc_obj(struct key_tag);
  401. if (!net->key_domain)
  402. goto out_free_2;
  403. refcount_set(&net->key_domain->usage, 1);
  404. #endif
  405. rcu_assign_pointer(net->gen, ng);
  406. out:
  407. return net;
  408. #ifdef CONFIG_KEYS
  409. out_free_2:
  410. kmem_cache_free(net_cachep, net);
  411. net = NULL;
  412. #endif
  413. out_free:
  414. kfree(ng);
  415. goto out;
  416. }
  417. static LLIST_HEAD(defer_free_list);
  418. static void net_complete_free(void)
  419. {
  420. struct llist_node *kill_list;
  421. struct net *net, *next;
  422. /* Get the list of namespaces to free from last round. */
  423. kill_list = llist_del_all(&defer_free_list);
  424. llist_for_each_entry_safe(net, next, kill_list, defer_free_list)
  425. kmem_cache_free(net_cachep, net);
  426. }
  427. void net_passive_dec(struct net *net)
  428. {
  429. if (refcount_dec_and_test(&net->passive)) {
  430. kfree(rcu_access_pointer(net->gen));
  431. /* There should not be any trackers left there. */
  432. ref_tracker_dir_exit(&net->notrefcnt_tracker);
  433. /* Wait for an extra rcu_barrier() before final free. */
  434. llist_add(&net->defer_free_list, &defer_free_list);
  435. }
  436. }
  437. void net_drop_ns(struct ns_common *ns)
  438. {
  439. if (ns)
  440. net_passive_dec(to_net_ns(ns));
  441. }
  442. struct net *copy_net_ns(u64 flags,
  443. struct user_namespace *user_ns, struct net *old_net)
  444. {
  445. struct ucounts *ucounts;
  446. struct net *net;
  447. int rv;
  448. if (!(flags & CLONE_NEWNET))
  449. return get_net(old_net);
  450. ucounts = inc_net_namespaces(user_ns);
  451. if (!ucounts)
  452. return ERR_PTR(-ENOSPC);
  453. net = net_alloc();
  454. if (!net) {
  455. rv = -ENOMEM;
  456. goto dec_ucounts;
  457. }
  458. rv = preinit_net(net, user_ns);
  459. if (rv < 0)
  460. goto dec_ucounts;
  461. net->ucounts = ucounts;
  462. get_user_ns(user_ns);
  463. rv = down_read_killable(&pernet_ops_rwsem);
  464. if (rv < 0)
  465. goto put_userns;
  466. rv = setup_net(net);
  467. up_read(&pernet_ops_rwsem);
  468. if (rv < 0) {
  469. put_userns:
  470. ns_common_free(net);
  471. #ifdef CONFIG_KEYS
  472. key_remove_domain(net->key_domain);
  473. #endif
  474. put_user_ns(user_ns);
  475. net_passive_dec(net);
  476. dec_ucounts:
  477. dec_net_namespaces(ucounts);
  478. return ERR_PTR(rv);
  479. }
  480. return net;
  481. }
  482. /**
  483. * net_ns_get_ownership - get sysfs ownership data for @net
  484. * @net: network namespace in question (can be NULL)
  485. * @uid: kernel user ID for sysfs objects
  486. * @gid: kernel group ID for sysfs objects
  487. *
  488. * Returns the uid/gid pair of root in the user namespace associated with the
  489. * given network namespace.
  490. */
  491. void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid)
  492. {
  493. if (net) {
  494. kuid_t ns_root_uid = make_kuid(net->user_ns, 0);
  495. kgid_t ns_root_gid = make_kgid(net->user_ns, 0);
  496. if (uid_valid(ns_root_uid))
  497. *uid = ns_root_uid;
  498. if (gid_valid(ns_root_gid))
  499. *gid = ns_root_gid;
  500. } else {
  501. *uid = GLOBAL_ROOT_UID;
  502. *gid = GLOBAL_ROOT_GID;
  503. }
  504. }
  505. EXPORT_SYMBOL_GPL(net_ns_get_ownership);
  506. static void unhash_nsid(struct net *last)
  507. {
  508. struct net *tmp, *peer;
  509. /* This function is only called from cleanup_net() work,
  510. * and this work is the only process, that may delete
  511. * a net from net_namespace_list. So, when the below
  512. * is executing, the list may only grow. Thus, we do not
  513. * use for_each_net_rcu() or net_rwsem.
  514. */
  515. for_each_net(tmp) {
  516. int id = 0;
  517. spin_lock(&tmp->nsid_lock);
  518. while ((peer = idr_get_next(&tmp->netns_ids, &id))) {
  519. int curr_id = id;
  520. id++;
  521. if (!peer->is_dying)
  522. continue;
  523. idr_remove(&tmp->netns_ids, curr_id);
  524. spin_unlock(&tmp->nsid_lock);
  525. rtnl_net_notifyid(tmp, RTM_DELNSID, curr_id, 0, NULL,
  526. GFP_KERNEL);
  527. spin_lock(&tmp->nsid_lock);
  528. }
  529. spin_unlock(&tmp->nsid_lock);
  530. if (tmp == last)
  531. break;
  532. }
  533. }
  534. static LLIST_HEAD(cleanup_list);
  535. struct task_struct *cleanup_net_task;
  536. static void cleanup_net(struct work_struct *work)
  537. {
  538. struct llist_node *net_kill_list;
  539. struct net *net, *tmp, *last;
  540. LIST_HEAD(net_exit_list);
  541. WRITE_ONCE(cleanup_net_task, current);
  542. /* Atomically snapshot the list of namespaces to cleanup */
  543. net_kill_list = llist_del_all(&cleanup_list);
  544. down_read(&pernet_ops_rwsem);
  545. /* Don't let anyone else find us. */
  546. down_write(&net_rwsem);
  547. llist_for_each_entry(net, net_kill_list, cleanup_list) {
  548. ns_tree_remove(net);
  549. list_del_rcu(&net->list);
  550. net->is_dying = true;
  551. }
  552. /* Cache last net. After we unlock rtnl, no one new net
  553. * added to net_namespace_list can assign nsid pointer
  554. * to a net from net_kill_list (see peernet2id_alloc()).
  555. * So, we skip them in unhash_nsid().
  556. *
  557. * Note, that unhash_nsid() does not delete nsid links
  558. * between net_kill_list's nets, as they've already
  559. * deleted from net_namespace_list. But, this would be
  560. * useless anyway, as netns_ids are destroyed there.
  561. */
  562. last = list_last_entry(&net_namespace_list, struct net, list);
  563. up_write(&net_rwsem);
  564. unhash_nsid(last);
  565. llist_for_each_entry(net, net_kill_list, cleanup_list) {
  566. idr_destroy(&net->netns_ids);
  567. list_add_tail(&net->exit_list, &net_exit_list);
  568. }
  569. ops_undo_list(&pernet_list, NULL, &net_exit_list, true);
  570. up_read(&pernet_ops_rwsem);
  571. /* Ensure there are no outstanding rcu callbacks using this
  572. * network namespace.
  573. */
  574. rcu_barrier();
  575. net_complete_free();
  576. /* Finally it is safe to free my network namespace structure */
  577. list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {
  578. list_del_init(&net->exit_list);
  579. ns_common_free(net);
  580. dec_net_namespaces(net->ucounts);
  581. #ifdef CONFIG_KEYS
  582. key_remove_domain(net->key_domain);
  583. #endif
  584. put_user_ns(net->user_ns);
  585. net_passive_dec(net);
  586. }
  587. WRITE_ONCE(cleanup_net_task, NULL);
  588. }
  589. /**
  590. * net_ns_barrier - wait until concurrent net_cleanup_work is done
  591. *
  592. * cleanup_net runs from work queue and will first remove namespaces
  593. * from the global list, then run net exit functions.
  594. *
  595. * Call this in module exit path to make sure that all netns
  596. * ->exit ops have been invoked before the function is removed.
  597. */
  598. void net_ns_barrier(void)
  599. {
  600. down_write(&pernet_ops_rwsem);
  601. up_write(&pernet_ops_rwsem);
  602. }
  603. EXPORT_SYMBOL(net_ns_barrier);
  604. static DECLARE_WORK(net_cleanup_work, cleanup_net);
  605. void __put_net(struct net *net)
  606. {
  607. ref_tracker_dir_exit(&net->refcnt_tracker);
  608. /* Cleanup the network namespace in process context */
  609. if (llist_add(&net->cleanup_list, &cleanup_list))
  610. queue_work(netns_wq, &net_cleanup_work);
  611. }
  612. EXPORT_SYMBOL_GPL(__put_net);
  613. /**
  614. * get_net_ns - increment the refcount of the network namespace
  615. * @ns: common namespace (net)
  616. *
  617. * Returns the net's common namespace or ERR_PTR() if ref is zero.
  618. */
  619. struct ns_common *get_net_ns(struct ns_common *ns)
  620. {
  621. struct net *net;
  622. net = maybe_get_net(container_of(ns, struct net, ns));
  623. if (net)
  624. return &net->ns;
  625. return ERR_PTR(-EINVAL);
  626. }
  627. EXPORT_SYMBOL_GPL(get_net_ns);
  628. struct net *get_net_ns_by_fd(int fd)
  629. {
  630. CLASS(fd, f)(fd);
  631. if (fd_empty(f))
  632. return ERR_PTR(-EBADF);
  633. if (proc_ns_file(fd_file(f))) {
  634. struct ns_common *ns = get_proc_ns(file_inode(fd_file(f)));
  635. if (ns->ops == &netns_operations)
  636. return get_net(container_of(ns, struct net, ns));
  637. }
  638. return ERR_PTR(-EINVAL);
  639. }
  640. EXPORT_SYMBOL_GPL(get_net_ns_by_fd);
  641. #endif
  642. struct net *get_net_ns_by_pid(pid_t pid)
  643. {
  644. struct task_struct *tsk;
  645. struct net *net;
  646. /* Lookup the network namespace */
  647. net = ERR_PTR(-ESRCH);
  648. rcu_read_lock();
  649. tsk = find_task_by_vpid(pid);
  650. if (tsk) {
  651. struct nsproxy *nsproxy;
  652. task_lock(tsk);
  653. nsproxy = tsk->nsproxy;
  654. if (nsproxy)
  655. net = get_net(nsproxy->net_ns);
  656. task_unlock(tsk);
  657. }
  658. rcu_read_unlock();
  659. return net;
  660. }
  661. EXPORT_SYMBOL_GPL(get_net_ns_by_pid);
  662. #ifdef CONFIG_NET_NS_REFCNT_TRACKER
  663. static void net_ns_net_debugfs(struct net *net)
  664. {
  665. ref_tracker_dir_symlink(&net->refcnt_tracker, "netns-%llx-%u-refcnt",
  666. net->net_cookie, net->ns.inum);
  667. ref_tracker_dir_symlink(&net->notrefcnt_tracker, "netns-%llx-%u-notrefcnt",
  668. net->net_cookie, net->ns.inum);
  669. }
  670. static int __init init_net_debugfs(void)
  671. {
  672. ref_tracker_dir_debugfs(&init_net.refcnt_tracker);
  673. ref_tracker_dir_debugfs(&init_net.notrefcnt_tracker);
  674. net_ns_net_debugfs(&init_net);
  675. return 0;
  676. }
  677. late_initcall(init_net_debugfs);
  678. #else
  679. static void net_ns_net_debugfs(struct net *net)
  680. {
  681. }
  682. #endif
  683. static __net_init int net_ns_net_init(struct net *net)
  684. {
  685. net_ns_net_debugfs(net);
  686. return 0;
  687. }
  688. static struct pernet_operations __net_initdata net_ns_ops = {
  689. .init = net_ns_net_init,
  690. };
  691. static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = {
  692. [NETNSA_NONE] = { .type = NLA_UNSPEC },
  693. [NETNSA_NSID] = { .type = NLA_S32 },
  694. [NETNSA_PID] = { .type = NLA_U32 },
  695. [NETNSA_FD] = { .type = NLA_U32 },
  696. [NETNSA_TARGET_NSID] = { .type = NLA_S32 },
  697. };
  698. static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh,
  699. struct netlink_ext_ack *extack)
  700. {
  701. struct net *net = sock_net(skb->sk);
  702. struct nlattr *tb[NETNSA_MAX + 1];
  703. struct nlattr *nla;
  704. struct net *peer;
  705. int nsid, err;
  706. err = nlmsg_parse_deprecated(nlh, sizeof(struct rtgenmsg), tb,
  707. NETNSA_MAX, rtnl_net_policy, extack);
  708. if (err < 0)
  709. return err;
  710. if (!tb[NETNSA_NSID]) {
  711. NL_SET_ERR_MSG(extack, "nsid is missing");
  712. return -EINVAL;
  713. }
  714. nsid = nla_get_s32(tb[NETNSA_NSID]);
  715. if (tb[NETNSA_PID]) {
  716. peer = get_net_ns_by_pid(nla_get_u32(tb[NETNSA_PID]));
  717. nla = tb[NETNSA_PID];
  718. } else if (tb[NETNSA_FD]) {
  719. peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD]));
  720. nla = tb[NETNSA_FD];
  721. } else {
  722. NL_SET_ERR_MSG(extack, "Peer netns reference is missing");
  723. return -EINVAL;
  724. }
  725. if (IS_ERR(peer)) {
  726. NL_SET_BAD_ATTR(extack, nla);
  727. NL_SET_ERR_MSG(extack, "Peer netns reference is invalid");
  728. return PTR_ERR(peer);
  729. }
  730. spin_lock(&net->nsid_lock);
  731. if (__peernet2id(net, peer) >= 0) {
  732. spin_unlock(&net->nsid_lock);
  733. err = -EEXIST;
  734. NL_SET_BAD_ATTR(extack, nla);
  735. NL_SET_ERR_MSG(extack,
  736. "Peer netns already has a nsid assigned");
  737. goto out;
  738. }
  739. err = alloc_netid(net, peer, nsid);
  740. spin_unlock(&net->nsid_lock);
  741. if (err >= 0) {
  742. rtnl_net_notifyid(net, RTM_NEWNSID, err, NETLINK_CB(skb).portid,
  743. nlh, GFP_KERNEL);
  744. err = 0;
  745. } else if (err == -ENOSPC && nsid >= 0) {
  746. err = -EEXIST;
  747. NL_SET_BAD_ATTR(extack, tb[NETNSA_NSID]);
  748. NL_SET_ERR_MSG(extack, "The specified nsid is already used");
  749. }
  750. out:
  751. put_net(peer);
  752. return err;
  753. }
  754. static int rtnl_net_get_size(void)
  755. {
  756. return NLMSG_ALIGN(sizeof(struct rtgenmsg))
  757. + nla_total_size(sizeof(s32)) /* NETNSA_NSID */
  758. + nla_total_size(sizeof(s32)) /* NETNSA_CURRENT_NSID */
  759. ;
  760. }
  761. struct net_fill_args {
  762. u32 portid;
  763. u32 seq;
  764. int flags;
  765. int cmd;
  766. int nsid;
  767. bool add_ref;
  768. int ref_nsid;
  769. };
  770. static int rtnl_net_fill(struct sk_buff *skb, struct net_fill_args *args)
  771. {
  772. struct nlmsghdr *nlh;
  773. struct rtgenmsg *rth;
  774. nlh = nlmsg_put(skb, args->portid, args->seq, args->cmd, sizeof(*rth),
  775. args->flags);
  776. if (!nlh)
  777. return -EMSGSIZE;
  778. rth = nlmsg_data(nlh);
  779. rth->rtgen_family = AF_UNSPEC;
  780. if (nla_put_s32(skb, NETNSA_NSID, args->nsid))
  781. goto nla_put_failure;
  782. if (args->add_ref &&
  783. nla_put_s32(skb, NETNSA_CURRENT_NSID, args->ref_nsid))
  784. goto nla_put_failure;
  785. nlmsg_end(skb, nlh);
  786. return 0;
  787. nla_put_failure:
  788. nlmsg_cancel(skb, nlh);
  789. return -EMSGSIZE;
  790. }
  791. static int rtnl_net_valid_getid_req(struct sk_buff *skb,
  792. const struct nlmsghdr *nlh,
  793. struct nlattr **tb,
  794. struct netlink_ext_ack *extack)
  795. {
  796. int i, err;
  797. if (!netlink_strict_get_check(skb))
  798. return nlmsg_parse_deprecated(nlh, sizeof(struct rtgenmsg),
  799. tb, NETNSA_MAX, rtnl_net_policy,
  800. extack);
  801. err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct rtgenmsg), tb,
  802. NETNSA_MAX, rtnl_net_policy,
  803. extack);
  804. if (err)
  805. return err;
  806. for (i = 0; i <= NETNSA_MAX; i++) {
  807. if (!tb[i])
  808. continue;
  809. switch (i) {
  810. case NETNSA_PID:
  811. case NETNSA_FD:
  812. case NETNSA_NSID:
  813. case NETNSA_TARGET_NSID:
  814. break;
  815. default:
  816. NL_SET_ERR_MSG(extack, "Unsupported attribute in peer netns getid request");
  817. return -EINVAL;
  818. }
  819. }
  820. return 0;
  821. }
  822. static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh,
  823. struct netlink_ext_ack *extack)
  824. {
  825. struct net *net = sock_net(skb->sk);
  826. struct nlattr *tb[NETNSA_MAX + 1];
  827. struct net_fill_args fillargs = {
  828. .portid = NETLINK_CB(skb).portid,
  829. .seq = nlh->nlmsg_seq,
  830. .cmd = RTM_NEWNSID,
  831. };
  832. struct net *peer, *target = net;
  833. struct nlattr *nla;
  834. struct sk_buff *msg;
  835. int err;
  836. err = rtnl_net_valid_getid_req(skb, nlh, tb, extack);
  837. if (err < 0)
  838. return err;
  839. if (tb[NETNSA_PID]) {
  840. peer = get_net_ns_by_pid(nla_get_u32(tb[NETNSA_PID]));
  841. nla = tb[NETNSA_PID];
  842. } else if (tb[NETNSA_FD]) {
  843. peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD]));
  844. nla = tb[NETNSA_FD];
  845. } else if (tb[NETNSA_NSID]) {
  846. peer = get_net_ns_by_id(net, nla_get_s32(tb[NETNSA_NSID]));
  847. if (!peer)
  848. peer = ERR_PTR(-ENOENT);
  849. nla = tb[NETNSA_NSID];
  850. } else {
  851. NL_SET_ERR_MSG(extack, "Peer netns reference is missing");
  852. return -EINVAL;
  853. }
  854. if (IS_ERR(peer)) {
  855. NL_SET_BAD_ATTR(extack, nla);
  856. NL_SET_ERR_MSG(extack, "Peer netns reference is invalid");
  857. return PTR_ERR(peer);
  858. }
  859. if (tb[NETNSA_TARGET_NSID]) {
  860. int id = nla_get_s32(tb[NETNSA_TARGET_NSID]);
  861. target = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, id);
  862. if (IS_ERR(target)) {
  863. NL_SET_BAD_ATTR(extack, tb[NETNSA_TARGET_NSID]);
  864. NL_SET_ERR_MSG(extack,
  865. "Target netns reference is invalid");
  866. err = PTR_ERR(target);
  867. goto out;
  868. }
  869. fillargs.add_ref = true;
  870. fillargs.ref_nsid = peernet2id(net, peer);
  871. }
  872. msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL);
  873. if (!msg) {
  874. err = -ENOMEM;
  875. goto out;
  876. }
  877. fillargs.nsid = peernet2id(target, peer);
  878. err = rtnl_net_fill(msg, &fillargs);
  879. if (err < 0)
  880. goto err_out;
  881. err = rtnl_unicast(msg, net, NETLINK_CB(skb).portid);
  882. goto out;
  883. err_out:
  884. nlmsg_free(msg);
  885. out:
  886. if (fillargs.add_ref)
  887. put_net(target);
  888. put_net(peer);
  889. return err;
  890. }
  891. struct rtnl_net_dump_cb {
  892. struct net *tgt_net;
  893. struct net *ref_net;
  894. struct sk_buff *skb;
  895. struct net_fill_args fillargs;
  896. int idx;
  897. int s_idx;
  898. };
  899. /* Runs in RCU-critical section. */
  900. static int rtnl_net_dumpid_one(int id, void *peer, void *data)
  901. {
  902. struct rtnl_net_dump_cb *net_cb = (struct rtnl_net_dump_cb *)data;
  903. int ret;
  904. if (net_cb->idx < net_cb->s_idx)
  905. goto cont;
  906. net_cb->fillargs.nsid = id;
  907. if (net_cb->fillargs.add_ref)
  908. net_cb->fillargs.ref_nsid = __peernet2id(net_cb->ref_net, peer);
  909. ret = rtnl_net_fill(net_cb->skb, &net_cb->fillargs);
  910. if (ret < 0)
  911. return ret;
  912. cont:
  913. net_cb->idx++;
  914. return 0;
  915. }
  916. static int rtnl_valid_dump_net_req(const struct nlmsghdr *nlh, struct sock *sk,
  917. struct rtnl_net_dump_cb *net_cb,
  918. struct netlink_callback *cb)
  919. {
  920. struct netlink_ext_ack *extack = cb->extack;
  921. struct nlattr *tb[NETNSA_MAX + 1];
  922. int err, i;
  923. err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct rtgenmsg), tb,
  924. NETNSA_MAX, rtnl_net_policy,
  925. extack);
  926. if (err < 0)
  927. return err;
  928. for (i = 0; i <= NETNSA_MAX; i++) {
  929. if (!tb[i])
  930. continue;
  931. if (i == NETNSA_TARGET_NSID) {
  932. struct net *net;
  933. net = rtnl_get_net_ns_capable(sk, nla_get_s32(tb[i]));
  934. if (IS_ERR(net)) {
  935. NL_SET_BAD_ATTR(extack, tb[i]);
  936. NL_SET_ERR_MSG(extack,
  937. "Invalid target network namespace id");
  938. return PTR_ERR(net);
  939. }
  940. net_cb->fillargs.add_ref = true;
  941. net_cb->ref_net = net_cb->tgt_net;
  942. net_cb->tgt_net = net;
  943. } else {
  944. NL_SET_BAD_ATTR(extack, tb[i]);
  945. NL_SET_ERR_MSG(extack,
  946. "Unsupported attribute in dump request");
  947. return -EINVAL;
  948. }
  949. }
  950. return 0;
  951. }
  952. static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb)
  953. {
  954. struct rtnl_net_dump_cb net_cb = {
  955. .tgt_net = sock_net(skb->sk),
  956. .skb = skb,
  957. .fillargs = {
  958. .portid = NETLINK_CB(cb->skb).portid,
  959. .seq = cb->nlh->nlmsg_seq,
  960. .flags = NLM_F_MULTI,
  961. .cmd = RTM_NEWNSID,
  962. },
  963. .idx = 0,
  964. .s_idx = cb->args[0],
  965. };
  966. int err = 0;
  967. if (cb->strict_check) {
  968. err = rtnl_valid_dump_net_req(cb->nlh, skb->sk, &net_cb, cb);
  969. if (err < 0)
  970. goto end;
  971. }
  972. rcu_read_lock();
  973. idr_for_each(&net_cb.tgt_net->netns_ids, rtnl_net_dumpid_one, &net_cb);
  974. rcu_read_unlock();
  975. cb->args[0] = net_cb.idx;
  976. end:
  977. if (net_cb.fillargs.add_ref)
  978. put_net(net_cb.tgt_net);
  979. return err;
  980. }
  981. static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid,
  982. struct nlmsghdr *nlh, gfp_t gfp)
  983. {
  984. struct net_fill_args fillargs = {
  985. .portid = portid,
  986. .seq = nlh ? nlh->nlmsg_seq : 0,
  987. .cmd = cmd,
  988. .nsid = id,
  989. };
  990. struct sk_buff *msg;
  991. int err = -ENOMEM;
  992. msg = nlmsg_new(rtnl_net_get_size(), gfp);
  993. if (!msg)
  994. goto out;
  995. err = rtnl_net_fill(msg, &fillargs);
  996. if (err < 0)
  997. goto err_out;
  998. rtnl_notify(msg, net, portid, RTNLGRP_NSID, nlh, gfp);
  999. return;
  1000. err_out:
  1001. nlmsg_free(msg);
  1002. out:
  1003. rtnl_set_sk_err(net, RTNLGRP_NSID, err);
  1004. }
  1005. #ifdef CONFIG_NET_NS
  1006. static void __init netns_ipv4_struct_check(void)
  1007. {
  1008. /* TX readonly hotpath cache lines */
  1009. CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
  1010. sysctl_tcp_early_retrans);
  1011. CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
  1012. sysctl_tcp_tso_win_divisor);
  1013. CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
  1014. sysctl_tcp_tso_rtt_log);
  1015. CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
  1016. sysctl_tcp_autocorking);
  1017. CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
  1018. sysctl_tcp_min_snd_mss);
  1019. CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
  1020. sysctl_tcp_notsent_lowat);
  1021. CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
  1022. sysctl_tcp_limit_output_bytes);
  1023. CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
  1024. sysctl_tcp_min_rtt_wlen);
  1025. CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
  1026. sysctl_tcp_wmem);
  1027. CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
  1028. sysctl_ip_fwd_use_pmtu);
  1029. /* RX readonly hotpath cache line */
  1030. CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
  1031. sysctl_tcp_moderate_rcvbuf);
  1032. CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
  1033. sysctl_tcp_rcvbuf_low_rtt);
  1034. CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
  1035. sysctl_ip_early_demux);
  1036. CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
  1037. sysctl_tcp_early_demux);
  1038. CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
  1039. sysctl_tcp_l3mdev_accept);
  1040. CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
  1041. sysctl_tcp_reordering);
  1042. CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
  1043. sysctl_tcp_rmem);
  1044. }
  1045. #endif
  1046. static const struct rtnl_msg_handler net_ns_rtnl_msg_handlers[] __initconst = {
  1047. {.msgtype = RTM_NEWNSID, .doit = rtnl_net_newid,
  1048. .flags = RTNL_FLAG_DOIT_UNLOCKED},
  1049. {.msgtype = RTM_GETNSID, .doit = rtnl_net_getid,
  1050. .dumpit = rtnl_net_dumpid,
  1051. .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED},
  1052. };
  1053. void __init net_ns_init(void)
  1054. {
  1055. struct net_generic *ng;
  1056. #ifdef CONFIG_NET_NS
  1057. netns_ipv4_struct_check();
  1058. net_cachep = kmem_cache_create("net_namespace", sizeof(struct net),
  1059. SMP_CACHE_BYTES,
  1060. SLAB_PANIC|SLAB_ACCOUNT, NULL);
  1061. /* Create workqueue for cleanup */
  1062. netns_wq = create_singlethread_workqueue("netns");
  1063. if (!netns_wq)
  1064. panic("Could not create netns workq");
  1065. #endif
  1066. ng = net_alloc_generic();
  1067. if (!ng)
  1068. panic("Could not allocate generic netns");
  1069. rcu_assign_pointer(init_net.gen, ng);
  1070. #ifdef CONFIG_KEYS
  1071. init_net.key_domain = &init_net_key_domain;
  1072. #endif
  1073. /*
  1074. * This currently cannot fail as the initial network namespace
  1075. * has a static inode number.
  1076. */
  1077. if (preinit_net(&init_net, &init_user_ns))
  1078. panic("Could not preinitialize the initial network namespace");
  1079. down_write(&pernet_ops_rwsem);
  1080. if (setup_net(&init_net))
  1081. panic("Could not setup the initial network namespace");
  1082. init_net_initialized = true;
  1083. up_write(&pernet_ops_rwsem);
  1084. if (register_pernet_subsys(&net_ns_ops))
  1085. panic("Could not register network namespace subsystems");
  1086. rtnl_register_many(net_ns_rtnl_msg_handlers);
  1087. }
  1088. #ifdef CONFIG_NET_NS
  1089. static int __register_pernet_operations(struct list_head *list,
  1090. struct pernet_operations *ops)
  1091. {
  1092. LIST_HEAD(net_exit_list);
  1093. struct net *net;
  1094. int error;
  1095. list_add_tail(&ops->list, list);
  1096. if (ops->init || ops->id) {
  1097. /* We held write locked pernet_ops_rwsem, and parallel
  1098. * setup_net() and cleanup_net() are not possible.
  1099. */
  1100. for_each_net(net) {
  1101. error = ops_init(ops, net);
  1102. if (error)
  1103. goto out_undo;
  1104. list_add_tail(&net->exit_list, &net_exit_list);
  1105. }
  1106. }
  1107. return 0;
  1108. out_undo:
  1109. /* If I have an error cleanup all namespaces I initialized */
  1110. list_del(&ops->list);
  1111. ops_undo_single(ops, &net_exit_list);
  1112. return error;
  1113. }
  1114. static void __unregister_pernet_operations(struct pernet_operations *ops)
  1115. {
  1116. LIST_HEAD(net_exit_list);
  1117. struct net *net;
  1118. /* See comment in __register_pernet_operations() */
  1119. for_each_net(net)
  1120. list_add_tail(&net->exit_list, &net_exit_list);
  1121. list_del(&ops->list);
  1122. ops_undo_single(ops, &net_exit_list);
  1123. }
  1124. #else
  1125. static int __register_pernet_operations(struct list_head *list,
  1126. struct pernet_operations *ops)
  1127. {
  1128. if (!init_net_initialized) {
  1129. list_add_tail(&ops->list, list);
  1130. return 0;
  1131. }
  1132. return ops_init(ops, &init_net);
  1133. }
  1134. static void __unregister_pernet_operations(struct pernet_operations *ops)
  1135. {
  1136. if (!init_net_initialized) {
  1137. list_del(&ops->list);
  1138. } else {
  1139. LIST_HEAD(net_exit_list);
  1140. list_add(&init_net.exit_list, &net_exit_list);
  1141. ops_undo_single(ops, &net_exit_list);
  1142. }
  1143. }
  1144. #endif /* CONFIG_NET_NS */
  1145. static DEFINE_IDA(net_generic_ids);
  1146. static int register_pernet_operations(struct list_head *list,
  1147. struct pernet_operations *ops)
  1148. {
  1149. int error;
  1150. if (WARN_ON(!!ops->id ^ !!ops->size))
  1151. return -EINVAL;
  1152. if (ops->id) {
  1153. error = ida_alloc_min(&net_generic_ids, MIN_PERNET_OPS_ID,
  1154. GFP_KERNEL);
  1155. if (error < 0)
  1156. return error;
  1157. *ops->id = error;
  1158. /* This does not require READ_ONCE as writers already hold
  1159. * pernet_ops_rwsem. But WRITE_ONCE is needed to protect
  1160. * net_alloc_generic.
  1161. */
  1162. WRITE_ONCE(max_gen_ptrs, max(max_gen_ptrs, *ops->id + 1));
  1163. }
  1164. error = __register_pernet_operations(list, ops);
  1165. if (error) {
  1166. rcu_barrier();
  1167. if (ops->id)
  1168. ida_free(&net_generic_ids, *ops->id);
  1169. }
  1170. return error;
  1171. }
  1172. static void unregister_pernet_operations(struct pernet_operations *ops)
  1173. {
  1174. __unregister_pernet_operations(ops);
  1175. rcu_barrier();
  1176. if (ops->id)
  1177. ida_free(&net_generic_ids, *ops->id);
  1178. }
  1179. /**
  1180. * register_pernet_subsys - register a network namespace subsystem
  1181. * @ops: pernet operations structure for the subsystem
  1182. *
  1183. * Register a subsystem which has init and exit functions
  1184. * that are called when network namespaces are created and
  1185. * destroyed respectively.
  1186. *
  1187. * When registered all network namespace init functions are
  1188. * called for every existing network namespace. Allowing kernel
  1189. * modules to have a race free view of the set of network namespaces.
  1190. *
  1191. * When a new network namespace is created all of the init
  1192. * methods are called in the order in which they were registered.
  1193. *
  1194. * When a network namespace is destroyed all of the exit methods
  1195. * are called in the reverse of the order with which they were
  1196. * registered.
  1197. */
  1198. int register_pernet_subsys(struct pernet_operations *ops)
  1199. {
  1200. int error;
  1201. down_write(&pernet_ops_rwsem);
  1202. error = register_pernet_operations(first_device, ops);
  1203. up_write(&pernet_ops_rwsem);
  1204. return error;
  1205. }
  1206. EXPORT_SYMBOL_GPL(register_pernet_subsys);
  1207. /**
  1208. * unregister_pernet_subsys - unregister a network namespace subsystem
  1209. * @ops: pernet operations structure to manipulate
  1210. *
  1211. * Remove the pernet operations structure from the list to be
  1212. * used when network namespaces are created or destroyed. In
  1213. * addition run the exit method for all existing network
  1214. * namespaces.
  1215. */
  1216. void unregister_pernet_subsys(struct pernet_operations *ops)
  1217. {
  1218. down_write(&pernet_ops_rwsem);
  1219. unregister_pernet_operations(ops);
  1220. up_write(&pernet_ops_rwsem);
  1221. }
  1222. EXPORT_SYMBOL_GPL(unregister_pernet_subsys);
  1223. /**
  1224. * register_pernet_device - register a network namespace device
  1225. * @ops: pernet operations structure for the subsystem
  1226. *
  1227. * Register a device which has init and exit functions
  1228. * that are called when network namespaces are created and
  1229. * destroyed respectively.
  1230. *
  1231. * When registered all network namespace init functions are
  1232. * called for every existing network namespace. Allowing kernel
  1233. * modules to have a race free view of the set of network namespaces.
  1234. *
  1235. * When a new network namespace is created all of the init
  1236. * methods are called in the order in which they were registered.
  1237. *
  1238. * When a network namespace is destroyed all of the exit methods
  1239. * are called in the reverse of the order with which they were
  1240. * registered.
  1241. */
  1242. int register_pernet_device(struct pernet_operations *ops)
  1243. {
  1244. int error;
  1245. down_write(&pernet_ops_rwsem);
  1246. error = register_pernet_operations(&pernet_list, ops);
  1247. if (!error && (first_device == &pernet_list))
  1248. first_device = &ops->list;
  1249. up_write(&pernet_ops_rwsem);
  1250. return error;
  1251. }
  1252. EXPORT_SYMBOL_GPL(register_pernet_device);
  1253. /**
  1254. * unregister_pernet_device - unregister a network namespace netdevice
  1255. * @ops: pernet operations structure to manipulate
  1256. *
  1257. * Remove the pernet operations structure from the list to be
  1258. * used when network namespaces are created or destroyed. In
  1259. * addition run the exit method for all existing network
  1260. * namespaces.
  1261. */
  1262. void unregister_pernet_device(struct pernet_operations *ops)
  1263. {
  1264. down_write(&pernet_ops_rwsem);
  1265. if (&ops->list == first_device)
  1266. first_device = first_device->next;
  1267. unregister_pernet_operations(ops);
  1268. up_write(&pernet_ops_rwsem);
  1269. }
  1270. EXPORT_SYMBOL_GPL(unregister_pernet_device);
  1271. #ifdef CONFIG_NET_NS
  1272. static struct ns_common *netns_get(struct task_struct *task)
  1273. {
  1274. struct net *net = NULL;
  1275. struct nsproxy *nsproxy;
  1276. task_lock(task);
  1277. nsproxy = task->nsproxy;
  1278. if (nsproxy)
  1279. net = get_net(nsproxy->net_ns);
  1280. task_unlock(task);
  1281. return net ? &net->ns : NULL;
  1282. }
  1283. static void netns_put(struct ns_common *ns)
  1284. {
  1285. put_net(to_net_ns(ns));
  1286. }
  1287. static int netns_install(struct nsset *nsset, struct ns_common *ns)
  1288. {
  1289. struct nsproxy *nsproxy = nsset->nsproxy;
  1290. struct net *net = to_net_ns(ns);
  1291. if (!ns_capable(net->user_ns, CAP_SYS_ADMIN) ||
  1292. !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
  1293. return -EPERM;
  1294. put_net(nsproxy->net_ns);
  1295. nsproxy->net_ns = get_net(net);
  1296. return 0;
  1297. }
  1298. static struct user_namespace *netns_owner(struct ns_common *ns)
  1299. {
  1300. return to_net_ns(ns)->user_ns;
  1301. }
  1302. const struct proc_ns_operations netns_operations = {
  1303. .name = "net",
  1304. .get = netns_get,
  1305. .put = netns_put,
  1306. .install = netns_install,
  1307. .owner = netns_owner,
  1308. };
  1309. #endif