start_up.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright (C) 2021 Benjamin Berg <benjamin@sipsolutions.net>
  4. * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
  5. */
  6. #include <stdio.h>
  7. #include <stdlib.h>
  8. #include <stdarg.h>
  9. #include <unistd.h>
  10. #include <errno.h>
  11. #include <fcntl.h>
  12. #include <sched.h>
  13. #include <signal.h>
  14. #include <string.h>
  15. #include <sys/mman.h>
  16. #include <sys/stat.h>
  17. #include <sys/wait.h>
  18. #include <sys/time.h>
  19. #include <sys/resource.h>
  20. #include <asm/ldt.h>
  21. #include <asm/unistd.h>
  22. #include <init.h>
  23. #include <os.h>
  24. #include <smp.h>
  25. #include <kern_util.h>
  26. #include <mem_user.h>
  27. #include <ptrace_user.h>
  28. #include <stdbool.h>
  29. #include <stub-data.h>
  30. #include <sys/prctl.h>
  31. #include <linux/seccomp.h>
  32. #include <linux/filter.h>
  33. #include <sysdep/mcontext.h>
  34. #include <sysdep/stub.h>
  35. #include <registers.h>
  36. #include <skas.h>
  37. #include "internal.h"
  38. static void ptrace_child(void)
  39. {
  40. int ret;
  41. /* Calling os_getpid because some libcs cached getpid incorrectly */
  42. int pid = os_getpid(), ppid = getppid();
  43. int sc_result;
  44. if (change_sig(SIGWINCH, 0) < 0 ||
  45. ptrace(PTRACE_TRACEME, 0, 0, 0) < 0) {
  46. perror("ptrace");
  47. kill(pid, SIGKILL);
  48. }
  49. kill(pid, SIGSTOP);
  50. /*
  51. * This syscall will be intercepted by the parent. Don't call more than
  52. * once, please.
  53. */
  54. sc_result = os_getpid();
  55. if (sc_result == pid)
  56. /* Nothing modified by the parent, we are running normally. */
  57. ret = 1;
  58. else if (sc_result == ppid)
  59. /*
  60. * Expected in check_ptrace and check_sysemu when they succeed
  61. * in modifying the stack frame
  62. */
  63. ret = 0;
  64. else
  65. /* Serious trouble! This could be caused by a bug in host 2.6
  66. * SKAS3/2.6 patch before release -V6, together with a bug in
  67. * the UML code itself.
  68. */
  69. ret = 2;
  70. exit(ret);
  71. }
  72. static void fatal_perror(const char *str)
  73. {
  74. perror(str);
  75. exit(1);
  76. }
  77. static void fatal(char *fmt, ...)
  78. {
  79. va_list list;
  80. va_start(list, fmt);
  81. vfprintf(stderr, fmt, list);
  82. va_end(list);
  83. exit(1);
  84. }
  85. static void non_fatal(char *fmt, ...)
  86. {
  87. va_list list;
  88. va_start(list, fmt);
  89. vfprintf(stderr, fmt, list);
  90. va_end(list);
  91. }
  92. static int start_ptraced_child(void)
  93. {
  94. int pid, n, status;
  95. fflush(stdout);
  96. pid = fork();
  97. if (pid == 0)
  98. ptrace_child();
  99. else if (pid < 0)
  100. fatal_perror("start_ptraced_child : fork failed");
  101. CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED));
  102. if (n < 0)
  103. fatal_perror("check_ptrace : waitpid failed");
  104. if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP))
  105. fatal("check_ptrace : expected SIGSTOP, got status = %d",
  106. status);
  107. return pid;
  108. }
  109. static void stop_ptraced_child(int pid, int exitcode)
  110. {
  111. int status, n;
  112. if (ptrace(PTRACE_CONT, pid, 0, 0) < 0)
  113. fatal_perror("stop_ptraced_child : ptrace failed");
  114. CATCH_EINTR(n = waitpid(pid, &status, 0));
  115. if (!WIFEXITED(status) || (WEXITSTATUS(status) != exitcode)) {
  116. int exit_with = WEXITSTATUS(status);
  117. fatal("stop_ptraced_child : child exited with exitcode %d, "
  118. "while expecting %d; status 0x%x\n", exit_with,
  119. exitcode, status);
  120. }
  121. }
  122. static void __init check_sysemu(void)
  123. {
  124. int pid, n, status, count=0;
  125. os_info("Checking syscall emulation for ptrace...");
  126. pid = start_ptraced_child();
  127. if ((ptrace(PTRACE_SETOPTIONS, pid, 0,
  128. (void *) PTRACE_O_TRACESYSGOOD) < 0))
  129. fatal_perror("check_sysemu: PTRACE_SETOPTIONS failed");
  130. while (1) {
  131. count++;
  132. if (ptrace(PTRACE_SYSEMU_SINGLESTEP, pid, 0, 0) < 0)
  133. goto fail;
  134. CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED));
  135. if (n < 0)
  136. fatal_perror("check_sysemu: wait failed");
  137. if (WIFSTOPPED(status) &&
  138. (WSTOPSIG(status) == (SIGTRAP|0x80))) {
  139. if (!count) {
  140. non_fatal("check_sysemu: SYSEMU_SINGLESTEP "
  141. "doesn't singlestep");
  142. goto fail;
  143. }
  144. n = ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_RET_OFFSET,
  145. os_getpid());
  146. if (n < 0)
  147. fatal_perror("check_sysemu : failed to modify "
  148. "system call return");
  149. break;
  150. }
  151. else if (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGTRAP))
  152. count++;
  153. else {
  154. non_fatal("check_sysemu: expected SIGTRAP or "
  155. "(SIGTRAP | 0x80), got status = %d\n",
  156. status);
  157. goto fail;
  158. }
  159. }
  160. stop_ptraced_child(pid, 0);
  161. os_info("OK\n");
  162. return;
  163. fail:
  164. stop_ptraced_child(pid, 1);
  165. fatal("missing\n");
  166. }
  167. static void __init check_ptrace(void)
  168. {
  169. int pid, syscall, n, status;
  170. os_info("Checking that ptrace can change system call numbers...");
  171. pid = start_ptraced_child();
  172. if ((ptrace(PTRACE_SETOPTIONS, pid, 0,
  173. (void *) PTRACE_O_TRACESYSGOOD) < 0))
  174. fatal_perror("check_ptrace: PTRACE_SETOPTIONS failed");
  175. while (1) {
  176. if (ptrace(PTRACE_SYSCALL, pid, 0, 0) < 0)
  177. fatal_perror("check_ptrace : ptrace failed");
  178. CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED));
  179. if (n < 0)
  180. fatal_perror("check_ptrace : wait failed");
  181. if (!WIFSTOPPED(status) ||
  182. (WSTOPSIG(status) != (SIGTRAP | 0x80)))
  183. fatal("check_ptrace : expected (SIGTRAP|0x80), "
  184. "got status = %d", status);
  185. syscall = ptrace(PTRACE_PEEKUSER, pid, PT_SYSCALL_NR_OFFSET,
  186. 0);
  187. if (syscall == __NR_getpid) {
  188. n = ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_NR_OFFSET,
  189. __NR_getppid);
  190. if (n < 0)
  191. fatal_perror("check_ptrace : failed to modify "
  192. "system call");
  193. break;
  194. }
  195. }
  196. stop_ptraced_child(pid, 0);
  197. os_info("OK\n");
  198. check_sysemu();
  199. }
  200. extern unsigned long host_fp_size;
  201. extern unsigned long exec_regs[MAX_REG_NR];
  202. extern unsigned long *exec_fp_regs;
  203. __initdata static struct stub_data *seccomp_test_stub_data;
  204. static void __init sigsys_handler(int sig, siginfo_t *info, void *p)
  205. {
  206. ucontext_t *uc = p;
  207. /* Stow away the location of the mcontext in the stack */
  208. seccomp_test_stub_data->mctx_offset = (unsigned long)&uc->uc_mcontext -
  209. (unsigned long)&seccomp_test_stub_data->sigstack[0];
  210. /* Prevent libc from clearing memory (mctx_offset in particular) */
  211. syscall(__NR_exit, 0);
  212. }
  213. static int __init seccomp_helper(void *data)
  214. {
  215. static struct sock_filter filter[] = {
  216. BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
  217. offsetof(struct seccomp_data, nr)),
  218. BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_clock_nanosleep, 1, 0),
  219. BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
  220. BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRAP),
  221. };
  222. static struct sock_fprog prog = {
  223. .len = ARRAY_SIZE(filter),
  224. .filter = filter,
  225. };
  226. struct sigaction sa;
  227. /* close_range is needed for the stub */
  228. if (stub_syscall3(__NR_close_range, 1, ~0U, 0))
  229. exit(1);
  230. set_sigstack(seccomp_test_stub_data->sigstack,
  231. sizeof(seccomp_test_stub_data->sigstack));
  232. sa.sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO;
  233. sa.sa_sigaction = (void *) sigsys_handler;
  234. sa.sa_restorer = NULL;
  235. if (sigaction(SIGSYS, &sa, NULL) < 0)
  236. exit(2);
  237. prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
  238. if (syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER,
  239. SECCOMP_FILTER_FLAG_TSYNC, &prog) != 0)
  240. exit(3);
  241. sleep(0);
  242. /* Never reached. */
  243. _exit(4);
  244. }
  245. static bool __init init_seccomp(void)
  246. {
  247. int pid;
  248. int status;
  249. int n;
  250. unsigned long sp;
  251. /*
  252. * We check that we can install a seccomp filter and then exit(0)
  253. * from a trapped syscall.
  254. *
  255. * Note that we cannot verify that no seccomp filter already exists
  256. * for a syscall that results in the process/thread to be killed.
  257. */
  258. os_info("Checking that seccomp filters can be installed...");
  259. seccomp_test_stub_data = mmap(0, sizeof(*seccomp_test_stub_data),
  260. PROT_READ | PROT_WRITE,
  261. MAP_SHARED | MAP_ANON, 0, 0);
  262. /* Use the syscall data area as stack, we just need something */
  263. sp = (unsigned long)&seccomp_test_stub_data->syscall_data +
  264. sizeof(seccomp_test_stub_data->syscall_data) -
  265. sizeof(void *);
  266. pid = clone(seccomp_helper, (void *)sp, CLONE_VFORK | CLONE_VM, NULL);
  267. if (pid < 0)
  268. fatal_perror("check_seccomp : clone failed");
  269. CATCH_EINTR(n = waitpid(pid, &status, __WCLONE));
  270. if (n < 0)
  271. fatal_perror("check_seccomp : waitpid failed");
  272. if (WIFEXITED(status) && WEXITSTATUS(status) == 0) {
  273. struct uml_pt_regs *regs;
  274. unsigned long fp_size;
  275. int r;
  276. /* Fill in the host_fp_size from the mcontext. */
  277. regs = calloc(1, sizeof(struct uml_pt_regs));
  278. get_stub_state(regs, seccomp_test_stub_data, &fp_size);
  279. host_fp_size = fp_size;
  280. free(regs);
  281. /* Repeat with the correct size */
  282. regs = calloc(1, sizeof(struct uml_pt_regs) + host_fp_size);
  283. r = get_stub_state(regs, seccomp_test_stub_data, NULL);
  284. /* Store as the default startup registers */
  285. exec_fp_regs = malloc(host_fp_size);
  286. memcpy(exec_regs, regs->gp, sizeof(exec_regs));
  287. memcpy(exec_fp_regs, regs->fp, host_fp_size);
  288. munmap(seccomp_test_stub_data, sizeof(*seccomp_test_stub_data));
  289. free(regs);
  290. if (r) {
  291. os_info("failed to fetch registers: %d\n", r);
  292. return false;
  293. }
  294. os_info("OK\n");
  295. return true;
  296. }
  297. if (WIFEXITED(status) && WEXITSTATUS(status) == 2)
  298. os_info("missing\n");
  299. else
  300. os_info("error\n");
  301. munmap(seccomp_test_stub_data, sizeof(*seccomp_test_stub_data));
  302. return false;
  303. }
  304. static void __init check_coredump_limit(void)
  305. {
  306. struct rlimit lim;
  307. int err = getrlimit(RLIMIT_CORE, &lim);
  308. if (err) {
  309. perror("Getting core dump limit");
  310. return;
  311. }
  312. os_info("Core dump limits :\n\tsoft - ");
  313. if (lim.rlim_cur == RLIM_INFINITY)
  314. os_info("NONE\n");
  315. else
  316. os_info("%llu\n", (unsigned long long)lim.rlim_cur);
  317. os_info("\thard - ");
  318. if (lim.rlim_max == RLIM_INFINITY)
  319. os_info("NONE\n");
  320. else
  321. os_info("%llu\n", (unsigned long long)lim.rlim_max);
  322. }
  323. void __init get_host_cpu_features(
  324. void (*flags_helper_func)(char *line),
  325. void (*cache_helper_func)(char *line))
  326. {
  327. FILE *cpuinfo;
  328. char *line = NULL;
  329. size_t len = 0;
  330. int done_parsing = 0;
  331. cpuinfo = fopen("/proc/cpuinfo", "r");
  332. if (cpuinfo == NULL) {
  333. os_info("Failed to get host CPU features\n");
  334. } else {
  335. while ((getline(&line, &len, cpuinfo)) != -1) {
  336. if (strstr(line, "flags")) {
  337. flags_helper_func(line);
  338. done_parsing++;
  339. }
  340. if (strstr(line, "cache_alignment")) {
  341. cache_helper_func(line);
  342. done_parsing++;
  343. }
  344. free(line);
  345. line = NULL;
  346. if (done_parsing > 1)
  347. break;
  348. }
  349. fclose(cpuinfo);
  350. }
  351. }
  352. static int seccomp_config __initdata;
  353. static int __init uml_seccomp_config(char *line, int *add)
  354. {
  355. *add = 0;
  356. if (strcmp(line, "off") == 0)
  357. seccomp_config = 0;
  358. else if (strcmp(line, "auto") == 0)
  359. seccomp_config = 1;
  360. else if (strcmp(line, "on") == 0)
  361. seccomp_config = 2;
  362. else
  363. fatal("Invalid seccomp option '%s', expected on/auto/off\n",
  364. line);
  365. return 0;
  366. }
  367. __uml_setup("seccomp=", uml_seccomp_config,
  368. "seccomp=<on/auto/off>\n"
  369. " Configure whether or not SECCOMP is used. With SECCOMP, userspace\n"
  370. " processes work collaboratively with the kernel instead of being\n"
  371. " traced using ptrace. All syscalls from the application are caught and\n"
  372. " redirected using a signal. This signal handler in turn is permitted to\n"
  373. " do the selected set of syscalls to communicate with the UML kernel and\n"
  374. " do the required memory management.\n"
  375. "\n"
  376. " This method is overall faster than the ptrace based userspace, primarily\n"
  377. " because it reduces the number of context switches for (minor) page faults.\n"
  378. "\n"
  379. " However, the SECCOMP filter is not (yet) restrictive enough to prevent\n"
  380. " userspace from reading and writing all physical memory. Userspace\n"
  381. " processes could also trick the stub into disabling SIGALRM which\n"
  382. " prevents it from being interrupted for scheduling purposes.\n"
  383. "\n"
  384. " This is insecure and should only be used with a trusted userspace\n\n"
  385. );
  386. void __init os_early_checks(void)
  387. {
  388. int pid;
  389. /* Print out the core dump limits early */
  390. check_coredump_limit();
  391. /* Need to check this early because mmapping happens before the
  392. * kernel is running.
  393. */
  394. check_tmpexec();
  395. if (seccomp_config) {
  396. if (init_seccomp()) {
  397. using_seccomp = 1;
  398. return;
  399. }
  400. if (seccomp_config == 2)
  401. fatal("SECCOMP userspace requested but not functional!\n");
  402. }
  403. if (uml_ncpus > 1)
  404. fatal("SMP is not supported with PTRACE userspace.\n");
  405. using_seccomp = 0;
  406. check_ptrace();
  407. pid = start_ptraced_child();
  408. if (init_pid_registers(pid))
  409. fatal("Failed to initialize default registers");
  410. stop_ptraced_child(pid, 1);
  411. }