proc-pid-vm.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592
  1. /*
  2. * Copyright (c) 2019 Alexey Dobriyan <adobriyan@gmail.com>
  3. *
  4. * Permission to use, copy, modify, and distribute this software for any
  5. * purpose with or without fee is hereby granted, provided that the above
  6. * copyright notice and this permission notice appear in all copies.
  7. *
  8. * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  9. * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  10. * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  11. * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  12. * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  13. * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  14. * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  15. */
  16. /*
  17. * Fork and exec tiny 1 page executable which precisely controls its VM.
  18. * Test /proc/$PID/maps
  19. * Test /proc/$PID/smaps
  20. * Test /proc/$PID/smaps_rollup
  21. * Test /proc/$PID/statm
  22. *
  23. * FIXME require CONFIG_TMPFS which can be disabled
  24. * FIXME test other values from "smaps"
  25. * FIXME support other archs
  26. */
  27. #undef NDEBUG
  28. #include <assert.h>
  29. #include <errno.h>
  30. #include <sched.h>
  31. #include <signal.h>
  32. #include <stdbool.h>
  33. #include <stdint.h>
  34. #include <stdio.h>
  35. #include <string.h>
  36. #include <stdlib.h>
  37. #include <sys/mount.h>
  38. #include <sys/types.h>
  39. #include <sys/stat.h>
  40. #include <sys/wait.h>
  41. #include <fcntl.h>
  42. #include <unistd.h>
  43. #include <sys/syscall.h>
  44. #include <sys/uio.h>
  45. #include <linux/kdev_t.h>
  46. #include <sys/time.h>
  47. #include <sys/resource.h>
  48. #include <linux/fs.h>
  49. #ifndef __maybe_unused
  50. #define __maybe_unused __attribute__((__unused__))
  51. #endif
  52. #include "kselftest.h"
  53. static inline long sys_execveat(int dirfd, const char *pathname, char **argv, char **envp, int flags)
  54. {
  55. return syscall(SYS_execveat, dirfd, pathname, argv, envp, flags);
  56. }
  57. static void make_private_tmp(void)
  58. {
  59. if (unshare(CLONE_NEWNS) == -1) {
  60. if (errno == ENOSYS || errno == EPERM) {
  61. exit(4);
  62. }
  63. exit(1);
  64. }
  65. if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) == -1) {
  66. exit(1);
  67. }
  68. if (mount(NULL, "/tmp", "tmpfs", 0, NULL) == -1) {
  69. exit(1);
  70. }
  71. }
  72. static pid_t pid = -1;
  73. static void ate(void)
  74. {
  75. if (pid > 0) {
  76. kill(pid, SIGTERM);
  77. }
  78. }
  79. struct elf64_hdr {
  80. uint8_t e_ident[16];
  81. uint16_t e_type;
  82. uint16_t e_machine;
  83. uint32_t e_version;
  84. uint64_t e_entry;
  85. uint64_t e_phoff;
  86. uint64_t e_shoff;
  87. uint32_t e_flags;
  88. uint16_t e_ehsize;
  89. uint16_t e_phentsize;
  90. uint16_t e_phnum;
  91. uint16_t e_shentsize;
  92. uint16_t e_shnum;
  93. uint16_t e_shstrndx;
  94. };
  95. struct elf64_phdr {
  96. uint32_t p_type;
  97. uint32_t p_flags;
  98. uint64_t p_offset;
  99. uint64_t p_vaddr;
  100. uint64_t p_paddr;
  101. uint64_t p_filesz;
  102. uint64_t p_memsz;
  103. uint64_t p_align;
  104. };
  105. #ifdef __x86_64__
  106. #define PAGE_SIZE 4096
  107. #define VADDR (1UL << 32)
  108. #define MAPS_OFFSET 73
  109. #define syscall 0x0f, 0x05
  110. #define mov_rdi(x) \
  111. 0x48, 0xbf, \
  112. (x)&0xff, ((x)>>8)&0xff, ((x)>>16)&0xff, ((x)>>24)&0xff, \
  113. ((x)>>32)&0xff, ((x)>>40)&0xff, ((x)>>48)&0xff, ((x)>>56)&0xff
  114. #define mov_rsi(x) \
  115. 0x48, 0xbe, \
  116. (x)&0xff, ((x)>>8)&0xff, ((x)>>16)&0xff, ((x)>>24)&0xff, \
  117. ((x)>>32)&0xff, ((x)>>40)&0xff, ((x)>>48)&0xff, ((x)>>56)&0xff
  118. #define mov_eax(x) \
  119. 0xb8, (x)&0xff, ((x)>>8)&0xff, ((x)>>16)&0xff, ((x)>>24)&0xff
  120. static const uint8_t payload[] = {
  121. /* Casually unmap stack, vDSO and everything else. */
  122. /* munmap */
  123. mov_rdi(VADDR + 4096),
  124. mov_rsi((1ULL << 47) - 4096 - VADDR - 4096),
  125. mov_eax(11),
  126. syscall,
  127. /* Ping parent. */
  128. /* write(0, &c, 1); */
  129. 0x31, 0xff, /* xor edi, edi */
  130. 0x48, 0x8d, 0x35, 0x00, 0x00, 0x00, 0x00, /* lea rsi, [rip] */
  131. 0xba, 0x01, 0x00, 0x00, 0x00, /* mov edx, 1 */
  132. mov_eax(1),
  133. syscall,
  134. /* 1: pause(); */
  135. mov_eax(34),
  136. syscall,
  137. 0xeb, 0xf7, /* jmp 1b */
  138. };
  139. static int make_exe(const uint8_t *payload, size_t len)
  140. {
  141. struct elf64_hdr h;
  142. struct elf64_phdr ph;
  143. struct iovec iov[3] = {
  144. {&h, sizeof(struct elf64_hdr)},
  145. {&ph, sizeof(struct elf64_phdr)},
  146. {(void *)payload, len},
  147. };
  148. int fd, fd1;
  149. char buf[64];
  150. memset(&h, 0, sizeof(h));
  151. h.e_ident[0] = 0x7f;
  152. h.e_ident[1] = 'E';
  153. h.e_ident[2] = 'L';
  154. h.e_ident[3] = 'F';
  155. h.e_ident[4] = 2;
  156. h.e_ident[5] = 1;
  157. h.e_ident[6] = 1;
  158. h.e_ident[7] = 0;
  159. h.e_type = 2;
  160. h.e_machine = 0x3e;
  161. h.e_version = 1;
  162. h.e_entry = VADDR + sizeof(struct elf64_hdr) + sizeof(struct elf64_phdr);
  163. h.e_phoff = sizeof(struct elf64_hdr);
  164. h.e_shoff = 0;
  165. h.e_flags = 0;
  166. h.e_ehsize = sizeof(struct elf64_hdr);
  167. h.e_phentsize = sizeof(struct elf64_phdr);
  168. h.e_phnum = 1;
  169. h.e_shentsize = 0;
  170. h.e_shnum = 0;
  171. h.e_shstrndx = 0;
  172. memset(&ph, 0, sizeof(ph));
  173. ph.p_type = 1;
  174. ph.p_flags = (1<<2)|1;
  175. ph.p_offset = 0;
  176. ph.p_vaddr = VADDR;
  177. ph.p_paddr = 0;
  178. ph.p_filesz = sizeof(struct elf64_hdr) + sizeof(struct elf64_phdr) + len;
  179. ph.p_memsz = sizeof(struct elf64_hdr) + sizeof(struct elf64_phdr) + len;
  180. ph.p_align = 4096;
  181. fd = openat(AT_FDCWD, "/tmp", O_WRONLY|O_EXCL|O_TMPFILE, 0700);
  182. if (fd == -1) {
  183. exit(1);
  184. }
  185. if (writev(fd, iov, 3) != sizeof(struct elf64_hdr) + sizeof(struct elf64_phdr) + len) {
  186. exit(1);
  187. }
  188. /* Avoid ETXTBSY on exec. */
  189. snprintf(buf, sizeof(buf), "/proc/self/fd/%u", fd);
  190. fd1 = open(buf, O_RDONLY|O_CLOEXEC);
  191. close(fd);
  192. return fd1;
  193. }
  194. #endif
  195. /*
  196. * 0: vsyscall VMA doesn't exist vsyscall=none
  197. * 1: vsyscall VMA is --xp vsyscall=xonly
  198. * 2: vsyscall VMA is r-xp vsyscall=emulate
  199. */
  200. static volatile int g_vsyscall;
  201. static const char *str_vsyscall __maybe_unused;
  202. static const char str_vsyscall_0[] __maybe_unused = "";
  203. static const char str_vsyscall_1[] __maybe_unused =
  204. "ffffffffff600000-ffffffffff601000 --xp 00000000 00:00 0 [vsyscall]\n";
  205. static const char str_vsyscall_2[] __maybe_unused =
  206. "ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]\n";
  207. #ifdef __x86_64__
  208. static void sigaction_SIGSEGV(int _, siginfo_t *__, void *___)
  209. {
  210. _exit(g_vsyscall);
  211. }
  212. /*
  213. * vsyscall page can't be unmapped, probe it directly.
  214. */
  215. static void vsyscall(void)
  216. {
  217. pid_t pid;
  218. int wstatus;
  219. pid = fork();
  220. if (pid < 0) {
  221. fprintf(stderr, "fork, errno %d\n", errno);
  222. exit(1);
  223. }
  224. if (pid == 0) {
  225. struct rlimit rlim = {0, 0};
  226. (void)setrlimit(RLIMIT_CORE, &rlim);
  227. /* Hide "segfault at ffffffffff600000" messages. */
  228. struct sigaction act;
  229. memset(&act, 0, sizeof(struct sigaction));
  230. act.sa_flags = SA_SIGINFO;
  231. act.sa_sigaction = sigaction_SIGSEGV;
  232. (void)sigaction(SIGSEGV, &act, NULL);
  233. g_vsyscall = 0;
  234. /* gettimeofday(NULL, NULL); */
  235. uint64_t rax = 0xffffffffff600000;
  236. asm volatile (
  237. "call *%[rax]"
  238. : [rax] "+a" (rax)
  239. : "D" (NULL), "S" (NULL)
  240. : "rcx", "r11"
  241. );
  242. g_vsyscall = 1;
  243. *(volatile int *)0xffffffffff600000UL;
  244. g_vsyscall = 2;
  245. exit(g_vsyscall);
  246. }
  247. waitpid(pid, &wstatus, 0);
  248. if (WIFEXITED(wstatus)) {
  249. g_vsyscall = WEXITSTATUS(wstatus);
  250. } else {
  251. fprintf(stderr, "error: wstatus %08x\n", wstatus);
  252. exit(1);
  253. }
  254. }
  255. int main(void)
  256. {
  257. int pipefd[2];
  258. int exec_fd;
  259. vsyscall();
  260. switch (g_vsyscall) {
  261. case 0:
  262. str_vsyscall = str_vsyscall_0;
  263. break;
  264. case 1:
  265. str_vsyscall = str_vsyscall_1;
  266. break;
  267. case 2:
  268. str_vsyscall = str_vsyscall_2;
  269. break;
  270. default:
  271. abort();
  272. }
  273. atexit(ate);
  274. make_private_tmp();
  275. /* Reserve fd 0 for 1-byte pipe ping from child. */
  276. close(0);
  277. if (open("/", O_RDONLY|O_DIRECTORY|O_PATH) != 0) {
  278. return 1;
  279. }
  280. exec_fd = make_exe(payload, sizeof(payload));
  281. if (pipe(pipefd) == -1) {
  282. return 1;
  283. }
  284. if (dup2(pipefd[1], 0) != 0) {
  285. return 1;
  286. }
  287. pid = fork();
  288. if (pid == -1) {
  289. return 1;
  290. }
  291. if (pid == 0) {
  292. sys_execveat(exec_fd, "", NULL, NULL, AT_EMPTY_PATH);
  293. return 1;
  294. }
  295. char _;
  296. if (read(pipefd[0], &_, 1) != 1) {
  297. return 1;
  298. }
  299. struct stat st;
  300. if (fstat(exec_fd, &st) == -1) {
  301. return 1;
  302. }
  303. /* Generate "head -n1 /proc/$PID/maps" */
  304. char buf0[256];
  305. memset(buf0, ' ', sizeof(buf0));
  306. int len = snprintf(buf0, sizeof(buf0),
  307. "%08lx-%08lx r-xp 00000000 %02lx:%02lx %llu",
  308. VADDR, VADDR + PAGE_SIZE,
  309. MAJOR(st.st_dev), MINOR(st.st_dev),
  310. (unsigned long long)st.st_ino);
  311. buf0[len] = ' ';
  312. snprintf(buf0 + MAPS_OFFSET, sizeof(buf0) - MAPS_OFFSET,
  313. "/tmp/#%llu (deleted)\n", (unsigned long long)st.st_ino);
  314. /* Test /proc/$PID/maps */
  315. {
  316. const size_t len = strlen(buf0) + strlen(str_vsyscall);
  317. char buf[256];
  318. ssize_t rv;
  319. int fd;
  320. snprintf(buf, sizeof(buf), "/proc/%u/maps", pid);
  321. fd = open(buf, O_RDONLY);
  322. if (fd == -1) {
  323. return 1;
  324. }
  325. rv = read(fd, buf, sizeof(buf));
  326. assert(rv == len);
  327. assert(memcmp(buf, buf0, strlen(buf0)) == 0);
  328. if (g_vsyscall > 0) {
  329. assert(memcmp(buf + strlen(buf0), str_vsyscall, strlen(str_vsyscall)) == 0);
  330. }
  331. }
  332. /* Test /proc/$PID/smaps */
  333. {
  334. char buf[4096];
  335. ssize_t rv;
  336. int fd;
  337. snprintf(buf, sizeof(buf), "/proc/%u/smaps", pid);
  338. fd = open(buf, O_RDONLY);
  339. if (fd == -1) {
  340. return 1;
  341. }
  342. rv = read(fd, buf, sizeof(buf));
  343. assert(0 <= rv && rv <= sizeof(buf));
  344. assert(rv >= strlen(buf0));
  345. assert(memcmp(buf, buf0, strlen(buf0)) == 0);
  346. #define RSS1 "Rss: 4 kB\n"
  347. #define RSS2 "Rss: 0 kB\n"
  348. #define PSS1 "Pss: 4 kB\n"
  349. #define PSS2 "Pss: 0 kB\n"
  350. assert(memmem(buf, rv, RSS1, strlen(RSS1)) ||
  351. memmem(buf, rv, RSS2, strlen(RSS2)));
  352. assert(memmem(buf, rv, PSS1, strlen(PSS1)) ||
  353. memmem(buf, rv, PSS2, strlen(PSS2)));
  354. static const char *S[] = {
  355. "Size: 4 kB\n",
  356. "KernelPageSize: 4 kB\n",
  357. "MMUPageSize: 4 kB\n",
  358. "Anonymous: 0 kB\n",
  359. "AnonHugePages: 0 kB\n",
  360. "Shared_Hugetlb: 0 kB\n",
  361. "Private_Hugetlb: 0 kB\n",
  362. "Locked: 0 kB\n",
  363. };
  364. int i;
  365. for (i = 0; i < ARRAY_SIZE(S); i++) {
  366. assert(memmem(buf, rv, S[i], strlen(S[i])));
  367. }
  368. if (g_vsyscall > 0) {
  369. assert(memmem(buf, rv, str_vsyscall, strlen(str_vsyscall)));
  370. }
  371. }
  372. /* Test /proc/$PID/smaps_rollup */
  373. {
  374. char bufr[256];
  375. memset(bufr, ' ', sizeof(bufr));
  376. len = snprintf(bufr, sizeof(bufr),
  377. "%08lx-%08lx ---p 00000000 00:00 0",
  378. VADDR, VADDR + PAGE_SIZE);
  379. bufr[len] = ' ';
  380. snprintf(bufr + MAPS_OFFSET, sizeof(bufr) - MAPS_OFFSET,
  381. "[rollup]\n");
  382. char buf[1024];
  383. ssize_t rv;
  384. int fd;
  385. snprintf(buf, sizeof(buf), "/proc/%u/smaps_rollup", pid);
  386. fd = open(buf, O_RDONLY);
  387. if (fd == -1) {
  388. return 1;
  389. }
  390. rv = read(fd, buf, sizeof(buf));
  391. assert(0 <= rv && rv <= sizeof(buf));
  392. assert(rv >= strlen(bufr));
  393. assert(memcmp(buf, bufr, strlen(bufr)) == 0);
  394. assert(memmem(buf, rv, RSS1, strlen(RSS1)) ||
  395. memmem(buf, rv, RSS2, strlen(RSS2)));
  396. assert(memmem(buf, rv, PSS1, strlen(PSS1)) ||
  397. memmem(buf, rv, PSS2, strlen(PSS2)));
  398. static const char *S[] = {
  399. "Anonymous: 0 kB\n",
  400. "AnonHugePages: 0 kB\n",
  401. "Shared_Hugetlb: 0 kB\n",
  402. "Private_Hugetlb: 0 kB\n",
  403. "Locked: 0 kB\n",
  404. };
  405. int i;
  406. for (i = 0; i < ARRAY_SIZE(S); i++) {
  407. assert(memmem(buf, rv, S[i], strlen(S[i])));
  408. }
  409. }
  410. /* Test /proc/$PID/statm */
  411. {
  412. char buf[64];
  413. ssize_t rv;
  414. int fd;
  415. snprintf(buf, sizeof(buf), "/proc/%u/statm", pid);
  416. fd = open(buf, O_RDONLY);
  417. if (fd == -1) {
  418. return 1;
  419. }
  420. rv = read(fd, buf, sizeof(buf));
  421. assert(rv == 7 * 2);
  422. assert(buf[0] == '1'); /* ->total_vm */
  423. assert(buf[1] == ' ');
  424. assert(buf[2] == '0' || buf[2] == '1'); /* rss */
  425. assert(buf[3] == ' ');
  426. assert(buf[4] == '0' || buf[2] == '1'); /* file rss */
  427. assert(buf[5] == ' ');
  428. assert(buf[6] == '1'); /* ELF executable segments */
  429. assert(buf[7] == ' ');
  430. assert(buf[8] == '0');
  431. assert(buf[9] == ' ');
  432. assert(buf[10] == '0'); /* ->data_vm + ->stack_vm */
  433. assert(buf[11] == ' ');
  434. assert(buf[12] == '0');
  435. assert(buf[13] == '\n');
  436. }
  437. /* Test PROCMAP_QUERY ioctl() for /proc/$PID/maps */
  438. {
  439. char path_buf[256], exp_path_buf[256];
  440. struct procmap_query q;
  441. int fd, err;
  442. snprintf(path_buf, sizeof(path_buf), "/proc/%u/maps", pid);
  443. fd = open(path_buf, O_RDONLY);
  444. if (fd == -1)
  445. return 1;
  446. /* CASE 1: exact MATCH at VADDR */
  447. memset(&q, 0, sizeof(q));
  448. q.size = sizeof(q);
  449. q.query_addr = VADDR;
  450. q.query_flags = 0;
  451. q.vma_name_addr = (__u64)(unsigned long)path_buf;
  452. q.vma_name_size = sizeof(path_buf);
  453. err = ioctl(fd, PROCMAP_QUERY, &q);
  454. assert(err == 0);
  455. assert(q.query_addr == VADDR);
  456. assert(q.query_flags == 0);
  457. assert(q.vma_flags == (PROCMAP_QUERY_VMA_READABLE | PROCMAP_QUERY_VMA_EXECUTABLE));
  458. assert(q.vma_start == VADDR);
  459. assert(q.vma_end == VADDR + PAGE_SIZE);
  460. assert(q.vma_page_size == PAGE_SIZE);
  461. assert(q.vma_offset == 0);
  462. assert(q.inode == st.st_ino);
  463. assert(q.dev_major == MAJOR(st.st_dev));
  464. assert(q.dev_minor == MINOR(st.st_dev));
  465. snprintf(exp_path_buf, sizeof(exp_path_buf),
  466. "/tmp/#%llu (deleted)", (unsigned long long)st.st_ino);
  467. assert(q.vma_name_size == strlen(exp_path_buf) + 1);
  468. assert(strcmp(path_buf, exp_path_buf) == 0);
  469. /* CASE 2: NO MATCH at VADDR-1 */
  470. memset(&q, 0, sizeof(q));
  471. q.size = sizeof(q);
  472. q.query_addr = VADDR - 1;
  473. q.query_flags = 0; /* exact match */
  474. err = ioctl(fd, PROCMAP_QUERY, &q);
  475. err = err < 0 ? -errno : 0;
  476. assert(err == -ENOENT);
  477. /* CASE 3: MATCH COVERING_OR_NEXT_VMA at VADDR - 1 */
  478. memset(&q, 0, sizeof(q));
  479. q.size = sizeof(q);
  480. q.query_addr = VADDR - 1;
  481. q.query_flags = PROCMAP_QUERY_COVERING_OR_NEXT_VMA;
  482. err = ioctl(fd, PROCMAP_QUERY, &q);
  483. assert(err == 0);
  484. assert(q.query_addr == VADDR - 1);
  485. assert(q.query_flags == PROCMAP_QUERY_COVERING_OR_NEXT_VMA);
  486. assert(q.vma_start == VADDR);
  487. assert(q.vma_end == VADDR + PAGE_SIZE);
  488. /* CASE 4: NO MATCH at VADDR + PAGE_SIZE */
  489. memset(&q, 0, sizeof(q));
  490. q.size = sizeof(q);
  491. q.query_addr = VADDR + PAGE_SIZE; /* point right after the VMA */
  492. q.query_flags = PROCMAP_QUERY_COVERING_OR_NEXT_VMA;
  493. err = ioctl(fd, PROCMAP_QUERY, &q);
  494. err = err < 0 ? -errno : 0;
  495. assert(err == -ENOENT);
  496. /* CASE 5: NO MATCH WRITABLE at VADDR */
  497. memset(&q, 0, sizeof(q));
  498. q.size = sizeof(q);
  499. q.query_addr = VADDR;
  500. q.query_flags = PROCMAP_QUERY_VMA_WRITABLE;
  501. err = ioctl(fd, PROCMAP_QUERY, &q);
  502. err = err < 0 ? -errno : 0;
  503. assert(err == -ENOENT);
  504. }
  505. return 0;
  506. }
  507. #else
  508. int main(void)
  509. {
  510. return 4;
  511. }
  512. #endif