util.c 39 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. #include <linux/mm.h>
  3. #include <linux/slab.h>
  4. #include <linux/string.h>
  5. #include <linux/compiler.h>
  6. #include <linux/export.h>
  7. #include <linux/err.h>
  8. #include <linux/sched.h>
  9. #include <linux/sched/mm.h>
  10. #include <linux/sched/signal.h>
  11. #include <linux/sched/task_stack.h>
  12. #include <linux/security.h>
  13. #include <linux/swap.h>
  14. #include <linux/swapops.h>
  15. #include <linux/sysctl.h>
  16. #include <linux/mman.h>
  17. #include <linux/hugetlb.h>
  18. #include <linux/vmalloc.h>
  19. #include <linux/userfaultfd_k.h>
  20. #include <linux/elf.h>
  21. #include <linux/elf-randomize.h>
  22. #include <linux/personality.h>
  23. #include <linux/random.h>
  24. #include <linux/processor.h>
  25. #include <linux/sizes.h>
  26. #include <linux/compat.h>
  27. #include <linux/fsnotify.h>
  28. #include <linux/page_idle.h>
  29. #include <linux/uaccess.h>
  30. #include <kunit/visibility.h>
  31. #include "internal.h"
  32. #include "swap.h"
  33. /**
  34. * kfree_const - conditionally free memory
  35. * @x: pointer to the memory
  36. *
  37. * Function calls kfree only if @x is not in .rodata section.
  38. */
  39. void kfree_const(const void *x)
  40. {
  41. if (!is_kernel_rodata((unsigned long)x))
  42. kfree(x);
  43. }
  44. EXPORT_SYMBOL(kfree_const);
  45. /**
  46. * __kmemdup_nul - Create a NUL-terminated string from @s, which might be unterminated.
  47. * @s: The data to copy
  48. * @len: The size of the data, not including the NUL terminator
  49. * @gfp: the GFP mask used in the kmalloc() call when allocating memory
  50. *
  51. * Return: newly allocated copy of @s with NUL-termination or %NULL in
  52. * case of error
  53. */
  54. static __always_inline char *__kmemdup_nul(const char *s, size_t len, gfp_t gfp)
  55. {
  56. char *buf;
  57. /* '+1' for the NUL terminator */
  58. buf = kmalloc_track_caller(len + 1, gfp);
  59. if (!buf)
  60. return NULL;
  61. memcpy(buf, s, len);
  62. /* Ensure the buf is always NUL-terminated, regardless of @s. */
  63. buf[len] = '\0';
  64. return buf;
  65. }
  66. /**
  67. * kstrdup - allocate space for and copy an existing string
  68. * @s: the string to duplicate
  69. * @gfp: the GFP mask used in the kmalloc() call when allocating memory
  70. *
  71. * Return: newly allocated copy of @s or %NULL in case of error
  72. */
  73. noinline
  74. char *kstrdup(const char *s, gfp_t gfp)
  75. {
  76. return s ? __kmemdup_nul(s, strlen(s), gfp) : NULL;
  77. }
  78. EXPORT_SYMBOL(kstrdup);
  79. /**
  80. * kstrdup_const - conditionally duplicate an existing const string
  81. * @s: the string to duplicate
  82. * @gfp: the GFP mask used in the kmalloc() call when allocating memory
  83. *
  84. * Note: Strings allocated by kstrdup_const should be freed by kfree_const and
  85. * must not be passed to krealloc().
  86. *
  87. * Return: source string if it is in .rodata section otherwise
  88. * fallback to kstrdup.
  89. */
  90. const char *kstrdup_const(const char *s, gfp_t gfp)
  91. {
  92. if (is_kernel_rodata((unsigned long)s))
  93. return s;
  94. return kstrdup(s, gfp);
  95. }
  96. EXPORT_SYMBOL(kstrdup_const);
  97. /**
  98. * kstrndup - allocate space for and copy an existing string
  99. * @s: the string to duplicate
  100. * @max: read at most @max chars from @s
  101. * @gfp: the GFP mask used in the kmalloc() call when allocating memory
  102. *
  103. * Note: Use kmemdup_nul() instead if the size is known exactly.
  104. *
  105. * Return: newly allocated copy of @s or %NULL in case of error
  106. */
  107. char *kstrndup(const char *s, size_t max, gfp_t gfp)
  108. {
  109. return s ? __kmemdup_nul(s, strnlen(s, max), gfp) : NULL;
  110. }
  111. EXPORT_SYMBOL(kstrndup);
  112. /**
  113. * kmemdup - duplicate region of memory
  114. *
  115. * @src: memory region to duplicate
  116. * @len: memory region length
  117. * @gfp: GFP mask to use
  118. *
  119. * Return: newly allocated copy of @src or %NULL in case of error,
  120. * result is physically contiguous. Use kfree() to free.
  121. */
  122. void *kmemdup_noprof(const void *src, size_t len, gfp_t gfp)
  123. {
  124. void *p;
  125. p = kmalloc_node_track_caller_noprof(len, gfp, NUMA_NO_NODE, _RET_IP_);
  126. if (p)
  127. memcpy(p, src, len);
  128. return p;
  129. }
  130. EXPORT_SYMBOL(kmemdup_noprof);
  131. /**
  132. * kmemdup_array - duplicate a given array.
  133. *
  134. * @src: array to duplicate.
  135. * @count: number of elements to duplicate from array.
  136. * @element_size: size of each element of array.
  137. * @gfp: GFP mask to use.
  138. *
  139. * Return: duplicated array of @src or %NULL in case of error,
  140. * result is physically contiguous. Use kfree() to free.
  141. */
  142. void *kmemdup_array(const void *src, size_t count, size_t element_size, gfp_t gfp)
  143. {
  144. return kmemdup(src, size_mul(element_size, count), gfp);
  145. }
  146. EXPORT_SYMBOL(kmemdup_array);
  147. /**
  148. * kvmemdup - duplicate region of memory
  149. *
  150. * @src: memory region to duplicate
  151. * @len: memory region length
  152. * @gfp: GFP mask to use
  153. *
  154. * Return: newly allocated copy of @src or %NULL in case of error,
  155. * result may be not physically contiguous. Use kvfree() to free.
  156. */
  157. void *kvmemdup(const void *src, size_t len, gfp_t gfp)
  158. {
  159. void *p;
  160. p = kvmalloc(len, gfp);
  161. if (p)
  162. memcpy(p, src, len);
  163. return p;
  164. }
  165. EXPORT_SYMBOL(kvmemdup);
  166. /**
  167. * kmemdup_nul - Create a NUL-terminated string from unterminated data
  168. * @s: The data to stringify
  169. * @len: The size of the data
  170. * @gfp: the GFP mask used in the kmalloc() call when allocating memory
  171. *
  172. * Return: newly allocated copy of @s with NUL-termination or %NULL in
  173. * case of error
  174. */
  175. char *kmemdup_nul(const char *s, size_t len, gfp_t gfp)
  176. {
  177. return s ? __kmemdup_nul(s, len, gfp) : NULL;
  178. }
  179. EXPORT_SYMBOL(kmemdup_nul);
  180. static kmem_buckets *user_buckets __ro_after_init;
  181. static int __init init_user_buckets(void)
  182. {
  183. user_buckets = kmem_buckets_create("memdup_user", 0, 0, INT_MAX, NULL);
  184. return 0;
  185. }
  186. subsys_initcall(init_user_buckets);
  187. /**
  188. * memdup_user - duplicate memory region from user space
  189. *
  190. * @src: source address in user space
  191. * @len: number of bytes to copy
  192. *
  193. * Return: an ERR_PTR() on failure. Result is physically
  194. * contiguous, to be freed by kfree().
  195. */
  196. void *memdup_user(const void __user *src, size_t len)
  197. {
  198. void *p;
  199. p = kmem_buckets_alloc_track_caller(user_buckets, len, GFP_USER | __GFP_NOWARN);
  200. if (!p)
  201. return ERR_PTR(-ENOMEM);
  202. if (copy_from_user(p, src, len)) {
  203. kfree(p);
  204. return ERR_PTR(-EFAULT);
  205. }
  206. return p;
  207. }
  208. EXPORT_SYMBOL(memdup_user);
  209. /**
  210. * vmemdup_user - duplicate memory region from user space
  211. *
  212. * @src: source address in user space
  213. * @len: number of bytes to copy
  214. *
  215. * Return: an ERR_PTR() on failure. Result may be not
  216. * physically contiguous. Use kvfree() to free.
  217. */
  218. void *vmemdup_user(const void __user *src, size_t len)
  219. {
  220. void *p;
  221. p = kmem_buckets_valloc(user_buckets, len, GFP_USER);
  222. if (!p)
  223. return ERR_PTR(-ENOMEM);
  224. if (copy_from_user(p, src, len)) {
  225. kvfree(p);
  226. return ERR_PTR(-EFAULT);
  227. }
  228. return p;
  229. }
  230. EXPORT_SYMBOL(vmemdup_user);
  231. /**
  232. * strndup_user - duplicate an existing string from user space
  233. * @s: The string to duplicate
  234. * @n: Maximum number of bytes to copy, including the trailing NUL.
  235. *
  236. * Return: newly allocated copy of @s or an ERR_PTR() in case of error
  237. */
  238. char *strndup_user(const char __user *s, long n)
  239. {
  240. char *p;
  241. long length;
  242. length = strnlen_user(s, n);
  243. if (!length)
  244. return ERR_PTR(-EFAULT);
  245. if (length > n)
  246. return ERR_PTR(-EINVAL);
  247. p = memdup_user(s, length);
  248. if (IS_ERR(p))
  249. return p;
  250. p[length - 1] = '\0';
  251. return p;
  252. }
  253. EXPORT_SYMBOL(strndup_user);
  254. /**
  255. * memdup_user_nul - duplicate memory region from user space and NUL-terminate
  256. *
  257. * @src: source address in user space
  258. * @len: number of bytes to copy
  259. *
  260. * Return: an ERR_PTR() on failure.
  261. */
  262. void *memdup_user_nul(const void __user *src, size_t len)
  263. {
  264. char *p;
  265. p = kmem_buckets_alloc_track_caller(user_buckets, len + 1, GFP_USER | __GFP_NOWARN);
  266. if (!p)
  267. return ERR_PTR(-ENOMEM);
  268. if (copy_from_user(p, src, len)) {
  269. kfree(p);
  270. return ERR_PTR(-EFAULT);
  271. }
  272. p[len] = '\0';
  273. return p;
  274. }
  275. EXPORT_SYMBOL(memdup_user_nul);
  276. /* Check if the vma is being used as a stack by this task */
  277. int vma_is_stack_for_current(const struct vm_area_struct *vma)
  278. {
  279. struct task_struct * __maybe_unused t = current;
  280. return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
  281. }
  282. /*
  283. * Change backing file, only valid to use during initial VMA setup.
  284. */
  285. void vma_set_file(struct vm_area_struct *vma, struct file *file)
  286. {
  287. /* Changing an anonymous vma with this is illegal */
  288. get_file(file);
  289. swap(vma->vm_file, file);
  290. fput(file);
  291. }
  292. EXPORT_SYMBOL(vma_set_file);
  293. #ifndef STACK_RND_MASK
  294. #define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */
  295. #endif
  296. unsigned long randomize_stack_top(unsigned long stack_top)
  297. {
  298. unsigned long random_variable = 0;
  299. if (current->flags & PF_RANDOMIZE) {
  300. random_variable = get_random_long();
  301. random_variable &= STACK_RND_MASK;
  302. random_variable <<= PAGE_SHIFT;
  303. }
  304. #ifdef CONFIG_STACK_GROWSUP
  305. return PAGE_ALIGN(stack_top) + random_variable;
  306. #else
  307. return PAGE_ALIGN(stack_top) - random_variable;
  308. #endif
  309. }
  310. /**
  311. * randomize_page - Generate a random, page aligned address
  312. * @start: The smallest acceptable address the caller will take.
  313. * @range: The size of the area, starting at @start, within which the
  314. * random address must fall.
  315. *
  316. * If @start + @range would overflow, @range is capped.
  317. *
  318. * NOTE: Historical use of randomize_range, which this replaces, presumed that
  319. * @start was already page aligned. We now align it regardless.
  320. *
  321. * Return: A page aligned address within [start, start + range). On error,
  322. * @start is returned.
  323. */
  324. unsigned long randomize_page(unsigned long start, unsigned long range)
  325. {
  326. if (!PAGE_ALIGNED(start)) {
  327. range -= PAGE_ALIGN(start) - start;
  328. start = PAGE_ALIGN(start);
  329. }
  330. if (start > ULONG_MAX - range)
  331. range = ULONG_MAX - start;
  332. range >>= PAGE_SHIFT;
  333. if (range == 0)
  334. return start;
  335. return start + (get_random_long() % range << PAGE_SHIFT);
  336. }
  337. #ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
  338. unsigned long __weak arch_randomize_brk(struct mm_struct *mm)
  339. {
  340. /* Is the current task 32bit ? */
  341. if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task())
  342. return randomize_page(mm->brk, SZ_32M);
  343. return randomize_page(mm->brk, SZ_1G);
  344. }
  345. unsigned long arch_mmap_rnd(void)
  346. {
  347. unsigned long rnd;
  348. #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
  349. if (is_compat_task())
  350. rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
  351. else
  352. #endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */
  353. rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
  354. return rnd << PAGE_SHIFT;
  355. }
  356. static int mmap_is_legacy(const struct rlimit *rlim_stack)
  357. {
  358. if (current->personality & ADDR_COMPAT_LAYOUT)
  359. return 1;
  360. /* On parisc the stack always grows up - so a unlimited stack should
  361. * not be an indicator to use the legacy memory layout. */
  362. if (rlim_stack->rlim_cur == RLIM_INFINITY &&
  363. !IS_ENABLED(CONFIG_STACK_GROWSUP))
  364. return 1;
  365. return sysctl_legacy_va_layout;
  366. }
  367. /*
  368. * Leave enough space between the mmap area and the stack to honour ulimit in
  369. * the face of randomisation.
  370. */
  371. #define MIN_GAP (SZ_128M)
  372. #define MAX_GAP (STACK_TOP / 6 * 5)
  373. static unsigned long mmap_base(const unsigned long rnd, const struct rlimit *rlim_stack)
  374. {
  375. #ifdef CONFIG_STACK_GROWSUP
  376. /*
  377. * For an upwards growing stack the calculation is much simpler.
  378. * Memory for the maximum stack size is reserved at the top of the
  379. * task. mmap_base starts directly below the stack and grows
  380. * downwards.
  381. */
  382. return PAGE_ALIGN_DOWN(mmap_upper_limit(rlim_stack) - rnd);
  383. #else
  384. unsigned long gap = rlim_stack->rlim_cur;
  385. unsigned long pad = stack_guard_gap;
  386. /* Account for stack randomization if necessary */
  387. if (current->flags & PF_RANDOMIZE)
  388. pad += (STACK_RND_MASK << PAGE_SHIFT);
  389. /* Values close to RLIM_INFINITY can overflow. */
  390. if (gap + pad > gap)
  391. gap += pad;
  392. if (gap < MIN_GAP && MIN_GAP < MAX_GAP)
  393. gap = MIN_GAP;
  394. else if (gap > MAX_GAP)
  395. gap = MAX_GAP;
  396. return PAGE_ALIGN(STACK_TOP - gap - rnd);
  397. #endif
  398. }
  399. void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack)
  400. {
  401. unsigned long random_factor = 0UL;
  402. if (current->flags & PF_RANDOMIZE)
  403. random_factor = arch_mmap_rnd();
  404. if (mmap_is_legacy(rlim_stack)) {
  405. mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
  406. mm_flags_clear(MMF_TOPDOWN, mm);
  407. } else {
  408. mm->mmap_base = mmap_base(random_factor, rlim_stack);
  409. mm_flags_set(MMF_TOPDOWN, mm);
  410. }
  411. }
  412. #elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
  413. void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack)
  414. {
  415. mm->mmap_base = TASK_UNMAPPED_BASE;
  416. mm_flags_clear(MMF_TOPDOWN, mm);
  417. }
  418. #endif
  419. #ifdef CONFIG_MMU
  420. EXPORT_SYMBOL_IF_KUNIT(arch_pick_mmap_layout);
  421. #endif
  422. /**
  423. * __account_locked_vm - account locked pages to an mm's locked_vm
  424. * @mm: mm to account against
  425. * @pages: number of pages to account
  426. * @inc: %true if @pages should be considered positive, %false if not
  427. * @task: task used to check RLIMIT_MEMLOCK
  428. * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped
  429. *
  430. * Assumes @task and @mm are valid (i.e. at least one reference on each), and
  431. * that mmap_lock is held as writer.
  432. *
  433. * Return:
  434. * * 0 on success
  435. * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
  436. */
  437. int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
  438. const struct task_struct *task, bool bypass_rlim)
  439. {
  440. unsigned long locked_vm, limit;
  441. int ret = 0;
  442. mmap_assert_write_locked(mm);
  443. locked_vm = mm->locked_vm;
  444. if (inc) {
  445. if (!bypass_rlim) {
  446. limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
  447. if (locked_vm + pages > limit)
  448. ret = -ENOMEM;
  449. }
  450. if (!ret)
  451. mm->locked_vm = locked_vm + pages;
  452. } else {
  453. WARN_ON_ONCE(pages > locked_vm);
  454. mm->locked_vm = locked_vm - pages;
  455. }
  456. pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid,
  457. (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT,
  458. locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK),
  459. ret ? " - exceeded" : "");
  460. return ret;
  461. }
  462. EXPORT_SYMBOL_GPL(__account_locked_vm);
  463. /**
  464. * account_locked_vm - account locked pages to an mm's locked_vm
  465. * @mm: mm to account against, may be NULL
  466. * @pages: number of pages to account
  467. * @inc: %true if @pages should be considered positive, %false if not
  468. *
  469. * Assumes a non-NULL @mm is valid (i.e. at least one reference on it).
  470. *
  471. * Return:
  472. * * 0 on success, or if mm is NULL
  473. * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
  474. */
  475. int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc)
  476. {
  477. int ret;
  478. if (pages == 0 || !mm)
  479. return 0;
  480. mmap_write_lock(mm);
  481. ret = __account_locked_vm(mm, pages, inc, current,
  482. capable(CAP_IPC_LOCK));
  483. mmap_write_unlock(mm);
  484. return ret;
  485. }
  486. EXPORT_SYMBOL_GPL(account_locked_vm);
  487. unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
  488. unsigned long len, unsigned long prot,
  489. unsigned long flag, unsigned long pgoff)
  490. {
  491. loff_t off = (loff_t)pgoff << PAGE_SHIFT;
  492. unsigned long ret;
  493. struct mm_struct *mm = current->mm;
  494. unsigned long populate;
  495. LIST_HEAD(uf);
  496. ret = security_mmap_file(file, prot, flag);
  497. if (!ret)
  498. ret = fsnotify_mmap_perm(file, prot, off, len);
  499. if (!ret) {
  500. if (mmap_write_lock_killable(mm))
  501. return -EINTR;
  502. ret = do_mmap(file, addr, len, prot, flag, 0, pgoff, &populate,
  503. &uf);
  504. mmap_write_unlock(mm);
  505. userfaultfd_unmap_complete(mm, &uf);
  506. if (populate)
  507. mm_populate(ret, populate);
  508. }
  509. return ret;
  510. }
  511. /*
  512. * Perform a userland memory mapping into the current process address space. See
  513. * the comment for do_mmap() for more details on this operation in general.
  514. *
  515. * This differs from do_mmap() in that:
  516. *
  517. * a. An offset parameter is provided rather than pgoff, which is both checked
  518. * for overflow and page alignment.
  519. * b. mmap locking is performed on the caller's behalf.
  520. * c. Userfaultfd unmap events and memory population are handled.
  521. *
  522. * This means that this function performs essentially the same work as if
  523. * userland were invoking mmap (2).
  524. *
  525. * Returns either an error, or the address at which the requested mapping has
  526. * been performed.
  527. */
  528. unsigned long vm_mmap(struct file *file, unsigned long addr,
  529. unsigned long len, unsigned long prot,
  530. unsigned long flag, unsigned long offset)
  531. {
  532. if (unlikely(offset + PAGE_ALIGN(len) < offset))
  533. return -EINVAL;
  534. if (unlikely(offset_in_page(offset)))
  535. return -EINVAL;
  536. return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
  537. }
  538. EXPORT_SYMBOL(vm_mmap);
  539. /**
  540. * __vmalloc_array - allocate memory for a virtually contiguous array.
  541. * @n: number of elements.
  542. * @size: element size.
  543. * @flags: the type of memory to allocate (see kmalloc).
  544. */
  545. void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags)
  546. {
  547. size_t bytes;
  548. if (unlikely(check_mul_overflow(n, size, &bytes)))
  549. return NULL;
  550. return __vmalloc_noprof(bytes, flags);
  551. }
  552. EXPORT_SYMBOL(__vmalloc_array_noprof);
  553. /**
  554. * vmalloc_array - allocate memory for a virtually contiguous array.
  555. * @n: number of elements.
  556. * @size: element size.
  557. */
  558. void *vmalloc_array_noprof(size_t n, size_t size)
  559. {
  560. return __vmalloc_array_noprof(n, size, GFP_KERNEL);
  561. }
  562. EXPORT_SYMBOL(vmalloc_array_noprof);
  563. /**
  564. * __vcalloc - allocate and zero memory for a virtually contiguous array.
  565. * @n: number of elements.
  566. * @size: element size.
  567. * @flags: the type of memory to allocate (see kmalloc).
  568. */
  569. void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags)
  570. {
  571. return __vmalloc_array_noprof(n, size, flags | __GFP_ZERO);
  572. }
  573. EXPORT_SYMBOL(__vcalloc_noprof);
  574. /**
  575. * vcalloc - allocate and zero memory for a virtually contiguous array.
  576. * @n: number of elements.
  577. * @size: element size.
  578. */
  579. void *vcalloc_noprof(size_t n, size_t size)
  580. {
  581. return __vmalloc_array_noprof(n, size, GFP_KERNEL | __GFP_ZERO);
  582. }
  583. EXPORT_SYMBOL(vcalloc_noprof);
  584. struct anon_vma *folio_anon_vma(const struct folio *folio)
  585. {
  586. unsigned long mapping = (unsigned long)folio->mapping;
  587. if ((mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON)
  588. return NULL;
  589. return (void *)(mapping - FOLIO_MAPPING_ANON);
  590. }
  591. /**
  592. * folio_mapping - Find the mapping where this folio is stored.
  593. * @folio: The folio.
  594. *
  595. * For folios which are in the page cache, return the mapping that this
  596. * page belongs to. Folios in the swap cache return the swap mapping
  597. * this page is stored in (which is different from the mapping for the
  598. * swap file or swap device where the data is stored).
  599. *
  600. * You can call this for folios which aren't in the swap cache or page
  601. * cache and it will return NULL.
  602. */
  603. struct address_space *folio_mapping(const struct folio *folio)
  604. {
  605. struct address_space *mapping;
  606. /* This happens if someone calls flush_dcache_page on slab page */
  607. if (unlikely(folio_test_slab(folio)))
  608. return NULL;
  609. if (unlikely(folio_test_swapcache(folio)))
  610. return swap_address_space(folio->swap);
  611. mapping = folio->mapping;
  612. if ((unsigned long)mapping & FOLIO_MAPPING_FLAGS)
  613. return NULL;
  614. return mapping;
  615. }
  616. EXPORT_SYMBOL(folio_mapping);
  617. /**
  618. * folio_copy - Copy the contents of one folio to another.
  619. * @dst: Folio to copy to.
  620. * @src: Folio to copy from.
  621. *
  622. * The bytes in the folio represented by @src are copied to @dst.
  623. * Assumes the caller has validated that @dst is at least as large as @src.
  624. * Can be called in atomic context for order-0 folios, but if the folio is
  625. * larger, it may sleep.
  626. */
  627. void folio_copy(struct folio *dst, struct folio *src)
  628. {
  629. long i = 0;
  630. long nr = folio_nr_pages(src);
  631. for (;;) {
  632. copy_highpage(folio_page(dst, i), folio_page(src, i));
  633. if (++i == nr)
  634. break;
  635. cond_resched();
  636. }
  637. }
  638. EXPORT_SYMBOL(folio_copy);
  639. int folio_mc_copy(struct folio *dst, struct folio *src)
  640. {
  641. long nr = folio_nr_pages(src);
  642. long i = 0;
  643. for (;;) {
  644. if (copy_mc_highpage(folio_page(dst, i), folio_page(src, i)))
  645. return -EHWPOISON;
  646. if (++i == nr)
  647. break;
  648. cond_resched();
  649. }
  650. return 0;
  651. }
  652. EXPORT_SYMBOL(folio_mc_copy);
  653. int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
  654. static int sysctl_overcommit_ratio __read_mostly = 50;
  655. static unsigned long sysctl_overcommit_kbytes __read_mostly;
  656. int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
  657. unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
  658. unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
  659. #ifdef CONFIG_SYSCTL
  660. static int overcommit_ratio_handler(const struct ctl_table *table, int write,
  661. void *buffer, size_t *lenp, loff_t *ppos)
  662. {
  663. int ret;
  664. ret = proc_dointvec(table, write, buffer, lenp, ppos);
  665. if (ret == 0 && write)
  666. sysctl_overcommit_kbytes = 0;
  667. return ret;
  668. }
  669. static void sync_overcommit_as(struct work_struct *dummy)
  670. {
  671. percpu_counter_sync(&vm_committed_as);
  672. }
  673. static int overcommit_policy_handler(const struct ctl_table *table, int write,
  674. void *buffer, size_t *lenp, loff_t *ppos)
  675. {
  676. struct ctl_table t;
  677. int new_policy = -1;
  678. int ret;
  679. /*
  680. * The deviation of sync_overcommit_as could be big with loose policy
  681. * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to
  682. * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply
  683. * with the strict "NEVER", and to avoid possible race condition (even
  684. * though user usually won't too frequently do the switching to policy
  685. * OVERCOMMIT_NEVER), the switch is done in the following order:
  686. * 1. changing the batch
  687. * 2. sync percpu count on each CPU
  688. * 3. switch the policy
  689. */
  690. if (write) {
  691. t = *table;
  692. t.data = &new_policy;
  693. ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
  694. if (ret || new_policy == -1)
  695. return ret;
  696. mm_compute_batch(new_policy);
  697. if (new_policy == OVERCOMMIT_NEVER)
  698. schedule_on_each_cpu(sync_overcommit_as);
  699. sysctl_overcommit_memory = new_policy;
  700. } else {
  701. ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
  702. }
  703. return ret;
  704. }
  705. static int overcommit_kbytes_handler(const struct ctl_table *table, int write,
  706. void *buffer, size_t *lenp, loff_t *ppos)
  707. {
  708. int ret;
  709. ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
  710. if (ret == 0 && write)
  711. sysctl_overcommit_ratio = 0;
  712. return ret;
  713. }
  714. static const struct ctl_table util_sysctl_table[] = {
  715. {
  716. .procname = "overcommit_memory",
  717. .data = &sysctl_overcommit_memory,
  718. .maxlen = sizeof(sysctl_overcommit_memory),
  719. .mode = 0644,
  720. .proc_handler = overcommit_policy_handler,
  721. .extra1 = SYSCTL_ZERO,
  722. .extra2 = SYSCTL_TWO,
  723. },
  724. {
  725. .procname = "overcommit_ratio",
  726. .data = &sysctl_overcommit_ratio,
  727. .maxlen = sizeof(sysctl_overcommit_ratio),
  728. .mode = 0644,
  729. .proc_handler = overcommit_ratio_handler,
  730. },
  731. {
  732. .procname = "overcommit_kbytes",
  733. .data = &sysctl_overcommit_kbytes,
  734. .maxlen = sizeof(sysctl_overcommit_kbytes),
  735. .mode = 0644,
  736. .proc_handler = overcommit_kbytes_handler,
  737. },
  738. {
  739. .procname = "user_reserve_kbytes",
  740. .data = &sysctl_user_reserve_kbytes,
  741. .maxlen = sizeof(sysctl_user_reserve_kbytes),
  742. .mode = 0644,
  743. .proc_handler = proc_doulongvec_minmax,
  744. },
  745. {
  746. .procname = "admin_reserve_kbytes",
  747. .data = &sysctl_admin_reserve_kbytes,
  748. .maxlen = sizeof(sysctl_admin_reserve_kbytes),
  749. .mode = 0644,
  750. .proc_handler = proc_doulongvec_minmax,
  751. },
  752. };
  753. static int __init init_vm_util_sysctls(void)
  754. {
  755. register_sysctl_init("vm", util_sysctl_table);
  756. return 0;
  757. }
  758. subsys_initcall(init_vm_util_sysctls);
  759. #endif /* CONFIG_SYSCTL */
  760. /*
  761. * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
  762. */
  763. unsigned long vm_commit_limit(void)
  764. {
  765. unsigned long allowed;
  766. if (sysctl_overcommit_kbytes)
  767. allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
  768. else
  769. allowed = ((totalram_pages() - hugetlb_total_pages())
  770. * sysctl_overcommit_ratio / 100);
  771. allowed += total_swap_pages;
  772. return allowed;
  773. }
  774. /*
  775. * Make sure vm_committed_as in one cacheline and not cacheline shared with
  776. * other variables. It can be updated by several CPUs frequently.
  777. */
  778. struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
  779. /*
  780. * The global memory commitment made in the system can be a metric
  781. * that can be used to drive ballooning decisions when Linux is hosted
  782. * as a guest. On Hyper-V, the host implements a policy engine for dynamically
  783. * balancing memory across competing virtual machines that are hosted.
  784. * Several metrics drive this policy engine including the guest reported
  785. * memory commitment.
  786. *
  787. * The time cost of this is very low for small platforms, and for big
  788. * platform like a 2S/36C/72T Skylake server, in worst case where
  789. * vm_committed_as's spinlock is under severe contention, the time cost
  790. * could be about 30~40 microseconds.
  791. */
  792. unsigned long vm_memory_committed(void)
  793. {
  794. return percpu_counter_sum_positive(&vm_committed_as);
  795. }
  796. EXPORT_SYMBOL_GPL(vm_memory_committed);
  797. /*
  798. * Check that a process has enough memory to allocate a new virtual
  799. * mapping. 0 means there is enough memory for the allocation to
  800. * succeed and -ENOMEM implies there is not.
  801. *
  802. * We currently support three overcommit policies, which are set via the
  803. * vm.overcommit_memory sysctl. See Documentation/mm/overcommit-accounting.rst
  804. *
  805. * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
  806. * Additional code 2002 Jul 20 by Robert Love.
  807. *
  808. * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
  809. *
  810. * Note this is a helper function intended to be used by LSMs which
  811. * wish to use this logic.
  812. */
  813. int __vm_enough_memory(const struct mm_struct *mm, long pages, int cap_sys_admin)
  814. {
  815. long allowed;
  816. unsigned long bytes_failed;
  817. vm_acct_memory(pages);
  818. /*
  819. * Sometimes we want to use more memory than we have
  820. */
  821. if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
  822. return 0;
  823. if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
  824. if (pages > totalram_pages() + total_swap_pages)
  825. goto error;
  826. return 0;
  827. }
  828. allowed = vm_commit_limit();
  829. /*
  830. * Reserve some for root
  831. */
  832. if (!cap_sys_admin)
  833. allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
  834. /*
  835. * Don't let a single process grow so big a user can't recover
  836. */
  837. if (mm) {
  838. long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
  839. allowed -= min_t(long, mm->total_vm / 32, reserve);
  840. }
  841. if (percpu_counter_read_positive(&vm_committed_as) < allowed)
  842. return 0;
  843. error:
  844. bytes_failed = pages << PAGE_SHIFT;
  845. pr_warn_ratelimited("%s: pid: %d, comm: %s, bytes: %lu not enough memory for the allocation\n",
  846. __func__, current->pid, current->comm, bytes_failed);
  847. vm_unacct_memory(pages);
  848. return -ENOMEM;
  849. }
  850. /**
  851. * get_cmdline() - copy the cmdline value to a buffer.
  852. * @task: the task whose cmdline value to copy.
  853. * @buffer: the buffer to copy to.
  854. * @buflen: the length of the buffer. Larger cmdline values are truncated
  855. * to this length.
  856. *
  857. * Return: the size of the cmdline field copied. Note that the copy does
  858. * not guarantee an ending NULL byte.
  859. */
  860. int get_cmdline(struct task_struct *task, char *buffer, int buflen)
  861. {
  862. int res = 0;
  863. unsigned int len;
  864. struct mm_struct *mm = get_task_mm(task);
  865. unsigned long arg_start, arg_end, env_start, env_end;
  866. if (!mm)
  867. goto out;
  868. if (!mm->arg_end)
  869. goto out_mm; /* Shh! No looking before we're done */
  870. spin_lock(&mm->arg_lock);
  871. arg_start = mm->arg_start;
  872. arg_end = mm->arg_end;
  873. env_start = mm->env_start;
  874. env_end = mm->env_end;
  875. spin_unlock(&mm->arg_lock);
  876. len = arg_end - arg_start;
  877. if (len > buflen)
  878. len = buflen;
  879. res = access_process_vm(task, arg_start, buffer, len, FOLL_FORCE);
  880. /*
  881. * If the nul at the end of args has been overwritten, then
  882. * assume application is using setproctitle(3).
  883. */
  884. if (res > 0 && buffer[res-1] != '\0' && len < buflen) {
  885. len = strnlen(buffer, res);
  886. if (len < res) {
  887. res = len;
  888. } else {
  889. len = env_end - env_start;
  890. if (len > buflen - res)
  891. len = buflen - res;
  892. res += access_process_vm(task, env_start,
  893. buffer+res, len,
  894. FOLL_FORCE);
  895. res = strnlen(buffer, res);
  896. }
  897. }
  898. out_mm:
  899. mmput(mm);
  900. out:
  901. return res;
  902. }
  903. int __weak memcmp_pages(struct page *page1, struct page *page2)
  904. {
  905. char *addr1, *addr2;
  906. int ret;
  907. addr1 = kmap_local_page(page1);
  908. addr2 = kmap_local_page(page2);
  909. ret = memcmp(addr1, addr2, PAGE_SIZE);
  910. kunmap_local(addr2);
  911. kunmap_local(addr1);
  912. return ret;
  913. }
  914. #ifdef CONFIG_PRINTK
  915. /**
  916. * mem_dump_obj - Print available provenance information
  917. * @object: object for which to find provenance information.
  918. *
  919. * This function uses pr_cont(), so that the caller is expected to have
  920. * printed out whatever preamble is appropriate. The provenance information
  921. * depends on the type of object and on how much debugging is enabled.
  922. * For example, for a slab-cache object, the slab name is printed, and,
  923. * if available, the return address and stack trace from the allocation
  924. * and last free path of that object.
  925. */
  926. void mem_dump_obj(void *object)
  927. {
  928. const char *type;
  929. if (kmem_dump_obj(object))
  930. return;
  931. if (vmalloc_dump_obj(object))
  932. return;
  933. if (is_vmalloc_addr(object))
  934. type = "vmalloc memory";
  935. else if (virt_addr_valid(object))
  936. type = "non-slab/vmalloc memory";
  937. else if (object == NULL)
  938. type = "NULL pointer";
  939. else if (object == ZERO_SIZE_PTR)
  940. type = "zero-size pointer";
  941. else
  942. type = "non-paged memory";
  943. pr_cont(" %s\n", type);
  944. }
  945. EXPORT_SYMBOL_GPL(mem_dump_obj);
  946. #endif
  947. /*
  948. * A driver might set a page logically offline -- PageOffline() -- and
  949. * turn the page inaccessible in the hypervisor; after that, access to page
  950. * content can be fatal.
  951. *
  952. * Some special PFN walkers -- i.e., /proc/kcore -- read content of random
  953. * pages after checking PageOffline(); however, these PFN walkers can race
  954. * with drivers that set PageOffline().
  955. *
  956. * page_offline_freeze()/page_offline_thaw() allows for a subsystem to
  957. * synchronize with such drivers, achieving that a page cannot be set
  958. * PageOffline() while frozen.
  959. *
  960. * page_offline_begin()/page_offline_end() is used by drivers that care about
  961. * such races when setting a page PageOffline().
  962. */
  963. static DECLARE_RWSEM(page_offline_rwsem);
  964. void page_offline_freeze(void)
  965. {
  966. down_read(&page_offline_rwsem);
  967. }
  968. void page_offline_thaw(void)
  969. {
  970. up_read(&page_offline_rwsem);
  971. }
  972. void page_offline_begin(void)
  973. {
  974. down_write(&page_offline_rwsem);
  975. }
  976. EXPORT_SYMBOL(page_offline_begin);
  977. void page_offline_end(void)
  978. {
  979. up_write(&page_offline_rwsem);
  980. }
  981. EXPORT_SYMBOL(page_offline_end);
  982. #ifndef flush_dcache_folio
  983. void flush_dcache_folio(struct folio *folio)
  984. {
  985. long i, nr = folio_nr_pages(folio);
  986. for (i = 0; i < nr; i++)
  987. flush_dcache_page(folio_page(folio, i));
  988. }
  989. EXPORT_SYMBOL(flush_dcache_folio);
  990. #endif
  991. /**
  992. * __compat_vma_mmap() - See description for compat_vma_mmap()
  993. * for details. This is the same operation, only with a specific file operations
  994. * struct which may or may not be the same as vma->vm_file->f_op.
  995. * @f_op: The file operations whose .mmap_prepare() hook is specified.
  996. * @file: The file which backs or will back the mapping.
  997. * @vma: The VMA to apply the .mmap_prepare() hook to.
  998. * Returns: 0 on success or error.
  999. */
  1000. int __compat_vma_mmap(const struct file_operations *f_op,
  1001. struct file *file, struct vm_area_struct *vma)
  1002. {
  1003. struct vm_area_desc desc = {
  1004. .mm = vma->vm_mm,
  1005. .file = file,
  1006. .start = vma->vm_start,
  1007. .end = vma->vm_end,
  1008. .pgoff = vma->vm_pgoff,
  1009. .vm_file = vma->vm_file,
  1010. .vma_flags = vma->flags,
  1011. .page_prot = vma->vm_page_prot,
  1012. .action.type = MMAP_NOTHING, /* Default */
  1013. };
  1014. int err;
  1015. err = f_op->mmap_prepare(&desc);
  1016. if (err)
  1017. return err;
  1018. mmap_action_prepare(&desc.action, &desc);
  1019. set_vma_from_desc(vma, &desc);
  1020. return mmap_action_complete(&desc.action, vma);
  1021. }
  1022. EXPORT_SYMBOL(__compat_vma_mmap);
  1023. /**
  1024. * compat_vma_mmap() - Apply the file's .mmap_prepare() hook to an
  1025. * existing VMA and execute any requested actions.
  1026. * @file: The file which possesss an f_op->mmap_prepare() hook.
  1027. * @vma: The VMA to apply the .mmap_prepare() hook to.
  1028. *
  1029. * Ordinarily, .mmap_prepare() is invoked directly upon mmap(). However, certain
  1030. * stacked filesystems invoke a nested mmap hook of an underlying file.
  1031. *
  1032. * Until all filesystems are converted to use .mmap_prepare(), we must be
  1033. * conservative and continue to invoke these stacked filesystems using the
  1034. * deprecated .mmap() hook.
  1035. *
  1036. * However we have a problem if the underlying file system possesses an
  1037. * .mmap_prepare() hook, as we are in a different context when we invoke the
  1038. * .mmap() hook, already having a VMA to deal with.
  1039. *
  1040. * compat_vma_mmap() is a compatibility function that takes VMA state,
  1041. * establishes a struct vm_area_desc descriptor, passes to the underlying
  1042. * .mmap_prepare() hook and applies any changes performed by it.
  1043. *
  1044. * Once the conversion of filesystems is complete this function will no longer
  1045. * be required and will be removed.
  1046. *
  1047. * Returns: 0 on success or error.
  1048. */
  1049. int compat_vma_mmap(struct file *file, struct vm_area_struct *vma)
  1050. {
  1051. return __compat_vma_mmap(file->f_op, file, vma);
  1052. }
  1053. EXPORT_SYMBOL(compat_vma_mmap);
  1054. static void set_ps_flags(struct page_snapshot *ps, const struct folio *folio,
  1055. const struct page *page)
  1056. {
  1057. /*
  1058. * Only the first page of a high-order buddy page has PageBuddy() set.
  1059. * So we have to check manually whether this page is part of a high-
  1060. * order buddy page.
  1061. */
  1062. if (PageBuddy(page))
  1063. ps->flags |= PAGE_SNAPSHOT_PG_BUDDY;
  1064. else if (page_count(page) == 0 && is_free_buddy_page(page))
  1065. ps->flags |= PAGE_SNAPSHOT_PG_BUDDY;
  1066. if (folio_test_idle(folio))
  1067. ps->flags |= PAGE_SNAPSHOT_PG_IDLE;
  1068. }
  1069. /**
  1070. * snapshot_page() - Create a snapshot of a struct page
  1071. * @ps: Pointer to a struct page_snapshot to store the page snapshot
  1072. * @page: The page to snapshot
  1073. *
  1074. * Create a snapshot of the page and store both its struct page and struct
  1075. * folio representations in @ps.
  1076. *
  1077. * A snapshot is marked as "faithful" if the compound state of @page was
  1078. * stable and allowed safe reconstruction of the folio representation. In
  1079. * rare cases where this is not possible (e.g. due to folio splitting),
  1080. * snapshot_page() falls back to treating @page as a single page and the
  1081. * snapshot is marked as "unfaithful". The snapshot_page_is_faithful()
  1082. * helper can be used to check for this condition.
  1083. */
  1084. void snapshot_page(struct page_snapshot *ps, const struct page *page)
  1085. {
  1086. unsigned long head, nr_pages = 1;
  1087. struct folio *foliop;
  1088. int loops = 5;
  1089. ps->pfn = page_to_pfn(page);
  1090. ps->flags = PAGE_SNAPSHOT_FAITHFUL;
  1091. again:
  1092. memset(&ps->folio_snapshot, 0, sizeof(struct folio));
  1093. memcpy(&ps->page_snapshot, page, sizeof(*page));
  1094. head = ps->page_snapshot.compound_head;
  1095. if ((head & 1) == 0) {
  1096. ps->idx = 0;
  1097. foliop = (struct folio *)&ps->page_snapshot;
  1098. if (!folio_test_large(foliop)) {
  1099. set_ps_flags(ps, page_folio(page), page);
  1100. memcpy(&ps->folio_snapshot, foliop,
  1101. sizeof(struct page));
  1102. return;
  1103. }
  1104. foliop = (struct folio *)page;
  1105. } else {
  1106. foliop = (struct folio *)(head - 1);
  1107. ps->idx = folio_page_idx(foliop, page);
  1108. }
  1109. if (ps->idx < MAX_FOLIO_NR_PAGES) {
  1110. memcpy(&ps->folio_snapshot, foliop, 2 * sizeof(struct page));
  1111. nr_pages = folio_nr_pages(&ps->folio_snapshot);
  1112. if (nr_pages > 1)
  1113. memcpy(&ps->folio_snapshot.__page_2, &foliop->__page_2,
  1114. sizeof(struct page));
  1115. set_ps_flags(ps, foliop, page);
  1116. }
  1117. if (ps->idx > nr_pages) {
  1118. if (loops-- > 0)
  1119. goto again;
  1120. clear_compound_head(&ps->page_snapshot);
  1121. foliop = (struct folio *)&ps->page_snapshot;
  1122. memcpy(&ps->folio_snapshot, foliop, sizeof(struct page));
  1123. ps->flags = 0;
  1124. ps->idx = 0;
  1125. }
  1126. }
  1127. static int mmap_action_finish(struct mmap_action *action,
  1128. const struct vm_area_struct *vma, int err)
  1129. {
  1130. /*
  1131. * If an error occurs, unmap the VMA altogether and return an error. We
  1132. * only clear the newly allocated VMA, since this function is only
  1133. * invoked if we do NOT merge, so we only clean up the VMA we created.
  1134. */
  1135. if (err) {
  1136. const size_t len = vma_pages(vma) << PAGE_SHIFT;
  1137. do_munmap(current->mm, vma->vm_start, len, NULL);
  1138. if (action->error_hook) {
  1139. /* We may want to filter the error. */
  1140. err = action->error_hook(err);
  1141. /* The caller should not clear the error. */
  1142. VM_WARN_ON_ONCE(!err);
  1143. }
  1144. return err;
  1145. }
  1146. if (action->success_hook)
  1147. return action->success_hook(vma);
  1148. return 0;
  1149. }
  1150. #ifdef CONFIG_MMU
  1151. /**
  1152. * mmap_action_prepare - Perform preparatory setup for an VMA descriptor
  1153. * action which need to be performed.
  1154. * @desc: The VMA descriptor to prepare for @action.
  1155. * @action: The action to perform.
  1156. */
  1157. void mmap_action_prepare(struct mmap_action *action,
  1158. struct vm_area_desc *desc)
  1159. {
  1160. switch (action->type) {
  1161. case MMAP_NOTHING:
  1162. break;
  1163. case MMAP_REMAP_PFN:
  1164. remap_pfn_range_prepare(desc, action->remap.start_pfn);
  1165. break;
  1166. case MMAP_IO_REMAP_PFN:
  1167. io_remap_pfn_range_prepare(desc, action->remap.start_pfn,
  1168. action->remap.size);
  1169. break;
  1170. }
  1171. }
  1172. EXPORT_SYMBOL(mmap_action_prepare);
  1173. /**
  1174. * mmap_action_complete - Execute VMA descriptor action.
  1175. * @action: The action to perform.
  1176. * @vma: The VMA to perform the action upon.
  1177. *
  1178. * Similar to mmap_action_prepare().
  1179. *
  1180. * Return: 0 on success, or error, at which point the VMA will be unmapped.
  1181. */
  1182. int mmap_action_complete(struct mmap_action *action,
  1183. struct vm_area_struct *vma)
  1184. {
  1185. int err = 0;
  1186. switch (action->type) {
  1187. case MMAP_NOTHING:
  1188. break;
  1189. case MMAP_REMAP_PFN:
  1190. err = remap_pfn_range_complete(vma, action->remap.start,
  1191. action->remap.start_pfn, action->remap.size,
  1192. action->remap.pgprot);
  1193. break;
  1194. case MMAP_IO_REMAP_PFN:
  1195. err = io_remap_pfn_range_complete(vma, action->remap.start,
  1196. action->remap.start_pfn, action->remap.size,
  1197. action->remap.pgprot);
  1198. break;
  1199. }
  1200. return mmap_action_finish(action, vma, err);
  1201. }
  1202. EXPORT_SYMBOL(mmap_action_complete);
  1203. #else
  1204. void mmap_action_prepare(struct mmap_action *action,
  1205. struct vm_area_desc *desc)
  1206. {
  1207. switch (action->type) {
  1208. case MMAP_NOTHING:
  1209. break;
  1210. case MMAP_REMAP_PFN:
  1211. case MMAP_IO_REMAP_PFN:
  1212. WARN_ON_ONCE(1); /* nommu cannot handle these. */
  1213. break;
  1214. }
  1215. }
  1216. EXPORT_SYMBOL(mmap_action_prepare);
  1217. int mmap_action_complete(struct mmap_action *action,
  1218. struct vm_area_struct *vma)
  1219. {
  1220. int err = 0;
  1221. switch (action->type) {
  1222. case MMAP_NOTHING:
  1223. break;
  1224. case MMAP_REMAP_PFN:
  1225. case MMAP_IO_REMAP_PFN:
  1226. WARN_ON_ONCE(1); /* nommu cannot handle this. */
  1227. err = -EINVAL;
  1228. break;
  1229. }
  1230. return mmap_action_finish(action, vma, err);
  1231. }
  1232. EXPORT_SYMBOL(mmap_action_complete);
  1233. #endif
  1234. #ifdef CONFIG_MMU
  1235. /**
  1236. * folio_pte_batch - detect a PTE batch for a large folio
  1237. * @folio: The large folio to detect a PTE batch for.
  1238. * @ptep: Page table pointer for the first entry.
  1239. * @pte: Page table entry for the first page.
  1240. * @max_nr: The maximum number of table entries to consider.
  1241. *
  1242. * This is a simplified variant of folio_pte_batch_flags().
  1243. *
  1244. * Detect a PTE batch: consecutive (present) PTEs that map consecutive
  1245. * pages of the same large folio in a single VMA and a single page table.
  1246. *
  1247. * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN,
  1248. * the accessed bit, writable bit, dirt-bit and soft-dirty bit.
  1249. *
  1250. * ptep must map any page of the folio. max_nr must be at least one and
  1251. * must be limited by the caller so scanning cannot exceed a single VMA and
  1252. * a single page table.
  1253. *
  1254. * Return: the number of table entries in the batch.
  1255. */
  1256. unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte,
  1257. unsigned int max_nr)
  1258. {
  1259. return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr, 0);
  1260. }
  1261. #endif /* CONFIG_MMU */
  1262. #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
  1263. /**
  1264. * page_range_contiguous - test whether the page range is contiguous
  1265. * @page: the start of the page range.
  1266. * @nr_pages: the number of pages in the range.
  1267. *
  1268. * Test whether the page range is contiguous, such that they can be iterated
  1269. * naively, corresponding to iterating a contiguous PFN range.
  1270. *
  1271. * This function should primarily only be used for debug checks, or when
  1272. * working with page ranges that are not naturally contiguous (e.g., pages
  1273. * within a folio are).
  1274. *
  1275. * Returns true if contiguous, otherwise false.
  1276. */
  1277. bool page_range_contiguous(const struct page *page, unsigned long nr_pages)
  1278. {
  1279. const unsigned long start_pfn = page_to_pfn(page);
  1280. const unsigned long end_pfn = start_pfn + nr_pages;
  1281. unsigned long pfn;
  1282. /*
  1283. * The memmap is allocated per memory section, so no need to check
  1284. * within the first section. However, we need to check each other
  1285. * spanned memory section once, making sure the first page in a
  1286. * section could similarly be reached by just iterating pages.
  1287. */
  1288. for (pfn = ALIGN(start_pfn, PAGES_PER_SECTION);
  1289. pfn < end_pfn; pfn += PAGES_PER_SECTION)
  1290. if (unlikely(page + (pfn - start_pfn) != pfn_to_page(pfn)))
  1291. return false;
  1292. return true;
  1293. }
  1294. EXPORT_SYMBOL(page_range_contiguous);
  1295. #endif