memory.c 33 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Memory subsystem support
  4. *
  5. * Written by Matt Tolentino <matthew.e.tolentino@intel.com>
  6. * Dave Hansen <haveblue@us.ibm.com>
  7. *
  8. * This file provides the necessary infrastructure to represent
  9. * a SPARSEMEM-memory-model system's physical memory in /sysfs.
  10. * All arch-independent code that assumes MEMORY_HOTPLUG requires
  11. * SPARSEMEM should be contained here, or in mm/memory_hotplug.c.
  12. */
  13. #include <linux/module.h>
  14. #include <linux/init.h>
  15. #include <linux/topology.h>
  16. #include <linux/capability.h>
  17. #include <linux/device.h>
  18. #include <linux/memory.h>
  19. #include <linux/memory_hotplug.h>
  20. #include <linux/mm.h>
  21. #include <linux/stat.h>
  22. #include <linux/slab.h>
  23. #include <linux/xarray.h>
  24. #include <linux/export.h>
  25. #include <linux/atomic.h>
  26. #include <linux/uaccess.h>
  27. #define MEMORY_CLASS_NAME "memory"
  28. static const char *const online_type_to_str[] = {
  29. [MMOP_OFFLINE] = "offline",
  30. [MMOP_ONLINE] = "online",
  31. [MMOP_ONLINE_KERNEL] = "online_kernel",
  32. [MMOP_ONLINE_MOVABLE] = "online_movable",
  33. };
  34. int mhp_online_type_from_str(const char *str)
  35. {
  36. int i;
  37. for (i = 0; i < ARRAY_SIZE(online_type_to_str); i++) {
  38. if (sysfs_streq(str, online_type_to_str[i]))
  39. return i;
  40. }
  41. return -EINVAL;
  42. }
  43. #define to_memory_block(dev) container_of(dev, struct memory_block, dev)
  44. int sections_per_block;
  45. EXPORT_SYMBOL(sections_per_block);
  46. static int memory_subsys_online(struct device *dev);
  47. static int memory_subsys_offline(struct device *dev);
  48. static const struct bus_type memory_subsys = {
  49. .name = MEMORY_CLASS_NAME,
  50. .dev_name = MEMORY_CLASS_NAME,
  51. .online = memory_subsys_online,
  52. .offline = memory_subsys_offline,
  53. };
  54. /*
  55. * Memory blocks are cached in a local radix tree to avoid
  56. * a costly linear search for the corresponding device on
  57. * the subsystem bus.
  58. */
  59. static DEFINE_XARRAY(memory_blocks);
  60. /*
  61. * Memory groups, indexed by memory group id (mgid).
  62. */
  63. static DEFINE_XARRAY_FLAGS(memory_groups, XA_FLAGS_ALLOC);
  64. #define MEMORY_GROUP_MARK_DYNAMIC XA_MARK_1
  65. static BLOCKING_NOTIFIER_HEAD(memory_chain);
  66. int register_memory_notifier(struct notifier_block *nb)
  67. {
  68. return blocking_notifier_chain_register(&memory_chain, nb);
  69. }
  70. EXPORT_SYMBOL(register_memory_notifier);
  71. void unregister_memory_notifier(struct notifier_block *nb)
  72. {
  73. blocking_notifier_chain_unregister(&memory_chain, nb);
  74. }
  75. EXPORT_SYMBOL(unregister_memory_notifier);
  76. static void memory_block_release(struct device *dev)
  77. {
  78. struct memory_block *mem = to_memory_block(dev);
  79. /* Verify that the altmap is freed */
  80. WARN_ON(mem->altmap);
  81. kfree(mem);
  82. }
  83. /* Max block size to be set by memory_block_advise_max_size */
  84. static unsigned long memory_block_advised_size;
  85. static bool memory_block_advised_size_queried;
  86. /**
  87. * memory_block_advise_max_size() - advise memory hotplug on the max suggested
  88. * block size, usually for alignment.
  89. * @size: suggestion for maximum block size. must be aligned on power of 2.
  90. *
  91. * Early boot software (pre-allocator init) may advise archs on the max block
  92. * size. This value can only decrease after initialization, as the intent is
  93. * to identify the largest supported alignment for all sources.
  94. *
  95. * Use of this value is arch-defined, as is min/max block size.
  96. *
  97. * Return: 0 on success
  98. * -EINVAL if size is 0 or not pow2 aligned
  99. * -EBUSY if value has already been probed
  100. */
  101. int __init memory_block_advise_max_size(unsigned long size)
  102. {
  103. if (!size || !is_power_of_2(size))
  104. return -EINVAL;
  105. if (memory_block_advised_size_queried)
  106. return -EBUSY;
  107. if (memory_block_advised_size)
  108. memory_block_advised_size = min(memory_block_advised_size, size);
  109. else
  110. memory_block_advised_size = size;
  111. return 0;
  112. }
  113. /**
  114. * memory_block_advised_max_size() - query advised max hotplug block size.
  115. *
  116. * After the first call, the value can never change. Callers looking for the
  117. * actual block size should use memory_block_size_bytes. This interface is
  118. * intended for use by arch-init when initializing the hotplug block size.
  119. *
  120. * Return: advised size in bytes, or 0 if never set.
  121. */
  122. unsigned long memory_block_advised_max_size(void)
  123. {
  124. memory_block_advised_size_queried = true;
  125. return memory_block_advised_size;
  126. }
  127. unsigned long __weak memory_block_size_bytes(void)
  128. {
  129. return MIN_MEMORY_BLOCK_SIZE;
  130. }
  131. EXPORT_SYMBOL_GPL(memory_block_size_bytes);
  132. /* Show the memory block ID, relative to the memory block size */
  133. static ssize_t phys_index_show(struct device *dev,
  134. struct device_attribute *attr, char *buf)
  135. {
  136. struct memory_block *mem = to_memory_block(dev);
  137. return sysfs_emit(buf, "%08lx\n", memory_block_id(mem->start_section_nr));
  138. }
  139. /*
  140. * Legacy interface that we cannot remove. Always indicate "removable"
  141. * with CONFIG_MEMORY_HOTREMOVE - bad heuristic.
  142. */
  143. static ssize_t removable_show(struct device *dev, struct device_attribute *attr,
  144. char *buf)
  145. {
  146. return sysfs_emit(buf, "%d\n", (int)IS_ENABLED(CONFIG_MEMORY_HOTREMOVE));
  147. }
  148. /*
  149. * online, offline, going offline, etc.
  150. */
  151. static ssize_t state_show(struct device *dev, struct device_attribute *attr,
  152. char *buf)
  153. {
  154. struct memory_block *mem = to_memory_block(dev);
  155. const char *output;
  156. /*
  157. * We can probably put these states in a nice little array
  158. * so that they're not open-coded
  159. */
  160. switch (mem->state) {
  161. case MEM_ONLINE:
  162. output = "online";
  163. break;
  164. case MEM_OFFLINE:
  165. output = "offline";
  166. break;
  167. case MEM_GOING_OFFLINE:
  168. output = "going-offline";
  169. break;
  170. default:
  171. WARN_ON(1);
  172. return sysfs_emit(buf, "ERROR-UNKNOWN-%d\n", mem->state);
  173. }
  174. return sysfs_emit(buf, "%s\n", output);
  175. }
  176. int memory_notify(enum memory_block_state state, void *v)
  177. {
  178. return blocking_notifier_call_chain(&memory_chain, state, v);
  179. }
  180. #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
  181. static unsigned long memblk_nr_poison(struct memory_block *mem);
  182. #else
  183. static inline unsigned long memblk_nr_poison(struct memory_block *mem)
  184. {
  185. return 0;
  186. }
  187. #endif
  188. /*
  189. * Must acquire mem_hotplug_lock in write mode.
  190. */
  191. static int memory_block_online(struct memory_block *mem)
  192. {
  193. unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
  194. unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
  195. unsigned long nr_vmemmap_pages = 0;
  196. struct zone *zone;
  197. int ret;
  198. if (memblk_nr_poison(mem))
  199. return -EHWPOISON;
  200. zone = zone_for_pfn_range(mem->online_type, mem->nid, mem->group,
  201. start_pfn, nr_pages);
  202. /*
  203. * Although vmemmap pages have a different lifecycle than the pages
  204. * they describe (they remain until the memory is unplugged), doing
  205. * their initialization and accounting at memory onlining/offlining
  206. * stage helps to keep accounting easier to follow - e.g vmemmaps
  207. * belong to the same zone as the memory they backed.
  208. */
  209. if (mem->altmap)
  210. nr_vmemmap_pages = mem->altmap->free;
  211. mem_hotplug_begin();
  212. if (nr_vmemmap_pages) {
  213. ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, zone);
  214. if (ret)
  215. goto out;
  216. }
  217. ret = online_pages(start_pfn + nr_vmemmap_pages,
  218. nr_pages - nr_vmemmap_pages, zone, mem->group);
  219. if (ret) {
  220. if (nr_vmemmap_pages)
  221. mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
  222. goto out;
  223. }
  224. /*
  225. * Account once onlining succeeded. If the zone was unpopulated, it is
  226. * now already properly populated.
  227. */
  228. if (nr_vmemmap_pages)
  229. adjust_present_page_count(pfn_to_page(start_pfn), mem->group,
  230. nr_vmemmap_pages);
  231. mem->zone = zone;
  232. out:
  233. mem_hotplug_done();
  234. return ret;
  235. }
  236. /*
  237. * Must acquire mem_hotplug_lock in write mode.
  238. */
  239. static int memory_block_offline(struct memory_block *mem)
  240. {
  241. unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
  242. unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
  243. unsigned long nr_vmemmap_pages = 0;
  244. int ret;
  245. if (!mem->zone)
  246. return -EINVAL;
  247. /*
  248. * Unaccount before offlining, such that unpopulated zone and kthreads
  249. * can properly be torn down in offline_pages().
  250. */
  251. if (mem->altmap)
  252. nr_vmemmap_pages = mem->altmap->free;
  253. mem_hotplug_begin();
  254. if (nr_vmemmap_pages)
  255. adjust_present_page_count(pfn_to_page(start_pfn), mem->group,
  256. -nr_vmemmap_pages);
  257. ret = offline_pages(start_pfn + nr_vmemmap_pages,
  258. nr_pages - nr_vmemmap_pages, mem->zone, mem->group);
  259. if (ret) {
  260. /* offline_pages() failed. Account back. */
  261. if (nr_vmemmap_pages)
  262. adjust_present_page_count(pfn_to_page(start_pfn),
  263. mem->group, nr_vmemmap_pages);
  264. goto out;
  265. }
  266. if (nr_vmemmap_pages)
  267. mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
  268. mem->zone = NULL;
  269. out:
  270. mem_hotplug_done();
  271. return ret;
  272. }
  273. /*
  274. * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
  275. * OK to have direct references to sparsemem variables in here.
  276. */
  277. static int
  278. memory_block_action(struct memory_block *mem, unsigned long action)
  279. {
  280. int ret;
  281. switch (action) {
  282. case MEM_ONLINE:
  283. ret = memory_block_online(mem);
  284. break;
  285. case MEM_OFFLINE:
  286. ret = memory_block_offline(mem);
  287. break;
  288. default:
  289. WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: "
  290. "%ld\n", __func__, mem->start_section_nr, action, action);
  291. ret = -EINVAL;
  292. }
  293. return ret;
  294. }
  295. static int memory_block_change_state(struct memory_block *mem,
  296. unsigned long to_state, unsigned long from_state_req)
  297. {
  298. int ret = 0;
  299. if (mem->state != from_state_req)
  300. return -EINVAL;
  301. if (to_state == MEM_OFFLINE)
  302. mem->state = MEM_GOING_OFFLINE;
  303. ret = memory_block_action(mem, to_state);
  304. mem->state = ret ? from_state_req : to_state;
  305. return ret;
  306. }
  307. /* The device lock serializes operations on memory_subsys_[online|offline] */
  308. static int memory_subsys_online(struct device *dev)
  309. {
  310. struct memory_block *mem = to_memory_block(dev);
  311. int ret;
  312. if (mem->state == MEM_ONLINE)
  313. return 0;
  314. /*
  315. * When called via device_online() without configuring the online_type,
  316. * we want to default to MMOP_ONLINE.
  317. */
  318. if (mem->online_type == MMOP_OFFLINE)
  319. mem->online_type = MMOP_ONLINE;
  320. ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
  321. mem->online_type = MMOP_OFFLINE;
  322. return ret;
  323. }
  324. static int memory_subsys_offline(struct device *dev)
  325. {
  326. struct memory_block *mem = to_memory_block(dev);
  327. if (mem->state == MEM_OFFLINE)
  328. return 0;
  329. return memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
  330. }
  331. static ssize_t state_store(struct device *dev, struct device_attribute *attr,
  332. const char *buf, size_t count)
  333. {
  334. const int online_type = mhp_online_type_from_str(buf);
  335. struct memory_block *mem = to_memory_block(dev);
  336. int ret;
  337. if (online_type < 0)
  338. return -EINVAL;
  339. ret = lock_device_hotplug_sysfs();
  340. if (ret)
  341. return ret;
  342. switch (online_type) {
  343. case MMOP_ONLINE_KERNEL:
  344. case MMOP_ONLINE_MOVABLE:
  345. case MMOP_ONLINE:
  346. /* mem->online_type is protected by device_hotplug_lock */
  347. mem->online_type = online_type;
  348. ret = device_online(&mem->dev);
  349. break;
  350. case MMOP_OFFLINE:
  351. ret = device_offline(&mem->dev);
  352. break;
  353. default:
  354. ret = -EINVAL; /* should never happen */
  355. }
  356. unlock_device_hotplug();
  357. if (ret < 0)
  358. return ret;
  359. if (ret)
  360. return -EINVAL;
  361. return count;
  362. }
  363. /*
  364. * Legacy interface that we cannot remove: s390x exposes the storage increment
  365. * covered by a memory block, allowing for identifying which memory blocks
  366. * comprise a storage increment. Since a memory block spans complete
  367. * storage increments nowadays, this interface is basically unused. Other
  368. * archs never exposed != 0.
  369. */
  370. static ssize_t phys_device_show(struct device *dev,
  371. struct device_attribute *attr, char *buf)
  372. {
  373. struct memory_block *mem = to_memory_block(dev);
  374. unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
  375. return sysfs_emit(buf, "%d\n",
  376. arch_get_memory_phys_device(start_pfn));
  377. }
  378. #ifdef CONFIG_MEMORY_HOTREMOVE
  379. static int print_allowed_zone(char *buf, int len, int nid,
  380. struct memory_group *group,
  381. unsigned long start_pfn, unsigned long nr_pages,
  382. int online_type, struct zone *default_zone)
  383. {
  384. struct zone *zone;
  385. zone = zone_for_pfn_range(online_type, nid, group, start_pfn, nr_pages);
  386. if (zone == default_zone)
  387. return 0;
  388. return sysfs_emit_at(buf, len, " %s", zone->name);
  389. }
  390. static ssize_t valid_zones_show(struct device *dev,
  391. struct device_attribute *attr, char *buf)
  392. {
  393. struct memory_block *mem = to_memory_block(dev);
  394. unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
  395. unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
  396. struct memory_group *group = mem->group;
  397. struct zone *default_zone;
  398. int nid = mem->nid;
  399. int len;
  400. /*
  401. * Check the existing zone. Make sure that we do that only on the
  402. * online nodes otherwise the page_zone is not reliable
  403. */
  404. if (mem->state == MEM_ONLINE) {
  405. /*
  406. * If !mem->zone, the memory block spans multiple zones and
  407. * cannot get offlined.
  408. */
  409. return sysfs_emit(buf, "%s\n",
  410. mem->zone ? mem->zone->name : "none");
  411. }
  412. default_zone = zone_for_pfn_range(MMOP_ONLINE, nid, group,
  413. start_pfn, nr_pages);
  414. len = sysfs_emit(buf, "%s", default_zone->name);
  415. len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages,
  416. MMOP_ONLINE_KERNEL, default_zone);
  417. len += print_allowed_zone(buf, len, nid, group, start_pfn, nr_pages,
  418. MMOP_ONLINE_MOVABLE, default_zone);
  419. len += sysfs_emit_at(buf, len, "\n");
  420. return len;
  421. }
  422. static DEVICE_ATTR_RO(valid_zones);
  423. #endif
  424. static DEVICE_ATTR_RO(phys_index);
  425. static DEVICE_ATTR_RW(state);
  426. static DEVICE_ATTR_RO(phys_device);
  427. static DEVICE_ATTR_RO(removable);
  428. /*
  429. * Show the memory block size (shared by all memory blocks).
  430. */
  431. static ssize_t block_size_bytes_show(struct device *dev,
  432. struct device_attribute *attr, char *buf)
  433. {
  434. return sysfs_emit(buf, "%lx\n", memory_block_size_bytes());
  435. }
  436. static DEVICE_ATTR_RO(block_size_bytes);
  437. /*
  438. * Memory auto online policy.
  439. */
  440. static ssize_t auto_online_blocks_show(struct device *dev,
  441. struct device_attribute *attr, char *buf)
  442. {
  443. return sysfs_emit(buf, "%s\n",
  444. online_type_to_str[mhp_get_default_online_type()]);
  445. }
  446. static ssize_t auto_online_blocks_store(struct device *dev,
  447. struct device_attribute *attr,
  448. const char *buf, size_t count)
  449. {
  450. const int online_type = mhp_online_type_from_str(buf);
  451. if (online_type < 0)
  452. return -EINVAL;
  453. mhp_set_default_online_type(online_type);
  454. return count;
  455. }
  456. static DEVICE_ATTR_RW(auto_online_blocks);
  457. #ifdef CONFIG_CRASH_HOTPLUG
  458. #include <linux/kexec.h>
  459. static ssize_t crash_hotplug_show(struct device *dev,
  460. struct device_attribute *attr, char *buf)
  461. {
  462. return sysfs_emit(buf, "%d\n", crash_check_hotplug_support());
  463. }
  464. static DEVICE_ATTR_RO(crash_hotplug);
  465. #endif
  466. /*
  467. * Some architectures will have custom drivers to do this, and
  468. * will not need to do it from userspace. The fake hot-add code
  469. * as well as ppc64 will do all of their discovery in userspace
  470. * and will require this interface.
  471. */
  472. #ifdef CONFIG_ARCH_MEMORY_PROBE
  473. static ssize_t probe_store(struct device *dev, struct device_attribute *attr,
  474. const char *buf, size_t count)
  475. {
  476. u64 phys_addr;
  477. int nid, ret;
  478. unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block;
  479. ret = kstrtoull(buf, 0, &phys_addr);
  480. if (ret)
  481. return ret;
  482. if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1))
  483. return -EINVAL;
  484. ret = lock_device_hotplug_sysfs();
  485. if (ret)
  486. return ret;
  487. nid = memory_add_physaddr_to_nid(phys_addr);
  488. ret = __add_memory(nid, phys_addr,
  489. MIN_MEMORY_BLOCK_SIZE * sections_per_block,
  490. MHP_NONE);
  491. if (ret)
  492. goto out;
  493. ret = count;
  494. out:
  495. unlock_device_hotplug();
  496. return ret;
  497. }
  498. static DEVICE_ATTR_WO(probe);
  499. #endif
  500. #ifdef CONFIG_MEMORY_FAILURE
  501. /*
  502. * Support for offlining pages of memory
  503. */
  504. /* Soft offline a page */
  505. static ssize_t soft_offline_page_store(struct device *dev,
  506. struct device_attribute *attr,
  507. const char *buf, size_t count)
  508. {
  509. int ret;
  510. u64 pfn;
  511. if (!capable(CAP_SYS_ADMIN))
  512. return -EPERM;
  513. if (kstrtoull(buf, 0, &pfn) < 0)
  514. return -EINVAL;
  515. pfn >>= PAGE_SHIFT;
  516. ret = soft_offline_page(pfn, 0);
  517. return ret == 0 ? count : ret;
  518. }
  519. /* Forcibly offline a page, including killing processes. */
  520. static ssize_t hard_offline_page_store(struct device *dev,
  521. struct device_attribute *attr,
  522. const char *buf, size_t count)
  523. {
  524. int ret;
  525. u64 pfn;
  526. if (!capable(CAP_SYS_ADMIN))
  527. return -EPERM;
  528. if (kstrtoull(buf, 0, &pfn) < 0)
  529. return -EINVAL;
  530. pfn >>= PAGE_SHIFT;
  531. ret = memory_failure(pfn, MF_SW_SIMULATED);
  532. if (ret == -EOPNOTSUPP)
  533. ret = 0;
  534. return ret ? ret : count;
  535. }
  536. static DEVICE_ATTR_WO(soft_offline_page);
  537. static DEVICE_ATTR_WO(hard_offline_page);
  538. #endif
  539. /* See phys_device_show(). */
  540. int __weak arch_get_memory_phys_device(unsigned long start_pfn)
  541. {
  542. return 0;
  543. }
  544. /*
  545. * A reference for the returned memory block device is acquired.
  546. *
  547. * Called under device_hotplug_lock.
  548. */
  549. struct memory_block *find_memory_block_by_id(unsigned long block_id)
  550. {
  551. struct memory_block *mem;
  552. mem = xa_load(&memory_blocks, block_id);
  553. if (mem)
  554. get_device(&mem->dev);
  555. return mem;
  556. }
  557. /*
  558. * Called under device_hotplug_lock.
  559. */
  560. struct memory_block *find_memory_block(unsigned long section_nr)
  561. {
  562. unsigned long block_id = memory_block_id(section_nr);
  563. return find_memory_block_by_id(block_id);
  564. }
  565. static struct attribute *memory_memblk_attrs[] = {
  566. &dev_attr_phys_index.attr,
  567. &dev_attr_state.attr,
  568. &dev_attr_phys_device.attr,
  569. &dev_attr_removable.attr,
  570. #ifdef CONFIG_MEMORY_HOTREMOVE
  571. &dev_attr_valid_zones.attr,
  572. #endif
  573. NULL
  574. };
  575. static const struct attribute_group memory_memblk_attr_group = {
  576. .attrs = memory_memblk_attrs,
  577. };
  578. static const struct attribute_group *memory_memblk_attr_groups[] = {
  579. &memory_memblk_attr_group,
  580. NULL,
  581. };
  582. static int __add_memory_block(struct memory_block *memory)
  583. {
  584. int ret;
  585. memory->dev.bus = &memory_subsys;
  586. memory->dev.id = memory->start_section_nr / sections_per_block;
  587. memory->dev.release = memory_block_release;
  588. memory->dev.groups = memory_memblk_attr_groups;
  589. memory->dev.offline = memory->state == MEM_OFFLINE;
  590. ret = device_register(&memory->dev);
  591. if (ret) {
  592. put_device(&memory->dev);
  593. return ret;
  594. }
  595. ret = xa_err(xa_store(&memory_blocks, memory->dev.id, memory,
  596. GFP_KERNEL));
  597. if (ret)
  598. device_unregister(&memory->dev);
  599. return ret;
  600. }
  601. static struct zone *early_node_zone_for_memory_block(struct memory_block *mem,
  602. int nid)
  603. {
  604. const unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
  605. const unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
  606. struct zone *zone, *matching_zone = NULL;
  607. pg_data_t *pgdat = NODE_DATA(nid);
  608. int i;
  609. /*
  610. * This logic only works for early memory, when the applicable zones
  611. * already span the memory block. We don't expect overlapping zones on
  612. * a single node for early memory. So if we're told that some PFNs
  613. * of a node fall into this memory block, we can assume that all node
  614. * zones that intersect with the memory block are actually applicable.
  615. * No need to look at the memmap.
  616. */
  617. for (i = 0; i < MAX_NR_ZONES; i++) {
  618. zone = pgdat->node_zones + i;
  619. if (!populated_zone(zone))
  620. continue;
  621. if (!zone_intersects(zone, start_pfn, nr_pages))
  622. continue;
  623. if (!matching_zone) {
  624. matching_zone = zone;
  625. continue;
  626. }
  627. /* Spans multiple zones ... */
  628. matching_zone = NULL;
  629. break;
  630. }
  631. return matching_zone;
  632. }
  633. #ifdef CONFIG_NUMA
  634. /**
  635. * memory_block_add_nid_early() - Indicate that early system RAM falling into
  636. * this memory block device (partially) belongs
  637. * to the given node.
  638. * @mem: The memory block device.
  639. * @nid: The node id.
  640. *
  641. * Indicate that early system RAM falling into this memory block (partially)
  642. * belongs to the given node. This will also properly set/adjust mem->zone based
  643. * on the zone ranges of the given node.
  644. *
  645. * Memory hotplug handles this on memory block creation, where we can only have
  646. * a single nid span a memory block.
  647. */
  648. void memory_block_add_nid_early(struct memory_block *mem, int nid)
  649. {
  650. if (mem->nid != nid) {
  651. /*
  652. * For early memory we have to determine the zone when setting
  653. * the node id and handle multiple nodes spanning a single
  654. * memory block by indicate via zone == NULL that we're not
  655. * dealing with a single zone. So if we're setting the node id
  656. * the first time, determine if there is a single zone. If we're
  657. * setting the node id a second time to a different node,
  658. * invalidate the single detected zone.
  659. */
  660. if (mem->nid == NUMA_NO_NODE)
  661. mem->zone = early_node_zone_for_memory_block(mem, nid);
  662. else
  663. mem->zone = NULL;
  664. /*
  665. * If this memory block spans multiple nodes, we only indicate
  666. * the last processed node. If we span multiple nodes (not applicable
  667. * to hotplugged memory), zone == NULL will prohibit memory offlining
  668. * and consequently unplug.
  669. */
  670. mem->nid = nid;
  671. }
  672. }
  673. #endif
  674. static int add_memory_block(unsigned long block_id, int nid, unsigned long state,
  675. struct vmem_altmap *altmap,
  676. struct memory_group *group)
  677. {
  678. struct memory_block *mem;
  679. int ret = 0;
  680. mem = find_memory_block_by_id(block_id);
  681. if (mem) {
  682. put_device(&mem->dev);
  683. return -EEXIST;
  684. }
  685. mem = kzalloc_obj(*mem);
  686. if (!mem)
  687. return -ENOMEM;
  688. mem->start_section_nr = block_id * sections_per_block;
  689. mem->state = state;
  690. mem->nid = nid;
  691. mem->altmap = altmap;
  692. INIT_LIST_HEAD(&mem->group_next);
  693. #ifndef CONFIG_NUMA
  694. if (state == MEM_ONLINE)
  695. /*
  696. * MEM_ONLINE at this point implies early memory. With NUMA,
  697. * we'll determine the zone when setting the node id via
  698. * memory_block_add_nid(). Memory hotplug updated the zone
  699. * manually when memory onlining/offlining succeeds.
  700. */
  701. mem->zone = early_node_zone_for_memory_block(mem, NUMA_NO_NODE);
  702. #endif /* CONFIG_NUMA */
  703. ret = __add_memory_block(mem);
  704. if (ret)
  705. return ret;
  706. if (group) {
  707. mem->group = group;
  708. list_add(&mem->group_next, &group->memory_blocks);
  709. }
  710. return 0;
  711. }
  712. static void remove_memory_block(struct memory_block *memory)
  713. {
  714. if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys))
  715. return;
  716. WARN_ON(xa_erase(&memory_blocks, memory->dev.id) == NULL);
  717. if (memory->group) {
  718. list_del(&memory->group_next);
  719. memory->group = NULL;
  720. }
  721. /* drop the ref. we got via find_memory_block() */
  722. put_device(&memory->dev);
  723. device_unregister(&memory->dev);
  724. }
  725. /*
  726. * Create memory block devices for the given memory area. Start and size
  727. * have to be aligned to memory block granularity. Memory block devices
  728. * will be initialized as offline.
  729. *
  730. * Called under device_hotplug_lock.
  731. */
  732. int create_memory_block_devices(unsigned long start, unsigned long size,
  733. int nid, struct vmem_altmap *altmap,
  734. struct memory_group *group)
  735. {
  736. const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
  737. unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
  738. struct memory_block *mem;
  739. unsigned long block_id;
  740. int ret = 0;
  741. if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
  742. !IS_ALIGNED(size, memory_block_size_bytes())))
  743. return -EINVAL;
  744. for (block_id = start_block_id; block_id != end_block_id; block_id++) {
  745. ret = add_memory_block(block_id, nid, MEM_OFFLINE, altmap, group);
  746. if (ret)
  747. break;
  748. }
  749. if (ret) {
  750. end_block_id = block_id;
  751. for (block_id = start_block_id; block_id != end_block_id;
  752. block_id++) {
  753. mem = find_memory_block_by_id(block_id);
  754. if (WARN_ON_ONCE(!mem))
  755. continue;
  756. remove_memory_block(mem);
  757. }
  758. }
  759. return ret;
  760. }
  761. /*
  762. * Remove memory block devices for the given memory area. Start and size
  763. * have to be aligned to memory block granularity. Memory block devices
  764. * have to be offline.
  765. *
  766. * Called under device_hotplug_lock.
  767. */
  768. void remove_memory_block_devices(unsigned long start, unsigned long size)
  769. {
  770. const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
  771. const unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
  772. struct memory_block *mem;
  773. unsigned long block_id;
  774. if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
  775. !IS_ALIGNED(size, memory_block_size_bytes())))
  776. return;
  777. for (block_id = start_block_id; block_id != end_block_id; block_id++) {
  778. mem = find_memory_block_by_id(block_id);
  779. if (WARN_ON_ONCE(!mem))
  780. continue;
  781. num_poisoned_pages_sub(-1UL, memblk_nr_poison(mem));
  782. unregister_memory_block_under_nodes(mem);
  783. remove_memory_block(mem);
  784. }
  785. }
  786. static struct attribute *memory_root_attrs[] = {
  787. #ifdef CONFIG_ARCH_MEMORY_PROBE
  788. &dev_attr_probe.attr,
  789. #endif
  790. #ifdef CONFIG_MEMORY_FAILURE
  791. &dev_attr_soft_offline_page.attr,
  792. &dev_attr_hard_offline_page.attr,
  793. #endif
  794. &dev_attr_block_size_bytes.attr,
  795. &dev_attr_auto_online_blocks.attr,
  796. #ifdef CONFIG_CRASH_HOTPLUG
  797. &dev_attr_crash_hotplug.attr,
  798. #endif
  799. NULL
  800. };
  801. static const struct attribute_group memory_root_attr_group = {
  802. .attrs = memory_root_attrs,
  803. };
  804. static const struct attribute_group *memory_root_attr_groups[] = {
  805. &memory_root_attr_group,
  806. NULL,
  807. };
  808. /*
  809. * Initialize the sysfs support for memory devices. At the time this function
  810. * is called, we cannot have concurrent creation/deletion of memory block
  811. * devices, the device_hotplug_lock is not needed.
  812. */
  813. void __init memory_dev_init(void)
  814. {
  815. int ret;
  816. unsigned long block_sz, block_id, nr;
  817. /* Validate the configured memory block size */
  818. block_sz = memory_block_size_bytes();
  819. if (!is_power_of_2(block_sz) || block_sz < MIN_MEMORY_BLOCK_SIZE)
  820. panic("Memory block size not suitable: 0x%lx\n", block_sz);
  821. sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
  822. ret = subsys_system_register(&memory_subsys, memory_root_attr_groups);
  823. if (ret)
  824. panic("%s() failed to register subsystem: %d\n", __func__, ret);
  825. /*
  826. * Create entries for memory sections that were found during boot
  827. * and have been initialized. Use @block_id to track the last
  828. * handled block and initialize it to an invalid value (ULONG_MAX)
  829. * to bypass the block ID matching check for the first present
  830. * block so that it can be covered.
  831. */
  832. block_id = ULONG_MAX;
  833. for_each_present_section_nr(0, nr) {
  834. if (block_id != ULONG_MAX && memory_block_id(nr) == block_id)
  835. continue;
  836. block_id = memory_block_id(nr);
  837. ret = add_memory_block(block_id, NUMA_NO_NODE, MEM_ONLINE, NULL, NULL);
  838. if (ret) {
  839. panic("%s() failed to add memory block: %d\n",
  840. __func__, ret);
  841. }
  842. }
  843. }
  844. /**
  845. * walk_memory_blocks - walk through all present memory blocks overlapped
  846. * by the range [start, start + size)
  847. *
  848. * @start: start address of the memory range
  849. * @size: size of the memory range
  850. * @arg: argument passed to func
  851. * @func: callback for each memory section walked
  852. *
  853. * This function walks through all present memory blocks overlapped by the
  854. * range [start, start + size), calling func on each memory block.
  855. *
  856. * In case func() returns an error, walking is aborted and the error is
  857. * returned.
  858. *
  859. * Called under device_hotplug_lock.
  860. */
  861. int walk_memory_blocks(unsigned long start, unsigned long size,
  862. void *arg, walk_memory_blocks_func_t func)
  863. {
  864. const unsigned long start_block_id = phys_to_block_id(start);
  865. const unsigned long end_block_id = phys_to_block_id(start + size - 1);
  866. struct memory_block *mem;
  867. unsigned long block_id;
  868. int ret = 0;
  869. if (!size)
  870. return 0;
  871. for (block_id = start_block_id; block_id <= end_block_id; block_id++) {
  872. mem = find_memory_block_by_id(block_id);
  873. if (!mem)
  874. continue;
  875. ret = func(mem, arg);
  876. put_device(&mem->dev);
  877. if (ret)
  878. break;
  879. }
  880. return ret;
  881. }
  882. struct for_each_memory_block_cb_data {
  883. walk_memory_blocks_func_t func;
  884. void *arg;
  885. };
  886. static int for_each_memory_block_cb(struct device *dev, void *data)
  887. {
  888. struct memory_block *mem = to_memory_block(dev);
  889. struct for_each_memory_block_cb_data *cb_data = data;
  890. return cb_data->func(mem, cb_data->arg);
  891. }
  892. /**
  893. * for_each_memory_block - walk through all present memory blocks
  894. *
  895. * @arg: argument passed to func
  896. * @func: callback for each memory block walked
  897. *
  898. * This function walks through all present memory blocks, calling func on
  899. * each memory block.
  900. *
  901. * In case func() returns an error, walking is aborted and the error is
  902. * returned.
  903. */
  904. int for_each_memory_block(void *arg, walk_memory_blocks_func_t func)
  905. {
  906. struct for_each_memory_block_cb_data cb_data = {
  907. .func = func,
  908. .arg = arg,
  909. };
  910. return bus_for_each_dev(&memory_subsys, NULL, &cb_data,
  911. for_each_memory_block_cb);
  912. }
  913. /*
  914. * This is an internal helper to unify allocation and initialization of
  915. * memory groups. Note that the passed memory group will be copied to a
  916. * dynamically allocated memory group. After this call, the passed
  917. * memory group should no longer be used.
  918. */
  919. static int memory_group_register(struct memory_group group)
  920. {
  921. struct memory_group *new_group;
  922. uint32_t mgid;
  923. int ret;
  924. if (!node_possible(group.nid))
  925. return -EINVAL;
  926. new_group = kzalloc_obj(group);
  927. if (!new_group)
  928. return -ENOMEM;
  929. *new_group = group;
  930. INIT_LIST_HEAD(&new_group->memory_blocks);
  931. ret = xa_alloc(&memory_groups, &mgid, new_group, xa_limit_31b,
  932. GFP_KERNEL);
  933. if (ret) {
  934. kfree(new_group);
  935. return ret;
  936. } else if (group.is_dynamic) {
  937. xa_set_mark(&memory_groups, mgid, MEMORY_GROUP_MARK_DYNAMIC);
  938. }
  939. return mgid;
  940. }
  941. /**
  942. * memory_group_register_static() - Register a static memory group.
  943. * @nid: The node id.
  944. * @max_pages: The maximum number of pages we'll have in this static memory
  945. * group.
  946. *
  947. * Register a new static memory group and return the memory group id.
  948. * All memory in the group belongs to a single unit, such as a DIMM. All
  949. * memory belonging to a static memory group is added in one go to be removed
  950. * in one go -- it's static.
  951. *
  952. * Returns an error if out of memory, if the node id is invalid, if no new
  953. * memory groups can be registered, or if max_pages is invalid (0). Otherwise,
  954. * returns the new memory group id.
  955. */
  956. int memory_group_register_static(int nid, unsigned long max_pages)
  957. {
  958. struct memory_group group = {
  959. .nid = nid,
  960. .s = {
  961. .max_pages = max_pages,
  962. },
  963. };
  964. if (!max_pages)
  965. return -EINVAL;
  966. return memory_group_register(group);
  967. }
  968. EXPORT_SYMBOL_GPL(memory_group_register_static);
  969. /**
  970. * memory_group_register_dynamic() - Register a dynamic memory group.
  971. * @nid: The node id.
  972. * @unit_pages: Unit in pages in which is memory added/removed in this dynamic
  973. * memory group.
  974. *
  975. * Register a new dynamic memory group and return the memory group id.
  976. * Memory within a dynamic memory group is added/removed dynamically
  977. * in unit_pages.
  978. *
  979. * Returns an error if out of memory, if the node id is invalid, if no new
  980. * memory groups can be registered, or if unit_pages is invalid (0, not a
  981. * power of two, smaller than a single memory block). Otherwise, returns the
  982. * new memory group id.
  983. */
  984. int memory_group_register_dynamic(int nid, unsigned long unit_pages)
  985. {
  986. struct memory_group group = {
  987. .nid = nid,
  988. .is_dynamic = true,
  989. .d = {
  990. .unit_pages = unit_pages,
  991. },
  992. };
  993. if (!unit_pages || !is_power_of_2(unit_pages) ||
  994. unit_pages < PHYS_PFN(memory_block_size_bytes()))
  995. return -EINVAL;
  996. return memory_group_register(group);
  997. }
  998. EXPORT_SYMBOL_GPL(memory_group_register_dynamic);
  999. /**
  1000. * memory_group_unregister() - Unregister a memory group.
  1001. * @mgid: the memory group id
  1002. *
  1003. * Unregister a memory group. If any memory block still belongs to this
  1004. * memory group, unregistering will fail.
  1005. *
  1006. * Returns -EINVAL if the memory group id is invalid, returns -EBUSY if some
  1007. * memory blocks still belong to this memory group and returns 0 if
  1008. * unregistering succeeded.
  1009. */
  1010. int memory_group_unregister(int mgid)
  1011. {
  1012. struct memory_group *group;
  1013. if (mgid < 0)
  1014. return -EINVAL;
  1015. group = xa_load(&memory_groups, mgid);
  1016. if (!group)
  1017. return -EINVAL;
  1018. if (!list_empty(&group->memory_blocks))
  1019. return -EBUSY;
  1020. xa_erase(&memory_groups, mgid);
  1021. kfree(group);
  1022. return 0;
  1023. }
  1024. EXPORT_SYMBOL_GPL(memory_group_unregister);
  1025. /*
  1026. * This is an internal helper only to be used in core memory hotplug code to
  1027. * lookup a memory group. We don't care about locking, as we don't expect a
  1028. * memory group to get unregistered while adding memory to it -- because
  1029. * the group and the memory is managed by the same driver.
  1030. */
  1031. struct memory_group *memory_group_find_by_id(int mgid)
  1032. {
  1033. return xa_load(&memory_groups, mgid);
  1034. }
  1035. /*
  1036. * This is an internal helper only to be used in core memory hotplug code to
  1037. * walk all dynamic memory groups excluding a given memory group, either
  1038. * belonging to a specific node, or belonging to any node.
  1039. */
  1040. int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func,
  1041. struct memory_group *excluded, void *arg)
  1042. {
  1043. struct memory_group *group;
  1044. unsigned long index;
  1045. int ret = 0;
  1046. xa_for_each_marked(&memory_groups, index, group,
  1047. MEMORY_GROUP_MARK_DYNAMIC) {
  1048. if (group == excluded)
  1049. continue;
  1050. #ifdef CONFIG_NUMA
  1051. if (nid != NUMA_NO_NODE && group->nid != nid)
  1052. continue;
  1053. #endif /* CONFIG_NUMA */
  1054. ret = func(group, arg);
  1055. if (ret)
  1056. break;
  1057. }
  1058. return ret;
  1059. }
  1060. #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
  1061. void memblk_nr_poison_inc(unsigned long pfn)
  1062. {
  1063. const unsigned long block_id = pfn_to_block_id(pfn);
  1064. struct memory_block *mem = find_memory_block_by_id(block_id);
  1065. if (mem)
  1066. atomic_long_inc(&mem->nr_hwpoison);
  1067. }
  1068. void memblk_nr_poison_sub(unsigned long pfn, long i)
  1069. {
  1070. const unsigned long block_id = pfn_to_block_id(pfn);
  1071. struct memory_block *mem = find_memory_block_by_id(block_id);
  1072. if (mem)
  1073. atomic_long_sub(i, &mem->nr_hwpoison);
  1074. }
  1075. static unsigned long memblk_nr_poison(struct memory_block *mem)
  1076. {
  1077. return atomic_long_read(&mem->nr_hwpoison);
  1078. }
  1079. #endif