Blame - mm/memory_hotplug.c - yocto/kernel/common

blob: f9d320828067fbe1b75bdb8db88f41aeb8fd891f [file] [log] [blame]

Thomas Gleixner	457c899	2019-05-19 13:08:55 +0100	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0-only
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	2	/*
				3	* linux/mm/memory_hotplug.c
				4	*
				5	* Copyright (C)
				6	*/
				7
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	8	#include <linux/stddef.h>
				9	#include <linux/mm.h>
Ingo Molnar	174cd4b	2017-02-02 19:15:33 +0100	[diff] [blame]	10	#include <linux/sched/signal.h>
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	11	#include <linux/swap.h>
				12	#include <linux/interrupt.h>
				13	#include <linux/pagemap.h>
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	14	#include <linux/compiler.h>
Paul Gortmaker	b95f1b31	2011-10-16 02:01:52 -0400	[diff] [blame]	15	#include <linux/export.h>
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	16	#include <linux/pagevec.h>
Chandra Seetharaman	2d1d43f	2006-09-29 02:01:25 -0700	[diff] [blame]	17	#include <linux/writeback.h>
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	18	#include <linux/slab.h>
				19	#include <linux/sysctl.h>
				20	#include <linux/cpu.h>
				21	#include <linux/memory.h>
Dan Williams	4b94ffd	2016-01-15 16:56:22 -0800	[diff] [blame]	22	#include <linux/memremap.h>
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	23	#include <linux/memory_hotplug.h>
				24	#include <linux/highmem.h>
				25	#include <linux/vmalloc.h>
KAMEZAWA Hiroyuki	0a54703	2006-06-27 02:53:35 -0700	[diff] [blame]	26	#include <linux/ioport.h>
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	27	#include <linux/delay.h>
				28	#include <linux/migrate.h>
				29	#include <linux/page-isolation.h>
Badari Pulavarty	7108878	2008-10-18 20:25:58 -0700	[diff] [blame]	30	#include <linux/pfn.h>
Andi Kleen	6ad696d	2009-11-17 14:06:22 -0800	[diff] [blame]	31	#include <linux/suspend.h>
KOSAKI Motohiro	6d9c285	2009-12-14 17:58:11 -0800	[diff] [blame]	32	#include <linux/mm_inline.h>
akpm@linux-foundation.org	d96ae53	2010-03-05 13:41:58 -0800	[diff] [blame]	33	#include <linux/firmware-map.h>
Tang Chen	60a5a19	2013-02-22 16:33:14 -0800	[diff] [blame]	34	#include <linux/stop_machine.h>
Naoya Horiguchi	c8721bb	2013-09-11 14:22:09 -0700	[diff] [blame]	35	#include <linux/hugetlb.h>
Tang Chen	c532092	2013-11-12 15:08:10 -0800	[diff] [blame]	36	#include <linux/memblock.h>
Vlastimil Babka	698b1b3	2016-03-17 14:18:08 -0700	[diff] [blame]	37	#include <linux/compaction.h>
Michal Hocko	b15c872	2018-12-28 00:38:01 -0800	[diff] [blame]	38	#include <linux/rmap.h>
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	39
				40	#include <asm/tlbflush.h>
				41
Adrian Bunk	1e5ad9a	2008-04-28 20:40:08 +0300	[diff] [blame]	42	#include "internal.h"
Dan Williams	e900a91	2019-05-14 15:41:28 -0700	[diff] [blame]	43	#include "shuffle.h"
Adrian Bunk	1e5ad9a	2008-04-28 20:40:08 +0300	[diff] [blame]	44
Oscar Salvador	e3a9d9f	2021-05-04 18:39:48 -0700	[diff] [blame]	45
				46	/*
				47	* memory_hotplug.memmap_on_memory parameter
				48	*/
				49	static bool memmap_on_memory __ro_after_init;
				50	#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
				51	module_param(memmap_on_memory, bool, 0444);
				52	MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug");
				53	#endif
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	54
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame^]	55	enum {
				56	ONLINE_POLICY_CONTIG_ZONES = 0,
				57	ONLINE_POLICY_AUTO_MOVABLE,
				58	};
				59
				60	const char *online_policy_to_str[] = {
				61	[ONLINE_POLICY_CONTIG_ZONES] = "contig-zones",
				62	[ONLINE_POLICY_AUTO_MOVABLE] = "auto-movable",
				63	};
				64
				65	static int set_online_policy(const char val, const struct kernel_param kp)
				66	{
				67	int ret = sysfs_match_string(online_policy_to_str, val);
				68
				69	if (ret < 0)
				70	return ret;
				71	((int )kp->arg) = ret;
				72	return 0;
				73	}
				74
				75	static int get_online_policy(char buffer, const struct kernel_param kp)
				76	{
				77	return sprintf(buffer, "%s\n", online_policy_to_str[((int )kp->arg)]);
				78	}
				79
				80	/*
				81	* memory_hotplug.online_policy: configure online behavior when onlining without
				82	* specifying a zone (MMOP_ONLINE)
				83	*
				84	* "contig-zones": keep zone contiguous
				85	* "auto-movable": online memory to ZONE_MOVABLE if the configuration
				86	* (auto_movable_ratio, auto_movable_numa_aware) allows for it
				87	*/
				88	static int online_policy __read_mostly = ONLINE_POLICY_CONTIG_ZONES;
				89	static const struct kernel_param_ops online_policy_ops = {
				90	.set = set_online_policy,
				91	.get = get_online_policy,
				92	};
				93	module_param_cb(online_policy, &online_policy_ops, &online_policy, 0644);
				94	MODULE_PARM_DESC(online_policy,
				95	"Set the online policy (\"contig-zones\", \"auto-movable\") "
				96	"Default: \"contig-zones\"");
				97
				98	/*
				99	* memory_hotplug.auto_movable_ratio: specify maximum MOVABLE:KERNEL ratio
				100	*
				101	* The ratio represent an upper limit and the kernel might decide to not
				102	* online some memory to ZONE_MOVABLE -- e.g., because hotplugged KERNEL memory
				103	* doesn't allow for more MOVABLE memory.
				104	*/
				105	static unsigned int auto_movable_ratio __read_mostly = 301;
				106	module_param(auto_movable_ratio, uint, 0644);
				107	MODULE_PARM_DESC(auto_movable_ratio,
				108	"Set the maximum ratio of MOVABLE:KERNEL memory in the system "
				109	"in percent for \"auto-movable\" online policy. Default: 301");
				110
				111	/*
				112	* memory_hotplug.auto_movable_numa_aware: consider numa node stats
				113	*/
				114	#ifdef CONFIG_NUMA
				115	static bool auto_movable_numa_aware __read_mostly = true;
				116	module_param(auto_movable_numa_aware, bool, 0644);
				117	MODULE_PARM_DESC(auto_movable_numa_aware,
				118	"Consider numa node stats in addition to global stats in "
				119	"\"auto-movable\" online policy. Default: true");
				120	#endif /* CONFIG_NUMA */
				121
Daniel Kiper	9d0ad8c	2011-07-25 17:12:05 -0700	[diff] [blame]	122	/*
				123	* online_page_callback contains pointer to current page onlining function.
				124	* Initially it is generic_online_page(). If it is required it could be
				125	* changed by calling set_online_page_callback() for callback registration
				126	* and restore_online_page_callback() for generic callback restore.
				127	*/
				128
Daniel Kiper	9d0ad8c	2011-07-25 17:12:05 -0700	[diff] [blame]	129	static online_page_callback_t online_page_callback = generic_online_page;
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	130	static DEFINE_MUTEX(online_page_callback_lock);
Daniel Kiper	9d0ad8c	2011-07-25 17:12:05 -0700	[diff] [blame]	131
Thomas Gleixner	3f906ba	2017-07-10 15:50:09 -0700	[diff] [blame]	132	DEFINE_STATIC_PERCPU_RWSEM(mem_hotplug_lock);
KOSAKI Motohiro	20d6c96	2010-12-02 14:31:19 -0800	[diff] [blame]	133
Thomas Gleixner	3f906ba	2017-07-10 15:50:09 -0700	[diff] [blame]	134	void get_online_mems(void)
				135	{
				136	percpu_down_read(&mem_hotplug_lock);
				137	}
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	138
Thomas Gleixner	3f906ba	2017-07-10 15:50:09 -0700	[diff] [blame]	139	void put_online_mems(void)
				140	{
				141	percpu_up_read(&mem_hotplug_lock);
				142	}
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	143
Michal Hocko	4932381	2017-07-06 15:41:05 -0700	[diff] [blame]	144	bool movable_node_enabled = false;
				145
Vitaly Kuznetsov	8604d9e	2016-05-19 17:13:03 -0700	[diff] [blame]	146	#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
Anshuman Khandual	1adf8b4	2021-02-25 17:17:13 -0800	[diff] [blame]	147	int mhp_default_online_type = MMOP_OFFLINE;
Vitaly Kuznetsov	8604d9e	2016-05-19 17:13:03 -0700	[diff] [blame]	148	#else
Anshuman Khandual	1adf8b4	2021-02-25 17:17:13 -0800	[diff] [blame]	149	int mhp_default_online_type = MMOP_ONLINE;
Vitaly Kuznetsov	8604d9e	2016-05-19 17:13:03 -0700	[diff] [blame]	150	#endif
Vitaly Kuznetsov	31bc385	2016-03-15 14:56:48 -0700	[diff] [blame]	151
Vitaly Kuznetsov	86dd995	2016-05-19 17:13:06 -0700	[diff] [blame]	152	static int __init setup_memhp_default_state(char *str)
				153	{
Anshuman Khandual	1adf8b4	2021-02-25 17:17:13 -0800	[diff] [blame]	154	const int online_type = mhp_online_type_from_str(str);
David Hildenbrand	5f47adf	2020-04-06 20:07:44 -0700	[diff] [blame]	155
				156	if (online_type >= 0)
Anshuman Khandual	1adf8b4	2021-02-25 17:17:13 -0800	[diff] [blame]	157	mhp_default_online_type = online_type;
Vitaly Kuznetsov	86dd995	2016-05-19 17:13:06 -0700	[diff] [blame]	158
				159	return 1;
				160	}
				161	__setup("memhp_default_state=", setup_memhp_default_state);
				162
David Rientjes	30467e0	2015-04-14 15:45:11 -0700	[diff] [blame]	163	void mem_hotplug_begin(void)
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	164	{
Thomas Gleixner	3f906ba	2017-07-10 15:50:09 -0700	[diff] [blame]	165	cpus_read_lock();
				166	percpu_down_write(&mem_hotplug_lock);
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	167	}
				168
David Rientjes	30467e0	2015-04-14 15:45:11 -0700	[diff] [blame]	169	void mem_hotplug_done(void)
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	170	{
Thomas Gleixner	3f906ba	2017-07-10 15:50:09 -0700	[diff] [blame]	171	percpu_up_write(&mem_hotplug_lock);
				172	cpus_read_unlock();
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	173	}
KOSAKI Motohiro	20d6c96	2010-12-02 14:31:19 -0800	[diff] [blame]	174
Juergen Gross	357b4da	2019-02-14 11:42:39 +0100	[diff] [blame]	175	u64 max_mem_size = U64_MAX;
				176
Keith Mannthey	45e0b78	2006-09-30 23:27:09 -0700	[diff] [blame]	177	/* add this memory to iomem resource */
David Hildenbrand	7b7b272	2020-06-04 16:48:41 -0700	[diff] [blame]	178	static struct resource *register_memory_resource(u64 start, u64 size,
				179	const char *resource_name)
Keith Mannthey	45e0b78	2006-09-30 23:27:09 -0700	[diff] [blame]	180	{
Dave Hansen	2794129	2019-02-25 10:57:36 -0800	[diff] [blame]	181	struct resource *res;
				182	unsigned long flags = IORESOURCE_SYSTEM_RAM \| IORESOURCE_BUSY;
David Hildenbrand	7b7b272	2020-06-04 16:48:41 -0700	[diff] [blame]	183
				184	if (strcmp(resource_name, "System RAM"))
David Hildenbrand	7cf603d	2020-10-15 20:08:33 -0700	[diff] [blame]	185	flags \|= IORESOURCE_SYSRAM_DRIVER_MANAGED;
Juergen Gross	357b4da	2019-02-14 11:42:39 +0100	[diff] [blame]	186
Anshuman Khandual	bca3fea	2021-02-25 17:17:33 -0800	[diff] [blame]	187	if (!mhp_range_allowed(start, size, true))
				188	return ERR_PTR(-E2BIG);
				189
Baoquan He	f3cd4c8	2020-04-06 20:06:50 -0700	[diff] [blame]	190	/*
				191	* Make sure value parsed from 'mem=' only restricts memory adding
				192	* while booting, so that memory hotplug won't be impacted. Please
				193	* refer to document of 'mem=' in kernel-parameters.txt for more
				194	* details.
				195	*/
				196	if (start + size > max_mem_size && system_state < SYSTEM_RUNNING)
Juergen Gross	357b4da	2019-02-14 11:42:39 +0100	[diff] [blame]	197	return ERR_PTR(-E2BIG);
				198
Dave Hansen	2794129	2019-02-25 10:57:36 -0800	[diff] [blame]	199	/*
				200	* Request ownership of the new memory range. This might be
				201	* a child of an existing resource that was present but
				202	* not marked as busy.
				203	*/
				204	res = __request_region(&iomem_resource, start, size,
				205	resource_name, flags);
Keith Mannthey	45e0b78	2006-09-30 23:27:09 -0700	[diff] [blame]	206
Dave Hansen	2794129	2019-02-25 10:57:36 -0800	[diff] [blame]	207	if (!res) {
				208	pr_debug("Unable to reserve System RAM region: %016llx->%016llx\n",
				209	start, start + size);
Vitaly Kuznetsov	6f754ba	2016-01-14 15:21:55 -0800	[diff] [blame]	210	return ERR_PTR(-EEXIST);
Keith Mannthey	45e0b78	2006-09-30 23:27:09 -0700	[diff] [blame]	211	}
				212	return res;
				213	}
				214
				215	static void release_memory_resource(struct resource *res)
				216	{
				217	if (!res)
				218	return;
				219	release_resource(res);
				220	kfree(res);
Keith Mannthey	45e0b78	2006-09-30 23:27:09 -0700	[diff] [blame]	221	}
				222
Keith Mannthey	5394702	2006-09-30 23:27:08 -0700	[diff] [blame]	223	#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
Dan Williams	7ea6216	2019-07-18 15:58:22 -0700	[diff] [blame]	224	static int check_pfn_span(unsigned long pfn, unsigned long nr_pages,
				225	const char *reason)
				226	{
				227	/*
				228	* Disallow all operations smaller than a sub-section and only
				229	* allow operations smaller than a section for
				230	* SPARSEMEM_VMEMMAP. Note that check_hotplug_memory_range()
				231	* enforces a larger memory_block_size_bytes() granularity for
				232	* memory that will be marked online, so this check should only
				233	* fire for direct arch_{add,remove}_memory() users outside of
				234	* add_memory_resource().
				235	*/
				236	unsigned long min_align;
				237
				238	if (IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP))
				239	min_align = PAGES_PER_SUBSECTION;
				240	else
				241	min_align = PAGES_PER_SECTION;
				242	if (!IS_ALIGNED(pfn, min_align)
				243	\|\| !IS_ALIGNED(nr_pages, min_align)) {
				244	WARN(1, "Misaligned __%s_pages start: %#lx end: #%lx\n",
				245	reason, pfn, pfn + nr_pages - 1);
				246	return -EINVAL;
				247	}
				248	return 0;
				249	}
				250
David Rientjes	4edd7ce	2013-04-29 15:08:22 -0700	[diff] [blame]	251	/*
Dan Williams	9f605f2	2021-02-25 17:16:57 -0800	[diff] [blame]	252	* Return page for the valid pfn only if the page is online. All pfn
				253	* walkers which rely on the fully initialized page->flags and others
				254	* should use this rather than pfn_valid && pfn_to_page
				255	*/
				256	struct page *pfn_to_online_page(unsigned long pfn)
				257	{
				258	unsigned long nr = pfn_to_section_nr(pfn);
Dan Williams	1f90a34	2021-02-25 17:17:05 -0800	[diff] [blame]	259	struct dev_pagemap *pgmap;
Dan Williams	9f9b02e	2021-02-25 17:17:01 -0800	[diff] [blame]	260	struct mem_section *ms;
Dan Williams	9f605f2	2021-02-25 17:16:57 -0800	[diff] [blame]	261
Dan Williams	9f9b02e	2021-02-25 17:17:01 -0800	[diff] [blame]	262	if (nr >= NR_MEM_SECTIONS)
				263	return NULL;
				264
				265	ms = __nr_to_section(nr);
				266	if (!online_section(ms))
				267	return NULL;
				268
				269	/*
				270	* Save some code text when online_section() +
				271	* pfn_section_valid() are sufficient.
				272	*/
				273	if (IS_ENABLED(CONFIG_HAVE_ARCH_PFN_VALID) && !pfn_valid(pfn))
				274	return NULL;
				275
				276	if (!pfn_section_valid(ms, pfn))
				277	return NULL;
				278
Dan Williams	1f90a34	2021-02-25 17:17:05 -0800	[diff] [blame]	279	if (!online_device_section(ms))
				280	return pfn_to_page(pfn);
				281
				282	/*
				283	* Slowpath: when ZONE_DEVICE collides with
				284	* ZONE_{NORMAL,MOVABLE} within the same section some pfns in
				285	* the section may be 'offline' but 'valid'. Only
				286	* get_dev_pagemap() can determine sub-section online status.
				287	*/
				288	pgmap = get_dev_pagemap(pfn, NULL);
				289	put_dev_pagemap(pgmap);
				290
				291	/* The presence of a pgmap indicates ZONE_DEVICE offline pfn */
				292	if (pgmap)
				293	return NULL;
				294
Dan Williams	9f9b02e	2021-02-25 17:17:01 -0800	[diff] [blame]	295	return pfn_to_page(pfn);
Dan Williams	9f605f2	2021-02-25 17:16:57 -0800	[diff] [blame]	296	}
				297	EXPORT_SYMBOL_GPL(pfn_to_online_page);
				298
				299	/*
David Rientjes	4edd7ce	2013-04-29 15:08:22 -0700	[diff] [blame]	300	* Reasonably generic function for adding memory. It is
				301	* expected that archs that support memory hotplug will
				302	* call this function after deciding the zone to which to
				303	* add the new pages.
				304	*/
Dan Williams	7ea6216	2019-07-18 15:58:22 -0700	[diff] [blame]	305	int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
Logan Gunthorpe	f5637d3	2020-04-10 14:33:21 -0700	[diff] [blame]	306	struct mhp_params *params)
David Rientjes	4edd7ce	2013-04-29 15:08:22 -0700	[diff] [blame]	307	{
David Hildenbrand	6cdd0b3	2020-04-06 20:06:56 -0700	[diff] [blame]	308	const unsigned long end_pfn = pfn + nr_pages;
				309	unsigned long cur_nr_pages;
Dan Williams	9a84503	2019-07-18 15:58:43 -0700	[diff] [blame]	310	int err;
Logan Gunthorpe	f5637d3	2020-04-10 14:33:21 -0700	[diff] [blame]	311	struct vmem_altmap *altmap = params->altmap;
Dan Williams	4b94ffd	2016-01-15 16:56:22 -0800	[diff] [blame]	312
Logan Gunthorpe	bfeb022	2020-04-10 14:33:36 -0700	[diff] [blame]	313	if (WARN_ON_ONCE(!params->pgprot.pgprot))
				314	return -EINVAL;
				315
Anshuman Khandual	bca3fea	2021-02-25 17:17:33 -0800	[diff] [blame]	316	VM_BUG_ON(!mhp_range_allowed(PFN_PHYS(pfn), nr_pages * PAGE_SIZE, false));
Alastair D'Silva	dca4436	2019-11-30 17:53:48 -0800	[diff] [blame]	317
Dan Williams	4b94ffd	2016-01-15 16:56:22 -0800	[diff] [blame]	318	if (altmap) {
				319	/*
				320	* Validate altmap is within bounds of the total request
				321	*/
Dan Williams	7ea6216	2019-07-18 15:58:22 -0700	[diff] [blame]	322	if (altmap->base_pfn != pfn
Dan Williams	4b94ffd	2016-01-15 16:56:22 -0800	[diff] [blame]	323	\|\| vmem_altmap_offset(altmap) > nr_pages) {
				324	pr_warn_once("memory add fail, invalid altmap\n");
Dan Williams	7ea6216	2019-07-18 15:58:22 -0700	[diff] [blame]	325	return -EINVAL;
Dan Williams	4b94ffd	2016-01-15 16:56:22 -0800	[diff] [blame]	326	}
				327	altmap->alloc = 0;
				328	}
				329
Dan Williams	7ea6216	2019-07-18 15:58:22 -0700	[diff] [blame]	330	err = check_pfn_span(pfn, nr_pages, "add");
				331	if (err)
				332	return err;
				333
David Hildenbrand	6cdd0b3	2020-04-06 20:06:56 -0700	[diff] [blame]	334	for (; pfn < end_pfn; pfn += cur_nr_pages) {
				335	/* Select all remaining pages up to the next section boundary */
				336	cur_nr_pages = min(end_pfn - pfn,
				337	SECTION_ALIGN_UP(pfn + 1) - pfn);
				338	err = sparse_add_section(nid, pfn, cur_nr_pages, altmap);
Dan Williams	ba72b4c	2019-07-18 15:58:26 -0700	[diff] [blame]	339	if (err)
				340	break;
Michal Hocko	f64ac5e	2017-10-03 16:16:16 -0700	[diff] [blame]	341	cond_resched();
David Rientjes	4edd7ce	2013-04-29 15:08:22 -0700	[diff] [blame]	342	}
Zhu Guihua	c435a39	2015-06-24 16:58:42 -0700	[diff] [blame]	343	vmemmap_populate_print_last();
David Rientjes	4edd7ce	2013-04-29 15:08:22 -0700	[diff] [blame]	344	return err;
				345	}
David Rientjes	4edd7ce	2013-04-29 15:08:22 -0700	[diff] [blame]	346
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	347	/* find the smallest valid pfn in the range [start_pfn, end_pfn) */
YASUAKI ISHIMATSU	d09b013	2017-10-03 16:16:32 -0700	[diff] [blame]	348	static unsigned long find_smallest_section_pfn(int nid, struct zone *zone,
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	349	unsigned long start_pfn,
				350	unsigned long end_pfn)
				351	{
Dan Williams	49ba3c6	2019-07-18 15:58:07 -0700	[diff] [blame]	352	for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SUBSECTION) {
David Hildenbrand	7ce700b	2019-11-21 17:53:56 -0800	[diff] [blame]	353	if (unlikely(!pfn_to_online_page(start_pfn)))
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	354	continue;
				355
				356	if (unlikely(pfn_to_nid(start_pfn) != nid))
				357	continue;
				358
David Hildenbrand	9b05158	2020-02-03 17:34:12 -0800	[diff] [blame]	359	if (zone != page_zone(pfn_to_page(start_pfn)))
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	360	continue;
				361
				362	return start_pfn;
				363	}
				364
				365	return 0;
				366	}
				367
				368	/* find the biggest valid pfn in the range [start_pfn, end_pfn). */
YASUAKI ISHIMATSU	d09b013	2017-10-03 16:16:32 -0700	[diff] [blame]	369	static unsigned long find_biggest_section_pfn(int nid, struct zone *zone,
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	370	unsigned long start_pfn,
				371	unsigned long end_pfn)
				372	{
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	373	unsigned long pfn;
				374
				375	/* pfn is the end pfn of a memory section. */
				376	pfn = end_pfn - 1;
Dan Williams	49ba3c6	2019-07-18 15:58:07 -0700	[diff] [blame]	377	for (; pfn >= start_pfn; pfn -= PAGES_PER_SUBSECTION) {
David Hildenbrand	7ce700b	2019-11-21 17:53:56 -0800	[diff] [blame]	378	if (unlikely(!pfn_to_online_page(pfn)))
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	379	continue;
				380
				381	if (unlikely(pfn_to_nid(pfn) != nid))
				382	continue;
				383
David Hildenbrand	9b05158	2020-02-03 17:34:12 -0800	[diff] [blame]	384	if (zone != page_zone(pfn_to_page(pfn)))
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	385	continue;
				386
				387	return pfn;
				388	}
				389
				390	return 0;
				391	}
				392
				393	static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
				394	unsigned long end_pfn)
				395	{
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	396	unsigned long pfn;
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	397	int nid = zone_to_nid(zone);
				398
David Hildenbrand	5d12071	2020-02-03 17:34:19 -0800	[diff] [blame]	399	if (zone->zone_start_pfn == start_pfn) {
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	400	/*
				401	* If the section is smallest section in the zone, it need
				402	* shrink zone->zone_start_pfn and zone->zone_spanned_pages.
				403	* In this case, we find second smallest valid mem_section
				404	* for shrinking zone.
				405	*/
				406	pfn = find_smallest_section_pfn(nid, zone, end_pfn,
David Hildenbrand	5d12071	2020-02-03 17:34:19 -0800	[diff] [blame]	407	zone_end_pfn(zone));
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	408	if (pfn) {
David Hildenbrand	5d12071	2020-02-03 17:34:19 -0800	[diff] [blame]	409	zone->spanned_pages = zone_end_pfn(zone) - pfn;
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	410	zone->zone_start_pfn = pfn;
David Hildenbrand	950b68d	2020-02-03 17:34:16 -0800	[diff] [blame]	411	} else {
				412	zone->zone_start_pfn = 0;
				413	zone->spanned_pages = 0;
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	414	}
David Hildenbrand	5d12071	2020-02-03 17:34:19 -0800	[diff] [blame]	415	} else if (zone_end_pfn(zone) == end_pfn) {
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	416	/*
				417	* If the section is biggest section in the zone, it need
				418	* shrink zone->spanned_pages.
				419	* In this case, we find second biggest valid mem_section for
				420	* shrinking zone.
				421	*/
David Hildenbrand	5d12071	2020-02-03 17:34:19 -0800	[diff] [blame]	422	pfn = find_biggest_section_pfn(nid, zone, zone->zone_start_pfn,
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	423	start_pfn);
				424	if (pfn)
David Hildenbrand	5d12071	2020-02-03 17:34:19 -0800	[diff] [blame]	425	zone->spanned_pages = pfn - zone->zone_start_pfn + 1;
David Hildenbrand	950b68d	2020-02-03 17:34:16 -0800	[diff] [blame]	426	else {
				427	zone->zone_start_pfn = 0;
				428	zone->spanned_pages = 0;
				429	}
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	430	}
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	431	}
				432
David Hildenbrand	00d6c01	2019-10-18 20:19:33 -0700	[diff] [blame]	433	static void update_pgdat_span(struct pglist_data *pgdat)
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	434	{
David Hildenbrand	00d6c01	2019-10-18 20:19:33 -0700	[diff] [blame]	435	unsigned long node_start_pfn = 0, node_end_pfn = 0;
				436	struct zone *zone;
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	437
David Hildenbrand	00d6c01	2019-10-18 20:19:33 -0700	[diff] [blame]	438	for (zone = pgdat->node_zones;
				439	zone < pgdat->node_zones + MAX_NR_ZONES; zone++) {
Miaohe Lin	6c922cf	2021-02-25 17:17:21 -0800	[diff] [blame]	440	unsigned long end_pfn = zone_end_pfn(zone);
David Hildenbrand	00d6c01	2019-10-18 20:19:33 -0700	[diff] [blame]	441
				442	/* No need to lock the zones, they can't change. */
David Hildenbrand	656d571	2019-11-05 21:17:10 -0800	[diff] [blame]	443	if (!zone->spanned_pages)
				444	continue;
				445	if (!node_end_pfn) {
				446	node_start_pfn = zone->zone_start_pfn;
Miaohe Lin	6c922cf	2021-02-25 17:17:21 -0800	[diff] [blame]	447	node_end_pfn = end_pfn;
David Hildenbrand	656d571	2019-11-05 21:17:10 -0800	[diff] [blame]	448	continue;
				449	}
				450
Miaohe Lin	6c922cf	2021-02-25 17:17:21 -0800	[diff] [blame]	451	if (end_pfn > node_end_pfn)
				452	node_end_pfn = end_pfn;
David Hildenbrand	00d6c01	2019-10-18 20:19:33 -0700	[diff] [blame]	453	if (zone->zone_start_pfn < node_start_pfn)
				454	node_start_pfn = zone->zone_start_pfn;
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	455	}
				456
David Hildenbrand	00d6c01	2019-10-18 20:19:33 -0700	[diff] [blame]	457	pgdat->node_start_pfn = node_start_pfn;
				458	pgdat->node_spanned_pages = node_end_pfn - node_start_pfn;
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	459	}
				460
David Hildenbrand	feee6b2	2020-01-04 12:59:33 -0800	[diff] [blame]	461	void __ref remove_pfn_range_from_zone(struct zone *zone,
				462	unsigned long start_pfn,
				463	unsigned long nr_pages)
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	464	{
Ben Widawsky	b7e3deb	2020-06-25 20:30:51 -0700	[diff] [blame]	465	const unsigned long end_pfn = start_pfn + nr_pages;
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	466	struct pglist_data *pgdat = zone->zone_pgdat;
Oscar Salvador	27cacaa	2021-06-30 18:52:46 -0700	[diff] [blame]	467	unsigned long pfn, cur_nr_pages;
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	468
David Hildenbrand	d33695b	2020-02-03 17:34:09 -0800	[diff] [blame]	469	/* Poison struct pages because they are now uninitialized again. */
Ben Widawsky	b7e3deb	2020-06-25 20:30:51 -0700	[diff] [blame]	470	for (pfn = start_pfn; pfn < end_pfn; pfn += cur_nr_pages) {
				471	cond_resched();
				472
				473	/* Select all remaining pages up to the next section boundary */
				474	cur_nr_pages =
				475	min(end_pfn - pfn, SECTION_ALIGN_UP(pfn + 1) - pfn);
				476	page_init_poison(pfn_to_page(pfn),
				477	sizeof(struct page) * cur_nr_pages);
				478	}
David Hildenbrand	d33695b	2020-02-03 17:34:09 -0800	[diff] [blame]	479
David Hildenbrand	7ce700b	2019-11-21 17:53:56 -0800	[diff] [blame]	480	#ifdef CONFIG_ZONE_DEVICE
				481	/*
				482	* Zone shrinking code cannot properly deal with ZONE_DEVICE. So
				483	* we will not try to shrink the zones - which is okay as
				484	* set_zone_contiguous() cannot deal with ZONE_DEVICE either way.
				485	*/
				486	if (zone_idx(zone) == ZONE_DEVICE)
				487	return;
				488	#endif
				489
David Hildenbrand	feee6b2	2020-01-04 12:59:33 -0800	[diff] [blame]	490	clear_zone_contiguous(zone);
				491
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	492	shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
David Hildenbrand	00d6c01	2019-10-18 20:19:33 -0700	[diff] [blame]	493	update_pgdat_span(pgdat);
David Hildenbrand	feee6b2	2020-01-04 12:59:33 -0800	[diff] [blame]	494
				495	set_zone_contiguous(zone);
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	496	}
				497
David Hildenbrand	feee6b2	2020-01-04 12:59:33 -0800	[diff] [blame]	498	static void __remove_section(unsigned long pfn, unsigned long nr_pages,
				499	unsigned long map_offset,
				500	struct vmem_altmap *altmap)
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	501	{
chenqiwu	1040490	2020-04-06 20:07:48 -0700	[diff] [blame]	502	struct mem_section *ms = __pfn_to_section(pfn);
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	503
David Hildenbrand	9d1d887	2019-05-13 17:21:41 -0700	[diff] [blame]	504	if (WARN_ON_ONCE(!valid_section(ms)))
				505	return;
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	506
Dan Williams	ba72b4c	2019-07-18 15:58:26 -0700	[diff] [blame]	507	sparse_remove_section(ms, pfn, nr_pages, map_offset, altmap);
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	508	}
				509
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	510	/**
David Hildenbrand	feee6b2	2020-01-04 12:59:33 -0800	[diff] [blame]	511	* __remove_pages() - remove sections of pages
Dan Williams	7ea6216	2019-07-18 15:58:22 -0700	[diff] [blame]	512	* @pfn: starting pageframe (must be aligned to start of a section)
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	513	* @nr_pages: number of pages to remove (must be multiple of section size)
Mike Rapoport	e8b098f	2018-04-05 16:24:57 -0700	[diff] [blame]	514	* @altmap: alternative device page map or %NULL if default memmap is used
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	515	*
				516	* Generic helper function to remove section mappings and sysfs entries
				517	* for the section of the memory we are removing. Caller needs to make
				518	* sure that pages are marked reserved and zones are adjust properly by
				519	* calling offline_pages().
				520	*/
David Hildenbrand	feee6b2	2020-01-04 12:59:33 -0800	[diff] [blame]	521	void __remove_pages(unsigned long pfn, unsigned long nr_pages,
				522	struct vmem_altmap *altmap)
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	523	{
David Hildenbrand	52fb87c	2020-02-03 17:34:23 -0800	[diff] [blame]	524	const unsigned long end_pfn = pfn + nr_pages;
				525	unsigned long cur_nr_pages;
Dan Williams	4b94ffd	2016-01-15 16:56:22 -0800	[diff] [blame]	526	unsigned long map_offset = 0;
Dan Williams	4b94ffd	2016-01-15 16:56:22 -0800	[diff] [blame]	527
Dan Williams	96da435	2019-07-18 15:58:15 -0700	[diff] [blame]	528	map_offset = vmem_altmap_offset(altmap);
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	529
Dan Williams	7ea6216	2019-07-18 15:58:22 -0700	[diff] [blame]	530	if (check_pfn_span(pfn, nr_pages, "remove"))
				531	return;
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	532
David Hildenbrand	52fb87c	2020-02-03 17:34:23 -0800	[diff] [blame]	533	for (; pfn < end_pfn; pfn += cur_nr_pages) {
Michal Hocko	dd33ad7	2018-11-02 15:48:46 -0700	[diff] [blame]	534	cond_resched();
David Hildenbrand	52fb87c	2020-02-03 17:34:23 -0800	[diff] [blame]	535	/* Select all remaining pages up to the next section boundary */
David Hildenbrand	a11b941	2020-04-06 20:06:53 -0700	[diff] [blame]	536	cur_nr_pages = min(end_pfn - pfn,
				537	SECTION_ALIGN_UP(pfn + 1) - pfn);
David Hildenbrand	52fb87c	2020-02-03 17:34:23 -0800	[diff] [blame]	538	__remove_section(pfn, cur_nr_pages, map_offset, altmap);
Dan Williams	4b94ffd	2016-01-15 16:56:22 -0800	[diff] [blame]	539	map_offset = 0;
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	540	}
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	541	}
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	542
Daniel Kiper	9d0ad8c	2011-07-25 17:12:05 -0700	[diff] [blame]	543	int set_online_page_callback(online_page_callback_t callback)
				544	{
				545	int rc = -EINVAL;
				546
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	547	get_online_mems();
				548	mutex_lock(&online_page_callback_lock);
Daniel Kiper	9d0ad8c	2011-07-25 17:12:05 -0700	[diff] [blame]	549
				550	if (online_page_callback == generic_online_page) {
				551	online_page_callback = callback;
				552	rc = 0;
				553	}
				554
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	555	mutex_unlock(&online_page_callback_lock);
				556	put_online_mems();
Daniel Kiper	9d0ad8c	2011-07-25 17:12:05 -0700	[diff] [blame]	557
				558	return rc;
				559	}
				560	EXPORT_SYMBOL_GPL(set_online_page_callback);
				561
				562	int restore_online_page_callback(online_page_callback_t callback)
				563	{
				564	int rc = -EINVAL;
				565
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	566	get_online_mems();
				567	mutex_lock(&online_page_callback_lock);
Daniel Kiper	9d0ad8c	2011-07-25 17:12:05 -0700	[diff] [blame]	568
				569	if (online_page_callback == callback) {
				570	online_page_callback = generic_online_page;
				571	rc = 0;
				572	}
				573
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	574	mutex_unlock(&online_page_callback_lock);
				575	put_online_mems();
Daniel Kiper	9d0ad8c	2011-07-25 17:12:05 -0700	[diff] [blame]	576
				577	return rc;
				578	}
				579	EXPORT_SYMBOL_GPL(restore_online_page_callback);
				580
David Hildenbrand	18db149	2019-11-30 17:53:51 -0800	[diff] [blame]	581	void generic_online_page(struct page *page, unsigned int order)
Daniel Kiper	9d0ad8c	2011-07-25 17:12:05 -0700	[diff] [blame]	582	{
Vlastimil Babka	c87cbc1	2020-03-05 22:28:42 -0800	[diff] [blame]	583	/*
				584	* Freeing the page with debug_pagealloc enabled will try to unmap it,
				585	* so we should map it first. This is better than introducing a special
				586	* case in page freeing fast path.
				587	*/
Mike Rapoport	77bc7fd	2020-12-14 19:10:20 -0800	[diff] [blame]	588	debug_pagealloc_map_pages(page, 1 << order);
Arun KS	a9cd410	2019-03-05 15:42:14 -0800	[diff] [blame]	589	__free_pages_core(page, order);
				590	totalram_pages_add(1UL << order);
				591	#ifdef CONFIG_HIGHMEM
				592	if (PageHighMem(page))
				593	totalhigh_pages_add(1UL << order);
				594	#endif
				595	}
David Hildenbrand	18db149	2019-11-30 17:53:51 -0800	[diff] [blame]	596	EXPORT_SYMBOL_GPL(generic_online_page);
Arun KS	a9cd410	2019-03-05 15:42:14 -0800	[diff] [blame]	597
David Hildenbrand	aac6532	2020-10-15 20:08:11 -0700	[diff] [blame]	598	static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	599	{
David Hildenbrand	b2c2ab2	2019-09-23 15:36:02 -0700	[diff] [blame]	600	const unsigned long end_pfn = start_pfn + nr_pages;
				601	unsigned long pfn;
Michal Hocko	2d070ea	2017-07-06 15:37:56 -0700	[diff] [blame]	602
David Hildenbrand	b2c2ab2	2019-09-23 15:36:02 -0700	[diff] [blame]	603	/*
David Hildenbrand	aac6532	2020-10-15 20:08:11 -0700	[diff] [blame]	604	* Online the pages in MAX_ORDER - 1 aligned chunks. The callback might
				605	* decide to not expose all pages to the buddy (e.g., expose them
				606	* later). We account all pages as being online and belonging to this
				607	* zone ("present").
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	608	* When using memmap_on_memory, the range might not be aligned to
				609	* MAX_ORDER_NR_PAGES - 1, but pageblock aligned. __ffs() will detect
				610	* this and the first chunk to online will be pageblock_nr_pages.
David Hildenbrand	b2c2ab2	2019-09-23 15:36:02 -0700	[diff] [blame]	611	*/
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	612	for (pfn = start_pfn; pfn < end_pfn;) {
				613	int order = min(MAX_ORDER - 1UL, __ffs(pfn));
				614
				615	(*online_page_callback)(pfn_to_page(pfn), order);
				616	pfn += (1UL << order);
				617	}
Michal Hocko	2d070ea	2017-07-06 15:37:56 -0700	[diff] [blame]	618
David Hildenbrand	b2c2ab2	2019-09-23 15:36:02 -0700	[diff] [blame]	619	/* mark all involved sections as online */
				620	online_mem_sections(start_pfn, end_pfn);
KAMEZAWA Hiroyuki	75884fb	2007-10-16 01:26:10 -0700	[diff] [blame]	621	}
				622
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	623	/* check which state of node_states will be changed when online memory */
				624	static void node_states_check_changes_online(unsigned long nr_pages,
				625	struct zone zone, struct memory_notify arg)
				626	{
				627	int nid = zone_to_nid(zone);
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	628
Anshuman Khandual	98fa15f	2019-03-05 15:42:58 -0800	[diff] [blame]	629	arg->status_change_nid = NUMA_NO_NODE;
				630	arg->status_change_nid_normal = NUMA_NO_NODE;
				631	arg->status_change_nid_high = NUMA_NO_NODE;
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	632
Lai Jiangshan	6715ddf	2012-12-12 13:51:49 -0800	[diff] [blame]	633	if (!node_state(nid, N_MEMORY))
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	634	arg->status_change_nid = nid;
Oscar Salvador	8efe33f	2018-10-26 15:07:34 -0700	[diff] [blame]	635	if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY))
				636	arg->status_change_nid_normal = nid;
				637	#ifdef CONFIG_HIGHMEM
Baoquan He	d3ba3ae	2019-05-13 17:17:35 -0700	[diff] [blame]	638	if (zone_idx(zone) <= ZONE_HIGHMEM && !node_state(nid, N_HIGH_MEMORY))
Oscar Salvador	8efe33f	2018-10-26 15:07:34 -0700	[diff] [blame]	639	arg->status_change_nid_high = nid;
				640	#endif
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	641	}
				642
				643	static void node_states_set_node(int node, struct memory_notify *arg)
				644	{
				645	if (arg->status_change_nid_normal >= 0)
				646	node_set_state(node, N_NORMAL_MEMORY);
				647
Lai Jiangshan	6715ddf	2012-12-12 13:51:49 -0800	[diff] [blame]	648	if (arg->status_change_nid_high >= 0)
				649	node_set_state(node, N_HIGH_MEMORY);
				650
Oscar Salvador	83d8361	2018-10-26 15:07:25 -0700	[diff] [blame]	651	if (arg->status_change_nid >= 0)
				652	node_set_state(node, N_MEMORY);
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	653	}
				654
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	655	static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn,
				656	unsigned long nr_pages)
				657	{
				658	unsigned long old_end_pfn = zone_end_pfn(zone);
				659
				660	if (zone_is_empty(zone) \|\| start_pfn < zone->zone_start_pfn)
				661	zone->zone_start_pfn = start_pfn;
				662
				663	zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn;
				664	}
				665
				666	static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn,
				667	unsigned long nr_pages)
				668	{
				669	unsigned long old_end_pfn = pgdat_end_pfn(pgdat);
				670
				671	if (!pgdat->node_spanned_pages \|\| start_pfn < pgdat->node_start_pfn)
				672	pgdat->node_start_pfn = start_pfn;
				673
				674	pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn;
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	675
David Hildenbrand	3fccb74	2019-09-23 15:35:37 -0700	[diff] [blame]	676	}
Dan Williams	1f90a34	2021-02-25 17:17:05 -0800	[diff] [blame]	677
				678	static void section_taint_zone_device(unsigned long pfn)
				679	{
				680	struct mem_section *ms = __pfn_to_section(pfn);
				681
				682	ms->section_mem_map \|= SECTION_TAINT_ZONE_DEVICE;
				683	}
				684
David Hildenbrand	3fccb74	2019-09-23 15:35:37 -0700	[diff] [blame]	685	/*
				686	* Associate the pfn range with the given zone, initializing the memmaps
				687	* and resizing the pgdat/zone data to span the added pages. After this
				688	* call, all affected pages are PG_reserved.
David Hildenbrand	d882c00	2020-10-15 20:08:19 -0700	[diff] [blame]	689	*
				690	* All aligned pageblocks are initialized to the specified migratetype
				691	* (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
				692	* zone stats (e.g., nr_isolate_pageblock) are touched.
David Hildenbrand	3fccb74	2019-09-23 15:35:37 -0700	[diff] [blame]	693	*/
Christoph Hellwig	a99583e	2017-12-29 08:53:57 +0100	[diff] [blame]	694	void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
David Hildenbrand	d882c00	2020-10-15 20:08:19 -0700	[diff] [blame]	695	unsigned long nr_pages,
				696	struct vmem_altmap *altmap, int migratetype)
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	697	{
				698	struct pglist_data *pgdat = zone->zone_pgdat;
				699	int nid = pgdat->node_id;
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	700
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	701	clear_zone_contiguous(zone);
				702
Wei Yang	fa004ab	2018-12-28 00:37:10 -0800	[diff] [blame]	703	if (zone_is_empty(zone))
				704	init_currently_empty_zone(zone, start_pfn, nr_pages);
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	705	resize_zone_range(zone, start_pfn, nr_pages);
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	706	resize_pgdat_range(pgdat, start_pfn, nr_pages);
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	707
				708	/*
Dan Williams	1f90a34	2021-02-25 17:17:05 -0800	[diff] [blame]	709	* Subsection population requires care in pfn_to_online_page().
				710	* Set the taint to enable the slow path detection of
				711	* ZONE_DEVICE pages in an otherwise ZONE_{NORMAL,MOVABLE}
				712	* section.
				713	*/
				714	if (zone_is_zone_device(zone)) {
				715	if (!IS_ALIGNED(start_pfn, PAGES_PER_SECTION))
				716	section_taint_zone_device(start_pfn);
				717	if (!IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION))
				718	section_taint_zone_device(start_pfn + nr_pages);
				719	}
				720
				721	/*
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	722	* TODO now we have a visible range of pages which are not associated
				723	* with their zone properly. Not nice but set_pfnblock_flags_mask
				724	* expects the zone spans the pfn range. All the pages in the range
				725	* are reserved so nobody should be touching them so we should be safe
				726	*/
Baoquan He	ab28cb6	2021-02-24 12:06:14 -0800	[diff] [blame]	727	memmap_init_range(nr_pages, nid, zone_idx(zone), start_pfn, 0,
David Hildenbrand	d882c00	2020-10-15 20:08:19 -0700	[diff] [blame]	728	MEMINIT_HOTPLUG, altmap, migratetype);
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	729
				730	set_zone_contiguous(zone);
				731	}
				732
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame^]	733	struct auto_movable_stats {
				734	unsigned long kernel_early_pages;
				735	unsigned long movable_pages;
				736	};
				737
				738	static void auto_movable_stats_account_zone(struct auto_movable_stats *stats,
				739	struct zone *zone)
				740	{
				741	if (zone_idx(zone) == ZONE_MOVABLE) {
				742	stats->movable_pages += zone->present_pages;
				743	} else {
				744	stats->kernel_early_pages += zone->present_early_pages;
				745	#ifdef CONFIG_CMA
				746	/*
				747	* CMA pages (never on hotplugged memory) behave like
				748	* ZONE_MOVABLE.
				749	*/
				750	stats->movable_pages += zone->cma_pages;
				751	stats->kernel_early_pages -= zone->cma_pages;
				752	#endif /* CONFIG_CMA */
				753	}
				754	}
				755
				756	static bool auto_movable_can_online_movable(int nid, unsigned long nr_pages)
				757	{
				758	struct auto_movable_stats stats = {};
				759	unsigned long kernel_early_pages, movable_pages;
				760	pg_data_t *pgdat = NODE_DATA(nid);
				761	struct zone *zone;
				762	int i;
				763
				764	/* Walk all relevant zones and collect MOVABLE vs. KERNEL stats. */
				765	if (nid == NUMA_NO_NODE) {
				766	/* TODO: cache values */
				767	for_each_populated_zone(zone)
				768	auto_movable_stats_account_zone(&stats, zone);
				769	} else {
				770	for (i = 0; i < MAX_NR_ZONES; i++) {
				771	zone = pgdat->node_zones + i;
				772	if (populated_zone(zone))
				773	auto_movable_stats_account_zone(&stats, zone);
				774	}
				775	}
				776
				777	kernel_early_pages = stats.kernel_early_pages;
				778	movable_pages = stats.movable_pages;
				779
				780	/*
				781	* Test if we could online the given number of pages to ZONE_MOVABLE
				782	* and still stay in the configured ratio.
				783	*/
				784	movable_pages += nr_pages;
				785	return movable_pages <= (auto_movable_ratio * kernel_early_pages) / 100;
				786	}
				787
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	788	/*
Michal Hocko	c246a21	2017-07-06 15:38:18 -0700	[diff] [blame]	789	* Returns a default kernel memory zone for the given pfn range.
				790	* If no kernel zone covers this pfn range it will automatically go
				791	* to the ZONE_NORMAL.
				792	*/
Michal Hocko	c6f03e2	2017-09-06 16:19:40 -0700	[diff] [blame]	793	static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn,
Michal Hocko	c246a21	2017-07-06 15:38:18 -0700	[diff] [blame]	794	unsigned long nr_pages)
				795	{
				796	struct pglist_data *pgdat = NODE_DATA(nid);
				797	int zid;
				798
				799	for (zid = 0; zid <= ZONE_NORMAL; zid++) {
				800	struct zone *zone = &pgdat->node_zones[zid];
				801
				802	if (zone_intersects(zone, start_pfn, nr_pages))
				803	return zone;
				804	}
				805
				806	return &pgdat->node_zones[ZONE_NORMAL];
				807	}
				808
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame^]	809	/*
				810	* Determine to which zone to online memory dynamically based on user
				811	* configuration and system stats. We care about the following ratio:
				812	*
				813	* MOVABLE : KERNEL
				814	*
				815	* Whereby MOVABLE is memory in ZONE_MOVABLE and KERNEL is memory in
				816	* one of the kernel zones. CMA pages inside one of the kernel zones really
				817	* behaves like ZONE_MOVABLE, so we treat them accordingly.
				818	*
				819	* We don't allow for hotplugged memory in a KERNEL zone to increase the
				820	* amount of MOVABLE memory we can have, so we end up with:
				821	*
				822	* MOVABLE : KERNEL_EARLY
				823	*
				824	* Whereby KERNEL_EARLY is memory in one of the kernel zones, available sinze
				825	* boot. We base our calculation on KERNEL_EARLY internally, because:
				826	*
				827	* a) Hotplugged memory in one of the kernel zones can sometimes still get
				828	* hotunplugged, especially when hot(un)plugging individual memory blocks.
				829	* There is no coordination across memory devices, therefore "automatic"
				830	* hotunplugging, as implemented in hypervisors, could result in zone
				831	* imbalances.
				832	* b) Early/boot memory in one of the kernel zones can usually not get
				833	* hotunplugged again (e.g., no firmware interface to unplug, fragmented
				834	* with unmovable allocations). While there are corner cases where it might
				835	* still work, it is barely relevant in practice.
				836	*
				837	* We rely on "present pages" instead of "managed pages", as the latter is
				838	* highly unreliable and dynamic in virtualized environments, and does not
				839	* consider boot time allocations. For example, memory ballooning adjusts the
				840	* managed pages when inflating/deflating the balloon, and balloon compaction
				841	* can even migrate inflated pages between zones.
				842	*
				843	* Using "present pages" is better but some things to keep in mind are:
				844	*
				845	* a) Some memblock allocations, such as for the crashkernel area, are
				846	* effectively unused by the kernel, yet they account to "present pages".
				847	* Fortunately, these allocations are comparatively small in relevant setups
				848	* (e.g., fraction of system memory).
				849	* b) Some hotplugged memory blocks in virtualized environments, esecially
				850	* hotplugged by virtio-mem, look like they are completely present, however,
				851	* only parts of the memory block are actually currently usable.
				852	* "present pages" is an upper limit that can get reached at runtime. As
				853	* we base our calculations on KERNEL_EARLY, this is not an issue.
				854	*/
				855	static struct zone *auto_movable_zone_for_pfn(int nid, unsigned long pfn,
				856	unsigned long nr_pages)
				857	{
				858	if (!auto_movable_ratio)
				859	goto kernel_zone;
				860
				861	if (!auto_movable_can_online_movable(NUMA_NO_NODE, nr_pages))
				862	goto kernel_zone;
				863
				864	#ifdef CONFIG_NUMA
				865	if (auto_movable_numa_aware &&
				866	!auto_movable_can_online_movable(nid, nr_pages))
				867	goto kernel_zone;
				868	#endif /* CONFIG_NUMA */
				869
				870	return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
				871	kernel_zone:
				872	return default_kernel_zone_for_pfn(nid, pfn, nr_pages);
				873	}
				874
Michal Hocko	c6f03e2	2017-09-06 16:19:40 -0700	[diff] [blame]	875	static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
				876	unsigned long nr_pages)
Michal Hocko	e5e6893	2017-09-06 16:19:37 -0700	[diff] [blame]	877	{
Michal Hocko	c6f03e2	2017-09-06 16:19:40 -0700	[diff] [blame]	878	struct zone *kernel_zone = default_kernel_zone_for_pfn(nid, start_pfn,
				879	nr_pages);
				880	struct zone *movable_zone = &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
				881	bool in_kernel = zone_intersects(kernel_zone, start_pfn, nr_pages);
				882	bool in_movable = zone_intersects(movable_zone, start_pfn, nr_pages);
Michal Hocko	e5e6893	2017-09-06 16:19:37 -0700	[diff] [blame]	883
				884	/*
Michal Hocko	c6f03e2	2017-09-06 16:19:40 -0700	[diff] [blame]	885	* We inherit the existing zone in a simple case where zones do not
				886	* overlap in the given range
Michal Hocko	e5e6893	2017-09-06 16:19:37 -0700	[diff] [blame]	887	*/
Michal Hocko	c6f03e2	2017-09-06 16:19:40 -0700	[diff] [blame]	888	if (in_kernel ^ in_movable)
				889	return (in_kernel) ? kernel_zone : movable_zone;
Michal Hocko	e5e6893	2017-09-06 16:19:37 -0700	[diff] [blame]	890
Michal Hocko	c6f03e2	2017-09-06 16:19:40 -0700	[diff] [blame]	891	/*
				892	* If the range doesn't belong to any zone or two zones overlap in the
				893	* given range then we use movable zone only if movable_node is
				894	* enabled because we always online to a kernel zone by default.
				895	*/
				896	return movable_node_enabled ? movable_zone : kernel_zone;
Michal Hocko	9f123ab	2017-07-10 15:48:37 -0700	[diff] [blame]	897	}
				898
David Hildenbrand	7cf209b	2021-09-07 19:54:59 -0700	[diff] [blame]	899	struct zone *zone_for_pfn_range(int online_type, int nid,
				900	unsigned long start_pfn, unsigned long nr_pages)
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	901	{
Michal Hocko	c6f03e2	2017-09-06 16:19:40 -0700	[diff] [blame]	902	if (online_type == MMOP_ONLINE_KERNEL)
				903	return default_kernel_zone_for_pfn(nid, start_pfn, nr_pages);
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	904
Michal Hocko	c6f03e2	2017-09-06 16:19:40 -0700	[diff] [blame]	905	if (online_type == MMOP_ONLINE_MOVABLE)
				906	return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
Reza Arbab	df429ac	2016-07-26 15:22:23 -0700	[diff] [blame]	907
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame^]	908	if (online_policy == ONLINE_POLICY_AUTO_MOVABLE)
				909	return auto_movable_zone_for_pfn(nid, start_pfn, nr_pages);
				910
Michal Hocko	c6f03e2	2017-09-06 16:19:40 -0700	[diff] [blame]	911	return default_zone_for_pfn(nid, start_pfn, nr_pages);
Michal Hocko	e5e6893	2017-09-06 16:19:37 -0700	[diff] [blame]	912	}
				913
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	914	/*
				915	* This function should only be called by memory_block_{online,offline},
				916	* and {online,offline}_pages.
				917	*/
David Hildenbrand	4b09700	2021-09-07 19:55:19 -0700	[diff] [blame]	918	void adjust_present_page_count(struct page *page, long nr_pages)
David Hildenbrand	f990114	2021-05-04 18:39:39 -0700	[diff] [blame]	919	{
David Hildenbrand	4b09700	2021-09-07 19:55:19 -0700	[diff] [blame]	920	struct zone *zone = page_zone(page);
				921
				922	/*
				923	* We only support onlining/offlining/adding/removing of complete
				924	* memory blocks; therefore, either all is either early or hotplugged.
				925	*/
				926	if (early_section(__pfn_to_section(page_to_pfn(page))))
				927	zone->present_early_pages += nr_pages;
David Hildenbrand	f990114	2021-05-04 18:39:39 -0700	[diff] [blame]	928	zone->present_pages += nr_pages;
David Hildenbrand	f990114	2021-05-04 18:39:39 -0700	[diff] [blame]	929	zone->zone_pgdat->node_present_pages += nr_pages;
David Hildenbrand	f990114	2021-05-04 18:39:39 -0700	[diff] [blame]	930	}
				931
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	932	int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
				933	struct zone *zone)
				934	{
				935	unsigned long end_pfn = pfn + nr_pages;
				936	int ret;
				937
				938	ret = kasan_add_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
				939	if (ret)
				940	return ret;
				941
				942	move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE);
				943
				944	/*
				945	* It might be that the vmemmap_pages fully span sections. If that is
				946	* the case, mark those sections online here as otherwise they will be
				947	* left offline.
				948	*/
				949	if (nr_pages >= PAGES_PER_SECTION)
				950	online_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION));
				951
				952	return ret;
				953	}
				954
				955	void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages)
				956	{
				957	unsigned long end_pfn = pfn + nr_pages;
				958
				959	/*
				960	* It might be that the vmemmap_pages fully span sections. If that is
				961	* the case, mark those sections offline here as otherwise they will be
				962	* left online.
				963	*/
				964	if (nr_pages >= PAGES_PER_SECTION)
				965	offline_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION));
				966
				967	/*
				968	* The pages associated with this vmemmap have been offlined, so
				969	* we can reset its state here.
				970	*/
				971	remove_pfn_range_from_zone(page_zone(pfn_to_page(pfn)), pfn, nr_pages);
				972	kasan_remove_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
				973	}
				974
				975	int __ref online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *zone)
KAMEZAWA Hiroyuki	75884fb	2007-10-16 01:26:10 -0700	[diff] [blame]	976	{
Cody P Schafer	aa47228	2013-07-03 15:02:10 -0700	[diff] [blame]	977	unsigned long flags;
Yasunori Goto	6811378	2006-06-23 02:03:11 -0700	[diff] [blame]	978	int need_zonelists_rebuild = 0;
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	979	const int nid = zone_to_nid(zone);
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	980	int ret;
				981	struct memory_notify arg;
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	982
Oscar Salvador	dd8e2f2	2021-05-04 18:39:36 -0700	[diff] [blame]	983	/*
				984	* {on,off}lining is constrained to full memory sections (or more
Zhen Lei	041711c	2021-06-30 18:53:17 -0700	[diff] [blame]	985	* precisely to memory blocks from the user space POV).
Oscar Salvador	dd8e2f2	2021-05-04 18:39:36 -0700	[diff] [blame]	986	* memmap_on_memory is an exception because it reserves initial part
				987	* of the physical memory space for vmemmaps. That space is pageblock
				988	* aligned.
				989	*/
David Hildenbrand	4986fac	2020-10-15 20:07:50 -0700	[diff] [blame]	990	if (WARN_ON_ONCE(!nr_pages \|\|
Oscar Salvador	dd8e2f2	2021-05-04 18:39:36 -0700	[diff] [blame]	991	!IS_ALIGNED(pfn, pageblock_nr_pages) \|\|
				992	!IS_ALIGNED(pfn + nr_pages, PAGES_PER_SECTION)))
David Hildenbrand	4986fac	2020-10-15 20:07:50 -0700	[diff] [blame]	993	return -EINVAL;
				994
David Hildenbrand	381eab4	2018-10-30 15:10:29 -0700	[diff] [blame]	995	mem_hotplug_begin();
				996
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	997	/* associate pfn range with the zone */
David Hildenbrand	b30c592	2020-10-15 20:08:23 -0700	[diff] [blame]	998	move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE);
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	999
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1000	arg.start_pfn = pfn;
				1001	arg.nr_pages = nr_pages;
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1002	node_states_check_changes_online(nr_pages, zone, &arg);
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1003
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1004	ret = memory_notify(MEM_GOING_ONLINE, &arg);
				1005	ret = notifier_to_errno(ret);
Chen Yucong	e33e33b	2016-03-17 14:19:35 -0700	[diff] [blame]	1006	if (ret)
				1007	goto failed_addition;
				1008
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	1009	/*
David Hildenbrand	b30c592	2020-10-15 20:08:23 -0700	[diff] [blame]	1010	* Fixup the number of isolated pageblocks before marking the sections
				1011	* onlining, such that undo_isolate_page_range() works correctly.
				1012	*/
				1013	spin_lock_irqsave(&zone->lock, flags);
				1014	zone->nr_isolate_pageblock += nr_pages / pageblock_nr_pages;
				1015	spin_unlock_irqrestore(&zone->lock, flags);
				1016
				1017	/*
Yasunori Goto	6811378	2006-06-23 02:03:11 -0700	[diff] [blame]	1018	* If this zone is not populated, then it is not in zonelist.
				1019	* This means the page allocator ignores this zone.
				1020	* So, zonelist must be updated after online.
				1021	*/
Wen Congyang	6dcd73d	2012-12-11 16:01:01 -0800	[diff] [blame]	1022	if (!populated_zone(zone)) {
Yasunori Goto	6811378	2006-06-23 02:03:11 -0700	[diff] [blame]	1023	need_zonelists_rebuild = 1;
Michal Hocko	72675e1	2017-09-06 16:20:24 -0700	[diff] [blame]	1024	setup_zone_pageset(zone);
Wen Congyang	6dcd73d	2012-12-11 16:01:01 -0800	[diff] [blame]	1025	}
Yasunori Goto	6811378	2006-06-23 02:03:11 -0700	[diff] [blame]	1026
David Hildenbrand	aac6532	2020-10-15 20:08:11 -0700	[diff] [blame]	1027	online_pages_range(pfn, nr_pages);
David Hildenbrand	4b09700	2021-09-07 19:55:19 -0700	[diff] [blame]	1028	adjust_present_page_count(pfn_to_page(pfn), nr_pages);
Cody P Schafer	aa47228	2013-07-03 15:02:10 -0700	[diff] [blame]	1029
David Hildenbrand	b30c592	2020-10-15 20:08:23 -0700	[diff] [blame]	1030	node_states_set_node(nid, &arg);
				1031	if (need_zonelists_rebuild)
				1032	build_all_zonelists(NULL);
David Hildenbrand	b30c592	2020-10-15 20:08:23 -0700	[diff] [blame]	1033
				1034	/* Basic onlining is complete, allow allocation of onlined pages. */
				1035	undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE);
				1036
David Hildenbrand	93146d9	2020-08-06 23:25:35 -0700	[diff] [blame]	1037	/*
David Hildenbrand	b86c5fc	2020-10-15 20:09:39 -0700	[diff] [blame]	1038	* Freshly onlined pages aren't shuffled (e.g., all pages are placed to
				1039	* the tail of the freelist when undoing isolation). Shuffle the whole
				1040	* zone to make sure the just onlined pages are properly distributed
				1041	* across the whole freelist - to create an initial shuffle.
David Hildenbrand	93146d9	2020-08-06 23:25:35 -0700	[diff] [blame]	1042	*/
Dan Williams	e900a91	2019-05-14 15:41:28 -0700	[diff] [blame]	1043	shuffle_zone(zone);
				1044
Mel Gorman	b92ca18	2021-06-28 19:42:12 -0700	[diff] [blame]	1045	/* reinitialise watermarks and update pcp limits */
KOSAKI Motohiro	1b79acc	2011-05-24 17:11:32 -0700	[diff] [blame]	1046	init_per_zone_wmark_min();
				1047
David Hildenbrand	ca9a46f	2019-09-23 15:36:08 -0700	[diff] [blame]	1048	kswapd_run(nid);
				1049	kcompactd_run(nid);
Dave Hansen	61b1399	2005-10-29 18:16:56 -0700	[diff] [blame]	1050
Chandra Seetharaman	2d1d43f	2006-09-29 02:01:25 -0700	[diff] [blame]	1051	writeback_set_ratelimit();
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1052
David Hildenbrand	ca9a46f	2019-09-23 15:36:08 -0700	[diff] [blame]	1053	memory_notify(MEM_ONLINE, &arg);
David Hildenbrand	381eab4	2018-10-30 15:10:29 -0700	[diff] [blame]	1054	mem_hotplug_done();
David Rientjes	30467e0	2015-04-14 15:45:11 -0700	[diff] [blame]	1055	return 0;
Chen Yucong	e33e33b	2016-03-17 14:19:35 -0700	[diff] [blame]	1056
				1057	failed_addition:
				1058	pr_debug("online_pages [mem %#010llx-%#010llx] failed\n",
				1059	(unsigned long long) pfn << PAGE_SHIFT,
				1060	(((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1);
				1061	memory_notify(MEM_CANCEL_ONLINE, &arg);
David Hildenbrand	feee6b2	2020-01-04 12:59:33 -0800	[diff] [blame]	1062	remove_pfn_range_from_zone(zone, pfn, nr_pages);
David Hildenbrand	381eab4	2018-10-30 15:10:29 -0700	[diff] [blame]	1063	mem_hotplug_done();
Chen Yucong	e33e33b	2016-03-17 14:19:35 -0700	[diff] [blame]	1064	return ret;
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	1065	}
Keith Mannthey	5394702	2006-09-30 23:27:08 -0700	[diff] [blame]	1066	#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
Yasunori Goto	bc02af9	2006-06-27 02:53:30 -0700	[diff] [blame]	1067
Tang Chen	0bd8542	2014-11-13 15:19:41 -0800	[diff] [blame]	1068	static void reset_node_present_pages(pg_data_t *pgdat)
				1069	{
				1070	struct zone *z;
				1071
				1072	for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
				1073	z->present_pages = 0;
				1074
				1075	pgdat->node_present_pages = 0;
				1076	}
				1077
Hidetoshi Seto	e131933	2009-11-17 14:06:18 -0800	[diff] [blame]	1078	/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
David Hildenbrand	c68ab18	2020-06-04 16:48:35 -0700	[diff] [blame]	1079	static pg_data_t __ref *hotadd_new_pgdat(int nid)
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1080	{
				1081	struct pglist_data *pgdat;
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1082
Tang Chen	a1e565a	2013-02-22 16:33:18 -0800	[diff] [blame]	1083	pgdat = NODE_DATA(nid);
				1084	if (!pgdat) {
				1085	pgdat = arch_alloc_nodedata(nid);
				1086	if (!pgdat)
				1087	return NULL;
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1088
Wei Yang	33fce01	2019-09-23 15:35:52 -0700	[diff] [blame]	1089	pgdat->per_cpu_nodestats =
				1090	alloc_percpu(struct per_cpu_nodestat);
Tang Chen	a1e565a	2013-02-22 16:33:18 -0800	[diff] [blame]	1091	arch_refresh_nodedata(nid, pgdat);
Gu Zheng	b0dc3a3	2015-03-25 15:55:20 -0700	[diff] [blame]	1092	} else {
Wei Yang	33fce01	2019-09-23 15:35:52 -0700	[diff] [blame]	1093	int cpu;
Mel Gorman	e716f2e	2017-05-03 14:53:45 -0700	[diff] [blame]	1094	/*
Joonsoo Kim	97a225e	2020-06-03 15:59:01 -0700	[diff] [blame]	1095	* Reset the nr_zones, order and highest_zoneidx before reuse.
				1096	* Note that kswapd will init kswapd_highest_zoneidx properly
Mel Gorman	e716f2e	2017-05-03 14:53:45 -0700	[diff] [blame]	1097	* when it starts in the near future.
				1098	*/
Gu Zheng	b0dc3a3	2015-03-25 15:55:20 -0700	[diff] [blame]	1099	pgdat->nr_zones = 0;
Mel Gorman	38087d9	2016-07-28 15:45:49 -0700	[diff] [blame]	1100	pgdat->kswapd_order = 0;
Joonsoo Kim	97a225e	2020-06-03 15:59:01 -0700	[diff] [blame]	1101	pgdat->kswapd_highest_zoneidx = 0;
Wei Yang	33fce01	2019-09-23 15:35:52 -0700	[diff] [blame]	1102	for_each_online_cpu(cpu) {
				1103	struct per_cpu_nodestat *p;
				1104
				1105	p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
				1106	memset(p, 0, sizeof(*p));
				1107	}
Tang Chen	a1e565a	2013-02-22 16:33:18 -0800	[diff] [blame]	1108	}
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1109
				1110	/* we can use NODE_DATA(nid) from here */
Oscar Salvador	03e85f9	2018-08-21 21:53:43 -0700	[diff] [blame]	1111	pgdat->node_id = nid;
David Hildenbrand	c68ab18	2020-06-04 16:48:35 -0700	[diff] [blame]	1112	pgdat->node_start_pfn = 0;
Oscar Salvador	03e85f9	2018-08-21 21:53:43 -0700	[diff] [blame]	1113
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1114	/* init node's zones as empty zones, we don't have any present pages.*/
Oscar Salvador	03e85f9	2018-08-21 21:53:43 -0700	[diff] [blame]	1115	free_area_init_core_hotplug(nid);
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1116
KAMEZAWA Hiroyuki	959ecc4	2011-06-15 15:08:38 -0700	[diff] [blame]	1117	/*
				1118	* The node we allocated has no zone fallback lists. For avoiding
				1119	* to access not-initialized zonelist, build here.
				1120	*/
Michal Hocko	72675e1	2017-09-06 16:20:24 -0700	[diff] [blame]	1121	build_all_zonelists(pgdat);
KAMEZAWA Hiroyuki	959ecc4	2011-06-15 15:08:38 -0700	[diff] [blame]	1122
Tang Chen	f784a3f	2014-11-13 15:19:39 -0800	[diff] [blame]	1123	/*
Tang Chen	0bd8542	2014-11-13 15:19:41 -0800	[diff] [blame]	1124	* When memory is hot-added, all the memory is in offline state. So
				1125	* clear all zones' present_pages because they will be updated in
				1126	* online_pages() and offline_pages().
				1127	*/
Oscar Salvador	03e85f9	2018-08-21 21:53:43 -0700	[diff] [blame]	1128	reset_node_managed_pages(pgdat);
Tang Chen	0bd8542	2014-11-13 15:19:41 -0800	[diff] [blame]	1129	reset_node_present_pages(pgdat);
				1130
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1131	return pgdat;
				1132	}
				1133
Oscar Salvador	b9ff036	2018-08-17 15:46:15 -0700	[diff] [blame]	1134	static void rollback_node_hotadd(int nid)
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1135	{
Oscar Salvador	b9ff036	2018-08-17 15:46:15 -0700	[diff] [blame]	1136	pg_data_t *pgdat = NODE_DATA(nid);
				1137
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1138	arch_refresh_nodedata(nid, NULL);
Reza Arbab	5830169	2016-08-11 15:33:12 -0700	[diff] [blame]	1139	free_percpu(pgdat->per_cpu_nodestats);
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1140	arch_free_nodedata(pgdat);
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1141	}
				1142
KAMEZAWA Hiroyuki	0a54703	2006-06-27 02:53:35 -0700	[diff] [blame]	1143
Mel Gorman	ba2d266	2021-06-30 18:53:35 -0700	[diff] [blame]	1144	/*
				1145	* __try_online_node - online a node if offlined
Mike Rapoport	e8b098f	2018-04-05 16:24:57 -0700	[diff] [blame]	1146	* @nid: the node ID
Oscar Salvador	b9ff036	2018-08-17 15:46:15 -0700	[diff] [blame]	1147	* @set_node_online: Whether we want to online the node
minskey guo	cf23422	2010-05-24 14:32:41 -0700	[diff] [blame]	1148	* called by cpu_up() to online a node without onlined memory.
Oscar Salvador	b9ff036	2018-08-17 15:46:15 -0700	[diff] [blame]	1149	*
				1150	* Returns:
				1151	* 1 -> a new node has been allocated
				1152	* 0 -> the node is already online
				1153	* -ENOMEM -> the node could not be allocated
minskey guo	cf23422	2010-05-24 14:32:41 -0700	[diff] [blame]	1154	*/
David Hildenbrand	c68ab18	2020-06-04 16:48:35 -0700	[diff] [blame]	1155	static int __try_online_node(int nid, bool set_node_online)
minskey guo	cf23422	2010-05-24 14:32:41 -0700	[diff] [blame]	1156	{
Oscar Salvador	b9ff036	2018-08-17 15:46:15 -0700	[diff] [blame]	1157	pg_data_t *pgdat;
				1158	int ret = 1;
minskey guo	cf23422	2010-05-24 14:32:41 -0700	[diff] [blame]	1159
Toshi Kani	01b0f19	2013-11-12 15:07:25 -0800	[diff] [blame]	1160	if (node_online(nid))
				1161	return 0;
				1162
David Hildenbrand	c68ab18	2020-06-04 16:48:35 -0700	[diff] [blame]	1163	pgdat = hotadd_new_pgdat(nid);
David Rientjes	7553e8f	2011-06-22 18:13:01 -0700	[diff] [blame]	1164	if (!pgdat) {
Toshi Kani	01b0f19	2013-11-12 15:07:25 -0800	[diff] [blame]	1165	pr_err("Cannot online node %d due to NULL pgdat\n", nid);
minskey guo	cf23422	2010-05-24 14:32:41 -0700	[diff] [blame]	1166	ret = -ENOMEM;
				1167	goto out;
				1168	}
Oscar Salvador	b9ff036	2018-08-17 15:46:15 -0700	[diff] [blame]	1169
				1170	if (set_node_online) {
				1171	node_set_online(nid);
				1172	ret = register_one_node(nid);
				1173	BUG_ON(ret);
				1174	}
minskey guo	cf23422	2010-05-24 14:32:41 -0700	[diff] [blame]	1175	out:
Oscar Salvador	b9ff036	2018-08-17 15:46:15 -0700	[diff] [blame]	1176	return ret;
				1177	}
				1178
				1179	/*
				1180	* Users of this function always want to online/register the node
				1181	*/
				1182	int try_online_node(int nid)
				1183	{
				1184	int ret;
				1185
				1186	mem_hotplug_begin();
David Hildenbrand	c68ab18	2020-06-04 16:48:35 -0700	[diff] [blame]	1187	ret = __try_online_node(nid, true);
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	1188	mem_hotplug_done();
minskey guo	cf23422	2010-05-24 14:32:41 -0700	[diff] [blame]	1189	return ret;
				1190	}
				1191
Toshi Kani	27356f5	2013-09-11 14:21:49 -0700	[diff] [blame]	1192	static int check_hotplug_memory_range(u64 start, u64 size)
				1193	{
Pavel Tatashin	ba32558	2018-04-05 16:22:39 -0700	[diff] [blame]	1194	/* memory range must be block size aligned */
David Hildenbrand	cec3ebd	2019-07-18 15:56:25 -0700	[diff] [blame]	1195	if (!size \|\| !IS_ALIGNED(start, memory_block_size_bytes()) \|\|
				1196	!IS_ALIGNED(size, memory_block_size_bytes())) {
Pavel Tatashin	ba32558	2018-04-05 16:22:39 -0700	[diff] [blame]	1197	pr_err("Block size [%#lx] unaligned hotplug range: start %#llx, size %#llx",
David Hildenbrand	cec3ebd	2019-07-18 15:56:25 -0700	[diff] [blame]	1198	memory_block_size_bytes(), start, size);
Toshi Kani	27356f5	2013-09-11 14:21:49 -0700	[diff] [blame]	1199	return -EINVAL;
				1200	}
				1201
				1202	return 0;
				1203	}
				1204
Vitaly Kuznetsov	31bc385	2016-03-15 14:56:48 -0700	[diff] [blame]	1205	static int online_memory_block(struct memory_block mem, void arg)
				1206	{
Anshuman Khandual	1adf8b4	2021-02-25 17:17:13 -0800	[diff] [blame]	1207	mem->online_type = mhp_default_online_type;
Nathan Fontenot	dc18d70	2017-02-24 15:00:02 -0800	[diff] [blame]	1208	return device_online(&mem->dev);
Vitaly Kuznetsov	31bc385	2016-03-15 14:56:48 -0700	[diff] [blame]	1209	}
				1210
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	1211	bool mhp_supports_memmap_on_memory(unsigned long size)
				1212	{
				1213	unsigned long nr_vmemmap_pages = size / PAGE_SIZE;
				1214	unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
				1215	unsigned long remaining_size = size - vmemmap_size;
				1216
				1217	/*
				1218	* Besides having arch support and the feature enabled at runtime, we
				1219	* need a few more assumptions to hold true:
				1220	*
				1221	* a) We span a single memory block: memory onlining/offlinin;g happens
				1222	* in memory block granularity. We don't want the vmemmap of online
				1223	* memory blocks to reside on offline memory blocks. In the future,
				1224	* we might want to support variable-sized memory blocks to make the
				1225	* feature more versatile.
				1226	*
				1227	* b) The vmemmap pages span complete PMDs: We don't want vmemmap code
				1228	* to populate memory from the altmap for unrelated parts (i.e.,
				1229	* other memory blocks)
				1230	*
				1231	* c) The vmemmap pages (and thereby the pages that will be exposed to
				1232	* the buddy) have to cover full pageblocks: memory onlining/offlining
				1233	* code requires applicable ranges to be page-aligned, for example, to
				1234	* set the migratetypes properly.
				1235	*
				1236	* TODO: Although we have a check here to make sure that vmemmap pages
				1237	* fully populate a PMD, it is not the right place to check for
				1238	* this. A much better solution involves improving vmemmap code
				1239	* to fallback to base pages when trying to populate vmemmap using
				1240	* altmap as an alternative source of memory, and we do not exactly
				1241	* populate a single PMD.
				1242	*/
				1243	return memmap_on_memory &&
Muchun Song	2d7a217	2021-06-30 18:48:25 -0700	[diff] [blame]	1244	!hugetlb_free_vmemmap_enabled &&
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	1245	IS_ENABLED(CONFIG_MHP_MEMMAP_ON_MEMORY) &&
				1246	size == memory_block_size_bytes() &&
				1247	IS_ALIGNED(vmemmap_size, PMD_SIZE) &&
				1248	IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT));
				1249	}
				1250
David Hildenbrand	8df1d0e	2018-10-30 15:10:24 -0700	[diff] [blame]	1251	/*
				1252	* NOTE: The caller must call lock_device_hotplug() to serialize hotplug
				1253	* and online/offline operations (triggered e.g. by sysfs).
				1254	*
				1255	* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG
				1256	*/
David Hildenbrand	b611719	2020-10-15 20:08:44 -0700	[diff] [blame]	1257	int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
Yasunori Goto	bc02af9	2006-06-27 02:53:30 -0700	[diff] [blame]	1258	{
Catalin Marinas	d15dfd3	2021-03-09 12:26:01 +0000	[diff] [blame]	1259	struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	1260	struct vmem_altmap mhp_altmap = {};
David Vrabel	62cedb9	2015-06-25 16:35:49 +0100	[diff] [blame]	1261	u64 start, size;
Oscar Salvador	b9ff036	2018-08-17 15:46:15 -0700	[diff] [blame]	1262	bool new_node = false;
Yasunori Goto	bc02af9	2006-06-27 02:53:30 -0700	[diff] [blame]	1263	int ret;
				1264
David Vrabel	62cedb9	2015-06-25 16:35:49 +0100	[diff] [blame]	1265	start = res->start;
				1266	size = resource_size(res);
				1267
Toshi Kani	27356f5	2013-09-11 14:21:49 -0700	[diff] [blame]	1268	ret = check_hotplug_memory_range(start, size);
				1269	if (ret)
				1270	return ret;
				1271
Vishal Verma	fa6d9ec	2020-06-04 16:48:25 -0700	[diff] [blame]	1272	if (!node_possible(nid)) {
				1273	WARN(1, "node %d was absent from the node_possible_map\n", nid);
				1274	return -EINVAL;
				1275	}
				1276
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	1277	mem_hotplug_begin();
Nathan Zimmer	ac13c46	2014-01-23 15:53:26 -0800	[diff] [blame]	1278
David Hildenbrand	52219ae	2020-06-04 16:48:38 -0700	[diff] [blame]	1279	if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
				1280	memblock_add_node(start, size, nid);
Tang Chen	7f36e3e	2015-09-04 15:42:32 -0700	[diff] [blame]	1281
David Hildenbrand	c68ab18	2020-06-04 16:48:35 -0700	[diff] [blame]	1282	ret = __try_online_node(nid, false);
Oscar Salvador	b9ff036	2018-08-17 15:46:15 -0700	[diff] [blame]	1283	if (ret < 0)
				1284	goto error;
				1285	new_node = ret;
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1286
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	1287	/*
				1288	* Self hosted memmap array
				1289	*/
				1290	if (mhp_flags & MHP_MEMMAP_ON_MEMORY) {
				1291	if (!mhp_supports_memmap_on_memory(size)) {
				1292	ret = -EINVAL;
				1293	goto error;
				1294	}
				1295	mhp_altmap.free = PHYS_PFN(size);
				1296	mhp_altmap.base_pfn = PHYS_PFN(start);
				1297	params.altmap = &mhp_altmap;
				1298	}
				1299
Yasunori Goto	bc02af9	2006-06-27 02:53:30 -0700	[diff] [blame]	1300	/* call arch's memory hotadd */
Logan Gunthorpe	f5637d3	2020-04-10 14:33:21 -0700	[diff] [blame]	1301	ret = arch_add_memory(nid, start, size, &params);
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1302	if (ret < 0)
				1303	goto error;
				1304
David Hildenbrand	db051a0	2019-07-18 15:56:56 -0700	[diff] [blame]	1305	/* create memory block devices after memory was added */
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	1306	ret = create_memory_block_devices(start, size, mhp_altmap.alloc);
David Hildenbrand	db051a0	2019-07-18 15:56:56 -0700	[diff] [blame]	1307	if (ret) {
David Hildenbrand	65a2aa5	2021-09-07 19:55:04 -0700	[diff] [blame]	1308	arch_remove_memory(start, size, NULL);
David Hildenbrand	db051a0	2019-07-18 15:56:56 -0700	[diff] [blame]	1309	goto error;
				1310	}
				1311
Tang Chen	a1e565a	2013-02-22 16:33:18 -0800	[diff] [blame]	1312	if (new_node) {
Oscar Salvador	d5b6f6a3	2018-08-17 15:46:18 -0700	[diff] [blame]	1313	/* If sysfs file of new node can't be created, cpu on the node
Yasunori Goto	0fc4415	2006-06-27 02:53:38 -0700	[diff] [blame]	1314	* can't be hot-added. There is no rollback way now.
				1315	* So, check by BUG_ON() to catch it reluctantly..
Oscar Salvador	d5b6f6a3	2018-08-17 15:46:18 -0700	[diff] [blame]	1316	* We online node here. We can't roll back from here.
Yasunori Goto	0fc4415	2006-06-27 02:53:38 -0700	[diff] [blame]	1317	*/
Oscar Salvador	d5b6f6a3	2018-08-17 15:46:18 -0700	[diff] [blame]	1318	node_set_online(nid);
				1319	ret = __register_one_node(nid);
Yasunori Goto	0fc4415	2006-06-27 02:53:38 -0700	[diff] [blame]	1320	BUG_ON(ret);
				1321	}
				1322
Oscar Salvador	d5b6f6a3	2018-08-17 15:46:18 -0700	[diff] [blame]	1323	/* link memory sections under this node.*/
Laurent Dufour	90c7eae	2020-10-15 20:09:15 -0700	[diff] [blame]	1324	link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1),
				1325	MEMINIT_HOTPLUG);
Oscar Salvador	d5b6f6a3	2018-08-17 15:46:18 -0700	[diff] [blame]	1326
akpm@linux-foundation.org	d96ae53	2010-03-05 13:41:58 -0800	[diff] [blame]	1327	/* create new memmap entry */
David Hildenbrand	7b7b272	2020-06-04 16:48:41 -0700	[diff] [blame]	1328	if (!strcmp(res->name, "System RAM"))
				1329	firmware_map_add_hotplug(start, start + size, "System RAM");
akpm@linux-foundation.org	d96ae53	2010-03-05 13:41:58 -0800	[diff] [blame]	1330
David Hildenbrand	381eab4	2018-10-30 15:10:29 -0700	[diff] [blame]	1331	/* device_online() will take the lock when calling online_pages() */
				1332	mem_hotplug_done();
				1333
David Hildenbrand	9ca6551	2020-10-15 20:08:49 -0700	[diff] [blame]	1334	/*
				1335	* In case we're allowed to merge the resource, flag it and trigger
				1336	* merging now that adding succeeded.
				1337	*/
David Hildenbrand	2601126	2021-02-25 17:17:17 -0800	[diff] [blame]	1338	if (mhp_flags & MHP_MERGE_RESOURCE)
David Hildenbrand	9ca6551	2020-10-15 20:08:49 -0700	[diff] [blame]	1339	merge_system_ram_resource(res);
				1340
Vitaly Kuznetsov	31bc385	2016-03-15 14:56:48 -0700	[diff] [blame]	1341	/* online pages if requested */
Anshuman Khandual	1adf8b4	2021-02-25 17:17:13 -0800	[diff] [blame]	1342	if (mhp_default_online_type != MMOP_OFFLINE)
David Hildenbrand	fbcf73c	2019-07-18 15:57:46 -0700	[diff] [blame]	1343	walk_memory_blocks(start, size, NULL, online_memory_block);
Vitaly Kuznetsov	31bc385	2016-03-15 14:56:48 -0700	[diff] [blame]	1344
David Hildenbrand	381eab4	2018-10-30 15:10:29 -0700	[diff] [blame]	1345	return ret;
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1346	error:
				1347	/* rollback pgdat allocation and others */
Oscar Salvador	b9ff036	2018-08-17 15:46:15 -0700	[diff] [blame]	1348	if (new_node)
				1349	rollback_node_hotadd(nid);
David Hildenbrand	52219ae	2020-06-04 16:48:38 -0700	[diff] [blame]	1350	if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
				1351	memblock_remove(start, size);
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	1352	mem_hotplug_done();
Yasunori Goto	bc02af9	2006-06-27 02:53:30 -0700	[diff] [blame]	1353	return ret;
				1354	}
David Vrabel	62cedb9	2015-06-25 16:35:49 +0100	[diff] [blame]	1355
David Hildenbrand	8df1d0e	2018-10-30 15:10:24 -0700	[diff] [blame]	1356	/* requires device_hotplug_lock, see add_memory_resource() */
David Hildenbrand	b611719	2020-10-15 20:08:44 -0700	[diff] [blame]	1357	int __ref __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags)
David Vrabel	62cedb9	2015-06-25 16:35:49 +0100	[diff] [blame]	1358	{
				1359	struct resource *res;
				1360	int ret;
				1361
David Hildenbrand	7b7b272	2020-06-04 16:48:41 -0700	[diff] [blame]	1362	res = register_memory_resource(start, size, "System RAM");
Vitaly Kuznetsov	6f754ba	2016-01-14 15:21:55 -0800	[diff] [blame]	1363	if (IS_ERR(res))
				1364	return PTR_ERR(res);
David Vrabel	62cedb9	2015-06-25 16:35:49 +0100	[diff] [blame]	1365
David Hildenbrand	b611719	2020-10-15 20:08:44 -0700	[diff] [blame]	1366	ret = add_memory_resource(nid, res, mhp_flags);
David Vrabel	62cedb9	2015-06-25 16:35:49 +0100	[diff] [blame]	1367	if (ret < 0)
				1368	release_memory_resource(res);
				1369	return ret;
				1370	}
David Hildenbrand	8df1d0e	2018-10-30 15:10:24 -0700	[diff] [blame]	1371
David Hildenbrand	b611719	2020-10-15 20:08:44 -0700	[diff] [blame]	1372	int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags)
David Hildenbrand	8df1d0e	2018-10-30 15:10:24 -0700	[diff] [blame]	1373	{
				1374	int rc;
				1375
				1376	lock_device_hotplug();
David Hildenbrand	b611719	2020-10-15 20:08:44 -0700	[diff] [blame]	1377	rc = __add_memory(nid, start, size, mhp_flags);
David Hildenbrand	8df1d0e	2018-10-30 15:10:24 -0700	[diff] [blame]	1378	unlock_device_hotplug();
				1379
				1380	return rc;
				1381	}
Yasunori Goto	bc02af9	2006-06-27 02:53:30 -0700	[diff] [blame]	1382	EXPORT_SYMBOL_GPL(add_memory);
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1383
David Hildenbrand	7b7b272	2020-06-04 16:48:41 -0700	[diff] [blame]	1384	/*
				1385	* Add special, driver-managed memory to the system as system RAM. Such
				1386	* memory is not exposed via the raw firmware-provided memmap as system
				1387	* RAM, instead, it is detected and added by a driver - during cold boot,
				1388	* after a reboot, and after kexec.
				1389	*
				1390	* Reasons why this memory should not be used for the initial memmap of a
				1391	* kexec kernel or for placing kexec images:
				1392	* - The booting kernel is in charge of determining how this memory will be
				1393	* used (e.g., use persistent memory as system RAM)
				1394	* - Coordination with a hypervisor is required before this memory
				1395	* can be used (e.g., inaccessible parts).
				1396	*
				1397	* For this memory, no entries in /sys/firmware/memmap ("raw firmware-provided
				1398	* memory map") are created. Also, the created memory resource is flagged
David Hildenbrand	7cf603d	2020-10-15 20:08:33 -0700	[diff] [blame]	1399	* with IORESOURCE_SYSRAM_DRIVER_MANAGED, so in-kernel users can special-case
David Hildenbrand	7b7b272	2020-06-04 16:48:41 -0700	[diff] [blame]	1400	* this memory as well (esp., not place kexec images onto it).
				1401	*
				1402	* The resource_name (visible via /proc/iomem) has to have the format
				1403	* "System RAM ($DRIVER)".
				1404	*/
				1405	int add_memory_driver_managed(int nid, u64 start, u64 size,
David Hildenbrand	b611719	2020-10-15 20:08:44 -0700	[diff] [blame]	1406	const char *resource_name, mhp_t mhp_flags)
David Hildenbrand	7b7b272	2020-06-04 16:48:41 -0700	[diff] [blame]	1407	{
				1408	struct resource *res;
				1409	int rc;
				1410
				1411	if (!resource_name \|\|
				1412	strstr(resource_name, "System RAM (") != resource_name \|\|
				1413	resource_name[strlen(resource_name) - 1] != ')')
				1414	return -EINVAL;
				1415
				1416	lock_device_hotplug();
				1417
				1418	res = register_memory_resource(start, size, resource_name);
				1419	if (IS_ERR(res)) {
				1420	rc = PTR_ERR(res);
				1421	goto out_unlock;
				1422	}
				1423
David Hildenbrand	b611719	2020-10-15 20:08:44 -0700	[diff] [blame]	1424	rc = add_memory_resource(nid, res, mhp_flags);
David Hildenbrand	7b7b272	2020-06-04 16:48:41 -0700	[diff] [blame]	1425	if (rc < 0)
				1426	release_memory_resource(res);
				1427
				1428	out_unlock:
				1429	unlock_device_hotplug();
				1430	return rc;
				1431	}
				1432	EXPORT_SYMBOL_GPL(add_memory_driver_managed);
				1433
Anshuman Khandual	bca3fea	2021-02-25 17:17:33 -0800	[diff] [blame]	1434	/*
				1435	* Platforms should define arch_get_mappable_range() that provides
				1436	* maximum possible addressable physical memory range for which the
				1437	* linear mapping could be created. The platform returned address
				1438	* range must adhere to these following semantics.
				1439	*
				1440	* - range.start <= range.end
				1441	* - Range includes both end points [range.start..range.end]
				1442	*
				1443	* There is also a fallback definition provided here, allowing the
				1444	* entire possible physical address range in case any platform does
				1445	* not define arch_get_mappable_range().
				1446	*/
				1447	struct range __weak arch_get_mappable_range(void)
				1448	{
				1449	struct range mhp_range = {
				1450	.start = 0UL,
				1451	.end = -1ULL,
				1452	};
				1453	return mhp_range;
				1454	}
				1455
				1456	struct range mhp_get_pluggable_range(bool need_mapping)
				1457	{
				1458	const u64 max_phys = (1ULL << MAX_PHYSMEM_BITS) - 1;
				1459	struct range mhp_range;
				1460
				1461	if (need_mapping) {
				1462	mhp_range = arch_get_mappable_range();
				1463	if (mhp_range.start > max_phys) {
				1464	mhp_range.start = 0;
				1465	mhp_range.end = 0;
				1466	}
				1467	mhp_range.end = min_t(u64, mhp_range.end, max_phys);
				1468	} else {
				1469	mhp_range.start = 0;
				1470	mhp_range.end = max_phys;
				1471	}
				1472	return mhp_range;
				1473	}
				1474	EXPORT_SYMBOL_GPL(mhp_get_pluggable_range);
				1475
				1476	bool mhp_range_allowed(u64 start, u64 size, bool need_mapping)
				1477	{
				1478	struct range mhp_range = mhp_get_pluggable_range(need_mapping);
				1479	u64 end = start + size;
				1480
				1481	if (start < end && start >= mhp_range.start && (end - 1) <= mhp_range.end)
				1482	return true;
				1483
				1484	pr_warn("Hotplug memory [%#llx-%#llx] exceeds maximum addressable range [%#llx-%#llx]\n",
				1485	start, end, mhp_range.start, mhp_range.end);
				1486	return false;
				1487	}
				1488
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1489	#ifdef CONFIG_MEMORY_HOTREMOVE
				1490	/*
David Hildenbrand	9291799	2020-02-03 17:34:26 -0800	[diff] [blame]	1491	* Confirm all pages in a range [start, end) belong to the same zone (skipping
				1492	* memory holes). When true, return the zone.
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1493	*/
David Hildenbrand	9291799	2020-02-03 17:34:26 -0800	[diff] [blame]	1494	struct zone *test_pages_in_a_zone(unsigned long start_pfn,
				1495	unsigned long end_pfn)
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1496	{
Andrew Banman	5f0f288	2015-12-29 14:54:25 -0800	[diff] [blame]	1497	unsigned long pfn, sec_end_pfn;
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1498	struct zone *zone = NULL;
				1499	struct page *page;
Mike Rapoport	673d40c	2021-09-07 19:54:55 -0700	[diff] [blame]	1500
Toshi Kani	deb88a2	2017-02-03 13:13:20 -0800	[diff] [blame]	1501	for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn + 1);
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1502	pfn < end_pfn;
Toshi Kani	deb88a2	2017-02-03 13:13:20 -0800	[diff] [blame]	1503	pfn = sec_end_pfn, sec_end_pfn += PAGES_PER_SECTION) {
Andrew Banman	5f0f288	2015-12-29 14:54:25 -0800	[diff] [blame]	1504	/* Make sure the memory section is present first */
				1505	if (!present_section_nr(pfn_to_section_nr(pfn)))
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1506	continue;
Andrew Banman	5f0f288	2015-12-29 14:54:25 -0800	[diff] [blame]	1507	for (; pfn < sec_end_pfn && pfn < end_pfn;
				1508	pfn += MAX_ORDER_NR_PAGES) {
Mikhail Zaslonko	24feb47	2019-02-01 14:20:38 -0800	[diff] [blame]	1509	/* Check if we got outside of the zone */
Mike Rapoport	673d40c	2021-09-07 19:54:55 -0700	[diff] [blame]	1510	if (zone && !zone_spans_pfn(zone, pfn))
David Hildenbrand	9291799	2020-02-03 17:34:26 -0800	[diff] [blame]	1511	return NULL;
Mike Rapoport	673d40c	2021-09-07 19:54:55 -0700	[diff] [blame]	1512	page = pfn_to_page(pfn);
Andrew Banman	5f0f288	2015-12-29 14:54:25 -0800	[diff] [blame]	1513	if (zone && page_zone(page) != zone)
David Hildenbrand	9291799	2020-02-03 17:34:26 -0800	[diff] [blame]	1514	return NULL;
Andrew Banman	5f0f288	2015-12-29 14:54:25 -0800	[diff] [blame]	1515	zone = page_zone(page);
				1516	}
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1517	}
Toshi Kani	deb88a2	2017-02-03 13:13:20 -0800	[diff] [blame]	1518
David Hildenbrand	9291799	2020-02-03 17:34:26 -0800	[diff] [blame]	1519	return zone;
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1520	}
				1521
				1522	/*
Yisheng Xie	0efadf4	2017-02-24 14:57:39 -0800	[diff] [blame]	1523	* Scan pfn range [start,end) to find movable/migratable pages (LRU pages,
David Hildenbrand	aa21879	2020-05-07 16:01:30 +0200	[diff] [blame]	1524	* non-lru movable pages and hugepages). Will skip over most unmovable
				1525	* pages (esp., pages that can be skipped when offlining), but bail out on
				1526	* definitely unmovable pages.
				1527	*
				1528	* Returns:
				1529	* 0 in case a movable page is found and movable_pfn was updated.
				1530	* -ENOENT in case no movable page was found.
				1531	* -EBUSY in case a definitely unmovable page was found.
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1532	*/
David Hildenbrand	aa21879	2020-05-07 16:01:30 +0200	[diff] [blame]	1533	static int scan_movable_pages(unsigned long start, unsigned long end,
				1534	unsigned long *movable_pfn)
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1535	{
				1536	unsigned long pfn;
Oscar Salvador	eeb0efd	2019-02-01 14:20:47 -0800	[diff] [blame]	1537
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1538	for (pfn = start; pfn < end; pfn++) {
Oscar Salvador	eeb0efd	2019-02-01 14:20:47 -0800	[diff] [blame]	1539	struct page page, head;
				1540	unsigned long skip;
				1541
				1542	if (!pfn_valid(pfn))
				1543	continue;
				1544	page = pfn_to_page(pfn);
				1545	if (PageLRU(page))
David Hildenbrand	aa21879	2020-05-07 16:01:30 +0200	[diff] [blame]	1546	goto found;
Oscar Salvador	eeb0efd	2019-02-01 14:20:47 -0800	[diff] [blame]	1547	if (__PageMovable(page))
David Hildenbrand	aa21879	2020-05-07 16:01:30 +0200	[diff] [blame]	1548	goto found;
				1549
				1550	/*
				1551	* PageOffline() pages that are not marked __PageMovable() and
				1552	* have a reference count > 0 (after MEM_GOING_OFFLINE) are
				1553	* definitely unmovable. If their reference count would be 0,
				1554	* they could at least be skipped when offlining memory.
				1555	*/
				1556	if (PageOffline(page) && page_count(page))
				1557	return -EBUSY;
Oscar Salvador	eeb0efd	2019-02-01 14:20:47 -0800	[diff] [blame]	1558
				1559	if (!PageHuge(page))
				1560	continue;
				1561	head = compound_head(page);
Mike Kravetz	8f251a3	2021-02-24 12:08:56 -0800	[diff] [blame]	1562	/*
				1563	* This test is racy as we hold no reference or lock. The
				1564	* hugetlb page could have been free'ed and head is no longer
				1565	* a hugetlb page before the following check. In such unlikely
				1566	* cases false positives and negatives are possible. Calling
				1567	* code must deal with these scenarios.
				1568	*/
				1569	if (HPageMigratable(head))
David Hildenbrand	aa21879	2020-05-07 16:01:30 +0200	[diff] [blame]	1570	goto found;
Matthew Wilcox (Oracle)	d8c6546	2019-09-23 15:34:30 -0700	[diff] [blame]	1571	skip = compound_nr(head) - (page - head);
Oscar Salvador	eeb0efd	2019-02-01 14:20:47 -0800	[diff] [blame]	1572	pfn += skip - 1;
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1573	}
David Hildenbrand	aa21879	2020-05-07 16:01:30 +0200	[diff] [blame]	1574	return -ENOENT;
				1575	found:
				1576	*movable_pfn = pfn;
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1577	return 0;
				1578	}
				1579
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1580	static int
				1581	do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
				1582	{
				1583	unsigned long pfn;
Matthew Wilcox (Oracle)	6c35784	2020-08-14 17:30:37 -0700	[diff] [blame]	1584	struct page page, head;
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1585	int ret = 0;
				1586	LIST_HEAD(source);
Liam Mark	786dee8	2021-06-30 18:52:43 -0700	[diff] [blame]	1587	static DEFINE_RATELIMIT_STATE(migrate_rs, DEFAULT_RATELIMIT_INTERVAL,
				1588	DEFAULT_RATELIMIT_BURST);
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1589
Michal Hocko	a85009c	2018-12-28 00:38:29 -0800	[diff] [blame]	1590	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1591	if (!pfn_valid(pfn))
				1592	continue;
				1593	page = pfn_to_page(pfn);
Matthew Wilcox (Oracle)	6c35784	2020-08-14 17:30:37 -0700	[diff] [blame]	1594	head = compound_head(page);
Naoya Horiguchi	c8721bb	2013-09-11 14:22:09 -0700	[diff] [blame]	1595
				1596	if (PageHuge(page)) {
Matthew Wilcox (Oracle)	d8c6546	2019-09-23 15:34:30 -0700	[diff] [blame]	1597	pfn = page_to_pfn(head) + compound_nr(head) - 1;
Oscar Salvador	daf3538	2019-03-05 15:48:53 -0800	[diff] [blame]	1598	isolate_huge_page(head, &source);
Naoya Horiguchi	c8721bb	2013-09-11 14:22:09 -0700	[diff] [blame]	1599	continue;
Michal Hocko	94723aa	2018-04-10 16:30:07 -0700	[diff] [blame]	1600	} else if (PageTransHuge(page))
Matthew Wilcox (Oracle)	6c35784	2020-08-14 17:30:37 -0700	[diff] [blame]	1601	pfn = page_to_pfn(head) + thp_nr_pages(page) - 1;
Naoya Horiguchi	c8721bb	2013-09-11 14:22:09 -0700	[diff] [blame]	1602
Michal Hocko	b15c872	2018-12-28 00:38:01 -0800	[diff] [blame]	1603	/*
				1604	* HWPoison pages have elevated reference counts so the migration would
				1605	* fail on them. It also doesn't make any sense to migrate them in the
				1606	* first place. Still try to unmap such a page in case it is still mapped
				1607	* (e.g. current hwpoison implementation doesn't unmap KSM pages but keep
				1608	* the unmap as the catch all safety net).
				1609	*/
				1610	if (PageHWPoison(page)) {
				1611	if (WARN_ON(PageLRU(page)))
				1612	isolate_lru_page(page);
				1613	if (page_mapped(page))
Shakeel Butt	013339d	2020-12-14 19:06:39 -0800	[diff] [blame]	1614	try_to_unmap(page, TTU_IGNORE_MLOCK);
Michal Hocko	b15c872	2018-12-28 00:38:01 -0800	[diff] [blame]	1615	continue;
				1616	}
				1617
Konstantin Khlebnikov	700c2a4	2011-05-24 17:12:19 -0700	[diff] [blame]	1618	if (!get_page_unless_zero(page))
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1619	continue;
				1620	/*
Yisheng Xie	0efadf4	2017-02-24 14:57:39 -0800	[diff] [blame]	1621	* We can skip free pages. And we can deal with pages on
				1622	* LRU and non-lru movable pages.
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1623	*/
Yisheng Xie	0efadf4	2017-02-24 14:57:39 -0800	[diff] [blame]	1624	if (PageLRU(page))
				1625	ret = isolate_lru_page(page);
				1626	else
				1627	ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1628	if (!ret) { /* Success */
Nick Piggin	62695a8	2008-10-18 20:26:09 -0700	[diff] [blame]	1629	list_add_tail(&page->lru, &source);
Yisheng Xie	0efadf4	2017-02-24 14:57:39 -0800	[diff] [blame]	1630	if (!__PageMovable(page))
				1631	inc_node_page_state(page, NR_ISOLATED_ANON +
Huang Ying	9de4f22	2020-04-06 20:04:41 -0700	[diff] [blame]	1632	page_is_file_lru(page));
KOSAKI Motohiro	6d9c285	2009-12-14 17:58:11 -0800	[diff] [blame]	1633
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1634	} else {
Liam Mark	786dee8	2021-06-30 18:52:43 -0700	[diff] [blame]	1635	if (__ratelimit(&migrate_rs)) {
				1636	pr_warn("failed to isolate pfn %lx\n", pfn);
				1637	dump_page(page, "isolation failed");
				1638	}
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1639	}
Oscar Salvador	1723058	2019-02-01 14:19:57 -0800	[diff] [blame]	1640	put_page(page);
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1641	}
Bob Liu	f3ab263	2010-10-26 14:22:10 -0700	[diff] [blame]	1642	if (!list_empty(&source)) {
Joonsoo Kim	203e6e5	2020-10-17 16:14:00 -0700	[diff] [blame]	1643	nodemask_t nmask = node_states[N_MEMORY];
				1644	struct migration_target_control mtc = {
				1645	.nmask = &nmask,
				1646	.gfp_mask = GFP_USER \| __GFP_MOVABLE \| __GFP_RETRY_MAYFAIL,
				1647	};
				1648
				1649	/*
				1650	* We have checked that migration range is on a single zone so
				1651	* we can use the nid of the first page to all the others.
				1652	*/
				1653	mtc.nid = page_to_nid(list_first_entry(&source, struct page, lru));
				1654
				1655	/*
				1656	* try to allocate from a different node but reuse this node
				1657	* if there are no other online nodes to be used (e.g. we are
				1658	* offlining a part of the only existing node)
				1659	*/
				1660	node_clear(mtc.nid, nmask);
				1661	if (nodes_empty(nmask))
				1662	node_set(mtc.nid, nmask);
				1663	ret = migrate_pages(&source, alloc_migration_target, NULL,
				1664	(unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
Michal Hocko	2932c8b	2018-12-28 00:33:53 -0800	[diff] [blame]	1665	if (ret) {
				1666	list_for_each_entry(page, &source, lru) {
Liam Mark	786dee8	2021-06-30 18:52:43 -0700	[diff] [blame]	1667	if (__ratelimit(&migrate_rs)) {
				1668	pr_warn("migrating pfn %lx failed ret:%d\n",
				1669	page_to_pfn(page), ret);
				1670	dump_page(page, "migration failure");
				1671	}
Michal Hocko	2932c8b	2018-12-28 00:33:53 -0800	[diff] [blame]	1672	}
Naoya Horiguchi	c8721bb	2013-09-11 14:22:09 -0700	[diff] [blame]	1673	putback_movable_pages(&source);
Michal Hocko	2932c8b	2018-12-28 00:33:53 -0800	[diff] [blame]	1674	}
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1675	}
Oscar Salvador	1723058	2019-02-01 14:19:57 -0800	[diff] [blame]	1676
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1677	return ret;
				1678	}
				1679
Tang Chen	c532092	2013-11-12 15:08:10 -0800	[diff] [blame]	1680	static int __init cmdline_parse_movable_node(char *p)
				1681	{
Tang Chen	55ac590	2014-01-21 15:49:35 -0800	[diff] [blame]	1682	movable_node_enabled = true;
Tang Chen	c532092	2013-11-12 15:08:10 -0800	[diff] [blame]	1683	return 0;
				1684	}
				1685	early_param("movable_node", cmdline_parse_movable_node);
				1686
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1687	/* check which state of node_states will be changed when offline memory */
				1688	static void node_states_check_changes_offline(unsigned long nr_pages,
				1689	struct zone zone, struct memory_notify arg)
				1690	{
				1691	struct pglist_data *pgdat = zone->zone_pgdat;
				1692	unsigned long present_pages = 0;
Oscar Salvador	86b27be	2018-10-26 15:07:38 -0700	[diff] [blame]	1693	enum zone_type zt;
				1694
Anshuman Khandual	98fa15f	2019-03-05 15:42:58 -0800	[diff] [blame]	1695	arg->status_change_nid = NUMA_NO_NODE;
				1696	arg->status_change_nid_normal = NUMA_NO_NODE;
				1697	arg->status_change_nid_high = NUMA_NO_NODE;
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1698
				1699	/*
Oscar Salvador	86b27be	2018-10-26 15:07:38 -0700	[diff] [blame]	1700	* Check whether node_states[N_NORMAL_MEMORY] will be changed.
				1701	* If the memory to be offline is within the range
				1702	* [0..ZONE_NORMAL], and it is the last present memory there,
				1703	* the zones in that range will become empty after the offlining,
				1704	* thus we can determine that we need to clear the node from
				1705	* node_states[N_NORMAL_MEMORY].
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1706	*/
Oscar Salvador	86b27be	2018-10-26 15:07:38 -0700	[diff] [blame]	1707	for (zt = 0; zt <= ZONE_NORMAL; zt++)
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1708	present_pages += pgdat->node_zones[zt].present_pages;
Oscar Salvador	86b27be	2018-10-26 15:07:38 -0700	[diff] [blame]	1709	if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages)
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1710	arg->status_change_nid_normal = zone_to_nid(zone);
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1711
Lai Jiangshan	6715ddf	2012-12-12 13:51:49 -0800	[diff] [blame]	1712	#ifdef CONFIG_HIGHMEM
				1713	/*
Oscar Salvador	86b27be	2018-10-26 15:07:38 -0700	[diff] [blame]	1714	* node_states[N_HIGH_MEMORY] contains nodes which
				1715	* have normal memory or high memory.
				1716	* Here we add the present_pages belonging to ZONE_HIGHMEM.
				1717	* If the zone is within the range of [0..ZONE_HIGHMEM), and
				1718	* we determine that the zones in that range become empty,
				1719	* we need to clear the node for N_HIGH_MEMORY.
Lai Jiangshan	6715ddf	2012-12-12 13:51:49 -0800	[diff] [blame]	1720	*/
Oscar Salvador	86b27be	2018-10-26 15:07:38 -0700	[diff] [blame]	1721	present_pages += pgdat->node_zones[ZONE_HIGHMEM].present_pages;
				1722	if (zone_idx(zone) <= ZONE_HIGHMEM && nr_pages >= present_pages)
Lai Jiangshan	6715ddf	2012-12-12 13:51:49 -0800	[diff] [blame]	1723	arg->status_change_nid_high = zone_to_nid(zone);
Lai Jiangshan	6715ddf	2012-12-12 13:51:49 -0800	[diff] [blame]	1724	#endif
				1725
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1726	/*
Oscar Salvador	86b27be	2018-10-26 15:07:38 -0700	[diff] [blame]	1727	* We have accounted the pages from [0..ZONE_NORMAL), and
				1728	* in case of CONFIG_HIGHMEM the pages from ZONE_HIGHMEM
				1729	* as well.
				1730	* Here we count the possible pages from ZONE_MOVABLE.
				1731	* If after having accounted all the pages, we see that the nr_pages
				1732	* to be offlined is over or equal to the accounted pages,
				1733	* we know that the node will become empty, and so, we can clear
				1734	* it for N_MEMORY as well.
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1735	*/
Oscar Salvador	86b27be	2018-10-26 15:07:38 -0700	[diff] [blame]	1736	present_pages += pgdat->node_zones[ZONE_MOVABLE].present_pages;
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1737
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1738	if (nr_pages >= present_pages)
				1739	arg->status_change_nid = zone_to_nid(zone);
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1740	}
				1741
				1742	static void node_states_clear_node(int node, struct memory_notify *arg)
				1743	{
				1744	if (arg->status_change_nid_normal >= 0)
				1745	node_clear_state(node, N_NORMAL_MEMORY);
				1746
Oscar Salvador	cf01f6f5	2018-10-26 15:07:28 -0700	[diff] [blame]	1747	if (arg->status_change_nid_high >= 0)
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1748	node_clear_state(node, N_HIGH_MEMORY);
Lai Jiangshan	6715ddf	2012-12-12 13:51:49 -0800	[diff] [blame]	1749
Oscar Salvador	cf01f6f5	2018-10-26 15:07:28 -0700	[diff] [blame]	1750	if (arg->status_change_nid >= 0)
Lai Jiangshan	6715ddf	2012-12-12 13:51:49 -0800	[diff] [blame]	1751	node_clear_state(node, N_MEMORY);
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1752	}
				1753
David Hildenbrand	c5e79ef	2019-11-30 17:54:17 -0800	[diff] [blame]	1754	static int count_system_ram_pages_cb(unsigned long start_pfn,
				1755	unsigned long nr_pages, void *data)
				1756	{
				1757	unsigned long *nr_system_ram_pages = data;
				1758
				1759	*nr_system_ram_pages += nr_pages;
				1760	return 0;
				1761	}
				1762
David Hildenbrand	73a11c9	2020-10-15 20:07:46 -0700	[diff] [blame]	1763	int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1764	{
David Hildenbrand	73a11c9	2020-10-15 20:07:46 -0700	[diff] [blame]	1765	const unsigned long end_pfn = start_pfn + nr_pages;
David Hildenbrand	0a1a9a0	2020-10-15 20:07:54 -0700	[diff] [blame]	1766	unsigned long pfn, system_ram_pages = 0;
Cody P Schafer	d702909	2013-07-03 15:02:11 -0700	[diff] [blame]	1767	unsigned long flags;
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1768	struct zone *zone;
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1769	struct memory_notify arg;
David Hildenbrand	ea15153	2020-10-15 20:08:03 -0700	[diff] [blame]	1770	int ret, node;
Michal Hocko	7960509	2018-12-28 00:33:49 -0800	[diff] [blame]	1771	char *reason;
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1772
Oscar Salvador	dd8e2f2	2021-05-04 18:39:36 -0700	[diff] [blame]	1773	/*
				1774	* {on,off}lining is constrained to full memory sections (or more
Zhen Lei	041711c	2021-06-30 18:53:17 -0700	[diff] [blame]	1775	* precisely to memory blocks from the user space POV).
Oscar Salvador	dd8e2f2	2021-05-04 18:39:36 -0700	[diff] [blame]	1776	* memmap_on_memory is an exception because it reserves initial part
				1777	* of the physical memory space for vmemmaps. That space is pageblock
				1778	* aligned.
				1779	*/
David Hildenbrand	4986fac	2020-10-15 20:07:50 -0700	[diff] [blame]	1780	if (WARN_ON_ONCE(!nr_pages \|\|
Oscar Salvador	dd8e2f2	2021-05-04 18:39:36 -0700	[diff] [blame]	1781	!IS_ALIGNED(start_pfn, pageblock_nr_pages) \|\|
				1782	!IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION)))
David Hildenbrand	4986fac	2020-10-15 20:07:50 -0700	[diff] [blame]	1783	return -EINVAL;
				1784
David Hildenbrand	381eab4	2018-10-30 15:10:29 -0700	[diff] [blame]	1785	mem_hotplug_begin();
				1786
David Hildenbrand	c5e79ef	2019-11-30 17:54:17 -0800	[diff] [blame]	1787	/*
				1788	* Don't allow to offline memory blocks that contain holes.
				1789	* Consequently, memory blocks with holes can never get onlined
				1790	* via the hotplug path - online_pages() - as hotplugged memory has
				1791	* no holes. This way, we e.g., don't have to worry about marking
				1792	* memory holes PG_reserved, don't need pfn_valid() checks, and can
				1793	* avoid using walk_system_ram_range() later.
				1794	*/
David Hildenbrand	73a11c9	2020-10-15 20:07:46 -0700	[diff] [blame]	1795	walk_system_ram_range(start_pfn, nr_pages, &system_ram_pages,
David Hildenbrand	c5e79ef	2019-11-30 17:54:17 -0800	[diff] [blame]	1796	count_system_ram_pages_cb);
David Hildenbrand	73a11c9	2020-10-15 20:07:46 -0700	[diff] [blame]	1797	if (system_ram_pages != nr_pages) {
David Hildenbrand	c5e79ef	2019-11-30 17:54:17 -0800	[diff] [blame]	1798	ret = -EINVAL;
				1799	reason = "memory holes";
				1800	goto failed_removal;
				1801	}
				1802
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1803	/* This makes hotplug much easier...and readable.
				1804	we assume this for now. .*/
David Hildenbrand	9291799	2020-02-03 17:34:26 -0800	[diff] [blame]	1805	zone = test_pages_in_a_zone(start_pfn, end_pfn);
				1806	if (!zone) {
Michal Hocko	7960509	2018-12-28 00:33:49 -0800	[diff] [blame]	1807	ret = -EINVAL;
				1808	reason = "multizone range";
				1809	goto failed_removal;
David Hildenbrand	381eab4	2018-10-30 15:10:29 -0700	[diff] [blame]	1810	}
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1811	node = zone_to_nid(zone);
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1812
Vlastimil Babka	ec6e8c7e	2020-12-14 19:10:59 -0800	[diff] [blame]	1813	/*
				1814	* Disable pcplists so that page isolation cannot race with freeing
				1815	* in a way that pages from isolated pageblock are left on pcplists.
				1816	*/
				1817	zone_pcp_disable(zone);
Minchan Kim	d479960e	2021-05-04 18:36:54 -0700	[diff] [blame]	1818	lru_cache_disable();
Vlastimil Babka	ec6e8c7e	2020-12-14 19:10:59 -0800	[diff] [blame]	1819
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1820	/* set above range as isolated */
Wen Congyang	b023f46	2012-12-11 16:00:45 -0800	[diff] [blame]	1821	ret = start_isolate_page_range(start_pfn, end_pfn,
Michal Hocko	d381c54	2018-12-28 00:33:56 -0800	[diff] [blame]	1822	MIGRATE_MOVABLE,
David Hildenbrand	756d25b	2019-11-30 17:54:07 -0800	[diff] [blame]	1823	MEMORY_OFFLINE \| REPORT_FAILURE);
David Hildenbrand	3fa0c7c	2020-10-15 20:08:07 -0700	[diff] [blame]	1824	if (ret) {
Michal Hocko	7960509	2018-12-28 00:33:49 -0800	[diff] [blame]	1825	reason = "failure to isolate range";
Vlastimil Babka	ec6e8c7e	2020-12-14 19:10:59 -0800	[diff] [blame]	1826	goto failed_removal_pcplists_disabled;
David Hildenbrand	381eab4	2018-10-30 15:10:29 -0700	[diff] [blame]	1827	}
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1828
				1829	arg.start_pfn = start_pfn;
				1830	arg.nr_pages = nr_pages;
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1831	node_states_check_changes_offline(nr_pages, zone, &arg);
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1832
				1833	ret = memory_notify(MEM_GOING_OFFLINE, &arg);
				1834	ret = notifier_to_errno(ret);
Michal Hocko	7960509	2018-12-28 00:33:49 -0800	[diff] [blame]	1835	if (ret) {
				1836	reason = "notifier failure";
				1837	goto failed_removal_isolated;
				1838	}
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1839
Michal Hocko	bb8965b	2018-12-28 00:38:32 -0800	[diff] [blame]	1840	do {
David Hildenbrand	aa21879	2020-05-07 16:01:30 +0200	[diff] [blame]	1841	pfn = start_pfn;
				1842	do {
Michal Hocko	bb8965b	2018-12-28 00:38:32 -0800	[diff] [blame]	1843	if (signal_pending(current)) {
				1844	ret = -EINTR;
				1845	reason = "signal backoff";
				1846	goto failed_removal_isolated;
				1847	}
Michal Hocko	72b39cf	2017-11-15 17:33:34 -0800	[diff] [blame]	1848
Michal Hocko	bb8965b	2018-12-28 00:38:32 -0800	[diff] [blame]	1849	cond_resched();
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1850
David Hildenbrand	aa21879	2020-05-07 16:01:30 +0200	[diff] [blame]	1851	ret = scan_movable_pages(pfn, end_pfn, &pfn);
				1852	if (!ret) {
Michal Hocko	bb8965b	2018-12-28 00:38:32 -0800	[diff] [blame]	1853	/*
				1854	* TODO: fatal migration failures should bail
				1855	* out
				1856	*/
				1857	do_migrate_range(pfn, end_pfn);
				1858	}
David Hildenbrand	aa21879	2020-05-07 16:01:30 +0200	[diff] [blame]	1859	} while (!ret);
				1860
				1861	if (ret != -ENOENT) {
				1862	reason = "unmovable page";
				1863	goto failed_removal_isolated;
Michal Hocko	bb8965b	2018-12-28 00:38:32 -0800	[diff] [blame]	1864	}
Michal Hocko	72b39cf	2017-11-15 17:33:34 -0800	[diff] [blame]	1865
Michal Hocko	bb8965b	2018-12-28 00:38:32 -0800	[diff] [blame]	1866	/*
				1867	* Dissolve free hugepages in the memory block before doing
				1868	* offlining actually in order to make hugetlbfs's object
				1869	* counting consistent.
				1870	*/
				1871	ret = dissolve_free_huge_pages(start_pfn, end_pfn);
				1872	if (ret) {
				1873	reason = "failure to dissolve huge pages";
				1874	goto failed_removal_isolated;
				1875	}
David Hildenbrand	0a1a9a0	2020-10-15 20:07:54 -0700	[diff] [blame]	1876
David Hildenbrand	0a1a9a0	2020-10-15 20:07:54 -0700	[diff] [blame]	1877	ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE);
Vlastimil Babka	ec6e8c7e	2020-12-14 19:10:59 -0800	[diff] [blame]	1878
Michal Hocko	5557c76	2019-05-13 17:21:24 -0700	[diff] [blame]	1879	} while (ret);
Michal Hocko	bb8965b	2018-12-28 00:38:32 -0800	[diff] [blame]	1880
David Hildenbrand	0a1a9a0	2020-10-15 20:07:54 -0700	[diff] [blame]	1881	/* Mark all sections offline and remove free pages from the buddy. */
				1882	__offline_isolated_pages(start_pfn, end_pfn);
Laurent Dufour	7c33023	2020-12-15 20:42:26 -0800	[diff] [blame]	1883	pr_debug("Offlined Pages %ld\n", nr_pages);
David Hildenbrand	0a1a9a0	2020-10-15 20:07:54 -0700	[diff] [blame]	1884
Qian Cai	9b7ea46	2019-03-28 20:43:34 -0700	[diff] [blame]	1885	/*
David Hildenbrand	b30c592	2020-10-15 20:08:23 -0700	[diff] [blame]	1886	* The memory sections are marked offline, and the pageblock flags
				1887	* effectively stale; nobody should be touching them. Fixup the number
				1888	* of isolated pageblocks, memory onlining will properly revert this.
Qian Cai	9b7ea46	2019-03-28 20:43:34 -0700	[diff] [blame]	1889	*/
				1890	spin_lock_irqsave(&zone->lock, flags);
David Hildenbrand	ea15153	2020-10-15 20:08:03 -0700	[diff] [blame]	1891	zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages;
Qian Cai	9b7ea46	2019-03-28 20:43:34 -0700	[diff] [blame]	1892	spin_unlock_irqrestore(&zone->lock, flags);
				1893
Minchan Kim	d479960e	2021-05-04 18:36:54 -0700	[diff] [blame]	1894	lru_cache_enable();
Vlastimil Babka	ec6e8c7e	2020-12-14 19:10:59 -0800	[diff] [blame]	1895	zone_pcp_enable(zone);
				1896
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1897	/* removal success */
David Hildenbrand	0a1a9a0	2020-10-15 20:07:54 -0700	[diff] [blame]	1898	adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
David Hildenbrand	4b09700	2021-09-07 19:55:19 -0700	[diff] [blame]	1899	adjust_present_page_count(pfn_to_page(start_pfn), -nr_pages);
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1900
Mel Gorman	b92ca18	2021-06-28 19:42:12 -0700	[diff] [blame]	1901	/* reinitialise watermarks and update pcp limits */
KOSAKI Motohiro	1b79acc	2011-05-24 17:11:32 -0700	[diff] [blame]	1902	init_per_zone_wmark_min();
				1903
Xishi Qiu	1e8537b	2012-10-08 16:31:51 -0700	[diff] [blame]	1904	if (!populated_zone(zone)) {
Jiang Liu	340175b	2012-07-31 16:43:32 -0700	[diff] [blame]	1905	zone_pcp_reset(zone);
Michal Hocko	72675e1	2017-09-06 16:20:24 -0700	[diff] [blame]	1906	build_all_zonelists(NULL);
Mel Gorman	b92ca18	2021-06-28 19:42:12 -0700	[diff] [blame]	1907	}
Jiang Liu	340175b	2012-07-31 16:43:32 -0700	[diff] [blame]	1908
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1909	node_states_clear_node(node, &arg);
Vlastimil Babka	698b1b3	2016-03-17 14:18:08 -0700	[diff] [blame]	1910	if (arg.status_change_nid >= 0) {
David Rientjes	8fe23e0	2009-12-14 17:58:33 -0800	[diff] [blame]	1911	kswapd_stop(node);
Vlastimil Babka	698b1b3	2016-03-17 14:18:08 -0700	[diff] [blame]	1912	kcompactd_stop(node);
				1913	}
Minchan Kim	bce7394	2009-06-16 15:32:50 -0700	[diff] [blame]	1914
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1915	writeback_set_ratelimit();
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1916
				1917	memory_notify(MEM_OFFLINE, &arg);
David Hildenbrand	feee6b2	2020-01-04 12:59:33 -0800	[diff] [blame]	1918	remove_pfn_range_from_zone(zone, start_pfn, nr_pages);
David Hildenbrand	381eab4	2018-10-30 15:10:29 -0700	[diff] [blame]	1919	mem_hotplug_done();
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1920	return 0;
				1921
Michal Hocko	7960509	2018-12-28 00:33:49 -0800	[diff] [blame]	1922	failed_removal_isolated:
				1923	undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
Qian Cai	c4efe48	2019-03-28 20:44:16 -0700	[diff] [blame]	1924	memory_notify(MEM_CANCEL_OFFLINE, &arg);
Vlastimil Babka	ec6e8c7e	2020-12-14 19:10:59 -0800	[diff] [blame]	1925	failed_removal_pcplists_disabled:
Miaohe Lin	946746d1	2021-08-25 12:17:55 -0700	[diff] [blame]	1926	lru_cache_enable();
Vlastimil Babka	ec6e8c7e	2020-12-14 19:10:59 -0800	[diff] [blame]	1927	zone_pcp_enable(zone);
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1928	failed_removal:
Michal Hocko	7960509	2018-12-28 00:33:49 -0800	[diff] [blame]	1929	pr_debug("memory offlining [mem %#010llx-%#010llx] failed due to %s\n",
Chen Yucong	e33e33b	2016-03-17 14:19:35 -0700	[diff] [blame]	1930	(unsigned long long) start_pfn << PAGE_SHIFT,
Michal Hocko	7960509	2018-12-28 00:33:49 -0800	[diff] [blame]	1931	((unsigned long long) end_pfn << PAGE_SHIFT) - 1,
				1932	reason);
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1933	/* pushback to free area */
David Hildenbrand	381eab4	2018-10-30 15:10:29 -0700	[diff] [blame]	1934	mem_hotplug_done();
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1935	return ret;
				1936	}
Badari Pulavarty	7108878	2008-10-18 20:25:58 -0700	[diff] [blame]	1937
Xishi Qiu	d6de9d5	2013-11-12 15:07:20 -0800	[diff] [blame]	1938	static int check_memblock_offlined_cb(struct memory_block mem, void arg)
Wen Congyang	bbc76be	2013-02-22 16:32:54 -0800	[diff] [blame]	1939	{
				1940	int ret = !is_memblock_offlined(mem);
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	1941	int *nid = arg;
Wen Congyang	bbc76be	2013-02-22 16:32:54 -0800	[diff] [blame]	1942
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	1943	*nid = mem->nid;
Randy Dunlap	349daa0	2013-04-29 15:08:49 -0700	[diff] [blame]	1944	if (unlikely(ret)) {
				1945	phys_addr_t beginpa, endpa;
				1946
				1947	beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr));
David Hildenbrand	b6c88d3	2019-09-23 15:35:49 -0700	[diff] [blame]	1948	endpa = beginpa + memory_block_size_bytes() - 1;
Joe Perches	756a025	2016-03-17 14:19:47 -0700	[diff] [blame]	1949	pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n",
Randy Dunlap	349daa0	2013-04-29 15:08:49 -0700	[diff] [blame]	1950	&beginpa, &endpa);
Wen Congyang	bbc76be	2013-02-22 16:32:54 -0800	[diff] [blame]	1951
Pavel Tatashin	eca499a	2019-07-16 16:30:31 -0700	[diff] [blame]	1952	return -EBUSY;
				1953	}
				1954	return 0;
Wen Congyang	bbc76be	2013-02-22 16:32:54 -0800	[diff] [blame]	1955	}
				1956
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	1957	static int get_nr_vmemmap_pages_cb(struct memory_block mem, void arg)
				1958	{
				1959	/*
				1960	* If not set, continue with the next block.
				1961	*/
				1962	return mem->nr_vmemmap_pages;
				1963	}
				1964
Toshi Kani	0f1cfe9	2013-09-11 14:21:50 -0700	[diff] [blame]	1965	static int check_cpu_on_node(pg_data_t *pgdat)
Tang Chen	60a5a19	2013-02-22 16:33:14 -0800	[diff] [blame]	1966	{
Tang Chen	60a5a19	2013-02-22 16:33:14 -0800	[diff] [blame]	1967	int cpu;
				1968
				1969	for_each_present_cpu(cpu) {
				1970	if (cpu_to_node(cpu) == pgdat->node_id)
				1971	/*
				1972	* the cpu on this node isn't removed, and we can't
				1973	* offline this node.
				1974	*/
				1975	return -EBUSY;
				1976	}
				1977
				1978	return 0;
				1979	}
				1980
David Hildenbrand	2c91f8f	2019-11-15 17:34:57 -0800	[diff] [blame]	1981	static int check_no_memblock_for_node_cb(struct memory_block mem, void arg)
				1982	{
				1983	int nid = (int )arg;
				1984
				1985	/*
				1986	* If a memory block belongs to multiple nodes, the stored nid is not
				1987	* reliable. However, such blocks are always online (e.g., cannot get
				1988	* offlined) and, therefore, are still spanned by the node.
				1989	*/
				1990	return mem->nid == nid ? -EEXIST : 0;
				1991	}
				1992
Toshi Kani	0f1cfe9	2013-09-11 14:21:50 -0700	[diff] [blame]	1993	/**
				1994	* try_offline_node
Mike Rapoport	e8b098f	2018-04-05 16:24:57 -0700	[diff] [blame]	1995	* @nid: the node ID
Toshi Kani	0f1cfe9	2013-09-11 14:21:50 -0700	[diff] [blame]	1996	*
				1997	* Offline a node if all memory sections and cpus of the node are removed.
				1998	*
				1999	* NOTE: The caller must call lock_device_hotplug() to serialize hotplug
				2000	* and online/offline operations before this call.
				2001	*/
Wen Congyang	90b30cd	2013-02-22 16:33:27 -0800	[diff] [blame]	2002	void try_offline_node(int nid)
Tang Chen	60a5a19	2013-02-22 16:33:14 -0800	[diff] [blame]	2003	{
Wen Congyang	d822b86	2013-02-22 16:33:16 -0800	[diff] [blame]	2004	pg_data_t *pgdat = NODE_DATA(nid);
David Hildenbrand	2c91f8f	2019-11-15 17:34:57 -0800	[diff] [blame]	2005	int rc;
Tang Chen	60a5a19	2013-02-22 16:33:14 -0800	[diff] [blame]	2006
David Hildenbrand	2c91f8f	2019-11-15 17:34:57 -0800	[diff] [blame]	2007	/*
				2008	* If the node still spans pages (especially ZONE_DEVICE), don't
				2009	* offline it. A node spans memory after move_pfn_range_to_zone(),
				2010	* e.g., after the memory block was onlined.
				2011	*/
				2012	if (pgdat->node_spanned_pages)
Tang Chen	60a5a19	2013-02-22 16:33:14 -0800	[diff] [blame]	2013	return;
David Hildenbrand	2c91f8f	2019-11-15 17:34:57 -0800	[diff] [blame]	2014
				2015	/*
				2016	* Especially offline memory blocks might not be spanned by the
				2017	* node. They will get spanned by the node once they get onlined.
				2018	* However, they link to the node in sysfs and can get onlined later.
				2019	*/
				2020	rc = for_each_memory_block(&nid, check_no_memblock_for_node_cb);
				2021	if (rc)
				2022	return;
Tang Chen	60a5a19	2013-02-22 16:33:14 -0800	[diff] [blame]	2023
Michal Hocko	46a3679	2018-12-28 00:34:13 -0800	[diff] [blame]	2024	if (check_cpu_on_node(pgdat))
Tang Chen	60a5a19	2013-02-22 16:33:14 -0800	[diff] [blame]	2025	return;
				2026
				2027	/*
				2028	* all memory/cpu of this node are removed, we can offline this
				2029	* node now.
				2030	*/
				2031	node_set_offline(nid);
				2032	unregister_one_node(nid);
				2033	}
Wen Congyang	90b30cd	2013-02-22 16:33:27 -0800	[diff] [blame]	2034	EXPORT_SYMBOL(try_offline_node);
Tang Chen	60a5a19	2013-02-22 16:33:14 -0800	[diff] [blame]	2035
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2036	static int __ref try_remove_memory(u64 start, u64 size)
Wen Congyang	bbc76be	2013-02-22 16:32:54 -0800	[diff] [blame]	2037	{
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	2038	struct vmem_altmap mhp_altmap = {};
				2039	struct vmem_altmap *altmap = NULL;
				2040	unsigned long nr_vmemmap_pages;
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2041	int rc = 0, nid = NUMA_NO_NODE;
Wen Congyang	993c1aa	2013-02-22 16:32:50 -0800	[diff] [blame]	2042
Toshi Kani	27356f5	2013-09-11 14:21:49 -0700	[diff] [blame]	2043	BUG_ON(check_hotplug_memory_range(start, size));
				2044
Yasuaki Ishimatsu	6677e3e	2013-02-22 16:32:52 -0800	[diff] [blame]	2045	/*
Rafael J. Wysocki	242831e	2013-05-27 12:58:46 +0200	[diff] [blame]	2046	* All memory blocks must be offlined before removing memory. Check
Pavel Tatashin	eca499a	2019-07-16 16:30:31 -0700	[diff] [blame]	2047	* whether all memory blocks in question are offline and return error
Rafael J. Wysocki	242831e	2013-05-27 12:58:46 +0200	[diff] [blame]	2048	* if this is not the case.
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2049	*
				2050	* While at it, determine the nid. Note that if we'd have mixed nodes,
				2051	* we'd only try to offline the last determined one -- which is good
				2052	* enough for the cases we care about.
Yasuaki Ishimatsu	6677e3e	2013-02-22 16:32:52 -0800	[diff] [blame]	2053	*/
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2054	rc = walk_memory_blocks(start, size, &nid, check_memblock_offlined_cb);
Pavel Tatashin	eca499a	2019-07-16 16:30:31 -0700	[diff] [blame]	2055	if (rc)
Jia He	b4223a5	2020-08-11 18:32:20 -0700	[diff] [blame]	2056	return rc;
Yasuaki Ishimatsu	6677e3e	2013-02-22 16:32:52 -0800	[diff] [blame]	2057
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	2058	/*
				2059	* We only support removing memory added with MHP_MEMMAP_ON_MEMORY in
				2060	* the same granularity it was added - a single memory block.
				2061	*/
				2062	if (memmap_on_memory) {
				2063	nr_vmemmap_pages = walk_memory_blocks(start, size, NULL,
				2064	get_nr_vmemmap_pages_cb);
				2065	if (nr_vmemmap_pages) {
				2066	if (size != memory_block_size_bytes()) {
				2067	pr_warn("Refuse to remove %#llx - %#llx,"
				2068	"wrong granularity\n",
				2069	start, start + size);
				2070	return -EINVAL;
				2071	}
				2072
				2073	/*
				2074	* Let remove_pmd_table->free_hugepage_table do the
				2075	* right thing if we used vmem_altmap when hot-adding
				2076	* the range.
				2077	*/
				2078	mhp_altmap.alloc = nr_vmemmap_pages;
				2079	altmap = &mhp_altmap;
				2080	}
				2081	}
				2082
Yasuaki Ishimatsu	46c66c4	2013-02-22 16:32:56 -0800	[diff] [blame]	2083	/* remove memmap entry */
				2084	firmware_map_remove(start, start + size, "System RAM");
				2085
Dan Williams	f1037ec	2020-01-30 22:11:17 -0800	[diff] [blame]	2086	/*
				2087	* Memory block device removal under the device_hotplug_lock is
				2088	* a barrier against racing online attempts.
				2089	*/
David Hildenbrand	4c4b7f9	2019-07-18 15:57:06 -0700	[diff] [blame]	2090	remove_memory_block_devices(start, size);
				2091
Dan Williams	f1037ec	2020-01-30 22:11:17 -0800	[diff] [blame]	2092	mem_hotplug_begin();
				2093
David Hildenbrand	65a2aa5	2021-09-07 19:55:04 -0700	[diff] [blame]	2094	arch_remove_memory(start, size, altmap);
David Hildenbrand	52219ae	2020-06-04 16:48:38 -0700	[diff] [blame]	2095
				2096	if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
				2097	memblock_free(start, size);
				2098	memblock_remove(start, size);
				2099	}
				2100
David Hildenbrand	cb8e3c8	2020-10-15 20:09:12 -0700	[diff] [blame]	2101	release_mem_region_adjustable(start, size);
Wen Congyang	24d335c	2013-02-22 16:32:58 -0800	[diff] [blame]	2102
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2103	if (nid != NUMA_NO_NODE)
				2104	try_offline_node(nid);
Tang Chen	60a5a19	2013-02-22 16:33:14 -0800	[diff] [blame]	2105
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	2106	mem_hotplug_done();
Jia He	b4223a5	2020-08-11 18:32:20 -0700	[diff] [blame]	2107	return 0;
Badari Pulavarty	7108878	2008-10-18 20:25:58 -0700	[diff] [blame]	2108	}
David Hildenbrand	d15e592	2018-10-30 15:10:18 -0700	[diff] [blame]	2109
Pavel Tatashin	eca499a	2019-07-16 16:30:31 -0700	[diff] [blame]	2110	/**
Mel Gorman	5640c9c	2021-06-30 18:53:38 -0700	[diff] [blame]	2111	* __remove_memory - Remove memory if every memory block is offline
Pavel Tatashin	eca499a	2019-07-16 16:30:31 -0700	[diff] [blame]	2112	* @start: physical address of the region to remove
				2113	* @size: size of the region to remove
				2114	*
				2115	* NOTE: The caller must call lock_device_hotplug() to serialize hotplug
				2116	* and online/offline operations before this call, as required by
				2117	* try_offline_node().
				2118	*/
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2119	void __remove_memory(u64 start, u64 size)
David Hildenbrand	d15e592	2018-10-30 15:10:18 -0700	[diff] [blame]	2120	{
Pavel Tatashin	eca499a	2019-07-16 16:30:31 -0700	[diff] [blame]	2121
				2122	/*
Souptick Joarder	29a90db	2019-09-23 15:36:18 -0700	[diff] [blame]	2123	* trigger BUG() if some memory is not offlined prior to calling this
Pavel Tatashin	eca499a	2019-07-16 16:30:31 -0700	[diff] [blame]	2124	* function
				2125	*/
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2126	if (try_remove_memory(start, size))
Pavel Tatashin	eca499a	2019-07-16 16:30:31 -0700	[diff] [blame]	2127	BUG();
				2128	}
				2129
				2130	/*
				2131	* Remove memory if every memory block is offline, otherwise return -EBUSY is
				2132	* some memory is not offline
				2133	*/
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2134	int remove_memory(u64 start, u64 size)
Pavel Tatashin	eca499a	2019-07-16 16:30:31 -0700	[diff] [blame]	2135	{
				2136	int rc;
				2137
David Hildenbrand	d15e592	2018-10-30 15:10:18 -0700	[diff] [blame]	2138	lock_device_hotplug();
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2139	rc = try_remove_memory(start, size);
David Hildenbrand	d15e592	2018-10-30 15:10:18 -0700	[diff] [blame]	2140	unlock_device_hotplug();
Pavel Tatashin	eca499a	2019-07-16 16:30:31 -0700	[diff] [blame]	2141
				2142	return rc;
David Hildenbrand	d15e592	2018-10-30 15:10:18 -0700	[diff] [blame]	2143	}
Badari Pulavarty	7108878	2008-10-18 20:25:58 -0700	[diff] [blame]	2144	EXPORT_SYMBOL_GPL(remove_memory);
David Hildenbrand	08b3acd	2020-05-07 16:01:32 +0200	[diff] [blame]	2145
David Hildenbrand	8dc4bb5	2020-11-12 14:38:13 +0100	[diff] [blame]	2146	static int try_offline_memory_block(struct memory_block mem, void arg)
				2147	{
				2148	uint8_t online_type = MMOP_ONLINE_KERNEL;
				2149	uint8_t **online_types = arg;
				2150	struct page *page;
				2151	int rc;
				2152
				2153	/*
				2154	* Sense the online_type via the zone of the memory block. Offlining
				2155	* with multiple zones within one memory block will be rejected
				2156	* by offlining code ... so we don't care about that.
				2157	*/
				2158	page = pfn_to_online_page(section_nr_to_pfn(mem->start_section_nr));
				2159	if (page && zone_idx(page_zone(page)) == ZONE_MOVABLE)
				2160	online_type = MMOP_ONLINE_MOVABLE;
				2161
				2162	rc = device_offline(&mem->dev);
				2163	/*
				2164	* Default is MMOP_OFFLINE - change it only if offlining succeeded,
				2165	* so try_reonline_memory_block() can do the right thing.
				2166	*/
				2167	if (!rc)
				2168	**online_types = online_type;
				2169
				2170	(*online_types)++;
				2171	/* Ignore if already offline. */
				2172	return rc < 0 ? rc : 0;
				2173	}
				2174
				2175	static int try_reonline_memory_block(struct memory_block mem, void arg)
				2176	{
				2177	uint8_t **online_types = arg;
				2178	int rc;
				2179
				2180	if (**online_types != MMOP_OFFLINE) {
				2181	mem->online_type = **online_types;
				2182	rc = device_online(&mem->dev);
				2183	if (rc < 0)
				2184	pr_warn("%s: Failed to re-online memory: %d",
				2185	__func__, rc);
				2186	}
				2187
				2188	/* Continue processing all remaining memory blocks. */
				2189	(*online_types)++;
				2190	return 0;
				2191	}
				2192
David Hildenbrand	08b3acd	2020-05-07 16:01:32 +0200	[diff] [blame]	2193	/*
David Hildenbrand	8dc4bb5	2020-11-12 14:38:13 +0100	[diff] [blame]	2194	* Try to offline and remove memory. Might take a long time to finish in case
				2195	* memory is still in use. Primarily useful for memory devices that logically
				2196	* unplugged all memory (so it's no longer in use) and want to offline + remove
				2197	* that memory.
David Hildenbrand	08b3acd	2020-05-07 16:01:32 +0200	[diff] [blame]	2198	*/
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2199	int offline_and_remove_memory(u64 start, u64 size)
David Hildenbrand	08b3acd	2020-05-07 16:01:32 +0200	[diff] [blame]	2200	{
David Hildenbrand	8dc4bb5	2020-11-12 14:38:13 +0100	[diff] [blame]	2201	const unsigned long mb_count = size / memory_block_size_bytes();
				2202	uint8_t online_types, tmp;
				2203	int rc;
David Hildenbrand	08b3acd	2020-05-07 16:01:32 +0200	[diff] [blame]	2204
				2205	if (!IS_ALIGNED(start, memory_block_size_bytes()) \|\|
David Hildenbrand	8dc4bb5	2020-11-12 14:38:13 +0100	[diff] [blame]	2206	!IS_ALIGNED(size, memory_block_size_bytes()) \|\| !size)
				2207	return -EINVAL;
David Hildenbrand	08b3acd	2020-05-07 16:01:32 +0200	[diff] [blame]	2208
				2209	/*
David Hildenbrand	8dc4bb5	2020-11-12 14:38:13 +0100	[diff] [blame]	2210	* We'll remember the old online type of each memory block, so we can
				2211	* try to revert whatever we did when offlining one memory block fails
				2212	* after offlining some others succeeded.
				2213	*/
				2214	online_types = kmalloc_array(mb_count, sizeof(*online_types),
				2215	GFP_KERNEL);
				2216	if (!online_types)
				2217	return -ENOMEM;
				2218	/*
				2219	* Initialize all states to MMOP_OFFLINE, so when we abort processing in
				2220	* try_offline_memory_block(), we'll skip all unprocessed blocks in
				2221	* try_reonline_memory_block().
				2222	*/
				2223	memset(online_types, MMOP_OFFLINE, mb_count);
				2224
				2225	lock_device_hotplug();
				2226
				2227	tmp = online_types;
				2228	rc = walk_memory_blocks(start, size, &tmp, try_offline_memory_block);
				2229
				2230	/*
				2231	* In case we succeeded to offline all memory, remove it.
David Hildenbrand	08b3acd	2020-05-07 16:01:32 +0200	[diff] [blame]	2232	* This cannot fail as it cannot get onlined in the meantime.
				2233	*/
				2234	if (!rc) {
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2235	rc = try_remove_memory(start, size);
David Hildenbrand	8dc4bb5	2020-11-12 14:38:13 +0100	[diff] [blame]	2236	if (rc)
				2237	pr_err("%s: Failed to remove memory: %d", __func__, rc);
				2238	}
				2239
				2240	/*
				2241	* Rollback what we did. While memory onlining might theoretically fail
				2242	* (nacked by a notifier), it barely ever happens.
				2243	*/
				2244	if (rc) {
				2245	tmp = online_types;
				2246	walk_memory_blocks(start, size, &tmp,
				2247	try_reonline_memory_block);
David Hildenbrand	08b3acd	2020-05-07 16:01:32 +0200	[diff] [blame]	2248	}
				2249	unlock_device_hotplug();
				2250
David Hildenbrand	8dc4bb5	2020-11-12 14:38:13 +0100	[diff] [blame]	2251	kfree(online_types);
David Hildenbrand	08b3acd	2020-05-07 16:01:32 +0200	[diff] [blame]	2252	return rc;
				2253	}
				2254	EXPORT_SYMBOL_GPL(offline_and_remove_memory);
Rafael J. Wysocki	aba6efc	2013-06-01 22:24:07 +0200	[diff] [blame]	2255	#endif /* CONFIG_MEMORY_HOTREMOVE */