Blame - mm/memory_hotplug.c - yocto/kernel/common

blob: b80fb8164fb81796500a4efda1e2da6b3451dfb4 [file] [log] [blame]

Thomas Gleixner	457c899	2019-05-19 13:08:55 +0100	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0-only
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	2	/*
				3	* linux/mm/memory_hotplug.c
				4	*
				5	* Copyright (C)
				6	*/
				7
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	8	#include <linux/stddef.h>
				9	#include <linux/mm.h>
Ingo Molnar	174cd4b	2017-02-02 19:15:33 +0100	[diff] [blame]	10	#include <linux/sched/signal.h>
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	11	#include <linux/swap.h>
				12	#include <linux/interrupt.h>
				13	#include <linux/pagemap.h>
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	14	#include <linux/compiler.h>
Paul Gortmaker	b95f1b31	2011-10-16 02:01:52 -0400	[diff] [blame]	15	#include <linux/export.h>
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	16	#include <linux/pagevec.h>
Chandra Seetharaman	2d1d43f	2006-09-29 02:01:25 -0700	[diff] [blame]	17	#include <linux/writeback.h>
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	18	#include <linux/slab.h>
				19	#include <linux/sysctl.h>
				20	#include <linux/cpu.h>
				21	#include <linux/memory.h>
Dan Williams	4b94ffd	2016-01-15 16:56:22 -0800	[diff] [blame]	22	#include <linux/memremap.h>
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	23	#include <linux/memory_hotplug.h>
				24	#include <linux/highmem.h>
				25	#include <linux/vmalloc.h>
KAMEZAWA Hiroyuki	0a54703	2006-06-27 02:53:35 -0700	[diff] [blame]	26	#include <linux/ioport.h>
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	27	#include <linux/delay.h>
				28	#include <linux/migrate.h>
				29	#include <linux/page-isolation.h>
Badari Pulavarty	7108878	2008-10-18 20:25:58 -0700	[diff] [blame]	30	#include <linux/pfn.h>
Andi Kleen	6ad696d	2009-11-17 14:06:22 -0800	[diff] [blame]	31	#include <linux/suspend.h>
KOSAKI Motohiro	6d9c285	2009-12-14 17:58:11 -0800	[diff] [blame]	32	#include <linux/mm_inline.h>
akpm@linux-foundation.org	d96ae53	2010-03-05 13:41:58 -0800	[diff] [blame]	33	#include <linux/firmware-map.h>
Tang Chen	60a5a19	2013-02-22 16:33:14 -0800	[diff] [blame]	34	#include <linux/stop_machine.h>
Naoya Horiguchi	c8721bb	2013-09-11 14:22:09 -0700	[diff] [blame]	35	#include <linux/hugetlb.h>
Tang Chen	c532092	2013-11-12 15:08:10 -0800	[diff] [blame]	36	#include <linux/memblock.h>
Vlastimil Babka	698b1b3	2016-03-17 14:18:08 -0700	[diff] [blame]	37	#include <linux/compaction.h>
Michal Hocko	b15c872	2018-12-28 00:38:01 -0800	[diff] [blame]	38	#include <linux/rmap.h>
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	39
				40	#include <asm/tlbflush.h>
				41
Adrian Bunk	1e5ad9a	2008-04-28 20:40:08 +0300	[diff] [blame]	42	#include "internal.h"
Dan Williams	e900a91	2019-05-14 15:41:28 -0700	[diff] [blame]	43	#include "shuffle.h"
Adrian Bunk	1e5ad9a	2008-04-28 20:40:08 +0300	[diff] [blame]	44
Oscar Salvador	e3a9d9f	2021-05-04 18:39:48 -0700	[diff] [blame]	45
				46	/*
				47	* memory_hotplug.memmap_on_memory parameter
				48	*/
				49	static bool memmap_on_memory __ro_after_init;
				50	#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
				51	module_param(memmap_on_memory, bool, 0444);
				52	MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug");
				53	#endif
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	54
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	55	enum {
				56	ONLINE_POLICY_CONTIG_ZONES = 0,
				57	ONLINE_POLICY_AUTO_MOVABLE,
				58	};
				59
				60	const char *online_policy_to_str[] = {
				61	[ONLINE_POLICY_CONTIG_ZONES] = "contig-zones",
				62	[ONLINE_POLICY_AUTO_MOVABLE] = "auto-movable",
				63	};
				64
				65	static int set_online_policy(const char val, const struct kernel_param kp)
				66	{
				67	int ret = sysfs_match_string(online_policy_to_str, val);
				68
				69	if (ret < 0)
				70	return ret;
				71	((int )kp->arg) = ret;
				72	return 0;
				73	}
				74
				75	static int get_online_policy(char buffer, const struct kernel_param kp)
				76	{
				77	return sprintf(buffer, "%s\n", online_policy_to_str[((int )kp->arg)]);
				78	}
				79
				80	/*
				81	* memory_hotplug.online_policy: configure online behavior when onlining without
				82	* specifying a zone (MMOP_ONLINE)
				83	*
				84	* "contig-zones": keep zone contiguous
				85	* "auto-movable": online memory to ZONE_MOVABLE if the configuration
				86	* (auto_movable_ratio, auto_movable_numa_aware) allows for it
				87	*/
				88	static int online_policy __read_mostly = ONLINE_POLICY_CONTIG_ZONES;
				89	static const struct kernel_param_ops online_policy_ops = {
				90	.set = set_online_policy,
				91	.get = get_online_policy,
				92	};
				93	module_param_cb(online_policy, &online_policy_ops, &online_policy, 0644);
				94	MODULE_PARM_DESC(online_policy,
				95	"Set the online policy (\"contig-zones\", \"auto-movable\") "
				96	"Default: \"contig-zones\"");
				97
				98	/*
				99	* memory_hotplug.auto_movable_ratio: specify maximum MOVABLE:KERNEL ratio
				100	*
				101	* The ratio represent an upper limit and the kernel might decide to not
				102	* online some memory to ZONE_MOVABLE -- e.g., because hotplugged KERNEL memory
				103	* doesn't allow for more MOVABLE memory.
				104	*/
				105	static unsigned int auto_movable_ratio __read_mostly = 301;
				106	module_param(auto_movable_ratio, uint, 0644);
				107	MODULE_PARM_DESC(auto_movable_ratio,
				108	"Set the maximum ratio of MOVABLE:KERNEL memory in the system "
				109	"in percent for \"auto-movable\" online policy. Default: 301");
				110
				111	/*
				112	* memory_hotplug.auto_movable_numa_aware: consider numa node stats
				113	*/
				114	#ifdef CONFIG_NUMA
				115	static bool auto_movable_numa_aware __read_mostly = true;
				116	module_param(auto_movable_numa_aware, bool, 0644);
				117	MODULE_PARM_DESC(auto_movable_numa_aware,
				118	"Consider numa node stats in addition to global stats in "
				119	"\"auto-movable\" online policy. Default: true");
				120	#endif /* CONFIG_NUMA */
				121
Daniel Kiper	9d0ad8c	2011-07-25 17:12:05 -0700	[diff] [blame]	122	/*
				123	* online_page_callback contains pointer to current page onlining function.
				124	* Initially it is generic_online_page(). If it is required it could be
				125	* changed by calling set_online_page_callback() for callback registration
				126	* and restore_online_page_callback() for generic callback restore.
				127	*/
				128
Daniel Kiper	9d0ad8c	2011-07-25 17:12:05 -0700	[diff] [blame]	129	static online_page_callback_t online_page_callback = generic_online_page;
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	130	static DEFINE_MUTEX(online_page_callback_lock);
Daniel Kiper	9d0ad8c	2011-07-25 17:12:05 -0700	[diff] [blame]	131
Thomas Gleixner	3f906ba	2017-07-10 15:50:09 -0700	[diff] [blame]	132	DEFINE_STATIC_PERCPU_RWSEM(mem_hotplug_lock);
KOSAKI Motohiro	20d6c96	2010-12-02 14:31:19 -0800	[diff] [blame]	133
Thomas Gleixner	3f906ba	2017-07-10 15:50:09 -0700	[diff] [blame]	134	void get_online_mems(void)
				135	{
				136	percpu_down_read(&mem_hotplug_lock);
				137	}
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	138
Thomas Gleixner	3f906ba	2017-07-10 15:50:09 -0700	[diff] [blame]	139	void put_online_mems(void)
				140	{
				141	percpu_up_read(&mem_hotplug_lock);
				142	}
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	143
Michal Hocko	4932381	2017-07-06 15:41:05 -0700	[diff] [blame]	144	bool movable_node_enabled = false;
				145
Vitaly Kuznetsov	8604d9e	2016-05-19 17:13:03 -0700	[diff] [blame]	146	#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
Anshuman Khandual	1adf8b4	2021-02-25 17:17:13 -0800	[diff] [blame]	147	int mhp_default_online_type = MMOP_OFFLINE;
Vitaly Kuznetsov	8604d9e	2016-05-19 17:13:03 -0700	[diff] [blame]	148	#else
Anshuman Khandual	1adf8b4	2021-02-25 17:17:13 -0800	[diff] [blame]	149	int mhp_default_online_type = MMOP_ONLINE;
Vitaly Kuznetsov	8604d9e	2016-05-19 17:13:03 -0700	[diff] [blame]	150	#endif
Vitaly Kuznetsov	31bc385	2016-03-15 14:56:48 -0700	[diff] [blame]	151
Vitaly Kuznetsov	86dd995	2016-05-19 17:13:06 -0700	[diff] [blame]	152	static int __init setup_memhp_default_state(char *str)
				153	{
Anshuman Khandual	1adf8b4	2021-02-25 17:17:13 -0800	[diff] [blame]	154	const int online_type = mhp_online_type_from_str(str);
David Hildenbrand	5f47adf	2020-04-06 20:07:44 -0700	[diff] [blame]	155
				156	if (online_type >= 0)
Anshuman Khandual	1adf8b4	2021-02-25 17:17:13 -0800	[diff] [blame]	157	mhp_default_online_type = online_type;
Vitaly Kuznetsov	86dd995	2016-05-19 17:13:06 -0700	[diff] [blame]	158
				159	return 1;
				160	}
				161	__setup("memhp_default_state=", setup_memhp_default_state);
				162
David Rientjes	30467e0	2015-04-14 15:45:11 -0700	[diff] [blame]	163	void mem_hotplug_begin(void)
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	164	{
Thomas Gleixner	3f906ba	2017-07-10 15:50:09 -0700	[diff] [blame]	165	cpus_read_lock();
				166	percpu_down_write(&mem_hotplug_lock);
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	167	}
				168
David Rientjes	30467e0	2015-04-14 15:45:11 -0700	[diff] [blame]	169	void mem_hotplug_done(void)
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	170	{
Thomas Gleixner	3f906ba	2017-07-10 15:50:09 -0700	[diff] [blame]	171	percpu_up_write(&mem_hotplug_lock);
				172	cpus_read_unlock();
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	173	}
KOSAKI Motohiro	20d6c96	2010-12-02 14:31:19 -0800	[diff] [blame]	174
Juergen Gross	357b4da	2019-02-14 11:42:39 +0100	[diff] [blame]	175	u64 max_mem_size = U64_MAX;
				176
Keith Mannthey	45e0b78	2006-09-30 23:27:09 -0700	[diff] [blame]	177	/* add this memory to iomem resource */
David Hildenbrand	7b7b272	2020-06-04 16:48:41 -0700	[diff] [blame]	178	static struct resource *register_memory_resource(u64 start, u64 size,
				179	const char *resource_name)
Keith Mannthey	45e0b78	2006-09-30 23:27:09 -0700	[diff] [blame]	180	{
Dave Hansen	2794129	2019-02-25 10:57:36 -0800	[diff] [blame]	181	struct resource *res;
				182	unsigned long flags = IORESOURCE_SYSTEM_RAM \| IORESOURCE_BUSY;
David Hildenbrand	7b7b272	2020-06-04 16:48:41 -0700	[diff] [blame]	183
				184	if (strcmp(resource_name, "System RAM"))
David Hildenbrand	7cf603d	2020-10-15 20:08:33 -0700	[diff] [blame]	185	flags \|= IORESOURCE_SYSRAM_DRIVER_MANAGED;
Juergen Gross	357b4da	2019-02-14 11:42:39 +0100	[diff] [blame]	186
Anshuman Khandual	bca3fea	2021-02-25 17:17:33 -0800	[diff] [blame]	187	if (!mhp_range_allowed(start, size, true))
				188	return ERR_PTR(-E2BIG);
				189
Baoquan He	f3cd4c8	2020-04-06 20:06:50 -0700	[diff] [blame]	190	/*
				191	* Make sure value parsed from 'mem=' only restricts memory adding
				192	* while booting, so that memory hotplug won't be impacted. Please
				193	* refer to document of 'mem=' in kernel-parameters.txt for more
				194	* details.
				195	*/
				196	if (start + size > max_mem_size && system_state < SYSTEM_RUNNING)
Juergen Gross	357b4da	2019-02-14 11:42:39 +0100	[diff] [blame]	197	return ERR_PTR(-E2BIG);
				198
Dave Hansen	2794129	2019-02-25 10:57:36 -0800	[diff] [blame]	199	/*
				200	* Request ownership of the new memory range. This might be
				201	* a child of an existing resource that was present but
				202	* not marked as busy.
				203	*/
				204	res = __request_region(&iomem_resource, start, size,
				205	resource_name, flags);
Keith Mannthey	45e0b78	2006-09-30 23:27:09 -0700	[diff] [blame]	206
Dave Hansen	2794129	2019-02-25 10:57:36 -0800	[diff] [blame]	207	if (!res) {
				208	pr_debug("Unable to reserve System RAM region: %016llx->%016llx\n",
				209	start, start + size);
Vitaly Kuznetsov	6f754ba	2016-01-14 15:21:55 -0800	[diff] [blame]	210	return ERR_PTR(-EEXIST);
Keith Mannthey	45e0b78	2006-09-30 23:27:09 -0700	[diff] [blame]	211	}
				212	return res;
				213	}
				214
				215	static void release_memory_resource(struct resource *res)
				216	{
				217	if (!res)
				218	return;
				219	release_resource(res);
				220	kfree(res);
Keith Mannthey	45e0b78	2006-09-30 23:27:09 -0700	[diff] [blame]	221	}
				222
Keith Mannthey	5394702	2006-09-30 23:27:08 -0700	[diff] [blame]	223	#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
Dan Williams	7ea6216	2019-07-18 15:58:22 -0700	[diff] [blame]	224	static int check_pfn_span(unsigned long pfn, unsigned long nr_pages,
				225	const char *reason)
				226	{
				227	/*
				228	* Disallow all operations smaller than a sub-section and only
				229	* allow operations smaller than a section for
				230	* SPARSEMEM_VMEMMAP. Note that check_hotplug_memory_range()
				231	* enforces a larger memory_block_size_bytes() granularity for
				232	* memory that will be marked online, so this check should only
				233	* fire for direct arch_{add,remove}_memory() users outside of
				234	* add_memory_resource().
				235	*/
				236	unsigned long min_align;
				237
				238	if (IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP))
				239	min_align = PAGES_PER_SUBSECTION;
				240	else
				241	min_align = PAGES_PER_SECTION;
				242	if (!IS_ALIGNED(pfn, min_align)
				243	\|\| !IS_ALIGNED(nr_pages, min_align)) {
				244	WARN(1, "Misaligned __%s_pages start: %#lx end: #%lx\n",
				245	reason, pfn, pfn + nr_pages - 1);
				246	return -EINVAL;
				247	}
				248	return 0;
				249	}
				250
David Rientjes	4edd7ce	2013-04-29 15:08:22 -0700	[diff] [blame]	251	/*
Dan Williams	9f605f2	2021-02-25 17:16:57 -0800	[diff] [blame]	252	* Return page for the valid pfn only if the page is online. All pfn
				253	* walkers which rely on the fully initialized page->flags and others
				254	* should use this rather than pfn_valid && pfn_to_page
				255	*/
				256	struct page *pfn_to_online_page(unsigned long pfn)
				257	{
				258	unsigned long nr = pfn_to_section_nr(pfn);
Dan Williams	1f90a34	2021-02-25 17:17:05 -0800	[diff] [blame]	259	struct dev_pagemap *pgmap;
Dan Williams	9f9b02e	2021-02-25 17:17:01 -0800	[diff] [blame]	260	struct mem_section *ms;
Dan Williams	9f605f2	2021-02-25 17:16:57 -0800	[diff] [blame]	261
Dan Williams	9f9b02e	2021-02-25 17:17:01 -0800	[diff] [blame]	262	if (nr >= NR_MEM_SECTIONS)
				263	return NULL;
				264
				265	ms = __nr_to_section(nr);
				266	if (!online_section(ms))
				267	return NULL;
				268
				269	/*
				270	* Save some code text when online_section() +
				271	* pfn_section_valid() are sufficient.
				272	*/
				273	if (IS_ENABLED(CONFIG_HAVE_ARCH_PFN_VALID) && !pfn_valid(pfn))
				274	return NULL;
				275
				276	if (!pfn_section_valid(ms, pfn))
				277	return NULL;
				278
Dan Williams	1f90a34	2021-02-25 17:17:05 -0800	[diff] [blame]	279	if (!online_device_section(ms))
				280	return pfn_to_page(pfn);
				281
				282	/*
				283	* Slowpath: when ZONE_DEVICE collides with
				284	* ZONE_{NORMAL,MOVABLE} within the same section some pfns in
				285	* the section may be 'offline' but 'valid'. Only
				286	* get_dev_pagemap() can determine sub-section online status.
				287	*/
				288	pgmap = get_dev_pagemap(pfn, NULL);
				289	put_dev_pagemap(pgmap);
				290
				291	/* The presence of a pgmap indicates ZONE_DEVICE offline pfn */
				292	if (pgmap)
				293	return NULL;
				294
Dan Williams	9f9b02e	2021-02-25 17:17:01 -0800	[diff] [blame]	295	return pfn_to_page(pfn);
Dan Williams	9f605f2	2021-02-25 17:16:57 -0800	[diff] [blame]	296	}
				297	EXPORT_SYMBOL_GPL(pfn_to_online_page);
				298
				299	/*
David Rientjes	4edd7ce	2013-04-29 15:08:22 -0700	[diff] [blame]	300	* Reasonably generic function for adding memory. It is
				301	* expected that archs that support memory hotplug will
				302	* call this function after deciding the zone to which to
				303	* add the new pages.
				304	*/
Dan Williams	7ea6216	2019-07-18 15:58:22 -0700	[diff] [blame]	305	int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
Logan Gunthorpe	f5637d3	2020-04-10 14:33:21 -0700	[diff] [blame]	306	struct mhp_params *params)
David Rientjes	4edd7ce	2013-04-29 15:08:22 -0700	[diff] [blame]	307	{
David Hildenbrand	6cdd0b3	2020-04-06 20:06:56 -0700	[diff] [blame]	308	const unsigned long end_pfn = pfn + nr_pages;
				309	unsigned long cur_nr_pages;
Dan Williams	9a84503	2019-07-18 15:58:43 -0700	[diff] [blame]	310	int err;
Logan Gunthorpe	f5637d3	2020-04-10 14:33:21 -0700	[diff] [blame]	311	struct vmem_altmap *altmap = params->altmap;
Dan Williams	4b94ffd	2016-01-15 16:56:22 -0800	[diff] [blame]	312
Logan Gunthorpe	bfeb022	2020-04-10 14:33:36 -0700	[diff] [blame]	313	if (WARN_ON_ONCE(!params->pgprot.pgprot))
				314	return -EINVAL;
				315
Anshuman Khandual	bca3fea	2021-02-25 17:17:33 -0800	[diff] [blame]	316	VM_BUG_ON(!mhp_range_allowed(PFN_PHYS(pfn), nr_pages * PAGE_SIZE, false));
Alastair D'Silva	dca4436	2019-11-30 17:53:48 -0800	[diff] [blame]	317
Dan Williams	4b94ffd	2016-01-15 16:56:22 -0800	[diff] [blame]	318	if (altmap) {
				319	/*
				320	* Validate altmap is within bounds of the total request
				321	*/
Dan Williams	7ea6216	2019-07-18 15:58:22 -0700	[diff] [blame]	322	if (altmap->base_pfn != pfn
Dan Williams	4b94ffd	2016-01-15 16:56:22 -0800	[diff] [blame]	323	\|\| vmem_altmap_offset(altmap) > nr_pages) {
				324	pr_warn_once("memory add fail, invalid altmap\n");
Dan Williams	7ea6216	2019-07-18 15:58:22 -0700	[diff] [blame]	325	return -EINVAL;
Dan Williams	4b94ffd	2016-01-15 16:56:22 -0800	[diff] [blame]	326	}
				327	altmap->alloc = 0;
				328	}
				329
Dan Williams	7ea6216	2019-07-18 15:58:22 -0700	[diff] [blame]	330	err = check_pfn_span(pfn, nr_pages, "add");
				331	if (err)
				332	return err;
				333
David Hildenbrand	6cdd0b3	2020-04-06 20:06:56 -0700	[diff] [blame]	334	for (; pfn < end_pfn; pfn += cur_nr_pages) {
				335	/* Select all remaining pages up to the next section boundary */
				336	cur_nr_pages = min(end_pfn - pfn,
				337	SECTION_ALIGN_UP(pfn + 1) - pfn);
				338	err = sparse_add_section(nid, pfn, cur_nr_pages, altmap);
Dan Williams	ba72b4c	2019-07-18 15:58:26 -0700	[diff] [blame]	339	if (err)
				340	break;
Michal Hocko	f64ac5e	2017-10-03 16:16:16 -0700	[diff] [blame]	341	cond_resched();
David Rientjes	4edd7ce	2013-04-29 15:08:22 -0700	[diff] [blame]	342	}
Zhu Guihua	c435a39	2015-06-24 16:58:42 -0700	[diff] [blame]	343	vmemmap_populate_print_last();
David Rientjes	4edd7ce	2013-04-29 15:08:22 -0700	[diff] [blame]	344	return err;
				345	}
David Rientjes	4edd7ce	2013-04-29 15:08:22 -0700	[diff] [blame]	346
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	347	/* find the smallest valid pfn in the range [start_pfn, end_pfn) */
YASUAKI ISHIMATSU	d09b013	2017-10-03 16:16:32 -0700	[diff] [blame]	348	static unsigned long find_smallest_section_pfn(int nid, struct zone *zone,
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	349	unsigned long start_pfn,
				350	unsigned long end_pfn)
				351	{
Dan Williams	49ba3c6	2019-07-18 15:58:07 -0700	[diff] [blame]	352	for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SUBSECTION) {
David Hildenbrand	7ce700b	2019-11-21 17:53:56 -0800	[diff] [blame]	353	if (unlikely(!pfn_to_online_page(start_pfn)))
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	354	continue;
				355
				356	if (unlikely(pfn_to_nid(start_pfn) != nid))
				357	continue;
				358
David Hildenbrand	9b05158	2020-02-03 17:34:12 -0800	[diff] [blame]	359	if (zone != page_zone(pfn_to_page(start_pfn)))
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	360	continue;
				361
				362	return start_pfn;
				363	}
				364
				365	return 0;
				366	}
				367
				368	/* find the biggest valid pfn in the range [start_pfn, end_pfn). */
YASUAKI ISHIMATSU	d09b013	2017-10-03 16:16:32 -0700	[diff] [blame]	369	static unsigned long find_biggest_section_pfn(int nid, struct zone *zone,
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	370	unsigned long start_pfn,
				371	unsigned long end_pfn)
				372	{
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	373	unsigned long pfn;
				374
				375	/* pfn is the end pfn of a memory section. */
				376	pfn = end_pfn - 1;
Dan Williams	49ba3c6	2019-07-18 15:58:07 -0700	[diff] [blame]	377	for (; pfn >= start_pfn; pfn -= PAGES_PER_SUBSECTION) {
David Hildenbrand	7ce700b	2019-11-21 17:53:56 -0800	[diff] [blame]	378	if (unlikely(!pfn_to_online_page(pfn)))
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	379	continue;
				380
				381	if (unlikely(pfn_to_nid(pfn) != nid))
				382	continue;
				383
David Hildenbrand	9b05158	2020-02-03 17:34:12 -0800	[diff] [blame]	384	if (zone != page_zone(pfn_to_page(pfn)))
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	385	continue;
				386
				387	return pfn;
				388	}
				389
				390	return 0;
				391	}
				392
				393	static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
				394	unsigned long end_pfn)
				395	{
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	396	unsigned long pfn;
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	397	int nid = zone_to_nid(zone);
				398
David Hildenbrand	5d12071	2020-02-03 17:34:19 -0800	[diff] [blame]	399	if (zone->zone_start_pfn == start_pfn) {
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	400	/*
				401	* If the section is smallest section in the zone, it need
				402	* shrink zone->zone_start_pfn and zone->zone_spanned_pages.
				403	* In this case, we find second smallest valid mem_section
				404	* for shrinking zone.
				405	*/
				406	pfn = find_smallest_section_pfn(nid, zone, end_pfn,
David Hildenbrand	5d12071	2020-02-03 17:34:19 -0800	[diff] [blame]	407	zone_end_pfn(zone));
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	408	if (pfn) {
David Hildenbrand	5d12071	2020-02-03 17:34:19 -0800	[diff] [blame]	409	zone->spanned_pages = zone_end_pfn(zone) - pfn;
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	410	zone->zone_start_pfn = pfn;
David Hildenbrand	950b68d	2020-02-03 17:34:16 -0800	[diff] [blame]	411	} else {
				412	zone->zone_start_pfn = 0;
				413	zone->spanned_pages = 0;
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	414	}
David Hildenbrand	5d12071	2020-02-03 17:34:19 -0800	[diff] [blame]	415	} else if (zone_end_pfn(zone) == end_pfn) {
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	416	/*
				417	* If the section is biggest section in the zone, it need
				418	* shrink zone->spanned_pages.
				419	* In this case, we find second biggest valid mem_section for
				420	* shrinking zone.
				421	*/
David Hildenbrand	5d12071	2020-02-03 17:34:19 -0800	[diff] [blame]	422	pfn = find_biggest_section_pfn(nid, zone, zone->zone_start_pfn,
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	423	start_pfn);
				424	if (pfn)
David Hildenbrand	5d12071	2020-02-03 17:34:19 -0800	[diff] [blame]	425	zone->spanned_pages = pfn - zone->zone_start_pfn + 1;
David Hildenbrand	950b68d	2020-02-03 17:34:16 -0800	[diff] [blame]	426	else {
				427	zone->zone_start_pfn = 0;
				428	zone->spanned_pages = 0;
				429	}
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	430	}
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	431	}
				432
David Hildenbrand	00d6c01	2019-10-18 20:19:33 -0700	[diff] [blame]	433	static void update_pgdat_span(struct pglist_data *pgdat)
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	434	{
David Hildenbrand	00d6c01	2019-10-18 20:19:33 -0700	[diff] [blame]	435	unsigned long node_start_pfn = 0, node_end_pfn = 0;
				436	struct zone *zone;
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	437
David Hildenbrand	00d6c01	2019-10-18 20:19:33 -0700	[diff] [blame]	438	for (zone = pgdat->node_zones;
				439	zone < pgdat->node_zones + MAX_NR_ZONES; zone++) {
Miaohe Lin	6c922cf	2021-02-25 17:17:21 -0800	[diff] [blame]	440	unsigned long end_pfn = zone_end_pfn(zone);
David Hildenbrand	00d6c01	2019-10-18 20:19:33 -0700	[diff] [blame]	441
				442	/* No need to lock the zones, they can't change. */
David Hildenbrand	656d571	2019-11-05 21:17:10 -0800	[diff] [blame]	443	if (!zone->spanned_pages)
				444	continue;
				445	if (!node_end_pfn) {
				446	node_start_pfn = zone->zone_start_pfn;
Miaohe Lin	6c922cf	2021-02-25 17:17:21 -0800	[diff] [blame]	447	node_end_pfn = end_pfn;
David Hildenbrand	656d571	2019-11-05 21:17:10 -0800	[diff] [blame]	448	continue;
				449	}
				450
Miaohe Lin	6c922cf	2021-02-25 17:17:21 -0800	[diff] [blame]	451	if (end_pfn > node_end_pfn)
				452	node_end_pfn = end_pfn;
David Hildenbrand	00d6c01	2019-10-18 20:19:33 -0700	[diff] [blame]	453	if (zone->zone_start_pfn < node_start_pfn)
				454	node_start_pfn = zone->zone_start_pfn;
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	455	}
				456
David Hildenbrand	00d6c01	2019-10-18 20:19:33 -0700	[diff] [blame]	457	pgdat->node_start_pfn = node_start_pfn;
				458	pgdat->node_spanned_pages = node_end_pfn - node_start_pfn;
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	459	}
				460
David Hildenbrand	feee6b2	2020-01-04 12:59:33 -0800	[diff] [blame]	461	void __ref remove_pfn_range_from_zone(struct zone *zone,
				462	unsigned long start_pfn,
				463	unsigned long nr_pages)
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	464	{
Ben Widawsky	b7e3deb	2020-06-25 20:30:51 -0700	[diff] [blame]	465	const unsigned long end_pfn = start_pfn + nr_pages;
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	466	struct pglist_data *pgdat = zone->zone_pgdat;
Oscar Salvador	27cacaa	2021-06-30 18:52:46 -0700	[diff] [blame]	467	unsigned long pfn, cur_nr_pages;
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	468
David Hildenbrand	d33695b	2020-02-03 17:34:09 -0800	[diff] [blame]	469	/* Poison struct pages because they are now uninitialized again. */
Ben Widawsky	b7e3deb	2020-06-25 20:30:51 -0700	[diff] [blame]	470	for (pfn = start_pfn; pfn < end_pfn; pfn += cur_nr_pages) {
				471	cond_resched();
				472
				473	/* Select all remaining pages up to the next section boundary */
				474	cur_nr_pages =
				475	min(end_pfn - pfn, SECTION_ALIGN_UP(pfn + 1) - pfn);
				476	page_init_poison(pfn_to_page(pfn),
				477	sizeof(struct page) * cur_nr_pages);
				478	}
David Hildenbrand	d33695b	2020-02-03 17:34:09 -0800	[diff] [blame]	479
David Hildenbrand	7ce700b	2019-11-21 17:53:56 -0800	[diff] [blame]	480	#ifdef CONFIG_ZONE_DEVICE
				481	/*
				482	* Zone shrinking code cannot properly deal with ZONE_DEVICE. So
				483	* we will not try to shrink the zones - which is okay as
				484	* set_zone_contiguous() cannot deal with ZONE_DEVICE either way.
				485	*/
				486	if (zone_idx(zone) == ZONE_DEVICE)
				487	return;
				488	#endif
				489
David Hildenbrand	feee6b2	2020-01-04 12:59:33 -0800	[diff] [blame]	490	clear_zone_contiguous(zone);
				491
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	492	shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
David Hildenbrand	00d6c01	2019-10-18 20:19:33 -0700	[diff] [blame]	493	update_pgdat_span(pgdat);
David Hildenbrand	feee6b2	2020-01-04 12:59:33 -0800	[diff] [blame]	494
				495	set_zone_contiguous(zone);
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	496	}
				497
David Hildenbrand	feee6b2	2020-01-04 12:59:33 -0800	[diff] [blame]	498	static void __remove_section(unsigned long pfn, unsigned long nr_pages,
				499	unsigned long map_offset,
				500	struct vmem_altmap *altmap)
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	501	{
chenqiwu	1040490	2020-04-06 20:07:48 -0700	[diff] [blame]	502	struct mem_section *ms = __pfn_to_section(pfn);
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	503
David Hildenbrand	9d1d887	2019-05-13 17:21:41 -0700	[diff] [blame]	504	if (WARN_ON_ONCE(!valid_section(ms)))
				505	return;
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	506
Dan Williams	ba72b4c	2019-07-18 15:58:26 -0700	[diff] [blame]	507	sparse_remove_section(ms, pfn, nr_pages, map_offset, altmap);
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	508	}
				509
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	510	/**
David Hildenbrand	feee6b2	2020-01-04 12:59:33 -0800	[diff] [blame]	511	* __remove_pages() - remove sections of pages
Dan Williams	7ea6216	2019-07-18 15:58:22 -0700	[diff] [blame]	512	* @pfn: starting pageframe (must be aligned to start of a section)
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	513	* @nr_pages: number of pages to remove (must be multiple of section size)
Mike Rapoport	e8b098f	2018-04-05 16:24:57 -0700	[diff] [blame]	514	* @altmap: alternative device page map or %NULL if default memmap is used
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	515	*
				516	* Generic helper function to remove section mappings and sysfs entries
				517	* for the section of the memory we are removing. Caller needs to make
				518	* sure that pages are marked reserved and zones are adjust properly by
				519	* calling offline_pages().
				520	*/
David Hildenbrand	feee6b2	2020-01-04 12:59:33 -0800	[diff] [blame]	521	void __remove_pages(unsigned long pfn, unsigned long nr_pages,
				522	struct vmem_altmap *altmap)
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	523	{
David Hildenbrand	52fb87c	2020-02-03 17:34:23 -0800	[diff] [blame]	524	const unsigned long end_pfn = pfn + nr_pages;
				525	unsigned long cur_nr_pages;
Dan Williams	4b94ffd	2016-01-15 16:56:22 -0800	[diff] [blame]	526	unsigned long map_offset = 0;
Dan Williams	4b94ffd	2016-01-15 16:56:22 -0800	[diff] [blame]	527
Dan Williams	96da435	2019-07-18 15:58:15 -0700	[diff] [blame]	528	map_offset = vmem_altmap_offset(altmap);
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	529
Dan Williams	7ea6216	2019-07-18 15:58:22 -0700	[diff] [blame]	530	if (check_pfn_span(pfn, nr_pages, "remove"))
				531	return;
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	532
David Hildenbrand	52fb87c	2020-02-03 17:34:23 -0800	[diff] [blame]	533	for (; pfn < end_pfn; pfn += cur_nr_pages) {
Michal Hocko	dd33ad7	2018-11-02 15:48:46 -0700	[diff] [blame]	534	cond_resched();
David Hildenbrand	52fb87c	2020-02-03 17:34:23 -0800	[diff] [blame]	535	/* Select all remaining pages up to the next section boundary */
David Hildenbrand	a11b941	2020-04-06 20:06:53 -0700	[diff] [blame]	536	cur_nr_pages = min(end_pfn - pfn,
				537	SECTION_ALIGN_UP(pfn + 1) - pfn);
David Hildenbrand	52fb87c	2020-02-03 17:34:23 -0800	[diff] [blame]	538	__remove_section(pfn, cur_nr_pages, map_offset, altmap);
Dan Williams	4b94ffd	2016-01-15 16:56:22 -0800	[diff] [blame]	539	map_offset = 0;
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	540	}
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	541	}
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	542
Daniel Kiper	9d0ad8c	2011-07-25 17:12:05 -0700	[diff] [blame]	543	int set_online_page_callback(online_page_callback_t callback)
				544	{
				545	int rc = -EINVAL;
				546
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	547	get_online_mems();
				548	mutex_lock(&online_page_callback_lock);
Daniel Kiper	9d0ad8c	2011-07-25 17:12:05 -0700	[diff] [blame]	549
				550	if (online_page_callback == generic_online_page) {
				551	online_page_callback = callback;
				552	rc = 0;
				553	}
				554
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	555	mutex_unlock(&online_page_callback_lock);
				556	put_online_mems();
Daniel Kiper	9d0ad8c	2011-07-25 17:12:05 -0700	[diff] [blame]	557
				558	return rc;
				559	}
				560	EXPORT_SYMBOL_GPL(set_online_page_callback);
				561
				562	int restore_online_page_callback(online_page_callback_t callback)
				563	{
				564	int rc = -EINVAL;
				565
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	566	get_online_mems();
				567	mutex_lock(&online_page_callback_lock);
Daniel Kiper	9d0ad8c	2011-07-25 17:12:05 -0700	[diff] [blame]	568
				569	if (online_page_callback == callback) {
				570	online_page_callback = generic_online_page;
				571	rc = 0;
				572	}
				573
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	574	mutex_unlock(&online_page_callback_lock);
				575	put_online_mems();
Daniel Kiper	9d0ad8c	2011-07-25 17:12:05 -0700	[diff] [blame]	576
				577	return rc;
				578	}
				579	EXPORT_SYMBOL_GPL(restore_online_page_callback);
				580
David Hildenbrand	18db149	2019-11-30 17:53:51 -0800	[diff] [blame]	581	void generic_online_page(struct page *page, unsigned int order)
Daniel Kiper	9d0ad8c	2011-07-25 17:12:05 -0700	[diff] [blame]	582	{
Vlastimil Babka	c87cbc1	2020-03-05 22:28:42 -0800	[diff] [blame]	583	/*
				584	* Freeing the page with debug_pagealloc enabled will try to unmap it,
				585	* so we should map it first. This is better than introducing a special
				586	* case in page freeing fast path.
				587	*/
Mike Rapoport	77bc7fd	2020-12-14 19:10:20 -0800	[diff] [blame]	588	debug_pagealloc_map_pages(page, 1 << order);
Arun KS	a9cd410	2019-03-05 15:42:14 -0800	[diff] [blame]	589	__free_pages_core(page, order);
				590	totalram_pages_add(1UL << order);
				591	#ifdef CONFIG_HIGHMEM
				592	if (PageHighMem(page))
				593	totalhigh_pages_add(1UL << order);
				594	#endif
				595	}
David Hildenbrand	18db149	2019-11-30 17:53:51 -0800	[diff] [blame]	596	EXPORT_SYMBOL_GPL(generic_online_page);
Arun KS	a9cd410	2019-03-05 15:42:14 -0800	[diff] [blame]	597
David Hildenbrand	aac6532	2020-10-15 20:08:11 -0700	[diff] [blame]	598	static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	599	{
David Hildenbrand	b2c2ab2	2019-09-23 15:36:02 -0700	[diff] [blame]	600	const unsigned long end_pfn = start_pfn + nr_pages;
				601	unsigned long pfn;
Michal Hocko	2d070ea	2017-07-06 15:37:56 -0700	[diff] [blame]	602
David Hildenbrand	b2c2ab2	2019-09-23 15:36:02 -0700	[diff] [blame]	603	/*
David Hildenbrand	aac6532	2020-10-15 20:08:11 -0700	[diff] [blame]	604	* Online the pages in MAX_ORDER - 1 aligned chunks. The callback might
				605	* decide to not expose all pages to the buddy (e.g., expose them
				606	* later). We account all pages as being online and belonging to this
				607	* zone ("present").
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	608	* When using memmap_on_memory, the range might not be aligned to
				609	* MAX_ORDER_NR_PAGES - 1, but pageblock aligned. __ffs() will detect
				610	* this and the first chunk to online will be pageblock_nr_pages.
David Hildenbrand	b2c2ab2	2019-09-23 15:36:02 -0700	[diff] [blame]	611	*/
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	612	for (pfn = start_pfn; pfn < end_pfn;) {
				613	int order = min(MAX_ORDER - 1UL, __ffs(pfn));
				614
				615	(*online_page_callback)(pfn_to_page(pfn), order);
				616	pfn += (1UL << order);
				617	}
Michal Hocko	2d070ea	2017-07-06 15:37:56 -0700	[diff] [blame]	618
David Hildenbrand	b2c2ab2	2019-09-23 15:36:02 -0700	[diff] [blame]	619	/* mark all involved sections as online */
				620	online_mem_sections(start_pfn, end_pfn);
KAMEZAWA Hiroyuki	75884fb	2007-10-16 01:26:10 -0700	[diff] [blame]	621	}
				622
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	623	/* check which state of node_states will be changed when online memory */
				624	static void node_states_check_changes_online(unsigned long nr_pages,
				625	struct zone zone, struct memory_notify arg)
				626	{
				627	int nid = zone_to_nid(zone);
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	628
Anshuman Khandual	98fa15f	2019-03-05 15:42:58 -0800	[diff] [blame]	629	arg->status_change_nid = NUMA_NO_NODE;
				630	arg->status_change_nid_normal = NUMA_NO_NODE;
				631	arg->status_change_nid_high = NUMA_NO_NODE;
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	632
Lai Jiangshan	6715ddf	2012-12-12 13:51:49 -0800	[diff] [blame]	633	if (!node_state(nid, N_MEMORY))
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	634	arg->status_change_nid = nid;
Oscar Salvador	8efe33f	2018-10-26 15:07:34 -0700	[diff] [blame]	635	if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY))
				636	arg->status_change_nid_normal = nid;
				637	#ifdef CONFIG_HIGHMEM
Baoquan He	d3ba3ae	2019-05-13 17:17:35 -0700	[diff] [blame]	638	if (zone_idx(zone) <= ZONE_HIGHMEM && !node_state(nid, N_HIGH_MEMORY))
Oscar Salvador	8efe33f	2018-10-26 15:07:34 -0700	[diff] [blame]	639	arg->status_change_nid_high = nid;
				640	#endif
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	641	}
				642
				643	static void node_states_set_node(int node, struct memory_notify *arg)
				644	{
				645	if (arg->status_change_nid_normal >= 0)
				646	node_set_state(node, N_NORMAL_MEMORY);
				647
Lai Jiangshan	6715ddf	2012-12-12 13:51:49 -0800	[diff] [blame]	648	if (arg->status_change_nid_high >= 0)
				649	node_set_state(node, N_HIGH_MEMORY);
				650
Oscar Salvador	83d8361	2018-10-26 15:07:25 -0700	[diff] [blame]	651	if (arg->status_change_nid >= 0)
				652	node_set_state(node, N_MEMORY);
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	653	}
				654
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	655	static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn,
				656	unsigned long nr_pages)
				657	{
				658	unsigned long old_end_pfn = zone_end_pfn(zone);
				659
				660	if (zone_is_empty(zone) \|\| start_pfn < zone->zone_start_pfn)
				661	zone->zone_start_pfn = start_pfn;
				662
				663	zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn;
				664	}
				665
				666	static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn,
				667	unsigned long nr_pages)
				668	{
				669	unsigned long old_end_pfn = pgdat_end_pfn(pgdat);
				670
				671	if (!pgdat->node_spanned_pages \|\| start_pfn < pgdat->node_start_pfn)
				672	pgdat->node_start_pfn = start_pfn;
				673
				674	pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn;
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	675
David Hildenbrand	3fccb74	2019-09-23 15:35:37 -0700	[diff] [blame]	676	}
Dan Williams	1f90a34	2021-02-25 17:17:05 -0800	[diff] [blame]	677
				678	static void section_taint_zone_device(unsigned long pfn)
				679	{
				680	struct mem_section *ms = __pfn_to_section(pfn);
				681
				682	ms->section_mem_map \|= SECTION_TAINT_ZONE_DEVICE;
				683	}
				684
David Hildenbrand	3fccb74	2019-09-23 15:35:37 -0700	[diff] [blame]	685	/*
				686	* Associate the pfn range with the given zone, initializing the memmaps
				687	* and resizing the pgdat/zone data to span the added pages. After this
				688	* call, all affected pages are PG_reserved.
David Hildenbrand	d882c00	2020-10-15 20:08:19 -0700	[diff] [blame]	689	*
				690	* All aligned pageblocks are initialized to the specified migratetype
				691	* (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
				692	* zone stats (e.g., nr_isolate_pageblock) are touched.
David Hildenbrand	3fccb74	2019-09-23 15:35:37 -0700	[diff] [blame]	693	*/
Christoph Hellwig	a99583e	2017-12-29 08:53:57 +0100	[diff] [blame]	694	void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
David Hildenbrand	d882c00	2020-10-15 20:08:19 -0700	[diff] [blame]	695	unsigned long nr_pages,
				696	struct vmem_altmap *altmap, int migratetype)
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	697	{
				698	struct pglist_data *pgdat = zone->zone_pgdat;
				699	int nid = pgdat->node_id;
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	700
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	701	clear_zone_contiguous(zone);
				702
Wei Yang	fa004ab	2018-12-28 00:37:10 -0800	[diff] [blame]	703	if (zone_is_empty(zone))
				704	init_currently_empty_zone(zone, start_pfn, nr_pages);
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	705	resize_zone_range(zone, start_pfn, nr_pages);
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	706	resize_pgdat_range(pgdat, start_pfn, nr_pages);
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	707
				708	/*
Dan Williams	1f90a34	2021-02-25 17:17:05 -0800	[diff] [blame]	709	* Subsection population requires care in pfn_to_online_page().
				710	* Set the taint to enable the slow path detection of
				711	* ZONE_DEVICE pages in an otherwise ZONE_{NORMAL,MOVABLE}
				712	* section.
				713	*/
				714	if (zone_is_zone_device(zone)) {
				715	if (!IS_ALIGNED(start_pfn, PAGES_PER_SECTION))
				716	section_taint_zone_device(start_pfn);
				717	if (!IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION))
				718	section_taint_zone_device(start_pfn + nr_pages);
				719	}
				720
				721	/*
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	722	* TODO now we have a visible range of pages which are not associated
				723	* with their zone properly. Not nice but set_pfnblock_flags_mask
				724	* expects the zone spans the pfn range. All the pages in the range
				725	* are reserved so nobody should be touching them so we should be safe
				726	*/
Baoquan He	ab28cb6	2021-02-24 12:06:14 -0800	[diff] [blame]	727	memmap_init_range(nr_pages, nid, zone_idx(zone), start_pfn, 0,
David Hildenbrand	d882c00	2020-10-15 20:08:19 -0700	[diff] [blame]	728	MEMINIT_HOTPLUG, altmap, migratetype);
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	729
				730	set_zone_contiguous(zone);
				731	}
				732
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	733	struct auto_movable_stats {
				734	unsigned long kernel_early_pages;
				735	unsigned long movable_pages;
				736	};
				737
				738	static void auto_movable_stats_account_zone(struct auto_movable_stats *stats,
				739	struct zone *zone)
				740	{
				741	if (zone_idx(zone) == ZONE_MOVABLE) {
				742	stats->movable_pages += zone->present_pages;
				743	} else {
				744	stats->kernel_early_pages += zone->present_early_pages;
				745	#ifdef CONFIG_CMA
				746	/*
				747	* CMA pages (never on hotplugged memory) behave like
				748	* ZONE_MOVABLE.
				749	*/
				750	stats->movable_pages += zone->cma_pages;
				751	stats->kernel_early_pages -= zone->cma_pages;
				752	#endif /* CONFIG_CMA */
				753	}
				754	}
David Hildenbrand	3fcebf9	2021-09-07 19:55:48 -0700	[diff] [blame^]	755	struct auto_movable_group_stats {
				756	unsigned long movable_pages;
				757	unsigned long req_kernel_early_pages;
				758	};
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	759
David Hildenbrand	3fcebf9	2021-09-07 19:55:48 -0700	[diff] [blame^]	760	static int auto_movable_stats_account_group(struct memory_group *group,
				761	void *arg)
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	762	{
David Hildenbrand	3fcebf9	2021-09-07 19:55:48 -0700	[diff] [blame^]	763	const int ratio = READ_ONCE(auto_movable_ratio);
				764	struct auto_movable_group_stats *stats = arg;
				765	long pages;
				766
				767	/*
				768	* We don't support modifying the config while the auto-movable online
				769	* policy is already enabled. Just avoid the division by zero below.
				770	*/
				771	if (!ratio)
				772	return 0;
				773
				774	/*
				775	* Calculate how many early kernel pages this group requires to
				776	* satisfy the configured zone ratio.
				777	*/
				778	pages = group->present_movable_pages * 100 / ratio;
				779	pages -= group->present_kernel_pages;
				780
				781	if (pages > 0)
				782	stats->req_kernel_early_pages += pages;
				783	stats->movable_pages += group->present_movable_pages;
				784	return 0;
				785	}
				786
				787	static bool auto_movable_can_online_movable(int nid, struct memory_group *group,
				788	unsigned long nr_pages)
				789	{
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	790	unsigned long kernel_early_pages, movable_pages;
David Hildenbrand	3fcebf9	2021-09-07 19:55:48 -0700	[diff] [blame^]	791	struct auto_movable_group_stats group_stats = {};
				792	struct auto_movable_stats stats = {};
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	793	pg_data_t *pgdat = NODE_DATA(nid);
				794	struct zone *zone;
				795	int i;
				796
				797	/* Walk all relevant zones and collect MOVABLE vs. KERNEL stats. */
				798	if (nid == NUMA_NO_NODE) {
				799	/* TODO: cache values */
				800	for_each_populated_zone(zone)
				801	auto_movable_stats_account_zone(&stats, zone);
				802	} else {
				803	for (i = 0; i < MAX_NR_ZONES; i++) {
				804	zone = pgdat->node_zones + i;
				805	if (populated_zone(zone))
				806	auto_movable_stats_account_zone(&stats, zone);
				807	}
				808	}
				809
				810	kernel_early_pages = stats.kernel_early_pages;
				811	movable_pages = stats.movable_pages;
				812
				813	/*
David Hildenbrand	3fcebf9	2021-09-07 19:55:48 -0700	[diff] [blame^]	814	* Kernel memory inside dynamic memory group allows for more MOVABLE
				815	* memory within the same group. Remove the effect of all but the
				816	* current group from the stats.
				817	*/
				818	walk_dynamic_memory_groups(nid, auto_movable_stats_account_group,
				819	group, &group_stats);
				820	if (kernel_early_pages <= group_stats.req_kernel_early_pages)
				821	return false;
				822	kernel_early_pages -= group_stats.req_kernel_early_pages;
				823	movable_pages -= group_stats.movable_pages;
				824
				825	if (group && group->is_dynamic)
				826	kernel_early_pages += group->present_kernel_pages;
				827
				828	/*
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	829	* Test if we could online the given number of pages to ZONE_MOVABLE
				830	* and still stay in the configured ratio.
				831	*/
				832	movable_pages += nr_pages;
				833	return movable_pages <= (auto_movable_ratio * kernel_early_pages) / 100;
				834	}
				835
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	836	/*
Michal Hocko	c246a21	2017-07-06 15:38:18 -0700	[diff] [blame]	837	* Returns a default kernel memory zone for the given pfn range.
				838	* If no kernel zone covers this pfn range it will automatically go
				839	* to the ZONE_NORMAL.
				840	*/
Michal Hocko	c6f03e2	2017-09-06 16:19:40 -0700	[diff] [blame]	841	static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn,
Michal Hocko	c246a21	2017-07-06 15:38:18 -0700	[diff] [blame]	842	unsigned long nr_pages)
				843	{
				844	struct pglist_data *pgdat = NODE_DATA(nid);
				845	int zid;
				846
				847	for (zid = 0; zid <= ZONE_NORMAL; zid++) {
				848	struct zone *zone = &pgdat->node_zones[zid];
				849
				850	if (zone_intersects(zone, start_pfn, nr_pages))
				851	return zone;
				852	}
				853
				854	return &pgdat->node_zones[ZONE_NORMAL];
				855	}
				856
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	857	/*
				858	* Determine to which zone to online memory dynamically based on user
				859	* configuration and system stats. We care about the following ratio:
				860	*
				861	* MOVABLE : KERNEL
				862	*
				863	* Whereby MOVABLE is memory in ZONE_MOVABLE and KERNEL is memory in
				864	* one of the kernel zones. CMA pages inside one of the kernel zones really
				865	* behaves like ZONE_MOVABLE, so we treat them accordingly.
				866	*
				867	* We don't allow for hotplugged memory in a KERNEL zone to increase the
				868	* amount of MOVABLE memory we can have, so we end up with:
				869	*
				870	* MOVABLE : KERNEL_EARLY
				871	*
				872	* Whereby KERNEL_EARLY is memory in one of the kernel zones, available sinze
				873	* boot. We base our calculation on KERNEL_EARLY internally, because:
				874	*
				875	* a) Hotplugged memory in one of the kernel zones can sometimes still get
				876	* hotunplugged, especially when hot(un)plugging individual memory blocks.
				877	* There is no coordination across memory devices, therefore "automatic"
				878	* hotunplugging, as implemented in hypervisors, could result in zone
				879	* imbalances.
				880	* b) Early/boot memory in one of the kernel zones can usually not get
				881	* hotunplugged again (e.g., no firmware interface to unplug, fragmented
				882	* with unmovable allocations). While there are corner cases where it might
				883	* still work, it is barely relevant in practice.
				884	*
David Hildenbrand	3fcebf9	2021-09-07 19:55:48 -0700	[diff] [blame^]	885	* Exceptions are dynamic memory groups, which allow for more MOVABLE
				886	* memory within the same memory group -- because in that case, there is
				887	* coordination within the single memory device managed by a single driver.
				888	*
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	889	* We rely on "present pages" instead of "managed pages", as the latter is
				890	* highly unreliable and dynamic in virtualized environments, and does not
				891	* consider boot time allocations. For example, memory ballooning adjusts the
				892	* managed pages when inflating/deflating the balloon, and balloon compaction
				893	* can even migrate inflated pages between zones.
				894	*
				895	* Using "present pages" is better but some things to keep in mind are:
				896	*
				897	* a) Some memblock allocations, such as for the crashkernel area, are
				898	* effectively unused by the kernel, yet they account to "present pages".
				899	* Fortunately, these allocations are comparatively small in relevant setups
				900	* (e.g., fraction of system memory).
				901	* b) Some hotplugged memory blocks in virtualized environments, esecially
				902	* hotplugged by virtio-mem, look like they are completely present, however,
				903	* only parts of the memory block are actually currently usable.
				904	* "present pages" is an upper limit that can get reached at runtime. As
				905	* we base our calculations on KERNEL_EARLY, this is not an issue.
				906	*/
David Hildenbrand	445fcf7	2021-09-07 19:55:45 -0700	[diff] [blame]	907	static struct zone *auto_movable_zone_for_pfn(int nid,
				908	struct memory_group *group,
				909	unsigned long pfn,
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	910	unsigned long nr_pages)
				911	{
David Hildenbrand	445fcf7	2021-09-07 19:55:45 -0700	[diff] [blame]	912	unsigned long online_pages = 0, max_pages, end_pfn;
				913	struct page *page;
				914
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	915	if (!auto_movable_ratio)
				916	goto kernel_zone;
				917
David Hildenbrand	445fcf7	2021-09-07 19:55:45 -0700	[diff] [blame]	918	if (group && !group->is_dynamic) {
				919	max_pages = group->s.max_pages;
				920	online_pages = group->present_movable_pages;
				921
				922	/* If anything is !MOVABLE online the rest !MOVABLE. */
				923	if (group->present_kernel_pages)
				924	goto kernel_zone;
				925	} else if (!group \|\| group->d.unit_pages == nr_pages) {
				926	max_pages = nr_pages;
				927	} else {
				928	max_pages = group->d.unit_pages;
				929	/*
				930	* Take a look at all online sections in the current unit.
				931	* We can safely assume that all pages within a section belong
				932	* to the same zone, because dynamic memory groups only deal
				933	* with hotplugged memory.
				934	*/
				935	pfn = ALIGN_DOWN(pfn, group->d.unit_pages);
				936	end_pfn = pfn + group->d.unit_pages;
				937	for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
				938	page = pfn_to_online_page(pfn);
				939	if (!page)
				940	continue;
				941	/* If anything is !MOVABLE online the rest !MOVABLE. */
				942	if (page_zonenum(page) != ZONE_MOVABLE)
				943	goto kernel_zone;
				944	online_pages += PAGES_PER_SECTION;
				945	}
				946	}
				947
				948	/*
				949	* Online MOVABLE if we could currently online all remaining parts
				950	* MOVABLE. We expect to (add+) online them immediately next, so if
				951	* nobody interferes, all will be MOVABLE if possible.
				952	*/
				953	nr_pages = max_pages - online_pages;
David Hildenbrand	3fcebf9	2021-09-07 19:55:48 -0700	[diff] [blame^]	954	if (!auto_movable_can_online_movable(NUMA_NO_NODE, group, nr_pages))
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	955	goto kernel_zone;
				956
				957	#ifdef CONFIG_NUMA
				958	if (auto_movable_numa_aware &&
David Hildenbrand	3fcebf9	2021-09-07 19:55:48 -0700	[diff] [blame^]	959	!auto_movable_can_online_movable(nid, group, nr_pages))
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	960	goto kernel_zone;
				961	#endif /* CONFIG_NUMA */
				962
				963	return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
				964	kernel_zone:
				965	return default_kernel_zone_for_pfn(nid, pfn, nr_pages);
				966	}
				967
Michal Hocko	c6f03e2	2017-09-06 16:19:40 -0700	[diff] [blame]	968	static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
				969	unsigned long nr_pages)
Michal Hocko	e5e6893	2017-09-06 16:19:37 -0700	[diff] [blame]	970	{
Michal Hocko	c6f03e2	2017-09-06 16:19:40 -0700	[diff] [blame]	971	struct zone *kernel_zone = default_kernel_zone_for_pfn(nid, start_pfn,
				972	nr_pages);
				973	struct zone *movable_zone = &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
				974	bool in_kernel = zone_intersects(kernel_zone, start_pfn, nr_pages);
				975	bool in_movable = zone_intersects(movable_zone, start_pfn, nr_pages);
Michal Hocko	e5e6893	2017-09-06 16:19:37 -0700	[diff] [blame]	976
				977	/*
Michal Hocko	c6f03e2	2017-09-06 16:19:40 -0700	[diff] [blame]	978	* We inherit the existing zone in a simple case where zones do not
				979	* overlap in the given range
Michal Hocko	e5e6893	2017-09-06 16:19:37 -0700	[diff] [blame]	980	*/
Michal Hocko	c6f03e2	2017-09-06 16:19:40 -0700	[diff] [blame]	981	if (in_kernel ^ in_movable)
				982	return (in_kernel) ? kernel_zone : movable_zone;
Michal Hocko	e5e6893	2017-09-06 16:19:37 -0700	[diff] [blame]	983
Michal Hocko	c6f03e2	2017-09-06 16:19:40 -0700	[diff] [blame]	984	/*
				985	* If the range doesn't belong to any zone or two zones overlap in the
				986	* given range then we use movable zone only if movable_node is
				987	* enabled because we always online to a kernel zone by default.
				988	*/
				989	return movable_node_enabled ? movable_zone : kernel_zone;
Michal Hocko	9f123ab	2017-07-10 15:48:37 -0700	[diff] [blame]	990	}
				991
David Hildenbrand	7cf209b	2021-09-07 19:54:59 -0700	[diff] [blame]	992	struct zone *zone_for_pfn_range(int online_type, int nid,
David Hildenbrand	445fcf7	2021-09-07 19:55:45 -0700	[diff] [blame]	993	struct memory_group *group, unsigned long start_pfn,
				994	unsigned long nr_pages)
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	995	{
Michal Hocko	c6f03e2	2017-09-06 16:19:40 -0700	[diff] [blame]	996	if (online_type == MMOP_ONLINE_KERNEL)
				997	return default_kernel_zone_for_pfn(nid, start_pfn, nr_pages);
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	998
Michal Hocko	c6f03e2	2017-09-06 16:19:40 -0700	[diff] [blame]	999	if (online_type == MMOP_ONLINE_MOVABLE)
				1000	return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
Reza Arbab	df429ac	2016-07-26 15:22:23 -0700	[diff] [blame]	1001
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	1002	if (online_policy == ONLINE_POLICY_AUTO_MOVABLE)
David Hildenbrand	445fcf7	2021-09-07 19:55:45 -0700	[diff] [blame]	1003	return auto_movable_zone_for_pfn(nid, group, start_pfn, nr_pages);
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	1004
Michal Hocko	c6f03e2	2017-09-06 16:19:40 -0700	[diff] [blame]	1005	return default_zone_for_pfn(nid, start_pfn, nr_pages);
Michal Hocko	e5e6893	2017-09-06 16:19:37 -0700	[diff] [blame]	1006	}
				1007
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	1008	/*
				1009	* This function should only be called by memory_block_{online,offline},
				1010	* and {online,offline}_pages.
				1011	*/
David Hildenbrand	836809e	2021-09-07 19:55:30 -0700	[diff] [blame]	1012	void adjust_present_page_count(struct page page, struct memory_group group,
				1013	long nr_pages)
David Hildenbrand	f990114	2021-05-04 18:39:39 -0700	[diff] [blame]	1014	{
David Hildenbrand	4b09700	2021-09-07 19:55:19 -0700	[diff] [blame]	1015	struct zone *zone = page_zone(page);
David Hildenbrand	836809e	2021-09-07 19:55:30 -0700	[diff] [blame]	1016	const bool movable = zone_idx(zone) == ZONE_MOVABLE;
David Hildenbrand	4b09700	2021-09-07 19:55:19 -0700	[diff] [blame]	1017
				1018	/*
				1019	* We only support onlining/offlining/adding/removing of complete
				1020	* memory blocks; therefore, either all is either early or hotplugged.
				1021	*/
				1022	if (early_section(__pfn_to_section(page_to_pfn(page))))
				1023	zone->present_early_pages += nr_pages;
David Hildenbrand	f990114	2021-05-04 18:39:39 -0700	[diff] [blame]	1024	zone->present_pages += nr_pages;
David Hildenbrand	f990114	2021-05-04 18:39:39 -0700	[diff] [blame]	1025	zone->zone_pgdat->node_present_pages += nr_pages;
David Hildenbrand	836809e	2021-09-07 19:55:30 -0700	[diff] [blame]	1026
				1027	if (group && movable)
				1028	group->present_movable_pages += nr_pages;
				1029	else if (group && !movable)
				1030	group->present_kernel_pages += nr_pages;
David Hildenbrand	f990114	2021-05-04 18:39:39 -0700	[diff] [blame]	1031	}
				1032
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	1033	int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
				1034	struct zone *zone)
				1035	{
				1036	unsigned long end_pfn = pfn + nr_pages;
				1037	int ret;
				1038
				1039	ret = kasan_add_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
				1040	if (ret)
				1041	return ret;
				1042
				1043	move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE);
				1044
				1045	/*
				1046	* It might be that the vmemmap_pages fully span sections. If that is
				1047	* the case, mark those sections online here as otherwise they will be
				1048	* left offline.
				1049	*/
				1050	if (nr_pages >= PAGES_PER_SECTION)
				1051	online_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION));
				1052
				1053	return ret;
				1054	}
				1055
				1056	void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages)
				1057	{
				1058	unsigned long end_pfn = pfn + nr_pages;
				1059
				1060	/*
				1061	* It might be that the vmemmap_pages fully span sections. If that is
				1062	* the case, mark those sections offline here as otherwise they will be
				1063	* left online.
				1064	*/
				1065	if (nr_pages >= PAGES_PER_SECTION)
				1066	offline_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION));
				1067
				1068	/*
				1069	* The pages associated with this vmemmap have been offlined, so
				1070	* we can reset its state here.
				1071	*/
				1072	remove_pfn_range_from_zone(page_zone(pfn_to_page(pfn)), pfn, nr_pages);
				1073	kasan_remove_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
				1074	}
				1075
David Hildenbrand	836809e	2021-09-07 19:55:30 -0700	[diff] [blame]	1076	int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
				1077	struct zone zone, struct memory_group group)
KAMEZAWA Hiroyuki	75884fb	2007-10-16 01:26:10 -0700	[diff] [blame]	1078	{
Cody P Schafer	aa47228	2013-07-03 15:02:10 -0700	[diff] [blame]	1079	unsigned long flags;
Yasunori Goto	6811378	2006-06-23 02:03:11 -0700	[diff] [blame]	1080	int need_zonelists_rebuild = 0;
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	1081	const int nid = zone_to_nid(zone);
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1082	int ret;
				1083	struct memory_notify arg;
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	1084
Oscar Salvador	dd8e2f2	2021-05-04 18:39:36 -0700	[diff] [blame]	1085	/*
				1086	* {on,off}lining is constrained to full memory sections (or more
Zhen Lei	041711c	2021-06-30 18:53:17 -0700	[diff] [blame]	1087	* precisely to memory blocks from the user space POV).
Oscar Salvador	dd8e2f2	2021-05-04 18:39:36 -0700	[diff] [blame]	1088	* memmap_on_memory is an exception because it reserves initial part
				1089	* of the physical memory space for vmemmaps. That space is pageblock
				1090	* aligned.
				1091	*/
David Hildenbrand	4986fac	2020-10-15 20:07:50 -0700	[diff] [blame]	1092	if (WARN_ON_ONCE(!nr_pages \|\|
Oscar Salvador	dd8e2f2	2021-05-04 18:39:36 -0700	[diff] [blame]	1093	!IS_ALIGNED(pfn, pageblock_nr_pages) \|\|
				1094	!IS_ALIGNED(pfn + nr_pages, PAGES_PER_SECTION)))
David Hildenbrand	4986fac	2020-10-15 20:07:50 -0700	[diff] [blame]	1095	return -EINVAL;
				1096
David Hildenbrand	381eab4	2018-10-30 15:10:29 -0700	[diff] [blame]	1097	mem_hotplug_begin();
				1098
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	1099	/* associate pfn range with the zone */
David Hildenbrand	b30c592	2020-10-15 20:08:23 -0700	[diff] [blame]	1100	move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE);
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	1101
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1102	arg.start_pfn = pfn;
				1103	arg.nr_pages = nr_pages;
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1104	node_states_check_changes_online(nr_pages, zone, &arg);
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1105
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1106	ret = memory_notify(MEM_GOING_ONLINE, &arg);
				1107	ret = notifier_to_errno(ret);
Chen Yucong	e33e33b	2016-03-17 14:19:35 -0700	[diff] [blame]	1108	if (ret)
				1109	goto failed_addition;
				1110
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	1111	/*
David Hildenbrand	b30c592	2020-10-15 20:08:23 -0700	[diff] [blame]	1112	* Fixup the number of isolated pageblocks before marking the sections
				1113	* onlining, such that undo_isolate_page_range() works correctly.
				1114	*/
				1115	spin_lock_irqsave(&zone->lock, flags);
				1116	zone->nr_isolate_pageblock += nr_pages / pageblock_nr_pages;
				1117	spin_unlock_irqrestore(&zone->lock, flags);
				1118
				1119	/*
Yasunori Goto	6811378	2006-06-23 02:03:11 -0700	[diff] [blame]	1120	* If this zone is not populated, then it is not in zonelist.
				1121	* This means the page allocator ignores this zone.
				1122	* So, zonelist must be updated after online.
				1123	*/
Wen Congyang	6dcd73d	2012-12-11 16:01:01 -0800	[diff] [blame]	1124	if (!populated_zone(zone)) {
Yasunori Goto	6811378	2006-06-23 02:03:11 -0700	[diff] [blame]	1125	need_zonelists_rebuild = 1;
Michal Hocko	72675e1	2017-09-06 16:20:24 -0700	[diff] [blame]	1126	setup_zone_pageset(zone);
Wen Congyang	6dcd73d	2012-12-11 16:01:01 -0800	[diff] [blame]	1127	}
Yasunori Goto	6811378	2006-06-23 02:03:11 -0700	[diff] [blame]	1128
David Hildenbrand	aac6532	2020-10-15 20:08:11 -0700	[diff] [blame]	1129	online_pages_range(pfn, nr_pages);
David Hildenbrand	836809e	2021-09-07 19:55:30 -0700	[diff] [blame]	1130	adjust_present_page_count(pfn_to_page(pfn), group, nr_pages);
Cody P Schafer	aa47228	2013-07-03 15:02:10 -0700	[diff] [blame]	1131
David Hildenbrand	b30c592	2020-10-15 20:08:23 -0700	[diff] [blame]	1132	node_states_set_node(nid, &arg);
				1133	if (need_zonelists_rebuild)
				1134	build_all_zonelists(NULL);
David Hildenbrand	b30c592	2020-10-15 20:08:23 -0700	[diff] [blame]	1135
				1136	/* Basic onlining is complete, allow allocation of onlined pages. */
				1137	undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE);
				1138
David Hildenbrand	93146d9	2020-08-06 23:25:35 -0700	[diff] [blame]	1139	/*
David Hildenbrand	b86c5fc	2020-10-15 20:09:39 -0700	[diff] [blame]	1140	* Freshly onlined pages aren't shuffled (e.g., all pages are placed to
				1141	* the tail of the freelist when undoing isolation). Shuffle the whole
				1142	* zone to make sure the just onlined pages are properly distributed
				1143	* across the whole freelist - to create an initial shuffle.
David Hildenbrand	93146d9	2020-08-06 23:25:35 -0700	[diff] [blame]	1144	*/
Dan Williams	e900a91	2019-05-14 15:41:28 -0700	[diff] [blame]	1145	shuffle_zone(zone);
				1146
Mel Gorman	b92ca18	2021-06-28 19:42:12 -0700	[diff] [blame]	1147	/* reinitialise watermarks and update pcp limits */
KOSAKI Motohiro	1b79acc	2011-05-24 17:11:32 -0700	[diff] [blame]	1148	init_per_zone_wmark_min();
				1149
David Hildenbrand	ca9a46f	2019-09-23 15:36:08 -0700	[diff] [blame]	1150	kswapd_run(nid);
				1151	kcompactd_run(nid);
Dave Hansen	61b1399	2005-10-29 18:16:56 -0700	[diff] [blame]	1152
Chandra Seetharaman	2d1d43f	2006-09-29 02:01:25 -0700	[diff] [blame]	1153	writeback_set_ratelimit();
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1154
David Hildenbrand	ca9a46f	2019-09-23 15:36:08 -0700	[diff] [blame]	1155	memory_notify(MEM_ONLINE, &arg);
David Hildenbrand	381eab4	2018-10-30 15:10:29 -0700	[diff] [blame]	1156	mem_hotplug_done();
David Rientjes	30467e0	2015-04-14 15:45:11 -0700	[diff] [blame]	1157	return 0;
Chen Yucong	e33e33b	2016-03-17 14:19:35 -0700	[diff] [blame]	1158
				1159	failed_addition:
				1160	pr_debug("online_pages [mem %#010llx-%#010llx] failed\n",
				1161	(unsigned long long) pfn << PAGE_SHIFT,
				1162	(((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1);
				1163	memory_notify(MEM_CANCEL_ONLINE, &arg);
David Hildenbrand	feee6b2	2020-01-04 12:59:33 -0800	[diff] [blame]	1164	remove_pfn_range_from_zone(zone, pfn, nr_pages);
David Hildenbrand	381eab4	2018-10-30 15:10:29 -0700	[diff] [blame]	1165	mem_hotplug_done();
Chen Yucong	e33e33b	2016-03-17 14:19:35 -0700	[diff] [blame]	1166	return ret;
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	1167	}
Keith Mannthey	5394702	2006-09-30 23:27:08 -0700	[diff] [blame]	1168	#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
Yasunori Goto	bc02af9	2006-06-27 02:53:30 -0700	[diff] [blame]	1169
Tang Chen	0bd8542	2014-11-13 15:19:41 -0800	[diff] [blame]	1170	static void reset_node_present_pages(pg_data_t *pgdat)
				1171	{
				1172	struct zone *z;
				1173
				1174	for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
				1175	z->present_pages = 0;
				1176
				1177	pgdat->node_present_pages = 0;
				1178	}
				1179
Hidetoshi Seto	e131933	2009-11-17 14:06:18 -0800	[diff] [blame]	1180	/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
David Hildenbrand	c68ab18	2020-06-04 16:48:35 -0700	[diff] [blame]	1181	static pg_data_t __ref *hotadd_new_pgdat(int nid)
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1182	{
				1183	struct pglist_data *pgdat;
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1184
Tang Chen	a1e565a	2013-02-22 16:33:18 -0800	[diff] [blame]	1185	pgdat = NODE_DATA(nid);
				1186	if (!pgdat) {
				1187	pgdat = arch_alloc_nodedata(nid);
				1188	if (!pgdat)
				1189	return NULL;
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1190
Wei Yang	33fce01	2019-09-23 15:35:52 -0700	[diff] [blame]	1191	pgdat->per_cpu_nodestats =
				1192	alloc_percpu(struct per_cpu_nodestat);
Tang Chen	a1e565a	2013-02-22 16:33:18 -0800	[diff] [blame]	1193	arch_refresh_nodedata(nid, pgdat);
Gu Zheng	b0dc3a3	2015-03-25 15:55:20 -0700	[diff] [blame]	1194	} else {
Wei Yang	33fce01	2019-09-23 15:35:52 -0700	[diff] [blame]	1195	int cpu;
Mel Gorman	e716f2e	2017-05-03 14:53:45 -0700	[diff] [blame]	1196	/*
Joonsoo Kim	97a225e	2020-06-03 15:59:01 -0700	[diff] [blame]	1197	* Reset the nr_zones, order and highest_zoneidx before reuse.
				1198	* Note that kswapd will init kswapd_highest_zoneidx properly
Mel Gorman	e716f2e	2017-05-03 14:53:45 -0700	[diff] [blame]	1199	* when it starts in the near future.
				1200	*/
Gu Zheng	b0dc3a3	2015-03-25 15:55:20 -0700	[diff] [blame]	1201	pgdat->nr_zones = 0;
Mel Gorman	38087d9	2016-07-28 15:45:49 -0700	[diff] [blame]	1202	pgdat->kswapd_order = 0;
Joonsoo Kim	97a225e	2020-06-03 15:59:01 -0700	[diff] [blame]	1203	pgdat->kswapd_highest_zoneidx = 0;
Wei Yang	33fce01	2019-09-23 15:35:52 -0700	[diff] [blame]	1204	for_each_online_cpu(cpu) {
				1205	struct per_cpu_nodestat *p;
				1206
				1207	p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
				1208	memset(p, 0, sizeof(*p));
				1209	}
Tang Chen	a1e565a	2013-02-22 16:33:18 -0800	[diff] [blame]	1210	}
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1211
				1212	/* we can use NODE_DATA(nid) from here */
Oscar Salvador	03e85f9	2018-08-21 21:53:43 -0700	[diff] [blame]	1213	pgdat->node_id = nid;
David Hildenbrand	c68ab18	2020-06-04 16:48:35 -0700	[diff] [blame]	1214	pgdat->node_start_pfn = 0;
Oscar Salvador	03e85f9	2018-08-21 21:53:43 -0700	[diff] [blame]	1215
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1216	/* init node's zones as empty zones, we don't have any present pages.*/
Oscar Salvador	03e85f9	2018-08-21 21:53:43 -0700	[diff] [blame]	1217	free_area_init_core_hotplug(nid);
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1218
KAMEZAWA Hiroyuki	959ecc4	2011-06-15 15:08:38 -0700	[diff] [blame]	1219	/*
				1220	* The node we allocated has no zone fallback lists. For avoiding
				1221	* to access not-initialized zonelist, build here.
				1222	*/
Michal Hocko	72675e1	2017-09-06 16:20:24 -0700	[diff] [blame]	1223	build_all_zonelists(pgdat);
KAMEZAWA Hiroyuki	959ecc4	2011-06-15 15:08:38 -0700	[diff] [blame]	1224
Tang Chen	f784a3f	2014-11-13 15:19:39 -0800	[diff] [blame]	1225	/*
Tang Chen	0bd8542	2014-11-13 15:19:41 -0800	[diff] [blame]	1226	* When memory is hot-added, all the memory is in offline state. So
				1227	* clear all zones' present_pages because they will be updated in
				1228	* online_pages() and offline_pages().
				1229	*/
Oscar Salvador	03e85f9	2018-08-21 21:53:43 -0700	[diff] [blame]	1230	reset_node_managed_pages(pgdat);
Tang Chen	0bd8542	2014-11-13 15:19:41 -0800	[diff] [blame]	1231	reset_node_present_pages(pgdat);
				1232
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1233	return pgdat;
				1234	}
				1235
Oscar Salvador	b9ff036	2018-08-17 15:46:15 -0700	[diff] [blame]	1236	static void rollback_node_hotadd(int nid)
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1237	{
Oscar Salvador	b9ff036	2018-08-17 15:46:15 -0700	[diff] [blame]	1238	pg_data_t *pgdat = NODE_DATA(nid);
				1239
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1240	arch_refresh_nodedata(nid, NULL);
Reza Arbab	5830169	2016-08-11 15:33:12 -0700	[diff] [blame]	1241	free_percpu(pgdat->per_cpu_nodestats);
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1242	arch_free_nodedata(pgdat);
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1243	}
				1244
KAMEZAWA Hiroyuki	0a54703	2006-06-27 02:53:35 -0700	[diff] [blame]	1245
Mel Gorman	ba2d266	2021-06-30 18:53:35 -0700	[diff] [blame]	1246	/*
				1247	* __try_online_node - online a node if offlined
Mike Rapoport	e8b098f	2018-04-05 16:24:57 -0700	[diff] [blame]	1248	* @nid: the node ID
Oscar Salvador	b9ff036	2018-08-17 15:46:15 -0700	[diff] [blame]	1249	* @set_node_online: Whether we want to online the node
minskey guo	cf23422	2010-05-24 14:32:41 -0700	[diff] [blame]	1250	* called by cpu_up() to online a node without onlined memory.
Oscar Salvador	b9ff036	2018-08-17 15:46:15 -0700	[diff] [blame]	1251	*
				1252	* Returns:
				1253	* 1 -> a new node has been allocated
				1254	* 0 -> the node is already online
				1255	* -ENOMEM -> the node could not be allocated
minskey guo	cf23422	2010-05-24 14:32:41 -0700	[diff] [blame]	1256	*/
David Hildenbrand	c68ab18	2020-06-04 16:48:35 -0700	[diff] [blame]	1257	static int __try_online_node(int nid, bool set_node_online)
minskey guo	cf23422	2010-05-24 14:32:41 -0700	[diff] [blame]	1258	{
Oscar Salvador	b9ff036	2018-08-17 15:46:15 -0700	[diff] [blame]	1259	pg_data_t *pgdat;
				1260	int ret = 1;
minskey guo	cf23422	2010-05-24 14:32:41 -0700	[diff] [blame]	1261
Toshi Kani	01b0f19	2013-11-12 15:07:25 -0800	[diff] [blame]	1262	if (node_online(nid))
				1263	return 0;
				1264
David Hildenbrand	c68ab18	2020-06-04 16:48:35 -0700	[diff] [blame]	1265	pgdat = hotadd_new_pgdat(nid);
David Rientjes	7553e8f	2011-06-22 18:13:01 -0700	[diff] [blame]	1266	if (!pgdat) {
Toshi Kani	01b0f19	2013-11-12 15:07:25 -0800	[diff] [blame]	1267	pr_err("Cannot online node %d due to NULL pgdat\n", nid);
minskey guo	cf23422	2010-05-24 14:32:41 -0700	[diff] [blame]	1268	ret = -ENOMEM;
				1269	goto out;
				1270	}
Oscar Salvador	b9ff036	2018-08-17 15:46:15 -0700	[diff] [blame]	1271
				1272	if (set_node_online) {
				1273	node_set_online(nid);
				1274	ret = register_one_node(nid);
				1275	BUG_ON(ret);
				1276	}
minskey guo	cf23422	2010-05-24 14:32:41 -0700	[diff] [blame]	1277	out:
Oscar Salvador	b9ff036	2018-08-17 15:46:15 -0700	[diff] [blame]	1278	return ret;
				1279	}
				1280
				1281	/*
				1282	* Users of this function always want to online/register the node
				1283	*/
				1284	int try_online_node(int nid)
				1285	{
				1286	int ret;
				1287
				1288	mem_hotplug_begin();
David Hildenbrand	c68ab18	2020-06-04 16:48:35 -0700	[diff] [blame]	1289	ret = __try_online_node(nid, true);
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	1290	mem_hotplug_done();
minskey guo	cf23422	2010-05-24 14:32:41 -0700	[diff] [blame]	1291	return ret;
				1292	}
				1293
Toshi Kani	27356f5	2013-09-11 14:21:49 -0700	[diff] [blame]	1294	static int check_hotplug_memory_range(u64 start, u64 size)
				1295	{
Pavel Tatashin	ba32558	2018-04-05 16:22:39 -0700	[diff] [blame]	1296	/* memory range must be block size aligned */
David Hildenbrand	cec3ebd	2019-07-18 15:56:25 -0700	[diff] [blame]	1297	if (!size \|\| !IS_ALIGNED(start, memory_block_size_bytes()) \|\|
				1298	!IS_ALIGNED(size, memory_block_size_bytes())) {
Pavel Tatashin	ba32558	2018-04-05 16:22:39 -0700	[diff] [blame]	1299	pr_err("Block size [%#lx] unaligned hotplug range: start %#llx, size %#llx",
David Hildenbrand	cec3ebd	2019-07-18 15:56:25 -0700	[diff] [blame]	1300	memory_block_size_bytes(), start, size);
Toshi Kani	27356f5	2013-09-11 14:21:49 -0700	[diff] [blame]	1301	return -EINVAL;
				1302	}
				1303
				1304	return 0;
				1305	}
				1306
Vitaly Kuznetsov	31bc385	2016-03-15 14:56:48 -0700	[diff] [blame]	1307	static int online_memory_block(struct memory_block mem, void arg)
				1308	{
Anshuman Khandual	1adf8b4	2021-02-25 17:17:13 -0800	[diff] [blame]	1309	mem->online_type = mhp_default_online_type;
Nathan Fontenot	dc18d70	2017-02-24 15:00:02 -0800	[diff] [blame]	1310	return device_online(&mem->dev);
Vitaly Kuznetsov	31bc385	2016-03-15 14:56:48 -0700	[diff] [blame]	1311	}
				1312
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	1313	bool mhp_supports_memmap_on_memory(unsigned long size)
				1314	{
				1315	unsigned long nr_vmemmap_pages = size / PAGE_SIZE;
				1316	unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
				1317	unsigned long remaining_size = size - vmemmap_size;
				1318
				1319	/*
				1320	* Besides having arch support and the feature enabled at runtime, we
				1321	* need a few more assumptions to hold true:
				1322	*
				1323	* a) We span a single memory block: memory onlining/offlinin;g happens
				1324	* in memory block granularity. We don't want the vmemmap of online
				1325	* memory blocks to reside on offline memory blocks. In the future,
				1326	* we might want to support variable-sized memory blocks to make the
				1327	* feature more versatile.
				1328	*
				1329	* b) The vmemmap pages span complete PMDs: We don't want vmemmap code
				1330	* to populate memory from the altmap for unrelated parts (i.e.,
				1331	* other memory blocks)
				1332	*
				1333	* c) The vmemmap pages (and thereby the pages that will be exposed to
				1334	* the buddy) have to cover full pageblocks: memory onlining/offlining
				1335	* code requires applicable ranges to be page-aligned, for example, to
				1336	* set the migratetypes properly.
				1337	*
				1338	* TODO: Although we have a check here to make sure that vmemmap pages
				1339	* fully populate a PMD, it is not the right place to check for
				1340	* this. A much better solution involves improving vmemmap code
				1341	* to fallback to base pages when trying to populate vmemmap using
				1342	* altmap as an alternative source of memory, and we do not exactly
				1343	* populate a single PMD.
				1344	*/
				1345	return memmap_on_memory &&
Muchun Song	2d7a217	2021-06-30 18:48:25 -0700	[diff] [blame]	1346	!hugetlb_free_vmemmap_enabled &&
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	1347	IS_ENABLED(CONFIG_MHP_MEMMAP_ON_MEMORY) &&
				1348	size == memory_block_size_bytes() &&
				1349	IS_ALIGNED(vmemmap_size, PMD_SIZE) &&
				1350	IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT));
				1351	}
				1352
David Hildenbrand	8df1d0e	2018-10-30 15:10:24 -0700	[diff] [blame]	1353	/*
				1354	* NOTE: The caller must call lock_device_hotplug() to serialize hotplug
				1355	* and online/offline operations (triggered e.g. by sysfs).
				1356	*
				1357	* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG
				1358	*/
David Hildenbrand	b611719	2020-10-15 20:08:44 -0700	[diff] [blame]	1359	int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
Yasunori Goto	bc02af9	2006-06-27 02:53:30 -0700	[diff] [blame]	1360	{
Catalin Marinas	d15dfd3	2021-03-09 12:26:01 +0000	[diff] [blame]	1361	struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	1362	struct vmem_altmap mhp_altmap = {};
David Hildenbrand	028fc57	2021-09-07 19:55:26 -0700	[diff] [blame]	1363	struct memory_group *group = NULL;
David Vrabel	62cedb9	2015-06-25 16:35:49 +0100	[diff] [blame]	1364	u64 start, size;
Oscar Salvador	b9ff036	2018-08-17 15:46:15 -0700	[diff] [blame]	1365	bool new_node = false;
Yasunori Goto	bc02af9	2006-06-27 02:53:30 -0700	[diff] [blame]	1366	int ret;
				1367
David Vrabel	62cedb9	2015-06-25 16:35:49 +0100	[diff] [blame]	1368	start = res->start;
				1369	size = resource_size(res);
				1370
Toshi Kani	27356f5	2013-09-11 14:21:49 -0700	[diff] [blame]	1371	ret = check_hotplug_memory_range(start, size);
				1372	if (ret)
				1373	return ret;
				1374
David Hildenbrand	028fc57	2021-09-07 19:55:26 -0700	[diff] [blame]	1375	if (mhp_flags & MHP_NID_IS_MGID) {
				1376	group = memory_group_find_by_id(nid);
				1377	if (!group)
				1378	return -EINVAL;
				1379	nid = group->nid;
				1380	}
				1381
Vishal Verma	fa6d9ec	2020-06-04 16:48:25 -0700	[diff] [blame]	1382	if (!node_possible(nid)) {
				1383	WARN(1, "node %d was absent from the node_possible_map\n", nid);
				1384	return -EINVAL;
				1385	}
				1386
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	1387	mem_hotplug_begin();
Nathan Zimmer	ac13c46	2014-01-23 15:53:26 -0800	[diff] [blame]	1388
David Hildenbrand	52219ae	2020-06-04 16:48:38 -0700	[diff] [blame]	1389	if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
				1390	memblock_add_node(start, size, nid);
Tang Chen	7f36e3e	2015-09-04 15:42:32 -0700	[diff] [blame]	1391
David Hildenbrand	c68ab18	2020-06-04 16:48:35 -0700	[diff] [blame]	1392	ret = __try_online_node(nid, false);
Oscar Salvador	b9ff036	2018-08-17 15:46:15 -0700	[diff] [blame]	1393	if (ret < 0)
				1394	goto error;
				1395	new_node = ret;
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1396
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	1397	/*
				1398	* Self hosted memmap array
				1399	*/
				1400	if (mhp_flags & MHP_MEMMAP_ON_MEMORY) {
				1401	if (!mhp_supports_memmap_on_memory(size)) {
				1402	ret = -EINVAL;
				1403	goto error;
				1404	}
				1405	mhp_altmap.free = PHYS_PFN(size);
				1406	mhp_altmap.base_pfn = PHYS_PFN(start);
				1407	params.altmap = &mhp_altmap;
				1408	}
				1409
Yasunori Goto	bc02af9	2006-06-27 02:53:30 -0700	[diff] [blame]	1410	/* call arch's memory hotadd */
Logan Gunthorpe	f5637d3	2020-04-10 14:33:21 -0700	[diff] [blame]	1411	ret = arch_add_memory(nid, start, size, &params);
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1412	if (ret < 0)
				1413	goto error;
				1414
David Hildenbrand	db051a0	2019-07-18 15:56:56 -0700	[diff] [blame]	1415	/* create memory block devices after memory was added */
David Hildenbrand	028fc57	2021-09-07 19:55:26 -0700	[diff] [blame]	1416	ret = create_memory_block_devices(start, size, mhp_altmap.alloc,
				1417	group);
David Hildenbrand	db051a0	2019-07-18 15:56:56 -0700	[diff] [blame]	1418	if (ret) {
David Hildenbrand	65a2aa5	2021-09-07 19:55:04 -0700	[diff] [blame]	1419	arch_remove_memory(start, size, NULL);
David Hildenbrand	db051a0	2019-07-18 15:56:56 -0700	[diff] [blame]	1420	goto error;
				1421	}
				1422
Tang Chen	a1e565a	2013-02-22 16:33:18 -0800	[diff] [blame]	1423	if (new_node) {
Oscar Salvador	d5b6f6a3	2018-08-17 15:46:18 -0700	[diff] [blame]	1424	/* If sysfs file of new node can't be created, cpu on the node
Yasunori Goto	0fc4415	2006-06-27 02:53:38 -0700	[diff] [blame]	1425	* can't be hot-added. There is no rollback way now.
				1426	* So, check by BUG_ON() to catch it reluctantly..
Oscar Salvador	d5b6f6a3	2018-08-17 15:46:18 -0700	[diff] [blame]	1427	* We online node here. We can't roll back from here.
Yasunori Goto	0fc4415	2006-06-27 02:53:38 -0700	[diff] [blame]	1428	*/
Oscar Salvador	d5b6f6a3	2018-08-17 15:46:18 -0700	[diff] [blame]	1429	node_set_online(nid);
				1430	ret = __register_one_node(nid);
Yasunori Goto	0fc4415	2006-06-27 02:53:38 -0700	[diff] [blame]	1431	BUG_ON(ret);
				1432	}
				1433
Oscar Salvador	d5b6f6a3	2018-08-17 15:46:18 -0700	[diff] [blame]	1434	/* link memory sections under this node.*/
Laurent Dufour	90c7eae	2020-10-15 20:09:15 -0700	[diff] [blame]	1435	link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1),
				1436	MEMINIT_HOTPLUG);
Oscar Salvador	d5b6f6a3	2018-08-17 15:46:18 -0700	[diff] [blame]	1437
akpm@linux-foundation.org	d96ae53	2010-03-05 13:41:58 -0800	[diff] [blame]	1438	/* create new memmap entry */
David Hildenbrand	7b7b272	2020-06-04 16:48:41 -0700	[diff] [blame]	1439	if (!strcmp(res->name, "System RAM"))
				1440	firmware_map_add_hotplug(start, start + size, "System RAM");
akpm@linux-foundation.org	d96ae53	2010-03-05 13:41:58 -0800	[diff] [blame]	1441
David Hildenbrand	381eab4	2018-10-30 15:10:29 -0700	[diff] [blame]	1442	/* device_online() will take the lock when calling online_pages() */
				1443	mem_hotplug_done();
				1444
David Hildenbrand	9ca6551	2020-10-15 20:08:49 -0700	[diff] [blame]	1445	/*
				1446	* In case we're allowed to merge the resource, flag it and trigger
				1447	* merging now that adding succeeded.
				1448	*/
David Hildenbrand	2601126	2021-02-25 17:17:17 -0800	[diff] [blame]	1449	if (mhp_flags & MHP_MERGE_RESOURCE)
David Hildenbrand	9ca6551	2020-10-15 20:08:49 -0700	[diff] [blame]	1450	merge_system_ram_resource(res);
				1451
Vitaly Kuznetsov	31bc385	2016-03-15 14:56:48 -0700	[diff] [blame]	1452	/* online pages if requested */
Anshuman Khandual	1adf8b4	2021-02-25 17:17:13 -0800	[diff] [blame]	1453	if (mhp_default_online_type != MMOP_OFFLINE)
David Hildenbrand	fbcf73c	2019-07-18 15:57:46 -0700	[diff] [blame]	1454	walk_memory_blocks(start, size, NULL, online_memory_block);
Vitaly Kuznetsov	31bc385	2016-03-15 14:56:48 -0700	[diff] [blame]	1455
David Hildenbrand	381eab4	2018-10-30 15:10:29 -0700	[diff] [blame]	1456	return ret;
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1457	error:
				1458	/* rollback pgdat allocation and others */
Oscar Salvador	b9ff036	2018-08-17 15:46:15 -0700	[diff] [blame]	1459	if (new_node)
				1460	rollback_node_hotadd(nid);
David Hildenbrand	52219ae	2020-06-04 16:48:38 -0700	[diff] [blame]	1461	if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
				1462	memblock_remove(start, size);
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	1463	mem_hotplug_done();
Yasunori Goto	bc02af9	2006-06-27 02:53:30 -0700	[diff] [blame]	1464	return ret;
				1465	}
David Vrabel	62cedb9	2015-06-25 16:35:49 +0100	[diff] [blame]	1466
David Hildenbrand	8df1d0e	2018-10-30 15:10:24 -0700	[diff] [blame]	1467	/* requires device_hotplug_lock, see add_memory_resource() */
David Hildenbrand	b611719	2020-10-15 20:08:44 -0700	[diff] [blame]	1468	int __ref __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags)
David Vrabel	62cedb9	2015-06-25 16:35:49 +0100	[diff] [blame]	1469	{
				1470	struct resource *res;
				1471	int ret;
				1472
David Hildenbrand	7b7b272	2020-06-04 16:48:41 -0700	[diff] [blame]	1473	res = register_memory_resource(start, size, "System RAM");
Vitaly Kuznetsov	6f754ba	2016-01-14 15:21:55 -0800	[diff] [blame]	1474	if (IS_ERR(res))
				1475	return PTR_ERR(res);
David Vrabel	62cedb9	2015-06-25 16:35:49 +0100	[diff] [blame]	1476
David Hildenbrand	b611719	2020-10-15 20:08:44 -0700	[diff] [blame]	1477	ret = add_memory_resource(nid, res, mhp_flags);
David Vrabel	62cedb9	2015-06-25 16:35:49 +0100	[diff] [blame]	1478	if (ret < 0)
				1479	release_memory_resource(res);
				1480	return ret;
				1481	}
David Hildenbrand	8df1d0e	2018-10-30 15:10:24 -0700	[diff] [blame]	1482
David Hildenbrand	b611719	2020-10-15 20:08:44 -0700	[diff] [blame]	1483	int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags)
David Hildenbrand	8df1d0e	2018-10-30 15:10:24 -0700	[diff] [blame]	1484	{
				1485	int rc;
				1486
				1487	lock_device_hotplug();
David Hildenbrand	b611719	2020-10-15 20:08:44 -0700	[diff] [blame]	1488	rc = __add_memory(nid, start, size, mhp_flags);
David Hildenbrand	8df1d0e	2018-10-30 15:10:24 -0700	[diff] [blame]	1489	unlock_device_hotplug();
				1490
				1491	return rc;
				1492	}
Yasunori Goto	bc02af9	2006-06-27 02:53:30 -0700	[diff] [blame]	1493	EXPORT_SYMBOL_GPL(add_memory);
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1494
David Hildenbrand	7b7b272	2020-06-04 16:48:41 -0700	[diff] [blame]	1495	/*
				1496	* Add special, driver-managed memory to the system as system RAM. Such
				1497	* memory is not exposed via the raw firmware-provided memmap as system
				1498	* RAM, instead, it is detected and added by a driver - during cold boot,
				1499	* after a reboot, and after kexec.
				1500	*
				1501	* Reasons why this memory should not be used for the initial memmap of a
				1502	* kexec kernel or for placing kexec images:
				1503	* - The booting kernel is in charge of determining how this memory will be
				1504	* used (e.g., use persistent memory as system RAM)
				1505	* - Coordination with a hypervisor is required before this memory
				1506	* can be used (e.g., inaccessible parts).
				1507	*
				1508	* For this memory, no entries in /sys/firmware/memmap ("raw firmware-provided
				1509	* memory map") are created. Also, the created memory resource is flagged
David Hildenbrand	7cf603d	2020-10-15 20:08:33 -0700	[diff] [blame]	1510	* with IORESOURCE_SYSRAM_DRIVER_MANAGED, so in-kernel users can special-case
David Hildenbrand	7b7b272	2020-06-04 16:48:41 -0700	[diff] [blame]	1511	* this memory as well (esp., not place kexec images onto it).
				1512	*
				1513	* The resource_name (visible via /proc/iomem) has to have the format
				1514	* "System RAM ($DRIVER)".
				1515	*/
				1516	int add_memory_driver_managed(int nid, u64 start, u64 size,
David Hildenbrand	b611719	2020-10-15 20:08:44 -0700	[diff] [blame]	1517	const char *resource_name, mhp_t mhp_flags)
David Hildenbrand	7b7b272	2020-06-04 16:48:41 -0700	[diff] [blame]	1518	{
				1519	struct resource *res;
				1520	int rc;
				1521
				1522	if (!resource_name \|\|
				1523	strstr(resource_name, "System RAM (") != resource_name \|\|
				1524	resource_name[strlen(resource_name) - 1] != ')')
				1525	return -EINVAL;
				1526
				1527	lock_device_hotplug();
				1528
				1529	res = register_memory_resource(start, size, resource_name);
				1530	if (IS_ERR(res)) {
				1531	rc = PTR_ERR(res);
				1532	goto out_unlock;
				1533	}
				1534
David Hildenbrand	b611719	2020-10-15 20:08:44 -0700	[diff] [blame]	1535	rc = add_memory_resource(nid, res, mhp_flags);
David Hildenbrand	7b7b272	2020-06-04 16:48:41 -0700	[diff] [blame]	1536	if (rc < 0)
				1537	release_memory_resource(res);
				1538
				1539	out_unlock:
				1540	unlock_device_hotplug();
				1541	return rc;
				1542	}
				1543	EXPORT_SYMBOL_GPL(add_memory_driver_managed);
				1544
Anshuman Khandual	bca3fea	2021-02-25 17:17:33 -0800	[diff] [blame]	1545	/*
				1546	* Platforms should define arch_get_mappable_range() that provides
				1547	* maximum possible addressable physical memory range for which the
				1548	* linear mapping could be created. The platform returned address
				1549	* range must adhere to these following semantics.
				1550	*
				1551	* - range.start <= range.end
				1552	* - Range includes both end points [range.start..range.end]
				1553	*
				1554	* There is also a fallback definition provided here, allowing the
				1555	* entire possible physical address range in case any platform does
				1556	* not define arch_get_mappable_range().
				1557	*/
				1558	struct range __weak arch_get_mappable_range(void)
				1559	{
				1560	struct range mhp_range = {
				1561	.start = 0UL,
				1562	.end = -1ULL,
				1563	};
				1564	return mhp_range;
				1565	}
				1566
				1567	struct range mhp_get_pluggable_range(bool need_mapping)
				1568	{
				1569	const u64 max_phys = (1ULL << MAX_PHYSMEM_BITS) - 1;
				1570	struct range mhp_range;
				1571
				1572	if (need_mapping) {
				1573	mhp_range = arch_get_mappable_range();
				1574	if (mhp_range.start > max_phys) {
				1575	mhp_range.start = 0;
				1576	mhp_range.end = 0;
				1577	}
				1578	mhp_range.end = min_t(u64, mhp_range.end, max_phys);
				1579	} else {
				1580	mhp_range.start = 0;
				1581	mhp_range.end = max_phys;
				1582	}
				1583	return mhp_range;
				1584	}
				1585	EXPORT_SYMBOL_GPL(mhp_get_pluggable_range);
				1586
				1587	bool mhp_range_allowed(u64 start, u64 size, bool need_mapping)
				1588	{
				1589	struct range mhp_range = mhp_get_pluggable_range(need_mapping);
				1590	u64 end = start + size;
				1591
				1592	if (start < end && start >= mhp_range.start && (end - 1) <= mhp_range.end)
				1593	return true;
				1594
				1595	pr_warn("Hotplug memory [%#llx-%#llx] exceeds maximum addressable range [%#llx-%#llx]\n",
				1596	start, end, mhp_range.start, mhp_range.end);
				1597	return false;
				1598	}
				1599
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1600	#ifdef CONFIG_MEMORY_HOTREMOVE
				1601	/*
David Hildenbrand	9291799	2020-02-03 17:34:26 -0800	[diff] [blame]	1602	* Confirm all pages in a range [start, end) belong to the same zone (skipping
				1603	* memory holes). When true, return the zone.
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1604	*/
David Hildenbrand	9291799	2020-02-03 17:34:26 -0800	[diff] [blame]	1605	struct zone *test_pages_in_a_zone(unsigned long start_pfn,
				1606	unsigned long end_pfn)
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1607	{
Andrew Banman	5f0f288	2015-12-29 14:54:25 -0800	[diff] [blame]	1608	unsigned long pfn, sec_end_pfn;
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1609	struct zone *zone = NULL;
				1610	struct page *page;
Mike Rapoport	673d40c	2021-09-07 19:54:55 -0700	[diff] [blame]	1611
Toshi Kani	deb88a2	2017-02-03 13:13:20 -0800	[diff] [blame]	1612	for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn + 1);
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1613	pfn < end_pfn;
Toshi Kani	deb88a2	2017-02-03 13:13:20 -0800	[diff] [blame]	1614	pfn = sec_end_pfn, sec_end_pfn += PAGES_PER_SECTION) {
Andrew Banman	5f0f288	2015-12-29 14:54:25 -0800	[diff] [blame]	1615	/* Make sure the memory section is present first */
				1616	if (!present_section_nr(pfn_to_section_nr(pfn)))
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1617	continue;
Andrew Banman	5f0f288	2015-12-29 14:54:25 -0800	[diff] [blame]	1618	for (; pfn < sec_end_pfn && pfn < end_pfn;
				1619	pfn += MAX_ORDER_NR_PAGES) {
Mikhail Zaslonko	24feb47	2019-02-01 14:20:38 -0800	[diff] [blame]	1620	/* Check if we got outside of the zone */
Mike Rapoport	673d40c	2021-09-07 19:54:55 -0700	[diff] [blame]	1621	if (zone && !zone_spans_pfn(zone, pfn))
David Hildenbrand	9291799	2020-02-03 17:34:26 -0800	[diff] [blame]	1622	return NULL;
Mike Rapoport	673d40c	2021-09-07 19:54:55 -0700	[diff] [blame]	1623	page = pfn_to_page(pfn);
Andrew Banman	5f0f288	2015-12-29 14:54:25 -0800	[diff] [blame]	1624	if (zone && page_zone(page) != zone)
David Hildenbrand	9291799	2020-02-03 17:34:26 -0800	[diff] [blame]	1625	return NULL;
Andrew Banman	5f0f288	2015-12-29 14:54:25 -0800	[diff] [blame]	1626	zone = page_zone(page);
				1627	}
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1628	}
Toshi Kani	deb88a2	2017-02-03 13:13:20 -0800	[diff] [blame]	1629
David Hildenbrand	9291799	2020-02-03 17:34:26 -0800	[diff] [blame]	1630	return zone;
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1631	}
				1632
				1633	/*
Yisheng Xie	0efadf4	2017-02-24 14:57:39 -0800	[diff] [blame]	1634	* Scan pfn range [start,end) to find movable/migratable pages (LRU pages,
David Hildenbrand	aa21879	2020-05-07 16:01:30 +0200	[diff] [blame]	1635	* non-lru movable pages and hugepages). Will skip over most unmovable
				1636	* pages (esp., pages that can be skipped when offlining), but bail out on
				1637	* definitely unmovable pages.
				1638	*
				1639	* Returns:
				1640	* 0 in case a movable page is found and movable_pfn was updated.
				1641	* -ENOENT in case no movable page was found.
				1642	* -EBUSY in case a definitely unmovable page was found.
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1643	*/
David Hildenbrand	aa21879	2020-05-07 16:01:30 +0200	[diff] [blame]	1644	static int scan_movable_pages(unsigned long start, unsigned long end,
				1645	unsigned long *movable_pfn)
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1646	{
				1647	unsigned long pfn;
Oscar Salvador	eeb0efd	2019-02-01 14:20:47 -0800	[diff] [blame]	1648
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1649	for (pfn = start; pfn < end; pfn++) {
Oscar Salvador	eeb0efd	2019-02-01 14:20:47 -0800	[diff] [blame]	1650	struct page page, head;
				1651	unsigned long skip;
				1652
				1653	if (!pfn_valid(pfn))
				1654	continue;
				1655	page = pfn_to_page(pfn);
				1656	if (PageLRU(page))
David Hildenbrand	aa21879	2020-05-07 16:01:30 +0200	[diff] [blame]	1657	goto found;
Oscar Salvador	eeb0efd	2019-02-01 14:20:47 -0800	[diff] [blame]	1658	if (__PageMovable(page))
David Hildenbrand	aa21879	2020-05-07 16:01:30 +0200	[diff] [blame]	1659	goto found;
				1660
				1661	/*
				1662	* PageOffline() pages that are not marked __PageMovable() and
				1663	* have a reference count > 0 (after MEM_GOING_OFFLINE) are
				1664	* definitely unmovable. If their reference count would be 0,
				1665	* they could at least be skipped when offlining memory.
				1666	*/
				1667	if (PageOffline(page) && page_count(page))
				1668	return -EBUSY;
Oscar Salvador	eeb0efd	2019-02-01 14:20:47 -0800	[diff] [blame]	1669
				1670	if (!PageHuge(page))
				1671	continue;
				1672	head = compound_head(page);
Mike Kravetz	8f251a3	2021-02-24 12:08:56 -0800	[diff] [blame]	1673	/*
				1674	* This test is racy as we hold no reference or lock. The
				1675	* hugetlb page could have been free'ed and head is no longer
				1676	* a hugetlb page before the following check. In such unlikely
				1677	* cases false positives and negatives are possible. Calling
				1678	* code must deal with these scenarios.
				1679	*/
				1680	if (HPageMigratable(head))
David Hildenbrand	aa21879	2020-05-07 16:01:30 +0200	[diff] [blame]	1681	goto found;
Matthew Wilcox (Oracle)	d8c6546	2019-09-23 15:34:30 -0700	[diff] [blame]	1682	skip = compound_nr(head) - (page - head);
Oscar Salvador	eeb0efd	2019-02-01 14:20:47 -0800	[diff] [blame]	1683	pfn += skip - 1;
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1684	}
David Hildenbrand	aa21879	2020-05-07 16:01:30 +0200	[diff] [blame]	1685	return -ENOENT;
				1686	found:
				1687	*movable_pfn = pfn;
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1688	return 0;
				1689	}
				1690
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1691	static int
				1692	do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
				1693	{
				1694	unsigned long pfn;
Matthew Wilcox (Oracle)	6c35784	2020-08-14 17:30:37 -0700	[diff] [blame]	1695	struct page page, head;
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1696	int ret = 0;
				1697	LIST_HEAD(source);
Liam Mark	786dee8	2021-06-30 18:52:43 -0700	[diff] [blame]	1698	static DEFINE_RATELIMIT_STATE(migrate_rs, DEFAULT_RATELIMIT_INTERVAL,
				1699	DEFAULT_RATELIMIT_BURST);
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1700
Michal Hocko	a85009c	2018-12-28 00:38:29 -0800	[diff] [blame]	1701	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1702	if (!pfn_valid(pfn))
				1703	continue;
				1704	page = pfn_to_page(pfn);
Matthew Wilcox (Oracle)	6c35784	2020-08-14 17:30:37 -0700	[diff] [blame]	1705	head = compound_head(page);
Naoya Horiguchi	c8721bb	2013-09-11 14:22:09 -0700	[diff] [blame]	1706
				1707	if (PageHuge(page)) {
Matthew Wilcox (Oracle)	d8c6546	2019-09-23 15:34:30 -0700	[diff] [blame]	1708	pfn = page_to_pfn(head) + compound_nr(head) - 1;
Oscar Salvador	daf3538	2019-03-05 15:48:53 -0800	[diff] [blame]	1709	isolate_huge_page(head, &source);
Naoya Horiguchi	c8721bb	2013-09-11 14:22:09 -0700	[diff] [blame]	1710	continue;
Michal Hocko	94723aa	2018-04-10 16:30:07 -0700	[diff] [blame]	1711	} else if (PageTransHuge(page))
Matthew Wilcox (Oracle)	6c35784	2020-08-14 17:30:37 -0700	[diff] [blame]	1712	pfn = page_to_pfn(head) + thp_nr_pages(page) - 1;
Naoya Horiguchi	c8721bb	2013-09-11 14:22:09 -0700	[diff] [blame]	1713
Michal Hocko	b15c872	2018-12-28 00:38:01 -0800	[diff] [blame]	1714	/*
				1715	* HWPoison pages have elevated reference counts so the migration would
				1716	* fail on them. It also doesn't make any sense to migrate them in the
				1717	* first place. Still try to unmap such a page in case it is still mapped
				1718	* (e.g. current hwpoison implementation doesn't unmap KSM pages but keep
				1719	* the unmap as the catch all safety net).
				1720	*/
				1721	if (PageHWPoison(page)) {
				1722	if (WARN_ON(PageLRU(page)))
				1723	isolate_lru_page(page);
				1724	if (page_mapped(page))
Shakeel Butt	013339d	2020-12-14 19:06:39 -0800	[diff] [blame]	1725	try_to_unmap(page, TTU_IGNORE_MLOCK);
Michal Hocko	b15c872	2018-12-28 00:38:01 -0800	[diff] [blame]	1726	continue;
				1727	}
				1728
Konstantin Khlebnikov	700c2a4	2011-05-24 17:12:19 -0700	[diff] [blame]	1729	if (!get_page_unless_zero(page))
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1730	continue;
				1731	/*
Yisheng Xie	0efadf4	2017-02-24 14:57:39 -0800	[diff] [blame]	1732	* We can skip free pages. And we can deal with pages on
				1733	* LRU and non-lru movable pages.
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1734	*/
Yisheng Xie	0efadf4	2017-02-24 14:57:39 -0800	[diff] [blame]	1735	if (PageLRU(page))
				1736	ret = isolate_lru_page(page);
				1737	else
				1738	ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1739	if (!ret) { /* Success */
Nick Piggin	62695a8	2008-10-18 20:26:09 -0700	[diff] [blame]	1740	list_add_tail(&page->lru, &source);
Yisheng Xie	0efadf4	2017-02-24 14:57:39 -0800	[diff] [blame]	1741	if (!__PageMovable(page))
				1742	inc_node_page_state(page, NR_ISOLATED_ANON +
Huang Ying	9de4f22	2020-04-06 20:04:41 -0700	[diff] [blame]	1743	page_is_file_lru(page));
KOSAKI Motohiro	6d9c285	2009-12-14 17:58:11 -0800	[diff] [blame]	1744
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1745	} else {
Liam Mark	786dee8	2021-06-30 18:52:43 -0700	[diff] [blame]	1746	if (__ratelimit(&migrate_rs)) {
				1747	pr_warn("failed to isolate pfn %lx\n", pfn);
				1748	dump_page(page, "isolation failed");
				1749	}
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1750	}
Oscar Salvador	1723058	2019-02-01 14:19:57 -0800	[diff] [blame]	1751	put_page(page);
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1752	}
Bob Liu	f3ab263	2010-10-26 14:22:10 -0700	[diff] [blame]	1753	if (!list_empty(&source)) {
Joonsoo Kim	203e6e5	2020-10-17 16:14:00 -0700	[diff] [blame]	1754	nodemask_t nmask = node_states[N_MEMORY];
				1755	struct migration_target_control mtc = {
				1756	.nmask = &nmask,
				1757	.gfp_mask = GFP_USER \| __GFP_MOVABLE \| __GFP_RETRY_MAYFAIL,
				1758	};
				1759
				1760	/*
				1761	* We have checked that migration range is on a single zone so
				1762	* we can use the nid of the first page to all the others.
				1763	*/
				1764	mtc.nid = page_to_nid(list_first_entry(&source, struct page, lru));
				1765
				1766	/*
				1767	* try to allocate from a different node but reuse this node
				1768	* if there are no other online nodes to be used (e.g. we are
				1769	* offlining a part of the only existing node)
				1770	*/
				1771	node_clear(mtc.nid, nmask);
				1772	if (nodes_empty(nmask))
				1773	node_set(mtc.nid, nmask);
				1774	ret = migrate_pages(&source, alloc_migration_target, NULL,
				1775	(unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
Michal Hocko	2932c8b	2018-12-28 00:33:53 -0800	[diff] [blame]	1776	if (ret) {
				1777	list_for_each_entry(page, &source, lru) {
Liam Mark	786dee8	2021-06-30 18:52:43 -0700	[diff] [blame]	1778	if (__ratelimit(&migrate_rs)) {
				1779	pr_warn("migrating pfn %lx failed ret:%d\n",
				1780	page_to_pfn(page), ret);
				1781	dump_page(page, "migration failure");
				1782	}
Michal Hocko	2932c8b	2018-12-28 00:33:53 -0800	[diff] [blame]	1783	}
Naoya Horiguchi	c8721bb	2013-09-11 14:22:09 -0700	[diff] [blame]	1784	putback_movable_pages(&source);
Michal Hocko	2932c8b	2018-12-28 00:33:53 -0800	[diff] [blame]	1785	}
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1786	}
Oscar Salvador	1723058	2019-02-01 14:19:57 -0800	[diff] [blame]	1787
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1788	return ret;
				1789	}
				1790
Tang Chen	c532092	2013-11-12 15:08:10 -0800	[diff] [blame]	1791	static int __init cmdline_parse_movable_node(char *p)
				1792	{
Tang Chen	55ac590	2014-01-21 15:49:35 -0800	[diff] [blame]	1793	movable_node_enabled = true;
Tang Chen	c532092	2013-11-12 15:08:10 -0800	[diff] [blame]	1794	return 0;
				1795	}
				1796	early_param("movable_node", cmdline_parse_movable_node);
				1797
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1798	/* check which state of node_states will be changed when offline memory */
				1799	static void node_states_check_changes_offline(unsigned long nr_pages,
				1800	struct zone zone, struct memory_notify arg)
				1801	{
				1802	struct pglist_data *pgdat = zone->zone_pgdat;
				1803	unsigned long present_pages = 0;
Oscar Salvador	86b27be	2018-10-26 15:07:38 -0700	[diff] [blame]	1804	enum zone_type zt;
				1805
Anshuman Khandual	98fa15f	2019-03-05 15:42:58 -0800	[diff] [blame]	1806	arg->status_change_nid = NUMA_NO_NODE;
				1807	arg->status_change_nid_normal = NUMA_NO_NODE;
				1808	arg->status_change_nid_high = NUMA_NO_NODE;
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1809
				1810	/*
Oscar Salvador	86b27be	2018-10-26 15:07:38 -0700	[diff] [blame]	1811	* Check whether node_states[N_NORMAL_MEMORY] will be changed.
				1812	* If the memory to be offline is within the range
				1813	* [0..ZONE_NORMAL], and it is the last present memory there,
				1814	* the zones in that range will become empty after the offlining,
				1815	* thus we can determine that we need to clear the node from
				1816	* node_states[N_NORMAL_MEMORY].
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1817	*/
Oscar Salvador	86b27be	2018-10-26 15:07:38 -0700	[diff] [blame]	1818	for (zt = 0; zt <= ZONE_NORMAL; zt++)
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1819	present_pages += pgdat->node_zones[zt].present_pages;
Oscar Salvador	86b27be	2018-10-26 15:07:38 -0700	[diff] [blame]	1820	if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages)
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1821	arg->status_change_nid_normal = zone_to_nid(zone);
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1822
Lai Jiangshan	6715ddf	2012-12-12 13:51:49 -0800	[diff] [blame]	1823	#ifdef CONFIG_HIGHMEM
				1824	/*
Oscar Salvador	86b27be	2018-10-26 15:07:38 -0700	[diff] [blame]	1825	* node_states[N_HIGH_MEMORY] contains nodes which
				1826	* have normal memory or high memory.
				1827	* Here we add the present_pages belonging to ZONE_HIGHMEM.
				1828	* If the zone is within the range of [0..ZONE_HIGHMEM), and
				1829	* we determine that the zones in that range become empty,
				1830	* we need to clear the node for N_HIGH_MEMORY.
Lai Jiangshan	6715ddf	2012-12-12 13:51:49 -0800	[diff] [blame]	1831	*/
Oscar Salvador	86b27be	2018-10-26 15:07:38 -0700	[diff] [blame]	1832	present_pages += pgdat->node_zones[ZONE_HIGHMEM].present_pages;
				1833	if (zone_idx(zone) <= ZONE_HIGHMEM && nr_pages >= present_pages)
Lai Jiangshan	6715ddf	2012-12-12 13:51:49 -0800	[diff] [blame]	1834	arg->status_change_nid_high = zone_to_nid(zone);
Lai Jiangshan	6715ddf	2012-12-12 13:51:49 -0800	[diff] [blame]	1835	#endif
				1836
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1837	/*
Oscar Salvador	86b27be	2018-10-26 15:07:38 -0700	[diff] [blame]	1838	* We have accounted the pages from [0..ZONE_NORMAL), and
				1839	* in case of CONFIG_HIGHMEM the pages from ZONE_HIGHMEM
				1840	* as well.
				1841	* Here we count the possible pages from ZONE_MOVABLE.
				1842	* If after having accounted all the pages, we see that the nr_pages
				1843	* to be offlined is over or equal to the accounted pages,
				1844	* we know that the node will become empty, and so, we can clear
				1845	* it for N_MEMORY as well.
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1846	*/
Oscar Salvador	86b27be	2018-10-26 15:07:38 -0700	[diff] [blame]	1847	present_pages += pgdat->node_zones[ZONE_MOVABLE].present_pages;
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1848
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1849	if (nr_pages >= present_pages)
				1850	arg->status_change_nid = zone_to_nid(zone);
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1851	}
				1852
				1853	static void node_states_clear_node(int node, struct memory_notify *arg)
				1854	{
				1855	if (arg->status_change_nid_normal >= 0)
				1856	node_clear_state(node, N_NORMAL_MEMORY);
				1857
Oscar Salvador	cf01f6f5	2018-10-26 15:07:28 -0700	[diff] [blame]	1858	if (arg->status_change_nid_high >= 0)
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1859	node_clear_state(node, N_HIGH_MEMORY);
Lai Jiangshan	6715ddf	2012-12-12 13:51:49 -0800	[diff] [blame]	1860
Oscar Salvador	cf01f6f5	2018-10-26 15:07:28 -0700	[diff] [blame]	1861	if (arg->status_change_nid >= 0)
Lai Jiangshan	6715ddf	2012-12-12 13:51:49 -0800	[diff] [blame]	1862	node_clear_state(node, N_MEMORY);
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1863	}
				1864
David Hildenbrand	c5e79ef	2019-11-30 17:54:17 -0800	[diff] [blame]	1865	static int count_system_ram_pages_cb(unsigned long start_pfn,
				1866	unsigned long nr_pages, void *data)
				1867	{
				1868	unsigned long *nr_system_ram_pages = data;
				1869
				1870	*nr_system_ram_pages += nr_pages;
				1871	return 0;
				1872	}
				1873
David Hildenbrand	836809e	2021-09-07 19:55:30 -0700	[diff] [blame]	1874	int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
				1875	struct memory_group *group)
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1876	{
David Hildenbrand	73a11c9	2020-10-15 20:07:46 -0700	[diff] [blame]	1877	const unsigned long end_pfn = start_pfn + nr_pages;
David Hildenbrand	0a1a9a0	2020-10-15 20:07:54 -0700	[diff] [blame]	1878	unsigned long pfn, system_ram_pages = 0;
Cody P Schafer	d702909	2013-07-03 15:02:11 -0700	[diff] [blame]	1879	unsigned long flags;
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1880	struct zone *zone;
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1881	struct memory_notify arg;
David Hildenbrand	ea15153	2020-10-15 20:08:03 -0700	[diff] [blame]	1882	int ret, node;
Michal Hocko	7960509	2018-12-28 00:33:49 -0800	[diff] [blame]	1883	char *reason;
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1884
Oscar Salvador	dd8e2f2	2021-05-04 18:39:36 -0700	[diff] [blame]	1885	/*
				1886	* {on,off}lining is constrained to full memory sections (or more
Zhen Lei	041711c	2021-06-30 18:53:17 -0700	[diff] [blame]	1887	* precisely to memory blocks from the user space POV).
Oscar Salvador	dd8e2f2	2021-05-04 18:39:36 -0700	[diff] [blame]	1888	* memmap_on_memory is an exception because it reserves initial part
				1889	* of the physical memory space for vmemmaps. That space is pageblock
				1890	* aligned.
				1891	*/
David Hildenbrand	4986fac	2020-10-15 20:07:50 -0700	[diff] [blame]	1892	if (WARN_ON_ONCE(!nr_pages \|\|
Oscar Salvador	dd8e2f2	2021-05-04 18:39:36 -0700	[diff] [blame]	1893	!IS_ALIGNED(start_pfn, pageblock_nr_pages) \|\|
				1894	!IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION)))
David Hildenbrand	4986fac	2020-10-15 20:07:50 -0700	[diff] [blame]	1895	return -EINVAL;
				1896
David Hildenbrand	381eab4	2018-10-30 15:10:29 -0700	[diff] [blame]	1897	mem_hotplug_begin();
				1898
David Hildenbrand	c5e79ef	2019-11-30 17:54:17 -0800	[diff] [blame]	1899	/*
				1900	* Don't allow to offline memory blocks that contain holes.
				1901	* Consequently, memory blocks with holes can never get onlined
				1902	* via the hotplug path - online_pages() - as hotplugged memory has
				1903	* no holes. This way, we e.g., don't have to worry about marking
				1904	* memory holes PG_reserved, don't need pfn_valid() checks, and can
				1905	* avoid using walk_system_ram_range() later.
				1906	*/
David Hildenbrand	73a11c9	2020-10-15 20:07:46 -0700	[diff] [blame]	1907	walk_system_ram_range(start_pfn, nr_pages, &system_ram_pages,
David Hildenbrand	c5e79ef	2019-11-30 17:54:17 -0800	[diff] [blame]	1908	count_system_ram_pages_cb);
David Hildenbrand	73a11c9	2020-10-15 20:07:46 -0700	[diff] [blame]	1909	if (system_ram_pages != nr_pages) {
David Hildenbrand	c5e79ef	2019-11-30 17:54:17 -0800	[diff] [blame]	1910	ret = -EINVAL;
				1911	reason = "memory holes";
				1912	goto failed_removal;
				1913	}
				1914
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1915	/* This makes hotplug much easier...and readable.
				1916	we assume this for now. .*/
David Hildenbrand	9291799	2020-02-03 17:34:26 -0800	[diff] [blame]	1917	zone = test_pages_in_a_zone(start_pfn, end_pfn);
				1918	if (!zone) {
Michal Hocko	7960509	2018-12-28 00:33:49 -0800	[diff] [blame]	1919	ret = -EINVAL;
				1920	reason = "multizone range";
				1921	goto failed_removal;
David Hildenbrand	381eab4	2018-10-30 15:10:29 -0700	[diff] [blame]	1922	}
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1923	node = zone_to_nid(zone);
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1924
Vlastimil Babka	ec6e8c7e	2020-12-14 19:10:59 -0800	[diff] [blame]	1925	/*
				1926	* Disable pcplists so that page isolation cannot race with freeing
				1927	* in a way that pages from isolated pageblock are left on pcplists.
				1928	*/
				1929	zone_pcp_disable(zone);
Minchan Kim	d479960e	2021-05-04 18:36:54 -0700	[diff] [blame]	1930	lru_cache_disable();
Vlastimil Babka	ec6e8c7e	2020-12-14 19:10:59 -0800	[diff] [blame]	1931
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1932	/* set above range as isolated */
Wen Congyang	b023f46	2012-12-11 16:00:45 -0800	[diff] [blame]	1933	ret = start_isolate_page_range(start_pfn, end_pfn,
Michal Hocko	d381c54	2018-12-28 00:33:56 -0800	[diff] [blame]	1934	MIGRATE_MOVABLE,
David Hildenbrand	756d25b	2019-11-30 17:54:07 -0800	[diff] [blame]	1935	MEMORY_OFFLINE \| REPORT_FAILURE);
David Hildenbrand	3fa0c7c	2020-10-15 20:08:07 -0700	[diff] [blame]	1936	if (ret) {
Michal Hocko	7960509	2018-12-28 00:33:49 -0800	[diff] [blame]	1937	reason = "failure to isolate range";
Vlastimil Babka	ec6e8c7e	2020-12-14 19:10:59 -0800	[diff] [blame]	1938	goto failed_removal_pcplists_disabled;
David Hildenbrand	381eab4	2018-10-30 15:10:29 -0700	[diff] [blame]	1939	}
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1940
				1941	arg.start_pfn = start_pfn;
				1942	arg.nr_pages = nr_pages;
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1943	node_states_check_changes_offline(nr_pages, zone, &arg);
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1944
				1945	ret = memory_notify(MEM_GOING_OFFLINE, &arg);
				1946	ret = notifier_to_errno(ret);
Michal Hocko	7960509	2018-12-28 00:33:49 -0800	[diff] [blame]	1947	if (ret) {
				1948	reason = "notifier failure";
				1949	goto failed_removal_isolated;
				1950	}
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1951
Michal Hocko	bb8965b	2018-12-28 00:38:32 -0800	[diff] [blame]	1952	do {
David Hildenbrand	aa21879	2020-05-07 16:01:30 +0200	[diff] [blame]	1953	pfn = start_pfn;
				1954	do {
Michal Hocko	bb8965b	2018-12-28 00:38:32 -0800	[diff] [blame]	1955	if (signal_pending(current)) {
				1956	ret = -EINTR;
				1957	reason = "signal backoff";
				1958	goto failed_removal_isolated;
				1959	}
Michal Hocko	72b39cf	2017-11-15 17:33:34 -0800	[diff] [blame]	1960
Michal Hocko	bb8965b	2018-12-28 00:38:32 -0800	[diff] [blame]	1961	cond_resched();
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1962
David Hildenbrand	aa21879	2020-05-07 16:01:30 +0200	[diff] [blame]	1963	ret = scan_movable_pages(pfn, end_pfn, &pfn);
				1964	if (!ret) {
Michal Hocko	bb8965b	2018-12-28 00:38:32 -0800	[diff] [blame]	1965	/*
				1966	* TODO: fatal migration failures should bail
				1967	* out
				1968	*/
				1969	do_migrate_range(pfn, end_pfn);
				1970	}
David Hildenbrand	aa21879	2020-05-07 16:01:30 +0200	[diff] [blame]	1971	} while (!ret);
				1972
				1973	if (ret != -ENOENT) {
				1974	reason = "unmovable page";
				1975	goto failed_removal_isolated;
Michal Hocko	bb8965b	2018-12-28 00:38:32 -0800	[diff] [blame]	1976	}
Michal Hocko	72b39cf	2017-11-15 17:33:34 -0800	[diff] [blame]	1977
Michal Hocko	bb8965b	2018-12-28 00:38:32 -0800	[diff] [blame]	1978	/*
				1979	* Dissolve free hugepages in the memory block before doing
				1980	* offlining actually in order to make hugetlbfs's object
				1981	* counting consistent.
				1982	*/
				1983	ret = dissolve_free_huge_pages(start_pfn, end_pfn);
				1984	if (ret) {
				1985	reason = "failure to dissolve huge pages";
				1986	goto failed_removal_isolated;
				1987	}
David Hildenbrand	0a1a9a0	2020-10-15 20:07:54 -0700	[diff] [blame]	1988
David Hildenbrand	0a1a9a0	2020-10-15 20:07:54 -0700	[diff] [blame]	1989	ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE);
Vlastimil Babka	ec6e8c7e	2020-12-14 19:10:59 -0800	[diff] [blame]	1990
Michal Hocko	5557c76	2019-05-13 17:21:24 -0700	[diff] [blame]	1991	} while (ret);
Michal Hocko	bb8965b	2018-12-28 00:38:32 -0800	[diff] [blame]	1992
David Hildenbrand	0a1a9a0	2020-10-15 20:07:54 -0700	[diff] [blame]	1993	/* Mark all sections offline and remove free pages from the buddy. */
				1994	__offline_isolated_pages(start_pfn, end_pfn);
Laurent Dufour	7c33023	2020-12-15 20:42:26 -0800	[diff] [blame]	1995	pr_debug("Offlined Pages %ld\n", nr_pages);
David Hildenbrand	0a1a9a0	2020-10-15 20:07:54 -0700	[diff] [blame]	1996
Qian Cai	9b7ea46	2019-03-28 20:43:34 -0700	[diff] [blame]	1997	/*
David Hildenbrand	b30c592	2020-10-15 20:08:23 -0700	[diff] [blame]	1998	* The memory sections are marked offline, and the pageblock flags
				1999	* effectively stale; nobody should be touching them. Fixup the number
				2000	* of isolated pageblocks, memory onlining will properly revert this.
Qian Cai	9b7ea46	2019-03-28 20:43:34 -0700	[diff] [blame]	2001	*/
				2002	spin_lock_irqsave(&zone->lock, flags);
David Hildenbrand	ea15153	2020-10-15 20:08:03 -0700	[diff] [blame]	2003	zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages;
Qian Cai	9b7ea46	2019-03-28 20:43:34 -0700	[diff] [blame]	2004	spin_unlock_irqrestore(&zone->lock, flags);
				2005
Minchan Kim	d479960e	2021-05-04 18:36:54 -0700	[diff] [blame]	2006	lru_cache_enable();
Vlastimil Babka	ec6e8c7e	2020-12-14 19:10:59 -0800	[diff] [blame]	2007	zone_pcp_enable(zone);
				2008
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	2009	/* removal success */
David Hildenbrand	0a1a9a0	2020-10-15 20:07:54 -0700	[diff] [blame]	2010	adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
David Hildenbrand	836809e	2021-09-07 19:55:30 -0700	[diff] [blame]	2011	adjust_present_page_count(pfn_to_page(start_pfn), group, -nr_pages);
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	2012
Mel Gorman	b92ca18	2021-06-28 19:42:12 -0700	[diff] [blame]	2013	/* reinitialise watermarks and update pcp limits */
KOSAKI Motohiro	1b79acc	2011-05-24 17:11:32 -0700	[diff] [blame]	2014	init_per_zone_wmark_min();
				2015
Xishi Qiu	1e8537b	2012-10-08 16:31:51 -0700	[diff] [blame]	2016	if (!populated_zone(zone)) {
Jiang Liu	340175b	2012-07-31 16:43:32 -0700	[diff] [blame]	2017	zone_pcp_reset(zone);
Michal Hocko	72675e1	2017-09-06 16:20:24 -0700	[diff] [blame]	2018	build_all_zonelists(NULL);
Mel Gorman	b92ca18	2021-06-28 19:42:12 -0700	[diff] [blame]	2019	}
Jiang Liu	340175b	2012-07-31 16:43:32 -0700	[diff] [blame]	2020
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	2021	node_states_clear_node(node, &arg);
Vlastimil Babka	698b1b3	2016-03-17 14:18:08 -0700	[diff] [blame]	2022	if (arg.status_change_nid >= 0) {
David Rientjes	8fe23e0	2009-12-14 17:58:33 -0800	[diff] [blame]	2023	kswapd_stop(node);
Vlastimil Babka	698b1b3	2016-03-17 14:18:08 -0700	[diff] [blame]	2024	kcompactd_stop(node);
				2025	}
Minchan Kim	bce7394	2009-06-16 15:32:50 -0700	[diff] [blame]	2026
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	2027	writeback_set_ratelimit();
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	2028
				2029	memory_notify(MEM_OFFLINE, &arg);
David Hildenbrand	feee6b2	2020-01-04 12:59:33 -0800	[diff] [blame]	2030	remove_pfn_range_from_zone(zone, start_pfn, nr_pages);
David Hildenbrand	381eab4	2018-10-30 15:10:29 -0700	[diff] [blame]	2031	mem_hotplug_done();
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	2032	return 0;
				2033
Michal Hocko	7960509	2018-12-28 00:33:49 -0800	[diff] [blame]	2034	failed_removal_isolated:
				2035	undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
Qian Cai	c4efe48	2019-03-28 20:44:16 -0700	[diff] [blame]	2036	memory_notify(MEM_CANCEL_OFFLINE, &arg);
Vlastimil Babka	ec6e8c7e	2020-12-14 19:10:59 -0800	[diff] [blame]	2037	failed_removal_pcplists_disabled:
Miaohe Lin	946746d1	2021-08-25 12:17:55 -0700	[diff] [blame]	2038	lru_cache_enable();
Vlastimil Babka	ec6e8c7e	2020-12-14 19:10:59 -0800	[diff] [blame]	2039	zone_pcp_enable(zone);
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	2040	failed_removal:
Michal Hocko	7960509	2018-12-28 00:33:49 -0800	[diff] [blame]	2041	pr_debug("memory offlining [mem %#010llx-%#010llx] failed due to %s\n",
Chen Yucong	e33e33b	2016-03-17 14:19:35 -0700	[diff] [blame]	2042	(unsigned long long) start_pfn << PAGE_SHIFT,
Michal Hocko	7960509	2018-12-28 00:33:49 -0800	[diff] [blame]	2043	((unsigned long long) end_pfn << PAGE_SHIFT) - 1,
				2044	reason);
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	2045	/* pushback to free area */
David Hildenbrand	381eab4	2018-10-30 15:10:29 -0700	[diff] [blame]	2046	mem_hotplug_done();
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	2047	return ret;
				2048	}
Badari Pulavarty	7108878	2008-10-18 20:25:58 -0700	[diff] [blame]	2049
Xishi Qiu	d6de9d5	2013-11-12 15:07:20 -0800	[diff] [blame]	2050	static int check_memblock_offlined_cb(struct memory_block mem, void arg)
Wen Congyang	bbc76be	2013-02-22 16:32:54 -0800	[diff] [blame]	2051	{
				2052	int ret = !is_memblock_offlined(mem);
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2053	int *nid = arg;
Wen Congyang	bbc76be	2013-02-22 16:32:54 -0800	[diff] [blame]	2054
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2055	*nid = mem->nid;
Randy Dunlap	349daa0	2013-04-29 15:08:49 -0700	[diff] [blame]	2056	if (unlikely(ret)) {
				2057	phys_addr_t beginpa, endpa;
				2058
				2059	beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr));
David Hildenbrand	b6c88d3	2019-09-23 15:35:49 -0700	[diff] [blame]	2060	endpa = beginpa + memory_block_size_bytes() - 1;
Joe Perches	756a025	2016-03-17 14:19:47 -0700	[diff] [blame]	2061	pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n",
Randy Dunlap	349daa0	2013-04-29 15:08:49 -0700	[diff] [blame]	2062	&beginpa, &endpa);
Wen Congyang	bbc76be	2013-02-22 16:32:54 -0800	[diff] [blame]	2063
Pavel Tatashin	eca499a	2019-07-16 16:30:31 -0700	[diff] [blame]	2064	return -EBUSY;
				2065	}
				2066	return 0;
Wen Congyang	bbc76be	2013-02-22 16:32:54 -0800	[diff] [blame]	2067	}
				2068
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	2069	static int get_nr_vmemmap_pages_cb(struct memory_block mem, void arg)
				2070	{
				2071	/*
				2072	* If not set, continue with the next block.
				2073	*/
				2074	return mem->nr_vmemmap_pages;
				2075	}
				2076
Toshi Kani	0f1cfe9	2013-09-11 14:21:50 -0700	[diff] [blame]	2077	static int check_cpu_on_node(pg_data_t *pgdat)
Tang Chen	60a5a19	2013-02-22 16:33:14 -0800	[diff] [blame]	2078	{
Tang Chen	60a5a19	2013-02-22 16:33:14 -0800	[diff] [blame]	2079	int cpu;
				2080
				2081	for_each_present_cpu(cpu) {
				2082	if (cpu_to_node(cpu) == pgdat->node_id)
				2083	/*
				2084	* the cpu on this node isn't removed, and we can't
				2085	* offline this node.
				2086	*/
				2087	return -EBUSY;
				2088	}
				2089
				2090	return 0;
				2091	}
				2092
David Hildenbrand	2c91f8f	2019-11-15 17:34:57 -0800	[diff] [blame]	2093	static int check_no_memblock_for_node_cb(struct memory_block mem, void arg)
				2094	{
				2095	int nid = (int )arg;
				2096
				2097	/*
				2098	* If a memory block belongs to multiple nodes, the stored nid is not
				2099	* reliable. However, such blocks are always online (e.g., cannot get
				2100	* offlined) and, therefore, are still spanned by the node.
				2101	*/
				2102	return mem->nid == nid ? -EEXIST : 0;
				2103	}
				2104
Toshi Kani	0f1cfe9	2013-09-11 14:21:50 -0700	[diff] [blame]	2105	/**
				2106	* try_offline_node
Mike Rapoport	e8b098f	2018-04-05 16:24:57 -0700	[diff] [blame]	2107	* @nid: the node ID
Toshi Kani	0f1cfe9	2013-09-11 14:21:50 -0700	[diff] [blame]	2108	*
				2109	* Offline a node if all memory sections and cpus of the node are removed.
				2110	*
				2111	* NOTE: The caller must call lock_device_hotplug() to serialize hotplug
				2112	* and online/offline operations before this call.
				2113	*/
Wen Congyang	90b30cd	2013-02-22 16:33:27 -0800	[diff] [blame]	2114	void try_offline_node(int nid)
Tang Chen	60a5a19	2013-02-22 16:33:14 -0800	[diff] [blame]	2115	{
Wen Congyang	d822b86	2013-02-22 16:33:16 -0800	[diff] [blame]	2116	pg_data_t *pgdat = NODE_DATA(nid);
David Hildenbrand	2c91f8f	2019-11-15 17:34:57 -0800	[diff] [blame]	2117	int rc;
Tang Chen	60a5a19	2013-02-22 16:33:14 -0800	[diff] [blame]	2118
David Hildenbrand	2c91f8f	2019-11-15 17:34:57 -0800	[diff] [blame]	2119	/*
				2120	* If the node still spans pages (especially ZONE_DEVICE), don't
				2121	* offline it. A node spans memory after move_pfn_range_to_zone(),
				2122	* e.g., after the memory block was onlined.
				2123	*/
				2124	if (pgdat->node_spanned_pages)
Tang Chen	60a5a19	2013-02-22 16:33:14 -0800	[diff] [blame]	2125	return;
David Hildenbrand	2c91f8f	2019-11-15 17:34:57 -0800	[diff] [blame]	2126
				2127	/*
				2128	* Especially offline memory blocks might not be spanned by the
				2129	* node. They will get spanned by the node once they get onlined.
				2130	* However, they link to the node in sysfs and can get onlined later.
				2131	*/
				2132	rc = for_each_memory_block(&nid, check_no_memblock_for_node_cb);
				2133	if (rc)
				2134	return;
Tang Chen	60a5a19	2013-02-22 16:33:14 -0800	[diff] [blame]	2135
Michal Hocko	46a3679	2018-12-28 00:34:13 -0800	[diff] [blame]	2136	if (check_cpu_on_node(pgdat))
Tang Chen	60a5a19	2013-02-22 16:33:14 -0800	[diff] [blame]	2137	return;
				2138
				2139	/*
				2140	* all memory/cpu of this node are removed, we can offline this
				2141	* node now.
				2142	*/
				2143	node_set_offline(nid);
				2144	unregister_one_node(nid);
				2145	}
Wen Congyang	90b30cd	2013-02-22 16:33:27 -0800	[diff] [blame]	2146	EXPORT_SYMBOL(try_offline_node);
Tang Chen	60a5a19	2013-02-22 16:33:14 -0800	[diff] [blame]	2147
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2148	static int __ref try_remove_memory(u64 start, u64 size)
Wen Congyang	bbc76be	2013-02-22 16:32:54 -0800	[diff] [blame]	2149	{
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	2150	struct vmem_altmap mhp_altmap = {};
				2151	struct vmem_altmap *altmap = NULL;
				2152	unsigned long nr_vmemmap_pages;
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2153	int rc = 0, nid = NUMA_NO_NODE;
Wen Congyang	993c1aa	2013-02-22 16:32:50 -0800	[diff] [blame]	2154
Toshi Kani	27356f5	2013-09-11 14:21:49 -0700	[diff] [blame]	2155	BUG_ON(check_hotplug_memory_range(start, size));
				2156
Yasuaki Ishimatsu	6677e3e	2013-02-22 16:32:52 -0800	[diff] [blame]	2157	/*
Rafael J. Wysocki	242831e	2013-05-27 12:58:46 +0200	[diff] [blame]	2158	* All memory blocks must be offlined before removing memory. Check
Pavel Tatashin	eca499a	2019-07-16 16:30:31 -0700	[diff] [blame]	2159	* whether all memory blocks in question are offline and return error
Rafael J. Wysocki	242831e	2013-05-27 12:58:46 +0200	[diff] [blame]	2160	* if this is not the case.
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2161	*
				2162	* While at it, determine the nid. Note that if we'd have mixed nodes,
				2163	* we'd only try to offline the last determined one -- which is good
				2164	* enough for the cases we care about.
Yasuaki Ishimatsu	6677e3e	2013-02-22 16:32:52 -0800	[diff] [blame]	2165	*/
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2166	rc = walk_memory_blocks(start, size, &nid, check_memblock_offlined_cb);
Pavel Tatashin	eca499a	2019-07-16 16:30:31 -0700	[diff] [blame]	2167	if (rc)
Jia He	b4223a5	2020-08-11 18:32:20 -0700	[diff] [blame]	2168	return rc;
Yasuaki Ishimatsu	6677e3e	2013-02-22 16:32:52 -0800	[diff] [blame]	2169
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	2170	/*
				2171	* We only support removing memory added with MHP_MEMMAP_ON_MEMORY in
				2172	* the same granularity it was added - a single memory block.
				2173	*/
				2174	if (memmap_on_memory) {
				2175	nr_vmemmap_pages = walk_memory_blocks(start, size, NULL,
				2176	get_nr_vmemmap_pages_cb);
				2177	if (nr_vmemmap_pages) {
				2178	if (size != memory_block_size_bytes()) {
				2179	pr_warn("Refuse to remove %#llx - %#llx,"
				2180	"wrong granularity\n",
				2181	start, start + size);
				2182	return -EINVAL;
				2183	}
				2184
				2185	/*
				2186	* Let remove_pmd_table->free_hugepage_table do the
				2187	* right thing if we used vmem_altmap when hot-adding
				2188	* the range.
				2189	*/
				2190	mhp_altmap.alloc = nr_vmemmap_pages;
				2191	altmap = &mhp_altmap;
				2192	}
				2193	}
				2194
Yasuaki Ishimatsu	46c66c4	2013-02-22 16:32:56 -0800	[diff] [blame]	2195	/* remove memmap entry */
				2196	firmware_map_remove(start, start + size, "System RAM");
				2197
Dan Williams	f1037ec	2020-01-30 22:11:17 -0800	[diff] [blame]	2198	/*
				2199	* Memory block device removal under the device_hotplug_lock is
				2200	* a barrier against racing online attempts.
				2201	*/
David Hildenbrand	4c4b7f9	2019-07-18 15:57:06 -0700	[diff] [blame]	2202	remove_memory_block_devices(start, size);
				2203
Dan Williams	f1037ec	2020-01-30 22:11:17 -0800	[diff] [blame]	2204	mem_hotplug_begin();
				2205
David Hildenbrand	65a2aa5	2021-09-07 19:55:04 -0700	[diff] [blame]	2206	arch_remove_memory(start, size, altmap);
David Hildenbrand	52219ae	2020-06-04 16:48:38 -0700	[diff] [blame]	2207
				2208	if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
				2209	memblock_free(start, size);
				2210	memblock_remove(start, size);
				2211	}
				2212
David Hildenbrand	cb8e3c8	2020-10-15 20:09:12 -0700	[diff] [blame]	2213	release_mem_region_adjustable(start, size);
Wen Congyang	24d335c	2013-02-22 16:32:58 -0800	[diff] [blame]	2214
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2215	if (nid != NUMA_NO_NODE)
				2216	try_offline_node(nid);
Tang Chen	60a5a19	2013-02-22 16:33:14 -0800	[diff] [blame]	2217
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	2218	mem_hotplug_done();
Jia He	b4223a5	2020-08-11 18:32:20 -0700	[diff] [blame]	2219	return 0;
Badari Pulavarty	7108878	2008-10-18 20:25:58 -0700	[diff] [blame]	2220	}
David Hildenbrand	d15e592	2018-10-30 15:10:18 -0700	[diff] [blame]	2221
Pavel Tatashin	eca499a	2019-07-16 16:30:31 -0700	[diff] [blame]	2222	/**
Mel Gorman	5640c9c	2021-06-30 18:53:38 -0700	[diff] [blame]	2223	* __remove_memory - Remove memory if every memory block is offline
Pavel Tatashin	eca499a	2019-07-16 16:30:31 -0700	[diff] [blame]	2224	* @start: physical address of the region to remove
				2225	* @size: size of the region to remove
				2226	*
				2227	* NOTE: The caller must call lock_device_hotplug() to serialize hotplug
				2228	* and online/offline operations before this call, as required by
				2229	* try_offline_node().
				2230	*/
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2231	void __remove_memory(u64 start, u64 size)
David Hildenbrand	d15e592	2018-10-30 15:10:18 -0700	[diff] [blame]	2232	{
Pavel Tatashin	eca499a	2019-07-16 16:30:31 -0700	[diff] [blame]	2233
				2234	/*
Souptick Joarder	29a90db	2019-09-23 15:36:18 -0700	[diff] [blame]	2235	* trigger BUG() if some memory is not offlined prior to calling this
Pavel Tatashin	eca499a	2019-07-16 16:30:31 -0700	[diff] [blame]	2236	* function
				2237	*/
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2238	if (try_remove_memory(start, size))
Pavel Tatashin	eca499a	2019-07-16 16:30:31 -0700	[diff] [blame]	2239	BUG();
				2240	}
				2241
				2242	/*
				2243	* Remove memory if every memory block is offline, otherwise return -EBUSY is
				2244	* some memory is not offline
				2245	*/
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2246	int remove_memory(u64 start, u64 size)
Pavel Tatashin	eca499a	2019-07-16 16:30:31 -0700	[diff] [blame]	2247	{
				2248	int rc;
				2249
David Hildenbrand	d15e592	2018-10-30 15:10:18 -0700	[diff] [blame]	2250	lock_device_hotplug();
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2251	rc = try_remove_memory(start, size);
David Hildenbrand	d15e592	2018-10-30 15:10:18 -0700	[diff] [blame]	2252	unlock_device_hotplug();
Pavel Tatashin	eca499a	2019-07-16 16:30:31 -0700	[diff] [blame]	2253
				2254	return rc;
David Hildenbrand	d15e592	2018-10-30 15:10:18 -0700	[diff] [blame]	2255	}
Badari Pulavarty	7108878	2008-10-18 20:25:58 -0700	[diff] [blame]	2256	EXPORT_SYMBOL_GPL(remove_memory);
David Hildenbrand	08b3acd	2020-05-07 16:01:32 +0200	[diff] [blame]	2257
David Hildenbrand	8dc4bb5	2020-11-12 14:38:13 +0100	[diff] [blame]	2258	static int try_offline_memory_block(struct memory_block mem, void arg)
				2259	{
				2260	uint8_t online_type = MMOP_ONLINE_KERNEL;
				2261	uint8_t **online_types = arg;
				2262	struct page *page;
				2263	int rc;
				2264
				2265	/*
				2266	* Sense the online_type via the zone of the memory block. Offlining
				2267	* with multiple zones within one memory block will be rejected
				2268	* by offlining code ... so we don't care about that.
				2269	*/
				2270	page = pfn_to_online_page(section_nr_to_pfn(mem->start_section_nr));
				2271	if (page && zone_idx(page_zone(page)) == ZONE_MOVABLE)
				2272	online_type = MMOP_ONLINE_MOVABLE;
				2273
				2274	rc = device_offline(&mem->dev);
				2275	/*
				2276	* Default is MMOP_OFFLINE - change it only if offlining succeeded,
				2277	* so try_reonline_memory_block() can do the right thing.
				2278	*/
				2279	if (!rc)
				2280	**online_types = online_type;
				2281
				2282	(*online_types)++;
				2283	/* Ignore if already offline. */
				2284	return rc < 0 ? rc : 0;
				2285	}
				2286
				2287	static int try_reonline_memory_block(struct memory_block mem, void arg)
				2288	{
				2289	uint8_t **online_types = arg;
				2290	int rc;
				2291
				2292	if (**online_types != MMOP_OFFLINE) {
				2293	mem->online_type = **online_types;
				2294	rc = device_online(&mem->dev);
				2295	if (rc < 0)
				2296	pr_warn("%s: Failed to re-online memory: %d",
				2297	__func__, rc);
				2298	}
				2299
				2300	/* Continue processing all remaining memory blocks. */
				2301	(*online_types)++;
				2302	return 0;
				2303	}
				2304
David Hildenbrand	08b3acd	2020-05-07 16:01:32 +0200	[diff] [blame]	2305	/*
David Hildenbrand	8dc4bb5	2020-11-12 14:38:13 +0100	[diff] [blame]	2306	* Try to offline and remove memory. Might take a long time to finish in case
				2307	* memory is still in use. Primarily useful for memory devices that logically
				2308	* unplugged all memory (so it's no longer in use) and want to offline + remove
				2309	* that memory.
David Hildenbrand	08b3acd	2020-05-07 16:01:32 +0200	[diff] [blame]	2310	*/
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2311	int offline_and_remove_memory(u64 start, u64 size)
David Hildenbrand	08b3acd	2020-05-07 16:01:32 +0200	[diff] [blame]	2312	{
David Hildenbrand	8dc4bb5	2020-11-12 14:38:13 +0100	[diff] [blame]	2313	const unsigned long mb_count = size / memory_block_size_bytes();
				2314	uint8_t online_types, tmp;
				2315	int rc;
David Hildenbrand	08b3acd	2020-05-07 16:01:32 +0200	[diff] [blame]	2316
				2317	if (!IS_ALIGNED(start, memory_block_size_bytes()) \|\|
David Hildenbrand	8dc4bb5	2020-11-12 14:38:13 +0100	[diff] [blame]	2318	!IS_ALIGNED(size, memory_block_size_bytes()) \|\| !size)
				2319	return -EINVAL;
David Hildenbrand	08b3acd	2020-05-07 16:01:32 +0200	[diff] [blame]	2320
				2321	/*
David Hildenbrand	8dc4bb5	2020-11-12 14:38:13 +0100	[diff] [blame]	2322	* We'll remember the old online type of each memory block, so we can
				2323	* try to revert whatever we did when offlining one memory block fails
				2324	* after offlining some others succeeded.
				2325	*/
				2326	online_types = kmalloc_array(mb_count, sizeof(*online_types),
				2327	GFP_KERNEL);
				2328	if (!online_types)
				2329	return -ENOMEM;
				2330	/*
				2331	* Initialize all states to MMOP_OFFLINE, so when we abort processing in
				2332	* try_offline_memory_block(), we'll skip all unprocessed blocks in
				2333	* try_reonline_memory_block().
				2334	*/
				2335	memset(online_types, MMOP_OFFLINE, mb_count);
				2336
				2337	lock_device_hotplug();
				2338
				2339	tmp = online_types;
				2340	rc = walk_memory_blocks(start, size, &tmp, try_offline_memory_block);
				2341
				2342	/*
				2343	* In case we succeeded to offline all memory, remove it.
David Hildenbrand	08b3acd	2020-05-07 16:01:32 +0200	[diff] [blame]	2344	* This cannot fail as it cannot get onlined in the meantime.
				2345	*/
				2346	if (!rc) {
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2347	rc = try_remove_memory(start, size);
David Hildenbrand	8dc4bb5	2020-11-12 14:38:13 +0100	[diff] [blame]	2348	if (rc)
				2349	pr_err("%s: Failed to remove memory: %d", __func__, rc);
				2350	}
				2351
				2352	/*
				2353	* Rollback what we did. While memory onlining might theoretically fail
				2354	* (nacked by a notifier), it barely ever happens.
				2355	*/
				2356	if (rc) {
				2357	tmp = online_types;
				2358	walk_memory_blocks(start, size, &tmp,
				2359	try_reonline_memory_block);
David Hildenbrand	08b3acd	2020-05-07 16:01:32 +0200	[diff] [blame]	2360	}
				2361	unlock_device_hotplug();
				2362
David Hildenbrand	8dc4bb5	2020-11-12 14:38:13 +0100	[diff] [blame]	2363	kfree(online_types);
David Hildenbrand	08b3acd	2020-05-07 16:01:32 +0200	[diff] [blame]	2364	return rc;
				2365	}
				2366	EXPORT_SYMBOL_GPL(offline_and_remove_memory);
Rafael J. Wysocki	aba6efc	2013-06-01 22:24:07 +0200	[diff] [blame]	2367	#endif /* CONFIG_MEMORY_HOTREMOVE */