Blame - mm/memory_hotplug.c - yocto/kernel/common

blob: 337eaee95fb2e12afd3672017f61b4103ff22c18 [file] [log] [blame]

Thomas Gleixner	457c899	2019-05-19 13:08:55 +0100	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0-only
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	2	/*
				3	* linux/mm/memory_hotplug.c
				4	*
				5	* Copyright (C)
				6	*/
				7
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	8	#include <linux/stddef.h>
				9	#include <linux/mm.h>
Ingo Molnar	174cd4b	2017-02-02 19:15:33 +0100	[diff] [blame]	10	#include <linux/sched/signal.h>
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	11	#include <linux/swap.h>
				12	#include <linux/interrupt.h>
				13	#include <linux/pagemap.h>
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	14	#include <linux/compiler.h>
Paul Gortmaker	b95f1b31	2011-10-16 02:01:52 -0400	[diff] [blame]	15	#include <linux/export.h>
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	16	#include <linux/pagevec.h>
Chandra Seetharaman	2d1d43f	2006-09-29 02:01:25 -0700	[diff] [blame]	17	#include <linux/writeback.h>
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	18	#include <linux/slab.h>
				19	#include <linux/sysctl.h>
				20	#include <linux/cpu.h>
				21	#include <linux/memory.h>
Dan Williams	4b94ffd	2016-01-15 16:56:22 -0800	[diff] [blame]	22	#include <linux/memremap.h>
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	23	#include <linux/memory_hotplug.h>
				24	#include <linux/highmem.h>
				25	#include <linux/vmalloc.h>
KAMEZAWA Hiroyuki	0a54703	2006-06-27 02:53:35 -0700	[diff] [blame]	26	#include <linux/ioport.h>
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	27	#include <linux/delay.h>
				28	#include <linux/migrate.h>
				29	#include <linux/page-isolation.h>
Badari Pulavarty	7108878	2008-10-18 20:25:58 -0700	[diff] [blame]	30	#include <linux/pfn.h>
Andi Kleen	6ad696d	2009-11-17 14:06:22 -0800	[diff] [blame]	31	#include <linux/suspend.h>
KOSAKI Motohiro	6d9c285	2009-12-14 17:58:11 -0800	[diff] [blame]	32	#include <linux/mm_inline.h>
akpm@linux-foundation.org	d96ae53	2010-03-05 13:41:58 -0800	[diff] [blame]	33	#include <linux/firmware-map.h>
Tang Chen	60a5a19	2013-02-22 16:33:14 -0800	[diff] [blame]	34	#include <linux/stop_machine.h>
Naoya Horiguchi	c8721bb	2013-09-11 14:22:09 -0700	[diff] [blame]	35	#include <linux/hugetlb.h>
Tang Chen	c532092	2013-11-12 15:08:10 -0800	[diff] [blame]	36	#include <linux/memblock.h>
Vlastimil Babka	698b1b3	2016-03-17 14:18:08 -0700	[diff] [blame]	37	#include <linux/compaction.h>
Michal Hocko	b15c872	2018-12-28 00:38:01 -0800	[diff] [blame]	38	#include <linux/rmap.h>
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	39
				40	#include <asm/tlbflush.h>
				41
Adrian Bunk	1e5ad9a	2008-04-28 20:40:08 +0300	[diff] [blame]	42	#include "internal.h"
Dan Williams	e900a91	2019-05-14 15:41:28 -0700	[diff] [blame]	43	#include "shuffle.h"
Adrian Bunk	1e5ad9a	2008-04-28 20:40:08 +0300	[diff] [blame]	44
Oscar Salvador	e3a9d9f	2021-05-04 18:39:48 -0700	[diff] [blame]	45
				46	/*
				47	* memory_hotplug.memmap_on_memory parameter
				48	*/
				49	static bool memmap_on_memory __ro_after_init;
				50	#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
				51	module_param(memmap_on_memory, bool, 0444);
				52	MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug");
				53	#endif
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	54
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	55	enum {
				56	ONLINE_POLICY_CONTIG_ZONES = 0,
				57	ONLINE_POLICY_AUTO_MOVABLE,
				58	};
				59
				60	const char *online_policy_to_str[] = {
				61	[ONLINE_POLICY_CONTIG_ZONES] = "contig-zones",
				62	[ONLINE_POLICY_AUTO_MOVABLE] = "auto-movable",
				63	};
				64
				65	static int set_online_policy(const char val, const struct kernel_param kp)
				66	{
				67	int ret = sysfs_match_string(online_policy_to_str, val);
				68
				69	if (ret < 0)
				70	return ret;
				71	((int )kp->arg) = ret;
				72	return 0;
				73	}
				74
				75	static int get_online_policy(char buffer, const struct kernel_param kp)
				76	{
				77	return sprintf(buffer, "%s\n", online_policy_to_str[((int )kp->arg)]);
				78	}
				79
				80	/*
				81	* memory_hotplug.online_policy: configure online behavior when onlining without
				82	* specifying a zone (MMOP_ONLINE)
				83	*
				84	* "contig-zones": keep zone contiguous
				85	* "auto-movable": online memory to ZONE_MOVABLE if the configuration
				86	* (auto_movable_ratio, auto_movable_numa_aware) allows for it
				87	*/
				88	static int online_policy __read_mostly = ONLINE_POLICY_CONTIG_ZONES;
				89	static const struct kernel_param_ops online_policy_ops = {
				90	.set = set_online_policy,
				91	.get = get_online_policy,
				92	};
				93	module_param_cb(online_policy, &online_policy_ops, &online_policy, 0644);
				94	MODULE_PARM_DESC(online_policy,
				95	"Set the online policy (\"contig-zones\", \"auto-movable\") "
				96	"Default: \"contig-zones\"");
				97
				98	/*
				99	* memory_hotplug.auto_movable_ratio: specify maximum MOVABLE:KERNEL ratio
				100	*
				101	* The ratio represent an upper limit and the kernel might decide to not
				102	* online some memory to ZONE_MOVABLE -- e.g., because hotplugged KERNEL memory
				103	* doesn't allow for more MOVABLE memory.
				104	*/
				105	static unsigned int auto_movable_ratio __read_mostly = 301;
				106	module_param(auto_movable_ratio, uint, 0644);
				107	MODULE_PARM_DESC(auto_movable_ratio,
				108	"Set the maximum ratio of MOVABLE:KERNEL memory in the system "
				109	"in percent for \"auto-movable\" online policy. Default: 301");
				110
				111	/*
				112	* memory_hotplug.auto_movable_numa_aware: consider numa node stats
				113	*/
				114	#ifdef CONFIG_NUMA
				115	static bool auto_movable_numa_aware __read_mostly = true;
				116	module_param(auto_movable_numa_aware, bool, 0644);
				117	MODULE_PARM_DESC(auto_movable_numa_aware,
				118	"Consider numa node stats in addition to global stats in "
				119	"\"auto-movable\" online policy. Default: true");
				120	#endif /* CONFIG_NUMA */
				121
Daniel Kiper	9d0ad8c	2011-07-25 17:12:05 -0700	[diff] [blame]	122	/*
				123	* online_page_callback contains pointer to current page onlining function.
				124	* Initially it is generic_online_page(). If it is required it could be
				125	* changed by calling set_online_page_callback() for callback registration
				126	* and restore_online_page_callback() for generic callback restore.
				127	*/
				128
Daniel Kiper	9d0ad8c	2011-07-25 17:12:05 -0700	[diff] [blame]	129	static online_page_callback_t online_page_callback = generic_online_page;
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	130	static DEFINE_MUTEX(online_page_callback_lock);
Daniel Kiper	9d0ad8c	2011-07-25 17:12:05 -0700	[diff] [blame]	131
Thomas Gleixner	3f906ba	2017-07-10 15:50:09 -0700	[diff] [blame]	132	DEFINE_STATIC_PERCPU_RWSEM(mem_hotplug_lock);
KOSAKI Motohiro	20d6c96	2010-12-02 14:31:19 -0800	[diff] [blame]	133
Thomas Gleixner	3f906ba	2017-07-10 15:50:09 -0700	[diff] [blame]	134	void get_online_mems(void)
				135	{
				136	percpu_down_read(&mem_hotplug_lock);
				137	}
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	138
Thomas Gleixner	3f906ba	2017-07-10 15:50:09 -0700	[diff] [blame]	139	void put_online_mems(void)
				140	{
				141	percpu_up_read(&mem_hotplug_lock);
				142	}
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	143
Michal Hocko	4932381	2017-07-06 15:41:05 -0700	[diff] [blame]	144	bool movable_node_enabled = false;
				145
Vitaly Kuznetsov	8604d9e	2016-05-19 17:13:03 -0700	[diff] [blame]	146	#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
Anshuman Khandual	1adf8b4	2021-02-25 17:17:13 -0800	[diff] [blame]	147	int mhp_default_online_type = MMOP_OFFLINE;
Vitaly Kuznetsov	8604d9e	2016-05-19 17:13:03 -0700	[diff] [blame]	148	#else
Anshuman Khandual	1adf8b4	2021-02-25 17:17:13 -0800	[diff] [blame]	149	int mhp_default_online_type = MMOP_ONLINE;
Vitaly Kuznetsov	8604d9e	2016-05-19 17:13:03 -0700	[diff] [blame]	150	#endif
Vitaly Kuznetsov	31bc385	2016-03-15 14:56:48 -0700	[diff] [blame]	151
Vitaly Kuznetsov	86dd995	2016-05-19 17:13:06 -0700	[diff] [blame]	152	static int __init setup_memhp_default_state(char *str)
				153	{
Anshuman Khandual	1adf8b4	2021-02-25 17:17:13 -0800	[diff] [blame]	154	const int online_type = mhp_online_type_from_str(str);
David Hildenbrand	5f47adf	2020-04-06 20:07:44 -0700	[diff] [blame]	155
				156	if (online_type >= 0)
Anshuman Khandual	1adf8b4	2021-02-25 17:17:13 -0800	[diff] [blame]	157	mhp_default_online_type = online_type;
Vitaly Kuznetsov	86dd995	2016-05-19 17:13:06 -0700	[diff] [blame]	158
				159	return 1;
				160	}
				161	__setup("memhp_default_state=", setup_memhp_default_state);
				162
David Rientjes	30467e0	2015-04-14 15:45:11 -0700	[diff] [blame]	163	void mem_hotplug_begin(void)
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	164	{
Thomas Gleixner	3f906ba	2017-07-10 15:50:09 -0700	[diff] [blame]	165	cpus_read_lock();
				166	percpu_down_write(&mem_hotplug_lock);
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	167	}
				168
David Rientjes	30467e0	2015-04-14 15:45:11 -0700	[diff] [blame]	169	void mem_hotplug_done(void)
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	170	{
Thomas Gleixner	3f906ba	2017-07-10 15:50:09 -0700	[diff] [blame]	171	percpu_up_write(&mem_hotplug_lock);
				172	cpus_read_unlock();
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	173	}
KOSAKI Motohiro	20d6c96	2010-12-02 14:31:19 -0800	[diff] [blame]	174
Juergen Gross	357b4da	2019-02-14 11:42:39 +0100	[diff] [blame]	175	u64 max_mem_size = U64_MAX;
				176
Keith Mannthey	45e0b78	2006-09-30 23:27:09 -0700	[diff] [blame]	177	/* add this memory to iomem resource */
David Hildenbrand	7b7b272	2020-06-04 16:48:41 -0700	[diff] [blame]	178	static struct resource *register_memory_resource(u64 start, u64 size,
				179	const char *resource_name)
Keith Mannthey	45e0b78	2006-09-30 23:27:09 -0700	[diff] [blame]	180	{
Dave Hansen	2794129	2019-02-25 10:57:36 -0800	[diff] [blame]	181	struct resource *res;
				182	unsigned long flags = IORESOURCE_SYSTEM_RAM \| IORESOURCE_BUSY;
David Hildenbrand	7b7b272	2020-06-04 16:48:41 -0700	[diff] [blame]	183
				184	if (strcmp(resource_name, "System RAM"))
David Hildenbrand	7cf603d	2020-10-15 20:08:33 -0700	[diff] [blame]	185	flags \|= IORESOURCE_SYSRAM_DRIVER_MANAGED;
Juergen Gross	357b4da	2019-02-14 11:42:39 +0100	[diff] [blame]	186
Anshuman Khandual	bca3fea	2021-02-25 17:17:33 -0800	[diff] [blame]	187	if (!mhp_range_allowed(start, size, true))
				188	return ERR_PTR(-E2BIG);
				189
Baoquan He	f3cd4c8	2020-04-06 20:06:50 -0700	[diff] [blame]	190	/*
				191	* Make sure value parsed from 'mem=' only restricts memory adding
				192	* while booting, so that memory hotplug won't be impacted. Please
				193	* refer to document of 'mem=' in kernel-parameters.txt for more
				194	* details.
				195	*/
				196	if (start + size > max_mem_size && system_state < SYSTEM_RUNNING)
Juergen Gross	357b4da	2019-02-14 11:42:39 +0100	[diff] [blame]	197	return ERR_PTR(-E2BIG);
				198
Dave Hansen	2794129	2019-02-25 10:57:36 -0800	[diff] [blame]	199	/*
				200	* Request ownership of the new memory range. This might be
				201	* a child of an existing resource that was present but
				202	* not marked as busy.
				203	*/
				204	res = __request_region(&iomem_resource, start, size,
				205	resource_name, flags);
Keith Mannthey	45e0b78	2006-09-30 23:27:09 -0700	[diff] [blame]	206
Dave Hansen	2794129	2019-02-25 10:57:36 -0800	[diff] [blame]	207	if (!res) {
				208	pr_debug("Unable to reserve System RAM region: %016llx->%016llx\n",
				209	start, start + size);
Vitaly Kuznetsov	6f754ba	2016-01-14 15:21:55 -0800	[diff] [blame]	210	return ERR_PTR(-EEXIST);
Keith Mannthey	45e0b78	2006-09-30 23:27:09 -0700	[diff] [blame]	211	}
				212	return res;
				213	}
				214
				215	static void release_memory_resource(struct resource *res)
				216	{
				217	if (!res)
				218	return;
				219	release_resource(res);
				220	kfree(res);
Keith Mannthey	45e0b78	2006-09-30 23:27:09 -0700	[diff] [blame]	221	}
				222
Keith Mannthey	5394702	2006-09-30 23:27:08 -0700	[diff] [blame]	223	#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
Dan Williams	7ea6216	2019-07-18 15:58:22 -0700	[diff] [blame]	224	static int check_pfn_span(unsigned long pfn, unsigned long nr_pages,
				225	const char *reason)
				226	{
				227	/*
				228	* Disallow all operations smaller than a sub-section and only
				229	* allow operations smaller than a section for
				230	* SPARSEMEM_VMEMMAP. Note that check_hotplug_memory_range()
				231	* enforces a larger memory_block_size_bytes() granularity for
				232	* memory that will be marked online, so this check should only
				233	* fire for direct arch_{add,remove}_memory() users outside of
				234	* add_memory_resource().
				235	*/
				236	unsigned long min_align;
				237
				238	if (IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP))
				239	min_align = PAGES_PER_SUBSECTION;
				240	else
				241	min_align = PAGES_PER_SECTION;
				242	if (!IS_ALIGNED(pfn, min_align)
				243	\|\| !IS_ALIGNED(nr_pages, min_align)) {
				244	WARN(1, "Misaligned __%s_pages start: %#lx end: #%lx\n",
				245	reason, pfn, pfn + nr_pages - 1);
				246	return -EINVAL;
				247	}
				248	return 0;
				249	}
				250
David Rientjes	4edd7ce	2013-04-29 15:08:22 -0700	[diff] [blame]	251	/*
Dan Williams	9f605f2	2021-02-25 17:16:57 -0800	[diff] [blame]	252	* Return page for the valid pfn only if the page is online. All pfn
				253	* walkers which rely on the fully initialized page->flags and others
				254	* should use this rather than pfn_valid && pfn_to_page
				255	*/
				256	struct page *pfn_to_online_page(unsigned long pfn)
				257	{
				258	unsigned long nr = pfn_to_section_nr(pfn);
Dan Williams	1f90a34	2021-02-25 17:17:05 -0800	[diff] [blame]	259	struct dev_pagemap *pgmap;
Dan Williams	9f9b02e	2021-02-25 17:17:01 -0800	[diff] [blame]	260	struct mem_section *ms;
Dan Williams	9f605f2	2021-02-25 17:16:57 -0800	[diff] [blame]	261
Dan Williams	9f9b02e	2021-02-25 17:17:01 -0800	[diff] [blame]	262	if (nr >= NR_MEM_SECTIONS)
				263	return NULL;
				264
				265	ms = __nr_to_section(nr);
				266	if (!online_section(ms))
				267	return NULL;
				268
				269	/*
				270	* Save some code text when online_section() +
				271	* pfn_section_valid() are sufficient.
				272	*/
				273	if (IS_ENABLED(CONFIG_HAVE_ARCH_PFN_VALID) && !pfn_valid(pfn))
				274	return NULL;
				275
				276	if (!pfn_section_valid(ms, pfn))
				277	return NULL;
				278
Dan Williams	1f90a34	2021-02-25 17:17:05 -0800	[diff] [blame]	279	if (!online_device_section(ms))
				280	return pfn_to_page(pfn);
				281
				282	/*
				283	* Slowpath: when ZONE_DEVICE collides with
				284	* ZONE_{NORMAL,MOVABLE} within the same section some pfns in
				285	* the section may be 'offline' but 'valid'. Only
				286	* get_dev_pagemap() can determine sub-section online status.
				287	*/
				288	pgmap = get_dev_pagemap(pfn, NULL);
				289	put_dev_pagemap(pgmap);
				290
				291	/* The presence of a pgmap indicates ZONE_DEVICE offline pfn */
				292	if (pgmap)
				293	return NULL;
				294
Dan Williams	9f9b02e	2021-02-25 17:17:01 -0800	[diff] [blame]	295	return pfn_to_page(pfn);
Dan Williams	9f605f2	2021-02-25 17:16:57 -0800	[diff] [blame]	296	}
				297	EXPORT_SYMBOL_GPL(pfn_to_online_page);
				298
				299	/*
David Rientjes	4edd7ce	2013-04-29 15:08:22 -0700	[diff] [blame]	300	* Reasonably generic function for adding memory. It is
				301	* expected that archs that support memory hotplug will
				302	* call this function after deciding the zone to which to
				303	* add the new pages.
				304	*/
Dan Williams	7ea6216	2019-07-18 15:58:22 -0700	[diff] [blame]	305	int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
Logan Gunthorpe	f5637d3	2020-04-10 14:33:21 -0700	[diff] [blame]	306	struct mhp_params *params)
David Rientjes	4edd7ce	2013-04-29 15:08:22 -0700	[diff] [blame]	307	{
David Hildenbrand	6cdd0b3	2020-04-06 20:06:56 -0700	[diff] [blame]	308	const unsigned long end_pfn = pfn + nr_pages;
				309	unsigned long cur_nr_pages;
Dan Williams	9a84503	2019-07-18 15:58:43 -0700	[diff] [blame]	310	int err;
Logan Gunthorpe	f5637d3	2020-04-10 14:33:21 -0700	[diff] [blame]	311	struct vmem_altmap *altmap = params->altmap;
Dan Williams	4b94ffd	2016-01-15 16:56:22 -0800	[diff] [blame]	312
Logan Gunthorpe	bfeb022	2020-04-10 14:33:36 -0700	[diff] [blame]	313	if (WARN_ON_ONCE(!params->pgprot.pgprot))
				314	return -EINVAL;
				315
Anshuman Khandual	bca3fea	2021-02-25 17:17:33 -0800	[diff] [blame]	316	VM_BUG_ON(!mhp_range_allowed(PFN_PHYS(pfn), nr_pages * PAGE_SIZE, false));
Alastair D'Silva	dca4436	2019-11-30 17:53:48 -0800	[diff] [blame]	317
Dan Williams	4b94ffd	2016-01-15 16:56:22 -0800	[diff] [blame]	318	if (altmap) {
				319	/*
				320	* Validate altmap is within bounds of the total request
				321	*/
Dan Williams	7ea6216	2019-07-18 15:58:22 -0700	[diff] [blame]	322	if (altmap->base_pfn != pfn
Dan Williams	4b94ffd	2016-01-15 16:56:22 -0800	[diff] [blame]	323	\|\| vmem_altmap_offset(altmap) > nr_pages) {
				324	pr_warn_once("memory add fail, invalid altmap\n");
Dan Williams	7ea6216	2019-07-18 15:58:22 -0700	[diff] [blame]	325	return -EINVAL;
Dan Williams	4b94ffd	2016-01-15 16:56:22 -0800	[diff] [blame]	326	}
				327	altmap->alloc = 0;
				328	}
				329
Dan Williams	7ea6216	2019-07-18 15:58:22 -0700	[diff] [blame]	330	err = check_pfn_span(pfn, nr_pages, "add");
				331	if (err)
				332	return err;
				333
David Hildenbrand	6cdd0b3	2020-04-06 20:06:56 -0700	[diff] [blame]	334	for (; pfn < end_pfn; pfn += cur_nr_pages) {
				335	/* Select all remaining pages up to the next section boundary */
				336	cur_nr_pages = min(end_pfn - pfn,
				337	SECTION_ALIGN_UP(pfn + 1) - pfn);
				338	err = sparse_add_section(nid, pfn, cur_nr_pages, altmap);
Dan Williams	ba72b4c	2019-07-18 15:58:26 -0700	[diff] [blame]	339	if (err)
				340	break;
Michal Hocko	f64ac5e	2017-10-03 16:16:16 -0700	[diff] [blame]	341	cond_resched();
David Rientjes	4edd7ce	2013-04-29 15:08:22 -0700	[diff] [blame]	342	}
Zhu Guihua	c435a39	2015-06-24 16:58:42 -0700	[diff] [blame]	343	vmemmap_populate_print_last();
David Rientjes	4edd7ce	2013-04-29 15:08:22 -0700	[diff] [blame]	344	return err;
				345	}
David Rientjes	4edd7ce	2013-04-29 15:08:22 -0700	[diff] [blame]	346
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	347	/* find the smallest valid pfn in the range [start_pfn, end_pfn) */
YASUAKI ISHIMATSU	d09b013	2017-10-03 16:16:32 -0700	[diff] [blame]	348	static unsigned long find_smallest_section_pfn(int nid, struct zone *zone,
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	349	unsigned long start_pfn,
				350	unsigned long end_pfn)
				351	{
Dan Williams	49ba3c6	2019-07-18 15:58:07 -0700	[diff] [blame]	352	for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SUBSECTION) {
David Hildenbrand	7ce700b	2019-11-21 17:53:56 -0800	[diff] [blame]	353	if (unlikely(!pfn_to_online_page(start_pfn)))
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	354	continue;
				355
				356	if (unlikely(pfn_to_nid(start_pfn) != nid))
				357	continue;
				358
David Hildenbrand	9b05158	2020-02-03 17:34:12 -0800	[diff] [blame]	359	if (zone != page_zone(pfn_to_page(start_pfn)))
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	360	continue;
				361
				362	return start_pfn;
				363	}
				364
				365	return 0;
				366	}
				367
				368	/* find the biggest valid pfn in the range [start_pfn, end_pfn). */
YASUAKI ISHIMATSU	d09b013	2017-10-03 16:16:32 -0700	[diff] [blame]	369	static unsigned long find_biggest_section_pfn(int nid, struct zone *zone,
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	370	unsigned long start_pfn,
				371	unsigned long end_pfn)
				372	{
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	373	unsigned long pfn;
				374
				375	/* pfn is the end pfn of a memory section. */
				376	pfn = end_pfn - 1;
Dan Williams	49ba3c6	2019-07-18 15:58:07 -0700	[diff] [blame]	377	for (; pfn >= start_pfn; pfn -= PAGES_PER_SUBSECTION) {
David Hildenbrand	7ce700b	2019-11-21 17:53:56 -0800	[diff] [blame]	378	if (unlikely(!pfn_to_online_page(pfn)))
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	379	continue;
				380
				381	if (unlikely(pfn_to_nid(pfn) != nid))
				382	continue;
				383
David Hildenbrand	9b05158	2020-02-03 17:34:12 -0800	[diff] [blame]	384	if (zone != page_zone(pfn_to_page(pfn)))
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	385	continue;
				386
				387	return pfn;
				388	}
				389
				390	return 0;
				391	}
				392
				393	static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
				394	unsigned long end_pfn)
				395	{
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	396	unsigned long pfn;
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	397	int nid = zone_to_nid(zone);
				398
David Hildenbrand	5d12071	2020-02-03 17:34:19 -0800	[diff] [blame]	399	if (zone->zone_start_pfn == start_pfn) {
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	400	/*
				401	* If the section is smallest section in the zone, it need
				402	* shrink zone->zone_start_pfn and zone->zone_spanned_pages.
				403	* In this case, we find second smallest valid mem_section
				404	* for shrinking zone.
				405	*/
				406	pfn = find_smallest_section_pfn(nid, zone, end_pfn,
David Hildenbrand	5d12071	2020-02-03 17:34:19 -0800	[diff] [blame]	407	zone_end_pfn(zone));
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	408	if (pfn) {
David Hildenbrand	5d12071	2020-02-03 17:34:19 -0800	[diff] [blame]	409	zone->spanned_pages = zone_end_pfn(zone) - pfn;
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	410	zone->zone_start_pfn = pfn;
David Hildenbrand	950b68d	2020-02-03 17:34:16 -0800	[diff] [blame]	411	} else {
				412	zone->zone_start_pfn = 0;
				413	zone->spanned_pages = 0;
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	414	}
David Hildenbrand	5d12071	2020-02-03 17:34:19 -0800	[diff] [blame]	415	} else if (zone_end_pfn(zone) == end_pfn) {
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	416	/*
				417	* If the section is biggest section in the zone, it need
				418	* shrink zone->spanned_pages.
				419	* In this case, we find second biggest valid mem_section for
				420	* shrinking zone.
				421	*/
David Hildenbrand	5d12071	2020-02-03 17:34:19 -0800	[diff] [blame]	422	pfn = find_biggest_section_pfn(nid, zone, zone->zone_start_pfn,
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	423	start_pfn);
				424	if (pfn)
David Hildenbrand	5d12071	2020-02-03 17:34:19 -0800	[diff] [blame]	425	zone->spanned_pages = pfn - zone->zone_start_pfn + 1;
David Hildenbrand	950b68d	2020-02-03 17:34:16 -0800	[diff] [blame]	426	else {
				427	zone->zone_start_pfn = 0;
				428	zone->spanned_pages = 0;
				429	}
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	430	}
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	431	}
				432
David Hildenbrand	00d6c01	2019-10-18 20:19:33 -0700	[diff] [blame]	433	static void update_pgdat_span(struct pglist_data *pgdat)
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	434	{
David Hildenbrand	00d6c01	2019-10-18 20:19:33 -0700	[diff] [blame]	435	unsigned long node_start_pfn = 0, node_end_pfn = 0;
				436	struct zone *zone;
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	437
David Hildenbrand	00d6c01	2019-10-18 20:19:33 -0700	[diff] [blame]	438	for (zone = pgdat->node_zones;
				439	zone < pgdat->node_zones + MAX_NR_ZONES; zone++) {
Miaohe Lin	6c922cf	2021-02-25 17:17:21 -0800	[diff] [blame]	440	unsigned long end_pfn = zone_end_pfn(zone);
David Hildenbrand	00d6c01	2019-10-18 20:19:33 -0700	[diff] [blame]	441
				442	/* No need to lock the zones, they can't change. */
David Hildenbrand	656d571	2019-11-05 21:17:10 -0800	[diff] [blame]	443	if (!zone->spanned_pages)
				444	continue;
				445	if (!node_end_pfn) {
				446	node_start_pfn = zone->zone_start_pfn;
Miaohe Lin	6c922cf	2021-02-25 17:17:21 -0800	[diff] [blame]	447	node_end_pfn = end_pfn;
David Hildenbrand	656d571	2019-11-05 21:17:10 -0800	[diff] [blame]	448	continue;
				449	}
				450
Miaohe Lin	6c922cf	2021-02-25 17:17:21 -0800	[diff] [blame]	451	if (end_pfn > node_end_pfn)
				452	node_end_pfn = end_pfn;
David Hildenbrand	00d6c01	2019-10-18 20:19:33 -0700	[diff] [blame]	453	if (zone->zone_start_pfn < node_start_pfn)
				454	node_start_pfn = zone->zone_start_pfn;
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	455	}
				456
David Hildenbrand	00d6c01	2019-10-18 20:19:33 -0700	[diff] [blame]	457	pgdat->node_start_pfn = node_start_pfn;
				458	pgdat->node_spanned_pages = node_end_pfn - node_start_pfn;
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	459	}
				460
David Hildenbrand	feee6b2	2020-01-04 12:59:33 -0800	[diff] [blame]	461	void __ref remove_pfn_range_from_zone(struct zone *zone,
				462	unsigned long start_pfn,
				463	unsigned long nr_pages)
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	464	{
Ben Widawsky	b7e3deb	2020-06-25 20:30:51 -0700	[diff] [blame]	465	const unsigned long end_pfn = start_pfn + nr_pages;
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	466	struct pglist_data *pgdat = zone->zone_pgdat;
Oscar Salvador	27cacaa	2021-06-30 18:52:46 -0700	[diff] [blame]	467	unsigned long pfn, cur_nr_pages;
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	468
David Hildenbrand	d33695b	2020-02-03 17:34:09 -0800	[diff] [blame]	469	/* Poison struct pages because they are now uninitialized again. */
Ben Widawsky	b7e3deb	2020-06-25 20:30:51 -0700	[diff] [blame]	470	for (pfn = start_pfn; pfn < end_pfn; pfn += cur_nr_pages) {
				471	cond_resched();
				472
				473	/* Select all remaining pages up to the next section boundary */
				474	cur_nr_pages =
				475	min(end_pfn - pfn, SECTION_ALIGN_UP(pfn + 1) - pfn);
				476	page_init_poison(pfn_to_page(pfn),
				477	sizeof(struct page) * cur_nr_pages);
				478	}
David Hildenbrand	d33695b	2020-02-03 17:34:09 -0800	[diff] [blame]	479
David Hildenbrand	7ce700b	2019-11-21 17:53:56 -0800	[diff] [blame]	480	/*
				481	* Zone shrinking code cannot properly deal with ZONE_DEVICE. So
				482	* we will not try to shrink the zones - which is okay as
				483	* set_zone_contiguous() cannot deal with ZONE_DEVICE either way.
				484	*/
Miaohe Lin	5ef5f81	2021-09-07 19:55:52 -0700	[diff] [blame]	485	if (zone_is_zone_device(zone))
David Hildenbrand	7ce700b	2019-11-21 17:53:56 -0800	[diff] [blame]	486	return;
David Hildenbrand	7ce700b	2019-11-21 17:53:56 -0800	[diff] [blame]	487
David Hildenbrand	feee6b2	2020-01-04 12:59:33 -0800	[diff] [blame]	488	clear_zone_contiguous(zone);
				489
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	490	shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
David Hildenbrand	00d6c01	2019-10-18 20:19:33 -0700	[diff] [blame]	491	update_pgdat_span(pgdat);
David Hildenbrand	feee6b2	2020-01-04 12:59:33 -0800	[diff] [blame]	492
				493	set_zone_contiguous(zone);
Yasuaki Ishimatsu	815121d	2013-02-22 16:33:12 -0800	[diff] [blame]	494	}
				495
David Hildenbrand	feee6b2	2020-01-04 12:59:33 -0800	[diff] [blame]	496	static void __remove_section(unsigned long pfn, unsigned long nr_pages,
				497	unsigned long map_offset,
				498	struct vmem_altmap *altmap)
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	499	{
chenqiwu	1040490	2020-04-06 20:07:48 -0700	[diff] [blame]	500	struct mem_section *ms = __pfn_to_section(pfn);
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	501
David Hildenbrand	9d1d887	2019-05-13 17:21:41 -0700	[diff] [blame]	502	if (WARN_ON_ONCE(!valid_section(ms)))
				503	return;
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	504
Dan Williams	ba72b4c	2019-07-18 15:58:26 -0700	[diff] [blame]	505	sparse_remove_section(ms, pfn, nr_pages, map_offset, altmap);
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	506	}
				507
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	508	/**
David Hildenbrand	feee6b2	2020-01-04 12:59:33 -0800	[diff] [blame]	509	* __remove_pages() - remove sections of pages
Dan Williams	7ea6216	2019-07-18 15:58:22 -0700	[diff] [blame]	510	* @pfn: starting pageframe (must be aligned to start of a section)
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	511	* @nr_pages: number of pages to remove (must be multiple of section size)
Mike Rapoport	e8b098f	2018-04-05 16:24:57 -0700	[diff] [blame]	512	* @altmap: alternative device page map or %NULL if default memmap is used
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	513	*
				514	* Generic helper function to remove section mappings and sysfs entries
				515	* for the section of the memory we are removing. Caller needs to make
				516	* sure that pages are marked reserved and zones are adjust properly by
				517	* calling offline_pages().
				518	*/
David Hildenbrand	feee6b2	2020-01-04 12:59:33 -0800	[diff] [blame]	519	void __remove_pages(unsigned long pfn, unsigned long nr_pages,
				520	struct vmem_altmap *altmap)
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	521	{
David Hildenbrand	52fb87c	2020-02-03 17:34:23 -0800	[diff] [blame]	522	const unsigned long end_pfn = pfn + nr_pages;
				523	unsigned long cur_nr_pages;
Dan Williams	4b94ffd	2016-01-15 16:56:22 -0800	[diff] [blame]	524	unsigned long map_offset = 0;
Dan Williams	4b94ffd	2016-01-15 16:56:22 -0800	[diff] [blame]	525
Dan Williams	96da435	2019-07-18 15:58:15 -0700	[diff] [blame]	526	map_offset = vmem_altmap_offset(altmap);
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	527
Dan Williams	7ea6216	2019-07-18 15:58:22 -0700	[diff] [blame]	528	if (check_pfn_span(pfn, nr_pages, "remove"))
				529	return;
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	530
David Hildenbrand	52fb87c	2020-02-03 17:34:23 -0800	[diff] [blame]	531	for (; pfn < end_pfn; pfn += cur_nr_pages) {
Michal Hocko	dd33ad7	2018-11-02 15:48:46 -0700	[diff] [blame]	532	cond_resched();
David Hildenbrand	52fb87c	2020-02-03 17:34:23 -0800	[diff] [blame]	533	/* Select all remaining pages up to the next section boundary */
David Hildenbrand	a11b941	2020-04-06 20:06:53 -0700	[diff] [blame]	534	cur_nr_pages = min(end_pfn - pfn,
				535	SECTION_ALIGN_UP(pfn + 1) - pfn);
David Hildenbrand	52fb87c	2020-02-03 17:34:23 -0800	[diff] [blame]	536	__remove_section(pfn, cur_nr_pages, map_offset, altmap);
Dan Williams	4b94ffd	2016-01-15 16:56:22 -0800	[diff] [blame]	537	map_offset = 0;
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	538	}
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	539	}
Badari Pulavarty	ea01ea9	2008-04-28 02:12:01 -0700	[diff] [blame]	540
Daniel Kiper	9d0ad8c	2011-07-25 17:12:05 -0700	[diff] [blame]	541	int set_online_page_callback(online_page_callback_t callback)
				542	{
				543	int rc = -EINVAL;
				544
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	545	get_online_mems();
				546	mutex_lock(&online_page_callback_lock);
Daniel Kiper	9d0ad8c	2011-07-25 17:12:05 -0700	[diff] [blame]	547
				548	if (online_page_callback == generic_online_page) {
				549	online_page_callback = callback;
				550	rc = 0;
				551	}
				552
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	553	mutex_unlock(&online_page_callback_lock);
				554	put_online_mems();
Daniel Kiper	9d0ad8c	2011-07-25 17:12:05 -0700	[diff] [blame]	555
				556	return rc;
				557	}
				558	EXPORT_SYMBOL_GPL(set_online_page_callback);
				559
				560	int restore_online_page_callback(online_page_callback_t callback)
				561	{
				562	int rc = -EINVAL;
				563
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	564	get_online_mems();
				565	mutex_lock(&online_page_callback_lock);
Daniel Kiper	9d0ad8c	2011-07-25 17:12:05 -0700	[diff] [blame]	566
				567	if (online_page_callback == callback) {
				568	online_page_callback = generic_online_page;
				569	rc = 0;
				570	}
				571
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	572	mutex_unlock(&online_page_callback_lock);
				573	put_online_mems();
Daniel Kiper	9d0ad8c	2011-07-25 17:12:05 -0700	[diff] [blame]	574
				575	return rc;
				576	}
				577	EXPORT_SYMBOL_GPL(restore_online_page_callback);
				578
David Hildenbrand	18db149	2019-11-30 17:53:51 -0800	[diff] [blame]	579	void generic_online_page(struct page *page, unsigned int order)
Daniel Kiper	9d0ad8c	2011-07-25 17:12:05 -0700	[diff] [blame]	580	{
Vlastimil Babka	c87cbc1	2020-03-05 22:28:42 -0800	[diff] [blame]	581	/*
				582	* Freeing the page with debug_pagealloc enabled will try to unmap it,
				583	* so we should map it first. This is better than introducing a special
				584	* case in page freeing fast path.
				585	*/
Mike Rapoport	77bc7fd	2020-12-14 19:10:20 -0800	[diff] [blame]	586	debug_pagealloc_map_pages(page, 1 << order);
Arun KS	a9cd410	2019-03-05 15:42:14 -0800	[diff] [blame]	587	__free_pages_core(page, order);
				588	totalram_pages_add(1UL << order);
				589	#ifdef CONFIG_HIGHMEM
				590	if (PageHighMem(page))
				591	totalhigh_pages_add(1UL << order);
				592	#endif
				593	}
David Hildenbrand	18db149	2019-11-30 17:53:51 -0800	[diff] [blame]	594	EXPORT_SYMBOL_GPL(generic_online_page);
Arun KS	a9cd410	2019-03-05 15:42:14 -0800	[diff] [blame]	595
David Hildenbrand	aac6532	2020-10-15 20:08:11 -0700	[diff] [blame]	596	static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	597	{
David Hildenbrand	b2c2ab2	2019-09-23 15:36:02 -0700	[diff] [blame]	598	const unsigned long end_pfn = start_pfn + nr_pages;
				599	unsigned long pfn;
Michal Hocko	2d070ea	2017-07-06 15:37:56 -0700	[diff] [blame]	600
David Hildenbrand	b2c2ab2	2019-09-23 15:36:02 -0700	[diff] [blame]	601	/*
David Hildenbrand	aac6532	2020-10-15 20:08:11 -0700	[diff] [blame]	602	* Online the pages in MAX_ORDER - 1 aligned chunks. The callback might
				603	* decide to not expose all pages to the buddy (e.g., expose them
				604	* later). We account all pages as being online and belonging to this
				605	* zone ("present").
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	606	* When using memmap_on_memory, the range might not be aligned to
				607	* MAX_ORDER_NR_PAGES - 1, but pageblock aligned. __ffs() will detect
				608	* this and the first chunk to online will be pageblock_nr_pages.
David Hildenbrand	b2c2ab2	2019-09-23 15:36:02 -0700	[diff] [blame]	609	*/
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	610	for (pfn = start_pfn; pfn < end_pfn;) {
				611	int order = min(MAX_ORDER - 1UL, __ffs(pfn));
				612
				613	(*online_page_callback)(pfn_to_page(pfn), order);
				614	pfn += (1UL << order);
				615	}
Michal Hocko	2d070ea	2017-07-06 15:37:56 -0700	[diff] [blame]	616
David Hildenbrand	b2c2ab2	2019-09-23 15:36:02 -0700	[diff] [blame]	617	/* mark all involved sections as online */
				618	online_mem_sections(start_pfn, end_pfn);
KAMEZAWA Hiroyuki	75884fb	2007-10-16 01:26:10 -0700	[diff] [blame]	619	}
				620
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	621	/* check which state of node_states will be changed when online memory */
				622	static void node_states_check_changes_online(unsigned long nr_pages,
				623	struct zone zone, struct memory_notify arg)
				624	{
				625	int nid = zone_to_nid(zone);
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	626
Anshuman Khandual	98fa15f	2019-03-05 15:42:58 -0800	[diff] [blame]	627	arg->status_change_nid = NUMA_NO_NODE;
				628	arg->status_change_nid_normal = NUMA_NO_NODE;
				629	arg->status_change_nid_high = NUMA_NO_NODE;
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	630
Lai Jiangshan	6715ddf	2012-12-12 13:51:49 -0800	[diff] [blame]	631	if (!node_state(nid, N_MEMORY))
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	632	arg->status_change_nid = nid;
Oscar Salvador	8efe33f	2018-10-26 15:07:34 -0700	[diff] [blame]	633	if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY))
				634	arg->status_change_nid_normal = nid;
				635	#ifdef CONFIG_HIGHMEM
Baoquan He	d3ba3ae	2019-05-13 17:17:35 -0700	[diff] [blame]	636	if (zone_idx(zone) <= ZONE_HIGHMEM && !node_state(nid, N_HIGH_MEMORY))
Oscar Salvador	8efe33f	2018-10-26 15:07:34 -0700	[diff] [blame]	637	arg->status_change_nid_high = nid;
				638	#endif
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	639	}
				640
				641	static void node_states_set_node(int node, struct memory_notify *arg)
				642	{
				643	if (arg->status_change_nid_normal >= 0)
				644	node_set_state(node, N_NORMAL_MEMORY);
				645
Lai Jiangshan	6715ddf	2012-12-12 13:51:49 -0800	[diff] [blame]	646	if (arg->status_change_nid_high >= 0)
				647	node_set_state(node, N_HIGH_MEMORY);
				648
Oscar Salvador	83d8361	2018-10-26 15:07:25 -0700	[diff] [blame]	649	if (arg->status_change_nid >= 0)
				650	node_set_state(node, N_MEMORY);
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	651	}
				652
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	653	static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn,
				654	unsigned long nr_pages)
				655	{
				656	unsigned long old_end_pfn = zone_end_pfn(zone);
				657
				658	if (zone_is_empty(zone) \|\| start_pfn < zone->zone_start_pfn)
				659	zone->zone_start_pfn = start_pfn;
				660
				661	zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn;
				662	}
				663
				664	static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn,
				665	unsigned long nr_pages)
				666	{
				667	unsigned long old_end_pfn = pgdat_end_pfn(pgdat);
				668
				669	if (!pgdat->node_spanned_pages \|\| start_pfn < pgdat->node_start_pfn)
				670	pgdat->node_start_pfn = start_pfn;
				671
				672	pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn;
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	673
David Hildenbrand	3fccb74	2019-09-23 15:35:37 -0700	[diff] [blame]	674	}
Dan Williams	1f90a34	2021-02-25 17:17:05 -0800	[diff] [blame]	675
				676	static void section_taint_zone_device(unsigned long pfn)
				677	{
				678	struct mem_section *ms = __pfn_to_section(pfn);
				679
				680	ms->section_mem_map \|= SECTION_TAINT_ZONE_DEVICE;
				681	}
				682
David Hildenbrand	3fccb74	2019-09-23 15:35:37 -0700	[diff] [blame]	683	/*
				684	* Associate the pfn range with the given zone, initializing the memmaps
				685	* and resizing the pgdat/zone data to span the added pages. After this
				686	* call, all affected pages are PG_reserved.
David Hildenbrand	d882c00	2020-10-15 20:08:19 -0700	[diff] [blame]	687	*
				688	* All aligned pageblocks are initialized to the specified migratetype
				689	* (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
				690	* zone stats (e.g., nr_isolate_pageblock) are touched.
David Hildenbrand	3fccb74	2019-09-23 15:35:37 -0700	[diff] [blame]	691	*/
Christoph Hellwig	a99583e	2017-12-29 08:53:57 +0100	[diff] [blame]	692	void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
David Hildenbrand	d882c00	2020-10-15 20:08:19 -0700	[diff] [blame]	693	unsigned long nr_pages,
				694	struct vmem_altmap *altmap, int migratetype)
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	695	{
				696	struct pglist_data *pgdat = zone->zone_pgdat;
				697	int nid = pgdat->node_id;
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	698
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	699	clear_zone_contiguous(zone);
				700
Wei Yang	fa004ab	2018-12-28 00:37:10 -0800	[diff] [blame]	701	if (zone_is_empty(zone))
				702	init_currently_empty_zone(zone, start_pfn, nr_pages);
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	703	resize_zone_range(zone, start_pfn, nr_pages);
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	704	resize_pgdat_range(pgdat, start_pfn, nr_pages);
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	705
				706	/*
Dan Williams	1f90a34	2021-02-25 17:17:05 -0800	[diff] [blame]	707	* Subsection population requires care in pfn_to_online_page().
				708	* Set the taint to enable the slow path detection of
				709	* ZONE_DEVICE pages in an otherwise ZONE_{NORMAL,MOVABLE}
				710	* section.
				711	*/
				712	if (zone_is_zone_device(zone)) {
				713	if (!IS_ALIGNED(start_pfn, PAGES_PER_SECTION))
				714	section_taint_zone_device(start_pfn);
				715	if (!IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION))
				716	section_taint_zone_device(start_pfn + nr_pages);
				717	}
				718
				719	/*
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	720	* TODO now we have a visible range of pages which are not associated
				721	* with their zone properly. Not nice but set_pfnblock_flags_mask
				722	* expects the zone spans the pfn range. All the pages in the range
				723	* are reserved so nobody should be touching them so we should be safe
				724	*/
Baoquan He	ab28cb6	2021-02-24 12:06:14 -0800	[diff] [blame]	725	memmap_init_range(nr_pages, nid, zone_idx(zone), start_pfn, 0,
David Hildenbrand	d882c00	2020-10-15 20:08:19 -0700	[diff] [blame]	726	MEMINIT_HOTPLUG, altmap, migratetype);
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	727
				728	set_zone_contiguous(zone);
				729	}
				730
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	731	struct auto_movable_stats {
				732	unsigned long kernel_early_pages;
				733	unsigned long movable_pages;
				734	};
				735
				736	static void auto_movable_stats_account_zone(struct auto_movable_stats *stats,
				737	struct zone *zone)
				738	{
				739	if (zone_idx(zone) == ZONE_MOVABLE) {
				740	stats->movable_pages += zone->present_pages;
				741	} else {
				742	stats->kernel_early_pages += zone->present_early_pages;
				743	#ifdef CONFIG_CMA
				744	/*
				745	* CMA pages (never on hotplugged memory) behave like
				746	* ZONE_MOVABLE.
				747	*/
				748	stats->movable_pages += zone->cma_pages;
				749	stats->kernel_early_pages -= zone->cma_pages;
				750	#endif /* CONFIG_CMA */
				751	}
				752	}
David Hildenbrand	3fcebf9	2021-09-07 19:55:48 -0700	[diff] [blame]	753	struct auto_movable_group_stats {
				754	unsigned long movable_pages;
				755	unsigned long req_kernel_early_pages;
				756	};
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	757
David Hildenbrand	3fcebf9	2021-09-07 19:55:48 -0700	[diff] [blame]	758	static int auto_movable_stats_account_group(struct memory_group *group,
				759	void *arg)
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	760	{
David Hildenbrand	3fcebf9	2021-09-07 19:55:48 -0700	[diff] [blame]	761	const int ratio = READ_ONCE(auto_movable_ratio);
				762	struct auto_movable_group_stats *stats = arg;
				763	long pages;
				764
				765	/*
				766	* We don't support modifying the config while the auto-movable online
				767	* policy is already enabled. Just avoid the division by zero below.
				768	*/
				769	if (!ratio)
				770	return 0;
				771
				772	/*
				773	* Calculate how many early kernel pages this group requires to
				774	* satisfy the configured zone ratio.
				775	*/
				776	pages = group->present_movable_pages * 100 / ratio;
				777	pages -= group->present_kernel_pages;
				778
				779	if (pages > 0)
				780	stats->req_kernel_early_pages += pages;
				781	stats->movable_pages += group->present_movable_pages;
				782	return 0;
				783	}
				784
				785	static bool auto_movable_can_online_movable(int nid, struct memory_group *group,
				786	unsigned long nr_pages)
				787	{
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	788	unsigned long kernel_early_pages, movable_pages;
David Hildenbrand	3fcebf9	2021-09-07 19:55:48 -0700	[diff] [blame]	789	struct auto_movable_group_stats group_stats = {};
				790	struct auto_movable_stats stats = {};
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	791	pg_data_t *pgdat = NODE_DATA(nid);
				792	struct zone *zone;
				793	int i;
				794
				795	/* Walk all relevant zones and collect MOVABLE vs. KERNEL stats. */
				796	if (nid == NUMA_NO_NODE) {
				797	/* TODO: cache values */
				798	for_each_populated_zone(zone)
				799	auto_movable_stats_account_zone(&stats, zone);
				800	} else {
				801	for (i = 0; i < MAX_NR_ZONES; i++) {
				802	zone = pgdat->node_zones + i;
				803	if (populated_zone(zone))
				804	auto_movable_stats_account_zone(&stats, zone);
				805	}
				806	}
				807
				808	kernel_early_pages = stats.kernel_early_pages;
				809	movable_pages = stats.movable_pages;
				810
				811	/*
David Hildenbrand	3fcebf9	2021-09-07 19:55:48 -0700	[diff] [blame]	812	* Kernel memory inside dynamic memory group allows for more MOVABLE
				813	* memory within the same group. Remove the effect of all but the
				814	* current group from the stats.
				815	*/
				816	walk_dynamic_memory_groups(nid, auto_movable_stats_account_group,
				817	group, &group_stats);
				818	if (kernel_early_pages <= group_stats.req_kernel_early_pages)
				819	return false;
				820	kernel_early_pages -= group_stats.req_kernel_early_pages;
				821	movable_pages -= group_stats.movable_pages;
				822
				823	if (group && group->is_dynamic)
				824	kernel_early_pages += group->present_kernel_pages;
				825
				826	/*
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	827	* Test if we could online the given number of pages to ZONE_MOVABLE
				828	* and still stay in the configured ratio.
				829	*/
				830	movable_pages += nr_pages;
				831	return movable_pages <= (auto_movable_ratio * kernel_early_pages) / 100;
				832	}
				833
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	834	/*
Michal Hocko	c246a21	2017-07-06 15:38:18 -0700	[diff] [blame]	835	* Returns a default kernel memory zone for the given pfn range.
				836	* If no kernel zone covers this pfn range it will automatically go
				837	* to the ZONE_NORMAL.
				838	*/
Michal Hocko	c6f03e2	2017-09-06 16:19:40 -0700	[diff] [blame]	839	static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn,
Michal Hocko	c246a21	2017-07-06 15:38:18 -0700	[diff] [blame]	840	unsigned long nr_pages)
				841	{
				842	struct pglist_data *pgdat = NODE_DATA(nid);
				843	int zid;
				844
				845	for (zid = 0; zid <= ZONE_NORMAL; zid++) {
				846	struct zone *zone = &pgdat->node_zones[zid];
				847
				848	if (zone_intersects(zone, start_pfn, nr_pages))
				849	return zone;
				850	}
				851
				852	return &pgdat->node_zones[ZONE_NORMAL];
				853	}
				854
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	855	/*
				856	* Determine to which zone to online memory dynamically based on user
				857	* configuration and system stats. We care about the following ratio:
				858	*
				859	* MOVABLE : KERNEL
				860	*
				861	* Whereby MOVABLE is memory in ZONE_MOVABLE and KERNEL is memory in
				862	* one of the kernel zones. CMA pages inside one of the kernel zones really
				863	* behaves like ZONE_MOVABLE, so we treat them accordingly.
				864	*
				865	* We don't allow for hotplugged memory in a KERNEL zone to increase the
				866	* amount of MOVABLE memory we can have, so we end up with:
				867	*
				868	* MOVABLE : KERNEL_EARLY
				869	*
				870	* Whereby KERNEL_EARLY is memory in one of the kernel zones, available sinze
				871	* boot. We base our calculation on KERNEL_EARLY internally, because:
				872	*
				873	* a) Hotplugged memory in one of the kernel zones can sometimes still get
				874	* hotunplugged, especially when hot(un)plugging individual memory blocks.
				875	* There is no coordination across memory devices, therefore "automatic"
				876	* hotunplugging, as implemented in hypervisors, could result in zone
				877	* imbalances.
				878	* b) Early/boot memory in one of the kernel zones can usually not get
				879	* hotunplugged again (e.g., no firmware interface to unplug, fragmented
				880	* with unmovable allocations). While there are corner cases where it might
				881	* still work, it is barely relevant in practice.
				882	*
David Hildenbrand	3fcebf9	2021-09-07 19:55:48 -0700	[diff] [blame]	883	* Exceptions are dynamic memory groups, which allow for more MOVABLE
				884	* memory within the same memory group -- because in that case, there is
				885	* coordination within the single memory device managed by a single driver.
				886	*
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	887	* We rely on "present pages" instead of "managed pages", as the latter is
				888	* highly unreliable and dynamic in virtualized environments, and does not
				889	* consider boot time allocations. For example, memory ballooning adjusts the
				890	* managed pages when inflating/deflating the balloon, and balloon compaction
				891	* can even migrate inflated pages between zones.
				892	*
				893	* Using "present pages" is better but some things to keep in mind are:
				894	*
				895	* a) Some memblock allocations, such as for the crashkernel area, are
				896	* effectively unused by the kernel, yet they account to "present pages".
				897	* Fortunately, these allocations are comparatively small in relevant setups
				898	* (e.g., fraction of system memory).
				899	* b) Some hotplugged memory blocks in virtualized environments, esecially
				900	* hotplugged by virtio-mem, look like they are completely present, however,
				901	* only parts of the memory block are actually currently usable.
				902	* "present pages" is an upper limit that can get reached at runtime. As
				903	* we base our calculations on KERNEL_EARLY, this is not an issue.
				904	*/
David Hildenbrand	445fcf7	2021-09-07 19:55:45 -0700	[diff] [blame]	905	static struct zone *auto_movable_zone_for_pfn(int nid,
				906	struct memory_group *group,
				907	unsigned long pfn,
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	908	unsigned long nr_pages)
				909	{
David Hildenbrand	445fcf7	2021-09-07 19:55:45 -0700	[diff] [blame]	910	unsigned long online_pages = 0, max_pages, end_pfn;
				911	struct page *page;
				912
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	913	if (!auto_movable_ratio)
				914	goto kernel_zone;
				915
David Hildenbrand	445fcf7	2021-09-07 19:55:45 -0700	[diff] [blame]	916	if (group && !group->is_dynamic) {
				917	max_pages = group->s.max_pages;
				918	online_pages = group->present_movable_pages;
				919
				920	/* If anything is !MOVABLE online the rest !MOVABLE. */
				921	if (group->present_kernel_pages)
				922	goto kernel_zone;
				923	} else if (!group \|\| group->d.unit_pages == nr_pages) {
				924	max_pages = nr_pages;
				925	} else {
				926	max_pages = group->d.unit_pages;
				927	/*
				928	* Take a look at all online sections in the current unit.
				929	* We can safely assume that all pages within a section belong
				930	* to the same zone, because dynamic memory groups only deal
				931	* with hotplugged memory.
				932	*/
				933	pfn = ALIGN_DOWN(pfn, group->d.unit_pages);
				934	end_pfn = pfn + group->d.unit_pages;
				935	for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
				936	page = pfn_to_online_page(pfn);
				937	if (!page)
				938	continue;
				939	/* If anything is !MOVABLE online the rest !MOVABLE. */
				940	if (page_zonenum(page) != ZONE_MOVABLE)
				941	goto kernel_zone;
				942	online_pages += PAGES_PER_SECTION;
				943	}
				944	}
				945
				946	/*
				947	* Online MOVABLE if we could currently online all remaining parts
				948	* MOVABLE. We expect to (add+) online them immediately next, so if
				949	* nobody interferes, all will be MOVABLE if possible.
				950	*/
				951	nr_pages = max_pages - online_pages;
David Hildenbrand	3fcebf9	2021-09-07 19:55:48 -0700	[diff] [blame]	952	if (!auto_movable_can_online_movable(NUMA_NO_NODE, group, nr_pages))
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	953	goto kernel_zone;
				954
				955	#ifdef CONFIG_NUMA
				956	if (auto_movable_numa_aware &&
David Hildenbrand	3fcebf9	2021-09-07 19:55:48 -0700	[diff] [blame]	957	!auto_movable_can_online_movable(nid, group, nr_pages))
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	958	goto kernel_zone;
				959	#endif /* CONFIG_NUMA */
				960
				961	return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
				962	kernel_zone:
				963	return default_kernel_zone_for_pfn(nid, pfn, nr_pages);
				964	}
				965
Michal Hocko	c6f03e2	2017-09-06 16:19:40 -0700	[diff] [blame]	966	static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
				967	unsigned long nr_pages)
Michal Hocko	e5e6893	2017-09-06 16:19:37 -0700	[diff] [blame]	968	{
Michal Hocko	c6f03e2	2017-09-06 16:19:40 -0700	[diff] [blame]	969	struct zone *kernel_zone = default_kernel_zone_for_pfn(nid, start_pfn,
				970	nr_pages);
				971	struct zone *movable_zone = &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
				972	bool in_kernel = zone_intersects(kernel_zone, start_pfn, nr_pages);
				973	bool in_movable = zone_intersects(movable_zone, start_pfn, nr_pages);
Michal Hocko	e5e6893	2017-09-06 16:19:37 -0700	[diff] [blame]	974
				975	/*
Michal Hocko	c6f03e2	2017-09-06 16:19:40 -0700	[diff] [blame]	976	* We inherit the existing zone in a simple case where zones do not
				977	* overlap in the given range
Michal Hocko	e5e6893	2017-09-06 16:19:37 -0700	[diff] [blame]	978	*/
Michal Hocko	c6f03e2	2017-09-06 16:19:40 -0700	[diff] [blame]	979	if (in_kernel ^ in_movable)
				980	return (in_kernel) ? kernel_zone : movable_zone;
Michal Hocko	e5e6893	2017-09-06 16:19:37 -0700	[diff] [blame]	981
Michal Hocko	c6f03e2	2017-09-06 16:19:40 -0700	[diff] [blame]	982	/*
				983	* If the range doesn't belong to any zone or two zones overlap in the
				984	* given range then we use movable zone only if movable_node is
				985	* enabled because we always online to a kernel zone by default.
				986	*/
				987	return movable_node_enabled ? movable_zone : kernel_zone;
Michal Hocko	9f123ab	2017-07-10 15:48:37 -0700	[diff] [blame]	988	}
				989
David Hildenbrand	7cf209b	2021-09-07 19:54:59 -0700	[diff] [blame]	990	struct zone *zone_for_pfn_range(int online_type, int nid,
David Hildenbrand	445fcf7	2021-09-07 19:55:45 -0700	[diff] [blame]	991	struct memory_group *group, unsigned long start_pfn,
Michal Hocko	e5e6893	2017-09-06 16:19:37 -0700	[diff] [blame]	992	unsigned long nr_pages)
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	993	{
Michal Hocko	c6f03e2	2017-09-06 16:19:40 -0700	[diff] [blame]	994	if (online_type == MMOP_ONLINE_KERNEL)
				995	return default_kernel_zone_for_pfn(nid, start_pfn, nr_pages);
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	996
Michal Hocko	c6f03e2	2017-09-06 16:19:40 -0700	[diff] [blame]	997	if (online_type == MMOP_ONLINE_MOVABLE)
				998	return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
Reza Arbab	df429ac	2016-07-26 15:22:23 -0700	[diff] [blame]	999
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	1000	if (online_policy == ONLINE_POLICY_AUTO_MOVABLE)
David Hildenbrand	445fcf7	2021-09-07 19:55:45 -0700	[diff] [blame]	1001	return auto_movable_zone_for_pfn(nid, group, start_pfn, nr_pages);
David Hildenbrand	e83a437	2021-09-07 19:55:23 -0700	[diff] [blame]	1002
Michal Hocko	c6f03e2	2017-09-06 16:19:40 -0700	[diff] [blame]	1003	return default_zone_for_pfn(nid, start_pfn, nr_pages);
Michal Hocko	e5e6893	2017-09-06 16:19:37 -0700	[diff] [blame]	1004	}
				1005
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	1006	/*
				1007	* This function should only be called by memory_block_{online,offline},
				1008	* and {online,offline}_pages.
				1009	*/
David Hildenbrand	836809e	2021-09-07 19:55:30 -0700	[diff] [blame]	1010	void adjust_present_page_count(struct page page, struct memory_group group,
				1011	long nr_pages)
David Hildenbrand	f990114	2021-05-04 18:39:39 -0700	[diff] [blame]	1012	{
David Hildenbrand	4b09700	2021-09-07 19:55:19 -0700	[diff] [blame]	1013	struct zone *zone = page_zone(page);
David Hildenbrand	836809e	2021-09-07 19:55:30 -0700	[diff] [blame]	1014	const bool movable = zone_idx(zone) == ZONE_MOVABLE;
David Hildenbrand	4b09700	2021-09-07 19:55:19 -0700	[diff] [blame]	1015
				1016	/*
				1017	* We only support onlining/offlining/adding/removing of complete
				1018	* memory blocks; therefore, either all is either early or hotplugged.
				1019	*/
				1020	if (early_section(__pfn_to_section(page_to_pfn(page))))
				1021	zone->present_early_pages += nr_pages;
David Hildenbrand	f990114	2021-05-04 18:39:39 -0700	[diff] [blame]	1022	zone->present_pages += nr_pages;
David Hildenbrand	f990114	2021-05-04 18:39:39 -0700	[diff] [blame]	1023	zone->zone_pgdat->node_present_pages += nr_pages;
David Hildenbrand	836809e	2021-09-07 19:55:30 -0700	[diff] [blame]	1024
				1025	if (group && movable)
				1026	group->present_movable_pages += nr_pages;
				1027	else if (group && !movable)
				1028	group->present_kernel_pages += nr_pages;
David Hildenbrand	f990114	2021-05-04 18:39:39 -0700	[diff] [blame]	1029	}
				1030
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	1031	int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
				1032	struct zone *zone)
				1033	{
				1034	unsigned long end_pfn = pfn + nr_pages;
				1035	int ret;
				1036
				1037	ret = kasan_add_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
				1038	if (ret)
				1039	return ret;
				1040
				1041	move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE);
				1042
				1043	/*
				1044	* It might be that the vmemmap_pages fully span sections. If that is
				1045	* the case, mark those sections online here as otherwise they will be
				1046	* left offline.
				1047	*/
				1048	if (nr_pages >= PAGES_PER_SECTION)
				1049	online_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION));
				1050
				1051	return ret;
				1052	}
				1053
				1054	void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages)
				1055	{
				1056	unsigned long end_pfn = pfn + nr_pages;
				1057
				1058	/*
				1059	* It might be that the vmemmap_pages fully span sections. If that is
				1060	* the case, mark those sections offline here as otherwise they will be
				1061	* left online.
				1062	*/
				1063	if (nr_pages >= PAGES_PER_SECTION)
				1064	offline_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION));
				1065
				1066	/*
				1067	* The pages associated with this vmemmap have been offlined, so
				1068	* we can reset its state here.
				1069	*/
				1070	remove_pfn_range_from_zone(page_zone(pfn_to_page(pfn)), pfn, nr_pages);
				1071	kasan_remove_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
				1072	}
				1073
David Hildenbrand	836809e	2021-09-07 19:55:30 -0700	[diff] [blame]	1074	int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
				1075	struct zone zone, struct memory_group group)
KAMEZAWA Hiroyuki	75884fb	2007-10-16 01:26:10 -0700	[diff] [blame]	1076	{
Cody P Schafer	aa47228	2013-07-03 15:02:10 -0700	[diff] [blame]	1077	unsigned long flags;
Yasunori Goto	6811378	2006-06-23 02:03:11 -0700	[diff] [blame]	1078	int need_zonelists_rebuild = 0;
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	1079	const int nid = zone_to_nid(zone);
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1080	int ret;
				1081	struct memory_notify arg;
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	1082
Oscar Salvador	dd8e2f2	2021-05-04 18:39:36 -0700	[diff] [blame]	1083	/*
				1084	* {on,off}lining is constrained to full memory sections (or more
Zhen Lei	041711c	2021-06-30 18:53:17 -0700	[diff] [blame]	1085	* precisely to memory blocks from the user space POV).
Oscar Salvador	dd8e2f2	2021-05-04 18:39:36 -0700	[diff] [blame]	1086	* memmap_on_memory is an exception because it reserves initial part
				1087	* of the physical memory space for vmemmaps. That space is pageblock
				1088	* aligned.
				1089	*/
David Hildenbrand	4986fac	2020-10-15 20:07:50 -0700	[diff] [blame]	1090	if (WARN_ON_ONCE(!nr_pages \|\|
Oscar Salvador	dd8e2f2	2021-05-04 18:39:36 -0700	[diff] [blame]	1091	!IS_ALIGNED(pfn, pageblock_nr_pages) \|\|
				1092	!IS_ALIGNED(pfn + nr_pages, PAGES_PER_SECTION)))
David Hildenbrand	4986fac	2020-10-15 20:07:50 -0700	[diff] [blame]	1093	return -EINVAL;
				1094
David Hildenbrand	381eab4	2018-10-30 15:10:29 -0700	[diff] [blame]	1095	mem_hotplug_begin();
				1096
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	1097	/* associate pfn range with the zone */
David Hildenbrand	b30c592	2020-10-15 20:08:23 -0700	[diff] [blame]	1098	move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE);
Michal Hocko	f1dd2cd	2017-07-06 15:38:11 -0700	[diff] [blame]	1099
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1100	arg.start_pfn = pfn;
				1101	arg.nr_pages = nr_pages;
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1102	node_states_check_changes_online(nr_pages, zone, &arg);
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1103
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1104	ret = memory_notify(MEM_GOING_ONLINE, &arg);
				1105	ret = notifier_to_errno(ret);
Chen Yucong	e33e33b	2016-03-17 14:19:35 -0700	[diff] [blame]	1106	if (ret)
				1107	goto failed_addition;
				1108
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	1109	/*
David Hildenbrand	b30c592	2020-10-15 20:08:23 -0700	[diff] [blame]	1110	* Fixup the number of isolated pageblocks before marking the sections
				1111	* onlining, such that undo_isolate_page_range() works correctly.
				1112	*/
				1113	spin_lock_irqsave(&zone->lock, flags);
				1114	zone->nr_isolate_pageblock += nr_pages / pageblock_nr_pages;
				1115	spin_unlock_irqrestore(&zone->lock, flags);
				1116
				1117	/*
Yasunori Goto	6811378	2006-06-23 02:03:11 -0700	[diff] [blame]	1118	* If this zone is not populated, then it is not in zonelist.
				1119	* This means the page allocator ignores this zone.
				1120	* So, zonelist must be updated after online.
				1121	*/
Wen Congyang	6dcd73d	2012-12-11 16:01:01 -0800	[diff] [blame]	1122	if (!populated_zone(zone)) {
Yasunori Goto	6811378	2006-06-23 02:03:11 -0700	[diff] [blame]	1123	need_zonelists_rebuild = 1;
Michal Hocko	72675e1	2017-09-06 16:20:24 -0700	[diff] [blame]	1124	setup_zone_pageset(zone);
Wen Congyang	6dcd73d	2012-12-11 16:01:01 -0800	[diff] [blame]	1125	}
Yasunori Goto	6811378	2006-06-23 02:03:11 -0700	[diff] [blame]	1126
David Hildenbrand	aac6532	2020-10-15 20:08:11 -0700	[diff] [blame]	1127	online_pages_range(pfn, nr_pages);
David Hildenbrand	836809e	2021-09-07 19:55:30 -0700	[diff] [blame]	1128	adjust_present_page_count(pfn_to_page(pfn), group, nr_pages);
Cody P Schafer	aa47228	2013-07-03 15:02:10 -0700	[diff] [blame]	1129
David Hildenbrand	b30c592	2020-10-15 20:08:23 -0700	[diff] [blame]	1130	node_states_set_node(nid, &arg);
				1131	if (need_zonelists_rebuild)
				1132	build_all_zonelists(NULL);
David Hildenbrand	b30c592	2020-10-15 20:08:23 -0700	[diff] [blame]	1133
				1134	/* Basic onlining is complete, allow allocation of onlined pages. */
				1135	undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE);
				1136
David Hildenbrand	93146d9	2020-08-06 23:25:35 -0700	[diff] [blame]	1137	/*
David Hildenbrand	b86c5fc	2020-10-15 20:09:39 -0700	[diff] [blame]	1138	* Freshly onlined pages aren't shuffled (e.g., all pages are placed to
				1139	* the tail of the freelist when undoing isolation). Shuffle the whole
				1140	* zone to make sure the just onlined pages are properly distributed
				1141	* across the whole freelist - to create an initial shuffle.
David Hildenbrand	93146d9	2020-08-06 23:25:35 -0700	[diff] [blame]	1142	*/
Dan Williams	e900a91	2019-05-14 15:41:28 -0700	[diff] [blame]	1143	shuffle_zone(zone);
				1144
Mel Gorman	b92ca18	2021-06-28 19:42:12 -0700	[diff] [blame]	1145	/* reinitialise watermarks and update pcp limits */
KOSAKI Motohiro	1b79acc	2011-05-24 17:11:32 -0700	[diff] [blame]	1146	init_per_zone_wmark_min();
				1147
David Hildenbrand	ca9a46f	2019-09-23 15:36:08 -0700	[diff] [blame]	1148	kswapd_run(nid);
				1149	kcompactd_run(nid);
Dave Hansen	61b1399	2005-10-29 18:16:56 -0700	[diff] [blame]	1150
Chandra Seetharaman	2d1d43f	2006-09-29 02:01:25 -0700	[diff] [blame]	1151	writeback_set_ratelimit();
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1152
David Hildenbrand	ca9a46f	2019-09-23 15:36:08 -0700	[diff] [blame]	1153	memory_notify(MEM_ONLINE, &arg);
David Hildenbrand	381eab4	2018-10-30 15:10:29 -0700	[diff] [blame]	1154	mem_hotplug_done();
David Rientjes	30467e0	2015-04-14 15:45:11 -0700	[diff] [blame]	1155	return 0;
Chen Yucong	e33e33b	2016-03-17 14:19:35 -0700	[diff] [blame]	1156
				1157	failed_addition:
				1158	pr_debug("online_pages [mem %#010llx-%#010llx] failed\n",
				1159	(unsigned long long) pfn << PAGE_SHIFT,
				1160	(((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1);
				1161	memory_notify(MEM_CANCEL_ONLINE, &arg);
David Hildenbrand	feee6b2	2020-01-04 12:59:33 -0800	[diff] [blame]	1162	remove_pfn_range_from_zone(zone, pfn, nr_pages);
David Hildenbrand	381eab4	2018-10-30 15:10:29 -0700	[diff] [blame]	1163	mem_hotplug_done();
Chen Yucong	e33e33b	2016-03-17 14:19:35 -0700	[diff] [blame]	1164	return ret;
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	1165	}
Keith Mannthey	5394702	2006-09-30 23:27:08 -0700	[diff] [blame]	1166	#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
Yasunori Goto	bc02af9	2006-06-27 02:53:30 -0700	[diff] [blame]	1167
Tang Chen	0bd8542	2014-11-13 15:19:41 -0800	[diff] [blame]	1168	static void reset_node_present_pages(pg_data_t *pgdat)
				1169	{
				1170	struct zone *z;
				1171
				1172	for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
				1173	z->present_pages = 0;
				1174
				1175	pgdat->node_present_pages = 0;
				1176	}
				1177
Hidetoshi Seto	e131933	2009-11-17 14:06:18 -0800	[diff] [blame]	1178	/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
David Hildenbrand	c68ab18	2020-06-04 16:48:35 -0700	[diff] [blame]	1179	static pg_data_t __ref *hotadd_new_pgdat(int nid)
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1180	{
				1181	struct pglist_data *pgdat;
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1182
Tang Chen	a1e565a	2013-02-22 16:33:18 -0800	[diff] [blame]	1183	pgdat = NODE_DATA(nid);
				1184	if (!pgdat) {
				1185	pgdat = arch_alloc_nodedata(nid);
				1186	if (!pgdat)
				1187	return NULL;
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1188
Wei Yang	33fce01	2019-09-23 15:35:52 -0700	[diff] [blame]	1189	pgdat->per_cpu_nodestats =
				1190	alloc_percpu(struct per_cpu_nodestat);
Tang Chen	a1e565a	2013-02-22 16:33:18 -0800	[diff] [blame]	1191	arch_refresh_nodedata(nid, pgdat);
Gu Zheng	b0dc3a3	2015-03-25 15:55:20 -0700	[diff] [blame]	1192	} else {
Wei Yang	33fce01	2019-09-23 15:35:52 -0700	[diff] [blame]	1193	int cpu;
Mel Gorman	e716f2e	2017-05-03 14:53:45 -0700	[diff] [blame]	1194	/*
Joonsoo Kim	97a225e	2020-06-03 15:59:01 -0700	[diff] [blame]	1195	* Reset the nr_zones, order and highest_zoneidx before reuse.
				1196	* Note that kswapd will init kswapd_highest_zoneidx properly
Mel Gorman	e716f2e	2017-05-03 14:53:45 -0700	[diff] [blame]	1197	* when it starts in the near future.
				1198	*/
Gu Zheng	b0dc3a3	2015-03-25 15:55:20 -0700	[diff] [blame]	1199	pgdat->nr_zones = 0;
Mel Gorman	38087d9	2016-07-28 15:45:49 -0700	[diff] [blame]	1200	pgdat->kswapd_order = 0;
Joonsoo Kim	97a225e	2020-06-03 15:59:01 -0700	[diff] [blame]	1201	pgdat->kswapd_highest_zoneidx = 0;
Wei Yang	33fce01	2019-09-23 15:35:52 -0700	[diff] [blame]	1202	for_each_online_cpu(cpu) {
				1203	struct per_cpu_nodestat *p;
				1204
				1205	p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
				1206	memset(p, 0, sizeof(*p));
				1207	}
Tang Chen	a1e565a	2013-02-22 16:33:18 -0800	[diff] [blame]	1208	}
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1209
				1210	/* we can use NODE_DATA(nid) from here */
Oscar Salvador	03e85f9	2018-08-21 21:53:43 -0700	[diff] [blame]	1211	pgdat->node_id = nid;
David Hildenbrand	c68ab18	2020-06-04 16:48:35 -0700	[diff] [blame]	1212	pgdat->node_start_pfn = 0;
Oscar Salvador	03e85f9	2018-08-21 21:53:43 -0700	[diff] [blame]	1213
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1214	/* init node's zones as empty zones, we don't have any present pages.*/
Oscar Salvador	03e85f9	2018-08-21 21:53:43 -0700	[diff] [blame]	1215	free_area_init_core_hotplug(nid);
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1216
KAMEZAWA Hiroyuki	959ecc4	2011-06-15 15:08:38 -0700	[diff] [blame]	1217	/*
				1218	* The node we allocated has no zone fallback lists. For avoiding
				1219	* to access not-initialized zonelist, build here.
				1220	*/
Michal Hocko	72675e1	2017-09-06 16:20:24 -0700	[diff] [blame]	1221	build_all_zonelists(pgdat);
KAMEZAWA Hiroyuki	959ecc4	2011-06-15 15:08:38 -0700	[diff] [blame]	1222
Tang Chen	f784a3f	2014-11-13 15:19:39 -0800	[diff] [blame]	1223	/*
Tang Chen	0bd8542	2014-11-13 15:19:41 -0800	[diff] [blame]	1224	* When memory is hot-added, all the memory is in offline state. So
				1225	* clear all zones' present_pages because they will be updated in
				1226	* online_pages() and offline_pages().
				1227	*/
Oscar Salvador	03e85f9	2018-08-21 21:53:43 -0700	[diff] [blame]	1228	reset_node_managed_pages(pgdat);
Tang Chen	0bd8542	2014-11-13 15:19:41 -0800	[diff] [blame]	1229	reset_node_present_pages(pgdat);
				1230
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1231	return pgdat;
				1232	}
				1233
Oscar Salvador	b9ff036	2018-08-17 15:46:15 -0700	[diff] [blame]	1234	static void rollback_node_hotadd(int nid)
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1235	{
Oscar Salvador	b9ff036	2018-08-17 15:46:15 -0700	[diff] [blame]	1236	pg_data_t *pgdat = NODE_DATA(nid);
				1237
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1238	arch_refresh_nodedata(nid, NULL);
Reza Arbab	5830169	2016-08-11 15:33:12 -0700	[diff] [blame]	1239	free_percpu(pgdat->per_cpu_nodestats);
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1240	arch_free_nodedata(pgdat);
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1241	}
				1242
KAMEZAWA Hiroyuki	0a54703	2006-06-27 02:53:35 -0700	[diff] [blame]	1243
Mel Gorman	ba2d266	2021-06-30 18:53:35 -0700	[diff] [blame]	1244	/*
				1245	* __try_online_node - online a node if offlined
Mike Rapoport	e8b098f	2018-04-05 16:24:57 -0700	[diff] [blame]	1246	* @nid: the node ID
Oscar Salvador	b9ff036	2018-08-17 15:46:15 -0700	[diff] [blame]	1247	* @set_node_online: Whether we want to online the node
minskey guo	cf23422	2010-05-24 14:32:41 -0700	[diff] [blame]	1248	* called by cpu_up() to online a node without onlined memory.
Oscar Salvador	b9ff036	2018-08-17 15:46:15 -0700	[diff] [blame]	1249	*
				1250	* Returns:
				1251	* 1 -> a new node has been allocated
				1252	* 0 -> the node is already online
				1253	* -ENOMEM -> the node could not be allocated
minskey guo	cf23422	2010-05-24 14:32:41 -0700	[diff] [blame]	1254	*/
David Hildenbrand	c68ab18	2020-06-04 16:48:35 -0700	[diff] [blame]	1255	static int __try_online_node(int nid, bool set_node_online)
minskey guo	cf23422	2010-05-24 14:32:41 -0700	[diff] [blame]	1256	{
Oscar Salvador	b9ff036	2018-08-17 15:46:15 -0700	[diff] [blame]	1257	pg_data_t *pgdat;
				1258	int ret = 1;
minskey guo	cf23422	2010-05-24 14:32:41 -0700	[diff] [blame]	1259
Toshi Kani	01b0f19	2013-11-12 15:07:25 -0800	[diff] [blame]	1260	if (node_online(nid))
				1261	return 0;
				1262
David Hildenbrand	c68ab18	2020-06-04 16:48:35 -0700	[diff] [blame]	1263	pgdat = hotadd_new_pgdat(nid);
David Rientjes	7553e8f	2011-06-22 18:13:01 -0700	[diff] [blame]	1264	if (!pgdat) {
Toshi Kani	01b0f19	2013-11-12 15:07:25 -0800	[diff] [blame]	1265	pr_err("Cannot online node %d due to NULL pgdat\n", nid);
minskey guo	cf23422	2010-05-24 14:32:41 -0700	[diff] [blame]	1266	ret = -ENOMEM;
				1267	goto out;
				1268	}
Oscar Salvador	b9ff036	2018-08-17 15:46:15 -0700	[diff] [blame]	1269
				1270	if (set_node_online) {
				1271	node_set_online(nid);
				1272	ret = register_one_node(nid);
				1273	BUG_ON(ret);
				1274	}
minskey guo	cf23422	2010-05-24 14:32:41 -0700	[diff] [blame]	1275	out:
Oscar Salvador	b9ff036	2018-08-17 15:46:15 -0700	[diff] [blame]	1276	return ret;
				1277	}
				1278
				1279	/*
				1280	* Users of this function always want to online/register the node
				1281	*/
				1282	int try_online_node(int nid)
				1283	{
				1284	int ret;
				1285
				1286	mem_hotplug_begin();
David Hildenbrand	c68ab18	2020-06-04 16:48:35 -0700	[diff] [blame]	1287	ret = __try_online_node(nid, true);
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	1288	mem_hotplug_done();
minskey guo	cf23422	2010-05-24 14:32:41 -0700	[diff] [blame]	1289	return ret;
				1290	}
				1291
Toshi Kani	27356f5	2013-09-11 14:21:49 -0700	[diff] [blame]	1292	static int check_hotplug_memory_range(u64 start, u64 size)
				1293	{
Pavel Tatashin	ba32558	2018-04-05 16:22:39 -0700	[diff] [blame]	1294	/* memory range must be block size aligned */
David Hildenbrand	cec3ebd	2019-07-18 15:56:25 -0700	[diff] [blame]	1295	if (!size \|\| !IS_ALIGNED(start, memory_block_size_bytes()) \|\|
				1296	!IS_ALIGNED(size, memory_block_size_bytes())) {
Pavel Tatashin	ba32558	2018-04-05 16:22:39 -0700	[diff] [blame]	1297	pr_err("Block size [%#lx] unaligned hotplug range: start %#llx, size %#llx",
David Hildenbrand	cec3ebd	2019-07-18 15:56:25 -0700	[diff] [blame]	1298	memory_block_size_bytes(), start, size);
Toshi Kani	27356f5	2013-09-11 14:21:49 -0700	[diff] [blame]	1299	return -EINVAL;
				1300	}
				1301
				1302	return 0;
				1303	}
				1304
Vitaly Kuznetsov	31bc385	2016-03-15 14:56:48 -0700	[diff] [blame]	1305	static int online_memory_block(struct memory_block mem, void arg)
				1306	{
Anshuman Khandual	1adf8b4	2021-02-25 17:17:13 -0800	[diff] [blame]	1307	mem->online_type = mhp_default_online_type;
Nathan Fontenot	dc18d70	2017-02-24 15:00:02 -0800	[diff] [blame]	1308	return device_online(&mem->dev);
Vitaly Kuznetsov	31bc385	2016-03-15 14:56:48 -0700	[diff] [blame]	1309	}
				1310
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	1311	bool mhp_supports_memmap_on_memory(unsigned long size)
				1312	{
				1313	unsigned long nr_vmemmap_pages = size / PAGE_SIZE;
				1314	unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
				1315	unsigned long remaining_size = size - vmemmap_size;
				1316
				1317	/*
				1318	* Besides having arch support and the feature enabled at runtime, we
				1319	* need a few more assumptions to hold true:
				1320	*
				1321	* a) We span a single memory block: memory onlining/offlinin;g happens
				1322	* in memory block granularity. We don't want the vmemmap of online
				1323	* memory blocks to reside on offline memory blocks. In the future,
				1324	* we might want to support variable-sized memory blocks to make the
				1325	* feature more versatile.
				1326	*
				1327	* b) The vmemmap pages span complete PMDs: We don't want vmemmap code
				1328	* to populate memory from the altmap for unrelated parts (i.e.,
				1329	* other memory blocks)
				1330	*
				1331	* c) The vmemmap pages (and thereby the pages that will be exposed to
				1332	* the buddy) have to cover full pageblocks: memory onlining/offlining
				1333	* code requires applicable ranges to be page-aligned, for example, to
				1334	* set the migratetypes properly.
				1335	*
				1336	* TODO: Although we have a check here to make sure that vmemmap pages
				1337	* fully populate a PMD, it is not the right place to check for
				1338	* this. A much better solution involves improving vmemmap code
				1339	* to fallback to base pages when trying to populate vmemmap using
				1340	* altmap as an alternative source of memory, and we do not exactly
				1341	* populate a single PMD.
				1342	*/
				1343	return memmap_on_memory &&
Muchun Song	2d7a217	2021-06-30 18:48:25 -0700	[diff] [blame]	1344	!hugetlb_free_vmemmap_enabled &&
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	1345	IS_ENABLED(CONFIG_MHP_MEMMAP_ON_MEMORY) &&
				1346	size == memory_block_size_bytes() &&
				1347	IS_ALIGNED(vmemmap_size, PMD_SIZE) &&
				1348	IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT));
				1349	}
				1350
David Hildenbrand	8df1d0e	2018-10-30 15:10:24 -0700	[diff] [blame]	1351	/*
				1352	* NOTE: The caller must call lock_device_hotplug() to serialize hotplug
				1353	* and online/offline operations (triggered e.g. by sysfs).
				1354	*
				1355	* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG
				1356	*/
David Hildenbrand	b611719	2020-10-15 20:08:44 -0700	[diff] [blame]	1357	int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
Yasunori Goto	bc02af9	2006-06-27 02:53:30 -0700	[diff] [blame]	1358	{
Catalin Marinas	d15dfd3	2021-03-09 12:26:01 +0000	[diff] [blame]	1359	struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	1360	struct vmem_altmap mhp_altmap = {};
David Hildenbrand	028fc57	2021-09-07 19:55:26 -0700	[diff] [blame]	1361	struct memory_group *group = NULL;
David Vrabel	62cedb9	2015-06-25 16:35:49 +0100	[diff] [blame]	1362	u64 start, size;
Oscar Salvador	b9ff036	2018-08-17 15:46:15 -0700	[diff] [blame]	1363	bool new_node = false;
Yasunori Goto	bc02af9	2006-06-27 02:53:30 -0700	[diff] [blame]	1364	int ret;
				1365
David Vrabel	62cedb9	2015-06-25 16:35:49 +0100	[diff] [blame]	1366	start = res->start;
				1367	size = resource_size(res);
				1368
Toshi Kani	27356f5	2013-09-11 14:21:49 -0700	[diff] [blame]	1369	ret = check_hotplug_memory_range(start, size);
				1370	if (ret)
				1371	return ret;
				1372
David Hildenbrand	028fc57	2021-09-07 19:55:26 -0700	[diff] [blame]	1373	if (mhp_flags & MHP_NID_IS_MGID) {
				1374	group = memory_group_find_by_id(nid);
				1375	if (!group)
				1376	return -EINVAL;
				1377	nid = group->nid;
				1378	}
				1379
Vishal Verma	fa6d9ec	2020-06-04 16:48:25 -0700	[diff] [blame]	1380	if (!node_possible(nid)) {
				1381	WARN(1, "node %d was absent from the node_possible_map\n", nid);
				1382	return -EINVAL;
				1383	}
				1384
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	1385	mem_hotplug_begin();
Nathan Zimmer	ac13c46	2014-01-23 15:53:26 -0800	[diff] [blame]	1386
David Hildenbrand	49e0fcb	2021-11-05 13:44:42 -0700	[diff] [blame]	1387	if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
David Hildenbrand	d9a9d8e	2021-11-05 13:44:49 -0700	[diff] [blame]	1388	ret = memblock_add_node(start, size, nid, MEMBLOCK_NONE);
David Hildenbrand	49e0fcb	2021-11-05 13:44:42 -0700	[diff] [blame]	1389	if (ret)
				1390	goto error_mem_hotplug_end;
				1391	}
Tang Chen	7f36e3e	2015-09-04 15:42:32 -0700	[diff] [blame]	1392
David Hildenbrand	c68ab18	2020-06-04 16:48:35 -0700	[diff] [blame]	1393	ret = __try_online_node(nid, false);
Oscar Salvador	b9ff036	2018-08-17 15:46:15 -0700	[diff] [blame]	1394	if (ret < 0)
				1395	goto error;
				1396	new_node = ret;
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1397
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	1398	/*
				1399	* Self hosted memmap array
				1400	*/
				1401	if (mhp_flags & MHP_MEMMAP_ON_MEMORY) {
				1402	if (!mhp_supports_memmap_on_memory(size)) {
				1403	ret = -EINVAL;
				1404	goto error;
				1405	}
				1406	mhp_altmap.free = PHYS_PFN(size);
				1407	mhp_altmap.base_pfn = PHYS_PFN(start);
				1408	params.altmap = &mhp_altmap;
				1409	}
				1410
Yasunori Goto	bc02af9	2006-06-27 02:53:30 -0700	[diff] [blame]	1411	/* call arch's memory hotadd */
Logan Gunthorpe	f5637d3	2020-04-10 14:33:21 -0700	[diff] [blame]	1412	ret = arch_add_memory(nid, start, size, &params);
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1413	if (ret < 0)
				1414	goto error;
				1415
David Hildenbrand	db051a0	2019-07-18 15:56:56 -0700	[diff] [blame]	1416	/* create memory block devices after memory was added */
David Hildenbrand	028fc57	2021-09-07 19:55:26 -0700	[diff] [blame]	1417	ret = create_memory_block_devices(start, size, mhp_altmap.alloc,
				1418	group);
David Hildenbrand	db051a0	2019-07-18 15:56:56 -0700	[diff] [blame]	1419	if (ret) {
David Hildenbrand	65a2aa5	2021-09-07 19:55:04 -0700	[diff] [blame]	1420	arch_remove_memory(start, size, NULL);
David Hildenbrand	db051a0	2019-07-18 15:56:56 -0700	[diff] [blame]	1421	goto error;
				1422	}
				1423
Tang Chen	a1e565a	2013-02-22 16:33:18 -0800	[diff] [blame]	1424	if (new_node) {
Oscar Salvador	d5b6f6a3	2018-08-17 15:46:18 -0700	[diff] [blame]	1425	/* If sysfs file of new node can't be created, cpu on the node
Yasunori Goto	0fc4415	2006-06-27 02:53:38 -0700	[diff] [blame]	1426	* can't be hot-added. There is no rollback way now.
				1427	* So, check by BUG_ON() to catch it reluctantly..
Oscar Salvador	d5b6f6a3	2018-08-17 15:46:18 -0700	[diff] [blame]	1428	* We online node here. We can't roll back from here.
Yasunori Goto	0fc4415	2006-06-27 02:53:38 -0700	[diff] [blame]	1429	*/
Oscar Salvador	d5b6f6a3	2018-08-17 15:46:18 -0700	[diff] [blame]	1430	node_set_online(nid);
				1431	ret = __register_one_node(nid);
Yasunori Goto	0fc4415	2006-06-27 02:53:38 -0700	[diff] [blame]	1432	BUG_ON(ret);
				1433	}
				1434
Oscar Salvador	d5b6f6a3	2018-08-17 15:46:18 -0700	[diff] [blame]	1435	/* link memory sections under this node.*/
Laurent Dufour	90c7eae	2020-10-15 20:09:15 -0700	[diff] [blame]	1436	link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1),
				1437	MEMINIT_HOTPLUG);
Oscar Salvador	d5b6f6a3	2018-08-17 15:46:18 -0700	[diff] [blame]	1438
akpm@linux-foundation.org	d96ae53	2010-03-05 13:41:58 -0800	[diff] [blame]	1439	/* create new memmap entry */
David Hildenbrand	7b7b272	2020-06-04 16:48:41 -0700	[diff] [blame]	1440	if (!strcmp(res->name, "System RAM"))
				1441	firmware_map_add_hotplug(start, start + size, "System RAM");
akpm@linux-foundation.org	d96ae53	2010-03-05 13:41:58 -0800	[diff] [blame]	1442
David Hildenbrand	381eab4	2018-10-30 15:10:29 -0700	[diff] [blame]	1443	/* device_online() will take the lock when calling online_pages() */
				1444	mem_hotplug_done();
				1445
David Hildenbrand	9ca6551	2020-10-15 20:08:49 -0700	[diff] [blame]	1446	/*
				1447	* In case we're allowed to merge the resource, flag it and trigger
				1448	* merging now that adding succeeded.
				1449	*/
David Hildenbrand	2601126	2021-02-25 17:17:17 -0800	[diff] [blame]	1450	if (mhp_flags & MHP_MERGE_RESOURCE)
David Hildenbrand	9ca6551	2020-10-15 20:08:49 -0700	[diff] [blame]	1451	merge_system_ram_resource(res);
				1452
Vitaly Kuznetsov	31bc385	2016-03-15 14:56:48 -0700	[diff] [blame]	1453	/* online pages if requested */
Anshuman Khandual	1adf8b4	2021-02-25 17:17:13 -0800	[diff] [blame]	1454	if (mhp_default_online_type != MMOP_OFFLINE)
David Hildenbrand	fbcf73c	2019-07-18 15:57:46 -0700	[diff] [blame]	1455	walk_memory_blocks(start, size, NULL, online_memory_block);
Vitaly Kuznetsov	31bc385	2016-03-15 14:56:48 -0700	[diff] [blame]	1456
David Hildenbrand	381eab4	2018-10-30 15:10:29 -0700	[diff] [blame]	1457	return ret;
Yasunori Goto	9af3c2d	2006-06-27 02:53:34 -0700	[diff] [blame]	1458	error:
				1459	/* rollback pgdat allocation and others */
Oscar Salvador	b9ff036	2018-08-17 15:46:15 -0700	[diff] [blame]	1460	if (new_node)
				1461	rollback_node_hotadd(nid);
David Hildenbrand	52219ae	2020-06-04 16:48:38 -0700	[diff] [blame]	1462	if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
				1463	memblock_remove(start, size);
David Hildenbrand	49e0fcb	2021-11-05 13:44:42 -0700	[diff] [blame]	1464	error_mem_hotplug_end:
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	1465	mem_hotplug_done();
Yasunori Goto	bc02af9	2006-06-27 02:53:30 -0700	[diff] [blame]	1466	return ret;
				1467	}
David Vrabel	62cedb9	2015-06-25 16:35:49 +0100	[diff] [blame]	1468
David Hildenbrand	8df1d0e	2018-10-30 15:10:24 -0700	[diff] [blame]	1469	/* requires device_hotplug_lock, see add_memory_resource() */
David Hildenbrand	b611719	2020-10-15 20:08:44 -0700	[diff] [blame]	1470	int __ref __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags)
David Vrabel	62cedb9	2015-06-25 16:35:49 +0100	[diff] [blame]	1471	{
				1472	struct resource *res;
				1473	int ret;
				1474
David Hildenbrand	7b7b272	2020-06-04 16:48:41 -0700	[diff] [blame]	1475	res = register_memory_resource(start, size, "System RAM");
Vitaly Kuznetsov	6f754ba	2016-01-14 15:21:55 -0800	[diff] [blame]	1476	if (IS_ERR(res))
				1477	return PTR_ERR(res);
David Vrabel	62cedb9	2015-06-25 16:35:49 +0100	[diff] [blame]	1478
David Hildenbrand	b611719	2020-10-15 20:08:44 -0700	[diff] [blame]	1479	ret = add_memory_resource(nid, res, mhp_flags);
David Vrabel	62cedb9	2015-06-25 16:35:49 +0100	[diff] [blame]	1480	if (ret < 0)
				1481	release_memory_resource(res);
				1482	return ret;
				1483	}
David Hildenbrand	8df1d0e	2018-10-30 15:10:24 -0700	[diff] [blame]	1484
David Hildenbrand	b611719	2020-10-15 20:08:44 -0700	[diff] [blame]	1485	int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags)
David Hildenbrand	8df1d0e	2018-10-30 15:10:24 -0700	[diff] [blame]	1486	{
				1487	int rc;
				1488
				1489	lock_device_hotplug();
David Hildenbrand	b611719	2020-10-15 20:08:44 -0700	[diff] [blame]	1490	rc = __add_memory(nid, start, size, mhp_flags);
David Hildenbrand	8df1d0e	2018-10-30 15:10:24 -0700	[diff] [blame]	1491	unlock_device_hotplug();
				1492
				1493	return rc;
				1494	}
Yasunori Goto	bc02af9	2006-06-27 02:53:30 -0700	[diff] [blame]	1495	EXPORT_SYMBOL_GPL(add_memory);
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1496
Sudarshan Rajagopalan	42db6c2	2021-12-07 13:14:46 -0800	[diff] [blame]	1497	int add_memory_subsection(int nid, u64 start, u64 size)
				1498	{
				1499	struct mhp_params params = { .pgprot = PAGE_KERNEL };
				1500	struct resource *res;
				1501	int ret;
				1502
Sudarshan Rajagopalan	42db6c2	2021-12-07 13:14:46 -0800	[diff] [blame]	1503	if (!IS_ALIGNED(start, SUBSECTION_SIZE) \|\|
				1504	!IS_ALIGNED(size, SUBSECTION_SIZE)) {
Suren Baghdasaryan	81a34d6	2021-12-13 11:50:44 -0800	[diff] [blame]	1505	pr_err("%s: start 0x%llx size 0x%llx not aligned to subsection size\n",
Sudarshan Rajagopalan	42db6c2	2021-12-07 13:14:46 -0800	[diff] [blame]	1506	__func__, start, size);
				1507	return -EINVAL;
				1508	}
				1509
				1510	res = register_memory_resource(start, size, "System RAM");
				1511	if (IS_ERR(res))
				1512	return PTR_ERR(res);
				1513
				1514	mem_hotplug_begin();
				1515
				1516	nid = memory_add_physaddr_to_nid(start);
				1517
				1518	if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
Greg Kroah-Hartman	3355035	2024-01-03 16:21:03 +0000	[diff] [blame]	1519	memblock_add_node(start, size, nid, MEMBLOCK_NONE);
Sudarshan Rajagopalan	42db6c2	2021-12-07 13:14:46 -0800	[diff] [blame]	1520
				1521	ret = arch_add_memory(nid, start, size, &params);
				1522	if (ret) {
Suren Baghdasaryan	81a34d6	2021-12-13 11:50:44 -0800	[diff] [blame]	1523	pr_err("%s failed to add subsection start 0x%llx size 0x%llx\n",
Sudarshan Rajagopalan	42db6c2	2021-12-07 13:14:46 -0800	[diff] [blame]	1524	__func__, start, size);
Patrick Daly	d61f6702	2022-08-22 18:00:51 -0700	[diff] [blame]	1525	goto err_add_memory;
Sudarshan Rajagopalan	42db6c2	2021-12-07 13:14:46 -0800	[diff] [blame]	1526	}
				1527	mem_hotplug_done();
				1528
				1529	return ret;
Patrick Daly	d61f6702	2022-08-22 18:00:51 -0700	[diff] [blame]	1530
				1531	err_add_memory:
				1532	if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
				1533	memblock_remove(start, size);
				1534
				1535	mem_hotplug_done();
				1536
				1537	release_memory_resource(res);
				1538	return ret;
Sudarshan Rajagopalan	42db6c2	2021-12-07 13:14:46 -0800	[diff] [blame]	1539	}
				1540	EXPORT_SYMBOL_GPL(add_memory_subsection);
				1541
David Hildenbrand	7b7b272	2020-06-04 16:48:41 -0700	[diff] [blame]	1542	/*
				1543	* Add special, driver-managed memory to the system as system RAM. Such
				1544	* memory is not exposed via the raw firmware-provided memmap as system
				1545	* RAM, instead, it is detected and added by a driver - during cold boot,
				1546	* after a reboot, and after kexec.
				1547	*
				1548	* Reasons why this memory should not be used for the initial memmap of a
				1549	* kexec kernel or for placing kexec images:
				1550	* - The booting kernel is in charge of determining how this memory will be
				1551	* used (e.g., use persistent memory as system RAM)
				1552	* - Coordination with a hypervisor is required before this memory
				1553	* can be used (e.g., inaccessible parts).
				1554	*
				1555	* For this memory, no entries in /sys/firmware/memmap ("raw firmware-provided
				1556	* memory map") are created. Also, the created memory resource is flagged
David Hildenbrand	7cf603d	2020-10-15 20:08:33 -0700	[diff] [blame]	1557	* with IORESOURCE_SYSRAM_DRIVER_MANAGED, so in-kernel users can special-case
David Hildenbrand	7b7b272	2020-06-04 16:48:41 -0700	[diff] [blame]	1558	* this memory as well (esp., not place kexec images onto it).
				1559	*
				1560	* The resource_name (visible via /proc/iomem) has to have the format
				1561	* "System RAM ($DRIVER)".
				1562	*/
				1563	int add_memory_driver_managed(int nid, u64 start, u64 size,
David Hildenbrand	b611719	2020-10-15 20:08:44 -0700	[diff] [blame]	1564	const char *resource_name, mhp_t mhp_flags)
David Hildenbrand	7b7b272	2020-06-04 16:48:41 -0700	[diff] [blame]	1565	{
				1566	struct resource *res;
				1567	int rc;
				1568
				1569	if (!resource_name \|\|
				1570	strstr(resource_name, "System RAM (") != resource_name \|\|
				1571	resource_name[strlen(resource_name) - 1] != ')')
				1572	return -EINVAL;
				1573
				1574	lock_device_hotplug();
				1575
				1576	res = register_memory_resource(start, size, resource_name);
				1577	if (IS_ERR(res)) {
				1578	rc = PTR_ERR(res);
				1579	goto out_unlock;
				1580	}
				1581
David Hildenbrand	b611719	2020-10-15 20:08:44 -0700	[diff] [blame]	1582	rc = add_memory_resource(nid, res, mhp_flags);
David Hildenbrand	7b7b272	2020-06-04 16:48:41 -0700	[diff] [blame]	1583	if (rc < 0)
				1584	release_memory_resource(res);
				1585
				1586	out_unlock:
				1587	unlock_device_hotplug();
				1588	return rc;
				1589	}
				1590	EXPORT_SYMBOL_GPL(add_memory_driver_managed);
				1591
Anshuman Khandual	bca3fea	2021-02-25 17:17:33 -0800	[diff] [blame]	1592	/*
				1593	* Platforms should define arch_get_mappable_range() that provides
				1594	* maximum possible addressable physical memory range for which the
				1595	* linear mapping could be created. The platform returned address
				1596	* range must adhere to these following semantics.
				1597	*
				1598	* - range.start <= range.end
				1599	* - Range includes both end points [range.start..range.end]
				1600	*
				1601	* There is also a fallback definition provided here, allowing the
				1602	* entire possible physical address range in case any platform does
				1603	* not define arch_get_mappable_range().
				1604	*/
				1605	struct range __weak arch_get_mappable_range(void)
				1606	{
				1607	struct range mhp_range = {
				1608	.start = 0UL,
				1609	.end = -1ULL,
				1610	};
				1611	return mhp_range;
				1612	}
				1613
				1614	struct range mhp_get_pluggable_range(bool need_mapping)
				1615	{
				1616	const u64 max_phys = (1ULL << MAX_PHYSMEM_BITS) - 1;
				1617	struct range mhp_range;
				1618
				1619	if (need_mapping) {
				1620	mhp_range = arch_get_mappable_range();
				1621	if (mhp_range.start > max_phys) {
				1622	mhp_range.start = 0;
				1623	mhp_range.end = 0;
				1624	}
				1625	mhp_range.end = min_t(u64, mhp_range.end, max_phys);
				1626	} else {
				1627	mhp_range.start = 0;
				1628	mhp_range.end = max_phys;
				1629	}
				1630	return mhp_range;
				1631	}
				1632	EXPORT_SYMBOL_GPL(mhp_get_pluggable_range);
				1633
				1634	bool mhp_range_allowed(u64 start, u64 size, bool need_mapping)
				1635	{
				1636	struct range mhp_range = mhp_get_pluggable_range(need_mapping);
				1637	u64 end = start + size;
				1638
				1639	if (start < end && start >= mhp_range.start && (end - 1) <= mhp_range.end)
				1640	return true;
				1641
				1642	pr_warn("Hotplug memory [%#llx-%#llx] exceeds maximum addressable range [%#llx-%#llx]\n",
				1643	start, end, mhp_range.start, mhp_range.end);
				1644	return false;
				1645	}
				1646
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1647	#ifdef CONFIG_MEMORY_HOTREMOVE
				1648	/*
David Hildenbrand	9291799	2020-02-03 17:34:26 -0800	[diff] [blame]	1649	* Confirm all pages in a range [start, end) belong to the same zone (skipping
				1650	* memory holes). When true, return the zone.
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1651	*/
David Hildenbrand	9291799	2020-02-03 17:34:26 -0800	[diff] [blame]	1652	struct zone *test_pages_in_a_zone(unsigned long start_pfn,
				1653	unsigned long end_pfn)
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1654	{
Andrew Banman	5f0f288	2015-12-29 14:54:25 -0800	[diff] [blame]	1655	unsigned long pfn, sec_end_pfn;
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1656	struct zone *zone = NULL;
				1657	struct page *page;
Mike Rapoport	673d40c	2021-09-07 19:54:55 -0700	[diff] [blame]	1658
Toshi Kani	deb88a2	2017-02-03 13:13:20 -0800	[diff] [blame]	1659	for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn + 1);
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1660	pfn < end_pfn;
Toshi Kani	deb88a2	2017-02-03 13:13:20 -0800	[diff] [blame]	1661	pfn = sec_end_pfn, sec_end_pfn += PAGES_PER_SECTION) {
Andrew Banman	5f0f288	2015-12-29 14:54:25 -0800	[diff] [blame]	1662	/* Make sure the memory section is present first */
				1663	if (!present_section_nr(pfn_to_section_nr(pfn)))
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1664	continue;
Andrew Banman	5f0f288	2015-12-29 14:54:25 -0800	[diff] [blame]	1665	for (; pfn < sec_end_pfn && pfn < end_pfn;
				1666	pfn += MAX_ORDER_NR_PAGES) {
Mikhail Zaslonko	24feb47	2019-02-01 14:20:38 -0800	[diff] [blame]	1667	/* Check if we got outside of the zone */
Mike Rapoport	673d40c	2021-09-07 19:54:55 -0700	[diff] [blame]	1668	if (zone && !zone_spans_pfn(zone, pfn))
David Hildenbrand	9291799	2020-02-03 17:34:26 -0800	[diff] [blame]	1669	return NULL;
Mike Rapoport	673d40c	2021-09-07 19:54:55 -0700	[diff] [blame]	1670	page = pfn_to_page(pfn);
Andrew Banman	5f0f288	2015-12-29 14:54:25 -0800	[diff] [blame]	1671	if (zone && page_zone(page) != zone)
David Hildenbrand	9291799	2020-02-03 17:34:26 -0800	[diff] [blame]	1672	return NULL;
Andrew Banman	5f0f288	2015-12-29 14:54:25 -0800	[diff] [blame]	1673	zone = page_zone(page);
				1674	}
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1675	}
Toshi Kani	deb88a2	2017-02-03 13:13:20 -0800	[diff] [blame]	1676
David Hildenbrand	9291799	2020-02-03 17:34:26 -0800	[diff] [blame]	1677	return zone;
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1678	}
				1679
				1680	/*
Yisheng Xie	0efadf4	2017-02-24 14:57:39 -0800	[diff] [blame]	1681	* Scan pfn range [start,end) to find movable/migratable pages (LRU pages,
David Hildenbrand	aa21879	2020-05-07 16:01:30 +0200	[diff] [blame]	1682	* non-lru movable pages and hugepages). Will skip over most unmovable
				1683	* pages (esp., pages that can be skipped when offlining), but bail out on
				1684	* definitely unmovable pages.
				1685	*
				1686	* Returns:
				1687	* 0 in case a movable page is found and movable_pfn was updated.
				1688	* -ENOENT in case no movable page was found.
				1689	* -EBUSY in case a definitely unmovable page was found.
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1690	*/
David Hildenbrand	aa21879	2020-05-07 16:01:30 +0200	[diff] [blame]	1691	static int scan_movable_pages(unsigned long start, unsigned long end,
				1692	unsigned long *movable_pfn)
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1693	{
				1694	unsigned long pfn;
Oscar Salvador	eeb0efd	2019-02-01 14:20:47 -0800	[diff] [blame]	1695
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1696	for (pfn = start; pfn < end; pfn++) {
Oscar Salvador	eeb0efd	2019-02-01 14:20:47 -0800	[diff] [blame]	1697	struct page page, head;
				1698	unsigned long skip;
				1699
				1700	if (!pfn_valid(pfn))
				1701	continue;
				1702	page = pfn_to_page(pfn);
				1703	if (PageLRU(page))
David Hildenbrand	aa21879	2020-05-07 16:01:30 +0200	[diff] [blame]	1704	goto found;
Oscar Salvador	eeb0efd	2019-02-01 14:20:47 -0800	[diff] [blame]	1705	if (__PageMovable(page))
David Hildenbrand	aa21879	2020-05-07 16:01:30 +0200	[diff] [blame]	1706	goto found;
				1707
				1708	/*
				1709	* PageOffline() pages that are not marked __PageMovable() and
				1710	* have a reference count > 0 (after MEM_GOING_OFFLINE) are
				1711	* definitely unmovable. If their reference count would be 0,
				1712	* they could at least be skipped when offlining memory.
				1713	*/
				1714	if (PageOffline(page) && page_count(page))
				1715	return -EBUSY;
Oscar Salvador	eeb0efd	2019-02-01 14:20:47 -0800	[diff] [blame]	1716
				1717	if (!PageHuge(page))
				1718	continue;
				1719	head = compound_head(page);
Mike Kravetz	8f251a3	2021-02-24 12:08:56 -0800	[diff] [blame]	1720	/*
				1721	* This test is racy as we hold no reference or lock. The
				1722	* hugetlb page could have been free'ed and head is no longer
				1723	* a hugetlb page before the following check. In such unlikely
				1724	* cases false positives and negatives are possible. Calling
				1725	* code must deal with these scenarios.
				1726	*/
				1727	if (HPageMigratable(head))
David Hildenbrand	aa21879	2020-05-07 16:01:30 +0200	[diff] [blame]	1728	goto found;
Zi Yan	a466808	2023-09-13 16:12:46 -0400	[diff] [blame]	1729	skip = compound_nr(head) - (pfn - page_to_pfn(head));
Oscar Salvador	eeb0efd	2019-02-01 14:20:47 -0800	[diff] [blame]	1730	pfn += skip - 1;
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1731	}
David Hildenbrand	aa21879	2020-05-07 16:01:30 +0200	[diff] [blame]	1732	return -ENOENT;
				1733	found:
				1734	*movable_pfn = pfn;
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1735	return 0;
				1736	}
				1737
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1738	static int
				1739	do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
				1740	{
				1741	unsigned long pfn;
Matthew Wilcox (Oracle)	6c35784	2020-08-14 17:30:37 -0700	[diff] [blame]	1742	struct page page, head;
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1743	int ret = 0;
				1744	LIST_HEAD(source);
Liam Mark	786dee8	2021-06-30 18:52:43 -0700	[diff] [blame]	1745	static DEFINE_RATELIMIT_STATE(migrate_rs, DEFAULT_RATELIMIT_INTERVAL,
				1746	DEFAULT_RATELIMIT_BURST);
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1747
Michal Hocko	a85009c	2018-12-28 00:38:29 -0800	[diff] [blame]	1748	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1749	if (!pfn_valid(pfn))
				1750	continue;
				1751	page = pfn_to_page(pfn);
Matthew Wilcox (Oracle)	6c35784	2020-08-14 17:30:37 -0700	[diff] [blame]	1752	head = compound_head(page);
Naoya Horiguchi	c8721bb	2013-09-11 14:22:09 -0700	[diff] [blame]	1753
				1754	if (PageHuge(page)) {
Matthew Wilcox (Oracle)	d8c6546	2019-09-23 15:34:30 -0700	[diff] [blame]	1755	pfn = page_to_pfn(head) + compound_nr(head) - 1;
Miaohe Lin	072e741	2022-05-30 19:30:15 +0800	[diff] [blame]	1756	isolate_hugetlb(head, &source);
Naoya Horiguchi	c8721bb	2013-09-11 14:22:09 -0700	[diff] [blame]	1757	continue;
Michal Hocko	94723aa	2018-04-10 16:30:07 -0700	[diff] [blame]	1758	} else if (PageTransHuge(page))
Matthew Wilcox (Oracle)	6c35784	2020-08-14 17:30:37 -0700	[diff] [blame]	1759	pfn = page_to_pfn(head) + thp_nr_pages(page) - 1;
Naoya Horiguchi	c8721bb	2013-09-11 14:22:09 -0700	[diff] [blame]	1760
Michal Hocko	b15c872	2018-12-28 00:38:01 -0800	[diff] [blame]	1761	/*
				1762	* HWPoison pages have elevated reference counts so the migration would
				1763	* fail on them. It also doesn't make any sense to migrate them in the
				1764	* first place. Still try to unmap such a page in case it is still mapped
				1765	* (e.g. current hwpoison implementation doesn't unmap KSM pages but keep
				1766	* the unmap as the catch all safety net).
				1767	*/
				1768	if (PageHWPoison(page)) {
				1769	if (WARN_ON(PageLRU(page)))
				1770	isolate_lru_page(page);
				1771	if (page_mapped(page))
Shakeel Butt	013339d	2020-12-14 19:06:39 -0800	[diff] [blame]	1772	try_to_unmap(page, TTU_IGNORE_MLOCK);
Michal Hocko	b15c872	2018-12-28 00:38:01 -0800	[diff] [blame]	1773	continue;
				1774	}
				1775
Konstantin Khlebnikov	700c2a4	2011-05-24 17:12:19 -0700	[diff] [blame]	1776	if (!get_page_unless_zero(page))
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1777	continue;
				1778	/*
Yisheng Xie	0efadf4	2017-02-24 14:57:39 -0800	[diff] [blame]	1779	* We can skip free pages. And we can deal with pages on
				1780	* LRU and non-lru movable pages.
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1781	*/
Yisheng Xie	0efadf4	2017-02-24 14:57:39 -0800	[diff] [blame]	1782	if (PageLRU(page))
				1783	ret = isolate_lru_page(page);
				1784	else
				1785	ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1786	if (!ret) { /* Success */
Nick Piggin	62695a8	2008-10-18 20:26:09 -0700	[diff] [blame]	1787	list_add_tail(&page->lru, &source);
Yisheng Xie	0efadf4	2017-02-24 14:57:39 -0800	[diff] [blame]	1788	if (!__PageMovable(page))
				1789	inc_node_page_state(page, NR_ISOLATED_ANON +
Huang Ying	9de4f22	2020-04-06 20:04:41 -0700	[diff] [blame]	1790	page_is_file_lru(page));
KOSAKI Motohiro	6d9c285	2009-12-14 17:58:11 -0800	[diff] [blame]	1791
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1792	} else {
Liam Mark	786dee8	2021-06-30 18:52:43 -0700	[diff] [blame]	1793	if (__ratelimit(&migrate_rs)) {
				1794	pr_warn("failed to isolate pfn %lx\n", pfn);
				1795	dump_page(page, "isolation failed");
				1796	}
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1797	}
Oscar Salvador	1723058	2019-02-01 14:19:57 -0800	[diff] [blame]	1798	put_page(page);
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1799	}
Bob Liu	f3ab263	2010-10-26 14:22:10 -0700	[diff] [blame]	1800	if (!list_empty(&source)) {
Joonsoo Kim	203e6e5	2020-10-17 16:14:00 -0700	[diff] [blame]	1801	nodemask_t nmask = node_states[N_MEMORY];
				1802	struct migration_target_control mtc = {
				1803	.nmask = &nmask,
				1804	.gfp_mask = GFP_USER \| __GFP_MOVABLE \| __GFP_RETRY_MAYFAIL,
				1805	};
				1806
				1807	/*
				1808	* We have checked that migration range is on a single zone so
				1809	* we can use the nid of the first page to all the others.
				1810	*/
				1811	mtc.nid = page_to_nid(list_first_entry(&source, struct page, lru));
				1812
				1813	/*
				1814	* try to allocate from a different node but reuse this node
				1815	* if there are no other online nodes to be used (e.g. we are
				1816	* offlining a part of the only existing node)
				1817	*/
				1818	node_clear(mtc.nid, nmask);
				1819	if (nodes_empty(nmask))
				1820	node_set(mtc.nid, nmask);
				1821	ret = migrate_pages(&source, alloc_migration_target, NULL,
Yang Shi	5ac9588	2021-09-02 14:59:13 -0700	[diff] [blame]	1822	(unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG, NULL);
Michal Hocko	2932c8b	2018-12-28 00:33:53 -0800	[diff] [blame]	1823	if (ret) {
				1824	list_for_each_entry(page, &source, lru) {
Liam Mark	786dee8	2021-06-30 18:52:43 -0700	[diff] [blame]	1825	if (__ratelimit(&migrate_rs)) {
				1826	pr_warn("migrating pfn %lx failed ret:%d\n",
				1827	page_to_pfn(page), ret);
				1828	dump_page(page, "migration failure");
				1829	}
Michal Hocko	2932c8b	2018-12-28 00:33:53 -0800	[diff] [blame]	1830	}
Naoya Horiguchi	c8721bb	2013-09-11 14:22:09 -0700	[diff] [blame]	1831	putback_movable_pages(&source);
Michal Hocko	2932c8b	2018-12-28 00:33:53 -0800	[diff] [blame]	1832	}
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1833	}
Oscar Salvador	1723058	2019-02-01 14:19:57 -0800	[diff] [blame]	1834
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1835	return ret;
				1836	}
				1837
Tang Chen	c532092	2013-11-12 15:08:10 -0800	[diff] [blame]	1838	static int __init cmdline_parse_movable_node(char *p)
				1839	{
Tang Chen	55ac590	2014-01-21 15:49:35 -0800	[diff] [blame]	1840	movable_node_enabled = true;
Tang Chen	c532092	2013-11-12 15:08:10 -0800	[diff] [blame]	1841	return 0;
				1842	}
				1843	early_param("movable_node", cmdline_parse_movable_node);
				1844
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1845	/* check which state of node_states will be changed when offline memory */
				1846	static void node_states_check_changes_offline(unsigned long nr_pages,
				1847	struct zone zone, struct memory_notify arg)
				1848	{
				1849	struct pglist_data *pgdat = zone->zone_pgdat;
				1850	unsigned long present_pages = 0;
Oscar Salvador	86b27be	2018-10-26 15:07:38 -0700	[diff] [blame]	1851	enum zone_type zt;
				1852
Anshuman Khandual	98fa15f	2019-03-05 15:42:58 -0800	[diff] [blame]	1853	arg->status_change_nid = NUMA_NO_NODE;
				1854	arg->status_change_nid_normal = NUMA_NO_NODE;
				1855	arg->status_change_nid_high = NUMA_NO_NODE;
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1856
				1857	/*
Oscar Salvador	86b27be	2018-10-26 15:07:38 -0700	[diff] [blame]	1858	* Check whether node_states[N_NORMAL_MEMORY] will be changed.
				1859	* If the memory to be offline is within the range
				1860	* [0..ZONE_NORMAL], and it is the last present memory there,
				1861	* the zones in that range will become empty after the offlining,
				1862	* thus we can determine that we need to clear the node from
				1863	* node_states[N_NORMAL_MEMORY].
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1864	*/
Oscar Salvador	86b27be	2018-10-26 15:07:38 -0700	[diff] [blame]	1865	for (zt = 0; zt <= ZONE_NORMAL; zt++)
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1866	present_pages += pgdat->node_zones[zt].present_pages;
Oscar Salvador	86b27be	2018-10-26 15:07:38 -0700	[diff] [blame]	1867	if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages)
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1868	arg->status_change_nid_normal = zone_to_nid(zone);
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1869
Lai Jiangshan	6715ddf	2012-12-12 13:51:49 -0800	[diff] [blame]	1870	#ifdef CONFIG_HIGHMEM
				1871	/*
Oscar Salvador	86b27be	2018-10-26 15:07:38 -0700	[diff] [blame]	1872	* node_states[N_HIGH_MEMORY] contains nodes which
				1873	* have normal memory or high memory.
				1874	* Here we add the present_pages belonging to ZONE_HIGHMEM.
				1875	* If the zone is within the range of [0..ZONE_HIGHMEM), and
				1876	* we determine that the zones in that range become empty,
				1877	* we need to clear the node for N_HIGH_MEMORY.
Lai Jiangshan	6715ddf	2012-12-12 13:51:49 -0800	[diff] [blame]	1878	*/
Oscar Salvador	86b27be	2018-10-26 15:07:38 -0700	[diff] [blame]	1879	present_pages += pgdat->node_zones[ZONE_HIGHMEM].present_pages;
				1880	if (zone_idx(zone) <= ZONE_HIGHMEM && nr_pages >= present_pages)
Lai Jiangshan	6715ddf	2012-12-12 13:51:49 -0800	[diff] [blame]	1881	arg->status_change_nid_high = zone_to_nid(zone);
Lai Jiangshan	6715ddf	2012-12-12 13:51:49 -0800	[diff] [blame]	1882	#endif
				1883
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1884	/*
Oscar Salvador	86b27be	2018-10-26 15:07:38 -0700	[diff] [blame]	1885	* We have accounted the pages from [0..ZONE_NORMAL), and
				1886	* in case of CONFIG_HIGHMEM the pages from ZONE_HIGHMEM
				1887	* as well.
				1888	* Here we count the possible pages from ZONE_MOVABLE.
				1889	* If after having accounted all the pages, we see that the nr_pages
				1890	* to be offlined is over or equal to the accounted pages,
				1891	* we know that the node will become empty, and so, we can clear
				1892	* it for N_MEMORY as well.
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1893	*/
Oscar Salvador	86b27be	2018-10-26 15:07:38 -0700	[diff] [blame]	1894	present_pages += pgdat->node_zones[ZONE_MOVABLE].present_pages;
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1895
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1896	if (nr_pages >= present_pages)
				1897	arg->status_change_nid = zone_to_nid(zone);
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1898	}
				1899
				1900	static void node_states_clear_node(int node, struct memory_notify *arg)
				1901	{
				1902	if (arg->status_change_nid_normal >= 0)
				1903	node_clear_state(node, N_NORMAL_MEMORY);
				1904
Oscar Salvador	cf01f6f5	2018-10-26 15:07:28 -0700	[diff] [blame]	1905	if (arg->status_change_nid_high >= 0)
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1906	node_clear_state(node, N_HIGH_MEMORY);
Lai Jiangshan	6715ddf	2012-12-12 13:51:49 -0800	[diff] [blame]	1907
Oscar Salvador	cf01f6f5	2018-10-26 15:07:28 -0700	[diff] [blame]	1908	if (arg->status_change_nid >= 0)
Lai Jiangshan	6715ddf	2012-12-12 13:51:49 -0800	[diff] [blame]	1909	node_clear_state(node, N_MEMORY);
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1910	}
				1911
David Hildenbrand	c5e79ef	2019-11-30 17:54:17 -0800	[diff] [blame]	1912	static int count_system_ram_pages_cb(unsigned long start_pfn,
				1913	unsigned long nr_pages, void *data)
				1914	{
				1915	unsigned long *nr_system_ram_pages = data;
				1916
				1917	*nr_system_ram_pages += nr_pages;
				1918	return 0;
				1919	}
				1920
David Hildenbrand	836809e	2021-09-07 19:55:30 -0700	[diff] [blame]	1921	int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
				1922	struct memory_group *group)
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1923	{
David Hildenbrand	73a11c9	2020-10-15 20:07:46 -0700	[diff] [blame]	1924	const unsigned long end_pfn = start_pfn + nr_pages;
David Hildenbrand	0a1a9a0	2020-10-15 20:07:54 -0700	[diff] [blame]	1925	unsigned long pfn, system_ram_pages = 0;
Cody P Schafer	d702909	2013-07-03 15:02:11 -0700	[diff] [blame]	1926	unsigned long flags;
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1927	struct zone *zone;
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1928	struct memory_notify arg;
David Hildenbrand	ea15153	2020-10-15 20:08:03 -0700	[diff] [blame]	1929	int ret, node;
Michal Hocko	7960509	2018-12-28 00:33:49 -0800	[diff] [blame]	1930	char *reason;
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1931
Oscar Salvador	dd8e2f2	2021-05-04 18:39:36 -0700	[diff] [blame]	1932	/*
				1933	* {on,off}lining is constrained to full memory sections (or more
Zhen Lei	041711c	2021-06-30 18:53:17 -0700	[diff] [blame]	1934	* precisely to memory blocks from the user space POV).
Oscar Salvador	dd8e2f2	2021-05-04 18:39:36 -0700	[diff] [blame]	1935	* memmap_on_memory is an exception because it reserves initial part
				1936	* of the physical memory space for vmemmaps. That space is pageblock
				1937	* aligned.
				1938	*/
David Hildenbrand	4986fac	2020-10-15 20:07:50 -0700	[diff] [blame]	1939	if (WARN_ON_ONCE(!nr_pages \|\|
Oscar Salvador	dd8e2f2	2021-05-04 18:39:36 -0700	[diff] [blame]	1940	!IS_ALIGNED(start_pfn, pageblock_nr_pages) \|\|
				1941	!IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION)))
David Hildenbrand	4986fac	2020-10-15 20:07:50 -0700	[diff] [blame]	1942	return -EINVAL;
				1943
David Hildenbrand	381eab4	2018-10-30 15:10:29 -0700	[diff] [blame]	1944	mem_hotplug_begin();
				1945
David Hildenbrand	c5e79ef	2019-11-30 17:54:17 -0800	[diff] [blame]	1946	/*
				1947	* Don't allow to offline memory blocks that contain holes.
				1948	* Consequently, memory blocks with holes can never get onlined
				1949	* via the hotplug path - online_pages() - as hotplugged memory has
				1950	* no holes. This way, we e.g., don't have to worry about marking
				1951	* memory holes PG_reserved, don't need pfn_valid() checks, and can
				1952	* avoid using walk_system_ram_range() later.
				1953	*/
David Hildenbrand	73a11c9	2020-10-15 20:07:46 -0700	[diff] [blame]	1954	walk_system_ram_range(start_pfn, nr_pages, &system_ram_pages,
David Hildenbrand	c5e79ef	2019-11-30 17:54:17 -0800	[diff] [blame]	1955	count_system_ram_pages_cb);
David Hildenbrand	73a11c9	2020-10-15 20:07:46 -0700	[diff] [blame]	1956	if (system_ram_pages != nr_pages) {
David Hildenbrand	c5e79ef	2019-11-30 17:54:17 -0800	[diff] [blame]	1957	ret = -EINVAL;
				1958	reason = "memory holes";
				1959	goto failed_removal;
				1960	}
				1961
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1962	/* This makes hotplug much easier...and readable.
				1963	we assume this for now. .*/
David Hildenbrand	9291799	2020-02-03 17:34:26 -0800	[diff] [blame]	1964	zone = test_pages_in_a_zone(start_pfn, end_pfn);
				1965	if (!zone) {
Michal Hocko	7960509	2018-12-28 00:33:49 -0800	[diff] [blame]	1966	ret = -EINVAL;
				1967	reason = "multizone range";
				1968	goto failed_removal;
David Hildenbrand	381eab4	2018-10-30 15:10:29 -0700	[diff] [blame]	1969	}
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1970	node = zone_to_nid(zone);
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1971
Vlastimil Babka	ec6e8c7e	2020-12-14 19:10:59 -0800	[diff] [blame]	1972	/*
				1973	* Disable pcplists so that page isolation cannot race with freeing
				1974	* in a way that pages from isolated pageblock are left on pcplists.
				1975	*/
				1976	zone_pcp_disable(zone);
Minchan Kim	d479960e	2021-05-04 18:36:54 -0700	[diff] [blame]	1977	lru_cache_disable();
Vlastimil Babka	ec6e8c7e	2020-12-14 19:10:59 -0800	[diff] [blame]	1978
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	1979	/* set above range as isolated */
Wen Congyang	b023f46	2012-12-11 16:00:45 -0800	[diff] [blame]	1980	ret = start_isolate_page_range(start_pfn, end_pfn,
Michal Hocko	d381c54	2018-12-28 00:33:56 -0800	[diff] [blame]	1981	MIGRATE_MOVABLE,
Minchan Kim	60d2dad	2021-06-29 12:08:44 -0700	[diff] [blame]	1982	MEMORY_OFFLINE \| REPORT_FAILURE, NULL);
David Hildenbrand	3fa0c7c	2020-10-15 20:08:07 -0700	[diff] [blame]	1983	if (ret) {
Michal Hocko	7960509	2018-12-28 00:33:49 -0800	[diff] [blame]	1984	reason = "failure to isolate range";
Vlastimil Babka	ec6e8c7e	2020-12-14 19:10:59 -0800	[diff] [blame]	1985	goto failed_removal_pcplists_disabled;
David Hildenbrand	381eab4	2018-10-30 15:10:29 -0700	[diff] [blame]	1986	}
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1987
				1988	arg.start_pfn = start_pfn;
				1989	arg.nr_pages = nr_pages;
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	1990	node_states_check_changes_offline(nr_pages, zone, &arg);
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1991
				1992	ret = memory_notify(MEM_GOING_OFFLINE, &arg);
				1993	ret = notifier_to_errno(ret);
Michal Hocko	7960509	2018-12-28 00:33:49 -0800	[diff] [blame]	1994	if (ret) {
				1995	reason = "notifier failure";
				1996	goto failed_removal_isolated;
				1997	}
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	1998
Michal Hocko	bb8965b	2018-12-28 00:38:32 -0800	[diff] [blame]	1999	do {
David Hildenbrand	aa21879	2020-05-07 16:01:30 +0200	[diff] [blame]	2000	pfn = start_pfn;
				2001	do {
Michal Hocko	bb8965b	2018-12-28 00:38:32 -0800	[diff] [blame]	2002	if (signal_pending(current)) {
				2003	ret = -EINTR;
				2004	reason = "signal backoff";
				2005	goto failed_removal_isolated;
				2006	}
Michal Hocko	72b39cf	2017-11-15 17:33:34 -0800	[diff] [blame]	2007
Michal Hocko	bb8965b	2018-12-28 00:38:32 -0800	[diff] [blame]	2008	cond_resched();
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	2009
David Hildenbrand	aa21879	2020-05-07 16:01:30 +0200	[diff] [blame]	2010	ret = scan_movable_pages(pfn, end_pfn, &pfn);
				2011	if (!ret) {
Michal Hocko	bb8965b	2018-12-28 00:38:32 -0800	[diff] [blame]	2012	/*
				2013	* TODO: fatal migration failures should bail
				2014	* out
				2015	*/
				2016	do_migrate_range(pfn, end_pfn);
				2017	}
David Hildenbrand	aa21879	2020-05-07 16:01:30 +0200	[diff] [blame]	2018	} while (!ret);
				2019
				2020	if (ret != -ENOENT) {
				2021	reason = "unmovable page";
				2022	goto failed_removal_isolated;
Michal Hocko	bb8965b	2018-12-28 00:38:32 -0800	[diff] [blame]	2023	}
Michal Hocko	72b39cf	2017-11-15 17:33:34 -0800	[diff] [blame]	2024
Michal Hocko	bb8965b	2018-12-28 00:38:32 -0800	[diff] [blame]	2025	/*
				2026	* Dissolve free hugepages in the memory block before doing
				2027	* offlining actually in order to make hugetlbfs's object
				2028	* counting consistent.
				2029	*/
				2030	ret = dissolve_free_huge_pages(start_pfn, end_pfn);
				2031	if (ret) {
				2032	reason = "failure to dissolve huge pages";
				2033	goto failed_removal_isolated;
				2034	}
David Hildenbrand	0a1a9a0	2020-10-15 20:07:54 -0700	[diff] [blame]	2035
Minchan Kim	60d2dad	2021-06-29 12:08:44 -0700	[diff] [blame]	2036	ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE, NULL);
Vlastimil Babka	ec6e8c7e	2020-12-14 19:10:59 -0800	[diff] [blame]	2037
Michal Hocko	5557c76	2019-05-13 17:21:24 -0700	[diff] [blame]	2038	} while (ret);
Michal Hocko	bb8965b	2018-12-28 00:38:32 -0800	[diff] [blame]	2039
David Hildenbrand	0a1a9a0	2020-10-15 20:07:54 -0700	[diff] [blame]	2040	/* Mark all sections offline and remove free pages from the buddy. */
				2041	__offline_isolated_pages(start_pfn, end_pfn);
Laurent Dufour	7c33023	2020-12-15 20:42:26 -0800	[diff] [blame]	2042	pr_debug("Offlined Pages %ld\n", nr_pages);
David Hildenbrand	0a1a9a0	2020-10-15 20:07:54 -0700	[diff] [blame]	2043
Qian Cai	9b7ea46	2019-03-28 20:43:34 -0700	[diff] [blame]	2044	/*
David Hildenbrand	b30c592	2020-10-15 20:08:23 -0700	[diff] [blame]	2045	* The memory sections are marked offline, and the pageblock flags
				2046	* effectively stale; nobody should be touching them. Fixup the number
				2047	* of isolated pageblocks, memory onlining will properly revert this.
Qian Cai	9b7ea46	2019-03-28 20:43:34 -0700	[diff] [blame]	2048	*/
				2049	spin_lock_irqsave(&zone->lock, flags);
David Hildenbrand	ea15153	2020-10-15 20:08:03 -0700	[diff] [blame]	2050	zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages;
Qian Cai	9b7ea46	2019-03-28 20:43:34 -0700	[diff] [blame]	2051	spin_unlock_irqrestore(&zone->lock, flags);
				2052
Minchan Kim	d479960e	2021-05-04 18:36:54 -0700	[diff] [blame]	2053	lru_cache_enable();
Vlastimil Babka	ec6e8c7e	2020-12-14 19:10:59 -0800	[diff] [blame]	2054	zone_pcp_enable(zone);
				2055
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	2056	/* removal success */
David Hildenbrand	0a1a9a0	2020-10-15 20:07:54 -0700	[diff] [blame]	2057	adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
David Hildenbrand	836809e	2021-09-07 19:55:30 -0700	[diff] [blame]	2058	adjust_present_page_count(pfn_to_page(start_pfn), group, -nr_pages);
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	2059
Mel Gorman	b92ca18	2021-06-28 19:42:12 -0700	[diff] [blame]	2060	/* reinitialise watermarks and update pcp limits */
KOSAKI Motohiro	1b79acc	2011-05-24 17:11:32 -0700	[diff] [blame]	2061	init_per_zone_wmark_min();
				2062
Xishi Qiu	1e8537b	2012-10-08 16:31:51 -0700	[diff] [blame]	2063	if (!populated_zone(zone)) {
Jiang Liu	340175b	2012-07-31 16:43:32 -0700	[diff] [blame]	2064	zone_pcp_reset(zone);
Michal Hocko	72675e1	2017-09-06 16:20:24 -0700	[diff] [blame]	2065	build_all_zonelists(NULL);
Mel Gorman	b92ca18	2021-06-28 19:42:12 -0700	[diff] [blame]	2066	}
Jiang Liu	340175b	2012-07-31 16:43:32 -0700	[diff] [blame]	2067
Lai Jiangshan	d971367	2012-12-11 16:01:03 -0800	[diff] [blame]	2068	node_states_clear_node(node, &arg);
Vlastimil Babka	698b1b3	2016-03-17 14:18:08 -0700	[diff] [blame]	2069	if (arg.status_change_nid >= 0) {
David Rientjes	8fe23e0	2009-12-14 17:58:33 -0800	[diff] [blame]	2070	kswapd_stop(node);
Vlastimil Babka	698b1b3	2016-03-17 14:18:08 -0700	[diff] [blame]	2071	kcompactd_stop(node);
				2072	}
Minchan Kim	bce7394	2009-06-16 15:32:50 -0700	[diff] [blame]	2073
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	2074	writeback_set_ratelimit();
Yasunori Goto	7b78d33	2007-10-21 16:41:36 -0700	[diff] [blame]	2075
				2076	memory_notify(MEM_OFFLINE, &arg);
David Hildenbrand	feee6b2	2020-01-04 12:59:33 -0800	[diff] [blame]	2077	remove_pfn_range_from_zone(zone, start_pfn, nr_pages);
David Hildenbrand	381eab4	2018-10-30 15:10:29 -0700	[diff] [blame]	2078	mem_hotplug_done();
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	2079	return 0;
				2080
Michal Hocko	7960509	2018-12-28 00:33:49 -0800	[diff] [blame]	2081	failed_removal_isolated:
				2082	undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
Qian Cai	c4efe48	2019-03-28 20:44:16 -0700	[diff] [blame]	2083	memory_notify(MEM_CANCEL_OFFLINE, &arg);
Vlastimil Babka	ec6e8c7e	2020-12-14 19:10:59 -0800	[diff] [blame]	2084	failed_removal_pcplists_disabled:
Miaohe Lin	946746d1	2021-08-25 12:17:55 -0700	[diff] [blame]	2085	lru_cache_enable();
Vlastimil Babka	ec6e8c7e	2020-12-14 19:10:59 -0800	[diff] [blame]	2086	zone_pcp_enable(zone);
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	2087	failed_removal:
Michal Hocko	7960509	2018-12-28 00:33:49 -0800	[diff] [blame]	2088	pr_debug("memory offlining [mem %#010llx-%#010llx] failed due to %s\n",
Chen Yucong	e33e33b	2016-03-17 14:19:35 -0700	[diff] [blame]	2089	(unsigned long long) start_pfn << PAGE_SHIFT,
Michal Hocko	7960509	2018-12-28 00:33:49 -0800	[diff] [blame]	2090	((unsigned long long) end_pfn << PAGE_SHIFT) - 1,
				2091	reason);
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	2092	/* pushback to free area */
David Hildenbrand	381eab4	2018-10-30 15:10:29 -0700	[diff] [blame]	2093	mem_hotplug_done();
KAMEZAWA Hiroyuki	0c0e619	2007-10-16 01:26:12 -0700	[diff] [blame]	2094	return ret;
				2095	}
Badari Pulavarty	7108878	2008-10-18 20:25:58 -0700	[diff] [blame]	2096
Xishi Qiu	d6de9d5	2013-11-12 15:07:20 -0800	[diff] [blame]	2097	static int check_memblock_offlined_cb(struct memory_block mem, void arg)
Wen Congyang	bbc76be	2013-02-22 16:32:54 -0800	[diff] [blame]	2098	{
				2099	int ret = !is_memblock_offlined(mem);
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2100	int *nid = arg;
Wen Congyang	bbc76be	2013-02-22 16:32:54 -0800	[diff] [blame]	2101
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2102	*nid = mem->nid;
Randy Dunlap	349daa0	2013-04-29 15:08:49 -0700	[diff] [blame]	2103	if (unlikely(ret)) {
				2104	phys_addr_t beginpa, endpa;
				2105
				2106	beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr));
David Hildenbrand	b6c88d3	2019-09-23 15:35:49 -0700	[diff] [blame]	2107	endpa = beginpa + memory_block_size_bytes() - 1;
Joe Perches	756a025	2016-03-17 14:19:47 -0700	[diff] [blame]	2108	pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n",
Randy Dunlap	349daa0	2013-04-29 15:08:49 -0700	[diff] [blame]	2109	&beginpa, &endpa);
Wen Congyang	bbc76be	2013-02-22 16:32:54 -0800	[diff] [blame]	2110
Pavel Tatashin	eca499a	2019-07-16 16:30:31 -0700	[diff] [blame]	2111	return -EBUSY;
				2112	}
				2113	return 0;
Wen Congyang	bbc76be	2013-02-22 16:32:54 -0800	[diff] [blame]	2114	}
				2115
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	2116	static int get_nr_vmemmap_pages_cb(struct memory_block mem, void arg)
				2117	{
				2118	/*
				2119	* If not set, continue with the next block.
				2120	*/
				2121	return mem->nr_vmemmap_pages;
				2122	}
				2123
Toshi Kani	0f1cfe9	2013-09-11 14:21:50 -0700	[diff] [blame]	2124	static int check_cpu_on_node(pg_data_t *pgdat)
Tang Chen	60a5a19	2013-02-22 16:33:14 -0800	[diff] [blame]	2125	{
Tang Chen	60a5a19	2013-02-22 16:33:14 -0800	[diff] [blame]	2126	int cpu;
				2127
				2128	for_each_present_cpu(cpu) {
				2129	if (cpu_to_node(cpu) == pgdat->node_id)
				2130	/*
				2131	* the cpu on this node isn't removed, and we can't
				2132	* offline this node.
				2133	*/
				2134	return -EBUSY;
				2135	}
				2136
				2137	return 0;
				2138	}
				2139
David Hildenbrand	2c91f8f	2019-11-15 17:34:57 -0800	[diff] [blame]	2140	static int check_no_memblock_for_node_cb(struct memory_block mem, void arg)
				2141	{
				2142	int nid = (int )arg;
				2143
				2144	/*
				2145	* If a memory block belongs to multiple nodes, the stored nid is not
				2146	* reliable. However, such blocks are always online (e.g., cannot get
				2147	* offlined) and, therefore, are still spanned by the node.
				2148	*/
				2149	return mem->nid == nid ? -EEXIST : 0;
				2150	}
				2151
Toshi Kani	0f1cfe9	2013-09-11 14:21:50 -0700	[diff] [blame]	2152	/**
				2153	* try_offline_node
Mike Rapoport	e8b098f	2018-04-05 16:24:57 -0700	[diff] [blame]	2154	* @nid: the node ID
Toshi Kani	0f1cfe9	2013-09-11 14:21:50 -0700	[diff] [blame]	2155	*
				2156	* Offline a node if all memory sections and cpus of the node are removed.
				2157	*
				2158	* NOTE: The caller must call lock_device_hotplug() to serialize hotplug
				2159	* and online/offline operations before this call.
				2160	*/
Wen Congyang	90b30cd	2013-02-22 16:33:27 -0800	[diff] [blame]	2161	void try_offline_node(int nid)
Tang Chen	60a5a19	2013-02-22 16:33:14 -0800	[diff] [blame]	2162	{
Wen Congyang	d822b86	2013-02-22 16:33:16 -0800	[diff] [blame]	2163	pg_data_t *pgdat = NODE_DATA(nid);
David Hildenbrand	2c91f8f	2019-11-15 17:34:57 -0800	[diff] [blame]	2164	int rc;
Tang Chen	60a5a19	2013-02-22 16:33:14 -0800	[diff] [blame]	2165
David Hildenbrand	2c91f8f	2019-11-15 17:34:57 -0800	[diff] [blame]	2166	/*
				2167	* If the node still spans pages (especially ZONE_DEVICE), don't
				2168	* offline it. A node spans memory after move_pfn_range_to_zone(),
				2169	* e.g., after the memory block was onlined.
				2170	*/
				2171	if (pgdat->node_spanned_pages)
Tang Chen	60a5a19	2013-02-22 16:33:14 -0800	[diff] [blame]	2172	return;
David Hildenbrand	2c91f8f	2019-11-15 17:34:57 -0800	[diff] [blame]	2173
				2174	/*
				2175	* Especially offline memory blocks might not be spanned by the
				2176	* node. They will get spanned by the node once they get onlined.
				2177	* However, they link to the node in sysfs and can get onlined later.
				2178	*/
				2179	rc = for_each_memory_block(&nid, check_no_memblock_for_node_cb);
				2180	if (rc)
				2181	return;
Tang Chen	60a5a19	2013-02-22 16:33:14 -0800	[diff] [blame]	2182
Michal Hocko	46a3679	2018-12-28 00:34:13 -0800	[diff] [blame]	2183	if (check_cpu_on_node(pgdat))
Tang Chen	60a5a19	2013-02-22 16:33:14 -0800	[diff] [blame]	2184	return;
				2185
				2186	/*
				2187	* all memory/cpu of this node are removed, we can offline this
				2188	* node now.
				2189	*/
				2190	node_set_offline(nid);
				2191	unregister_one_node(nid);
				2192	}
Wen Congyang	90b30cd	2013-02-22 16:33:27 -0800	[diff] [blame]	2193	EXPORT_SYMBOL(try_offline_node);
Tang Chen	60a5a19	2013-02-22 16:33:14 -0800	[diff] [blame]	2194
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2195	static int __ref try_remove_memory(u64 start, u64 size)
Wen Congyang	bbc76be	2013-02-22 16:32:54 -0800	[diff] [blame]	2196	{
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	2197	struct vmem_altmap mhp_altmap = {};
				2198	struct vmem_altmap *altmap = NULL;
				2199	unsigned long nr_vmemmap_pages;
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2200	int rc = 0, nid = NUMA_NO_NODE;
Wen Congyang	993c1aa	2013-02-22 16:32:50 -0800	[diff] [blame]	2201
Toshi Kani	27356f5	2013-09-11 14:21:49 -0700	[diff] [blame]	2202	BUG_ON(check_hotplug_memory_range(start, size));
				2203
Yasuaki Ishimatsu	6677e3e	2013-02-22 16:32:52 -0800	[diff] [blame]	2204	/*
Rafael J. Wysocki	242831e	2013-05-27 12:58:46 +0200	[diff] [blame]	2205	* All memory blocks must be offlined before removing memory. Check
Pavel Tatashin	eca499a	2019-07-16 16:30:31 -0700	[diff] [blame]	2206	* whether all memory blocks in question are offline and return error
Rafael J. Wysocki	242831e	2013-05-27 12:58:46 +0200	[diff] [blame]	2207	* if this is not the case.
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2208	*
				2209	* While at it, determine the nid. Note that if we'd have mixed nodes,
				2210	* we'd only try to offline the last determined one -- which is good
				2211	* enough for the cases we care about.
Yasuaki Ishimatsu	6677e3e	2013-02-22 16:32:52 -0800	[diff] [blame]	2212	*/
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2213	rc = walk_memory_blocks(start, size, &nid, check_memblock_offlined_cb);
Pavel Tatashin	eca499a	2019-07-16 16:30:31 -0700	[diff] [blame]	2214	if (rc)
Jia He	b4223a5	2020-08-11 18:32:20 -0700	[diff] [blame]	2215	return rc;
Yasuaki Ishimatsu	6677e3e	2013-02-22 16:32:52 -0800	[diff] [blame]	2216
Oscar Salvador	a08a2ae	2021-05-04 18:39:42 -0700	[diff] [blame]	2217	/*
				2218	* We only support removing memory added with MHP_MEMMAP_ON_MEMORY in
				2219	* the same granularity it was added - a single memory block.
				2220	*/
				2221	if (memmap_on_memory) {
				2222	nr_vmemmap_pages = walk_memory_blocks(start, size, NULL,
				2223	get_nr_vmemmap_pages_cb);
				2224	if (nr_vmemmap_pages) {
				2225	if (size != memory_block_size_bytes()) {
				2226	pr_warn("Refuse to remove %#llx - %#llx,"
				2227	"wrong granularity\n",
				2228	start, start + size);
				2229	return -EINVAL;
				2230	}
				2231
				2232	/*
				2233	* Let remove_pmd_table->free_hugepage_table do the
				2234	* right thing if we used vmem_altmap when hot-adding
				2235	* the range.
				2236	*/
				2237	mhp_altmap.alloc = nr_vmemmap_pages;
				2238	altmap = &mhp_altmap;
				2239	}
				2240	}
				2241
Yasuaki Ishimatsu	46c66c4	2013-02-22 16:32:56 -0800	[diff] [blame]	2242	/* remove memmap entry */
				2243	firmware_map_remove(start, start + size, "System RAM");
				2244
Dan Williams	f1037ec	2020-01-30 22:11:17 -0800	[diff] [blame]	2245	/*
				2246	* Memory block device removal under the device_hotplug_lock is
				2247	* a barrier against racing online attempts.
				2248	*/
David Hildenbrand	4c4b7f9	2019-07-18 15:57:06 -0700	[diff] [blame]	2249	remove_memory_block_devices(start, size);
				2250
Dan Williams	f1037ec	2020-01-30 22:11:17 -0800	[diff] [blame]	2251	mem_hotplug_begin();
				2252
David Hildenbrand	65a2aa5	2021-09-07 19:55:04 -0700	[diff] [blame]	2253	arch_remove_memory(start, size, altmap);
David Hildenbrand	52219ae	2020-06-04 16:48:38 -0700	[diff] [blame]	2254
				2255	if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
				2256	memblock_free(start, size);
				2257	memblock_remove(start, size);
				2258	}
				2259
David Hildenbrand	cb8e3c8	2020-10-15 20:09:12 -0700	[diff] [blame]	2260	release_mem_region_adjustable(start, size);
Wen Congyang	24d335c	2013-02-22 16:32:58 -0800	[diff] [blame]	2261
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2262	if (nid != NUMA_NO_NODE)
				2263	try_offline_node(nid);
Tang Chen	60a5a19	2013-02-22 16:33:14 -0800	[diff] [blame]	2264
Vladimir Davydov	bfc8c90	2014-06-04 16:07:18 -0700	[diff] [blame]	2265	mem_hotplug_done();
Jia He	b4223a5	2020-08-11 18:32:20 -0700	[diff] [blame]	2266	return 0;
Badari Pulavarty	7108878	2008-10-18 20:25:58 -0700	[diff] [blame]	2267	}
David Hildenbrand	d15e592	2018-10-30 15:10:18 -0700	[diff] [blame]	2268
Pavel Tatashin	eca499a	2019-07-16 16:30:31 -0700	[diff] [blame]	2269	/**
Mel Gorman	5640c9c	2021-06-30 18:53:38 -0700	[diff] [blame]	2270	* __remove_memory - Remove memory if every memory block is offline
Pavel Tatashin	eca499a	2019-07-16 16:30:31 -0700	[diff] [blame]	2271	* @start: physical address of the region to remove
				2272	* @size: size of the region to remove
				2273	*
				2274	* NOTE: The caller must call lock_device_hotplug() to serialize hotplug
				2275	* and online/offline operations before this call, as required by
				2276	* try_offline_node().
				2277	*/
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2278	void __remove_memory(u64 start, u64 size)
David Hildenbrand	d15e592	2018-10-30 15:10:18 -0700	[diff] [blame]	2279	{
Pavel Tatashin	eca499a	2019-07-16 16:30:31 -0700	[diff] [blame]	2280
				2281	/*
Souptick Joarder	29a90db	2019-09-23 15:36:18 -0700	[diff] [blame]	2282	* trigger BUG() if some memory is not offlined prior to calling this
Pavel Tatashin	eca499a	2019-07-16 16:30:31 -0700	[diff] [blame]	2283	* function
				2284	*/
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2285	if (try_remove_memory(start, size))
Pavel Tatashin	eca499a	2019-07-16 16:30:31 -0700	[diff] [blame]	2286	BUG();
				2287	}
				2288
				2289	/*
				2290	* Remove memory if every memory block is offline, otherwise return -EBUSY is
				2291	* some memory is not offline
				2292	*/
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2293	int remove_memory(u64 start, u64 size)
Pavel Tatashin	eca499a	2019-07-16 16:30:31 -0700	[diff] [blame]	2294	{
				2295	int rc;
				2296
David Hildenbrand	d15e592	2018-10-30 15:10:18 -0700	[diff] [blame]	2297	lock_device_hotplug();
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2298	rc = try_remove_memory(start, size);
David Hildenbrand	d15e592	2018-10-30 15:10:18 -0700	[diff] [blame]	2299	unlock_device_hotplug();
Pavel Tatashin	eca499a	2019-07-16 16:30:31 -0700	[diff] [blame]	2300
				2301	return rc;
David Hildenbrand	d15e592	2018-10-30 15:10:18 -0700	[diff] [blame]	2302	}
Badari Pulavarty	7108878	2008-10-18 20:25:58 -0700	[diff] [blame]	2303	EXPORT_SYMBOL_GPL(remove_memory);
David Hildenbrand	08b3acd	2020-05-07 16:01:32 +0200	[diff] [blame]	2304
Sudarshan Rajagopalan	42db6c2	2021-12-07 13:14:46 -0800	[diff] [blame]	2305	int remove_memory_subsection(u64 start, u64 size)
				2306	{
Sudarshan Rajagopalan	42db6c2	2021-12-07 13:14:46 -0800	[diff] [blame]	2307	if (!IS_ALIGNED(start, SUBSECTION_SIZE) \|\|
				2308	!IS_ALIGNED(size, SUBSECTION_SIZE)) {
Suren Baghdasaryan	f36d7e34	2021-12-15 13:01:36 -0800	[diff] [blame]	2309	pr_err("%s: start 0x%llx size 0x%llx not aligned to subsection size\n",
Sudarshan Rajagopalan	42db6c2	2021-12-07 13:14:46 -0800	[diff] [blame]	2310	__func__, start, size);
				2311	return -EINVAL;
				2312	}
				2313
				2314	mem_hotplug_begin();
Sudarshan Rajagopalan	42db6c2	2021-12-07 13:14:46 -0800	[diff] [blame]	2315	arch_remove_memory(start, size, NULL);
				2316
				2317	if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
				2318	memblock_remove(start, size);
				2319
				2320	release_mem_region_adjustable(start, size);
				2321
				2322	mem_hotplug_done();
				2323
				2324	return 0;
				2325	}
				2326	EXPORT_SYMBOL_GPL(remove_memory_subsection);
				2327
David Hildenbrand	8dc4bb5	2020-11-12 14:38:13 +0100	[diff] [blame]	2328	static int try_offline_memory_block(struct memory_block mem, void arg)
				2329	{
				2330	uint8_t online_type = MMOP_ONLINE_KERNEL;
				2331	uint8_t **online_types = arg;
				2332	struct page *page;
				2333	int rc;
				2334
				2335	/*
				2336	* Sense the online_type via the zone of the memory block. Offlining
				2337	* with multiple zones within one memory block will be rejected
				2338	* by offlining code ... so we don't care about that.
				2339	*/
				2340	page = pfn_to_online_page(section_nr_to_pfn(mem->start_section_nr));
				2341	if (page && zone_idx(page_zone(page)) == ZONE_MOVABLE)
				2342	online_type = MMOP_ONLINE_MOVABLE;
				2343
				2344	rc = device_offline(&mem->dev);
				2345	/*
				2346	* Default is MMOP_OFFLINE - change it only if offlining succeeded,
				2347	* so try_reonline_memory_block() can do the right thing.
				2348	*/
				2349	if (!rc)
				2350	**online_types = online_type;
				2351
				2352	(*online_types)++;
				2353	/* Ignore if already offline. */
				2354	return rc < 0 ? rc : 0;
				2355	}
				2356
				2357	static int try_reonline_memory_block(struct memory_block mem, void arg)
				2358	{
				2359	uint8_t **online_types = arg;
				2360	int rc;
				2361
				2362	if (**online_types != MMOP_OFFLINE) {
				2363	mem->online_type = **online_types;
				2364	rc = device_online(&mem->dev);
				2365	if (rc < 0)
				2366	pr_warn("%s: Failed to re-online memory: %d",
				2367	__func__, rc);
				2368	}
				2369
				2370	/* Continue processing all remaining memory blocks. */
				2371	(*online_types)++;
				2372	return 0;
				2373	}
				2374
David Hildenbrand	08b3acd	2020-05-07 16:01:32 +0200	[diff] [blame]	2375	/*
David Hildenbrand	8dc4bb5	2020-11-12 14:38:13 +0100	[diff] [blame]	2376	* Try to offline and remove memory. Might take a long time to finish in case
				2377	* memory is still in use. Primarily useful for memory devices that logically
				2378	* unplugged all memory (so it's no longer in use) and want to offline + remove
				2379	* that memory.
David Hildenbrand	08b3acd	2020-05-07 16:01:32 +0200	[diff] [blame]	2380	*/
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2381	int offline_and_remove_memory(u64 start, u64 size)
David Hildenbrand	08b3acd	2020-05-07 16:01:32 +0200	[diff] [blame]	2382	{
David Hildenbrand	8dc4bb5	2020-11-12 14:38:13 +0100	[diff] [blame]	2383	const unsigned long mb_count = size / memory_block_size_bytes();
				2384	uint8_t online_types, tmp;
				2385	int rc;
David Hildenbrand	08b3acd	2020-05-07 16:01:32 +0200	[diff] [blame]	2386
				2387	if (!IS_ALIGNED(start, memory_block_size_bytes()) \|\|
David Hildenbrand	8dc4bb5	2020-11-12 14:38:13 +0100	[diff] [blame]	2388	!IS_ALIGNED(size, memory_block_size_bytes()) \|\| !size)
				2389	return -EINVAL;
David Hildenbrand	08b3acd	2020-05-07 16:01:32 +0200	[diff] [blame]	2390
				2391	/*
David Hildenbrand	8dc4bb5	2020-11-12 14:38:13 +0100	[diff] [blame]	2392	* We'll remember the old online type of each memory block, so we can
				2393	* try to revert whatever we did when offlining one memory block fails
				2394	* after offlining some others succeeded.
				2395	*/
				2396	online_types = kmalloc_array(mb_count, sizeof(*online_types),
				2397	GFP_KERNEL);
				2398	if (!online_types)
				2399	return -ENOMEM;
				2400	/*
				2401	* Initialize all states to MMOP_OFFLINE, so when we abort processing in
				2402	* try_offline_memory_block(), we'll skip all unprocessed blocks in
				2403	* try_reonline_memory_block().
				2404	*/
				2405	memset(online_types, MMOP_OFFLINE, mb_count);
				2406
				2407	lock_device_hotplug();
				2408
				2409	tmp = online_types;
				2410	rc = walk_memory_blocks(start, size, &tmp, try_offline_memory_block);
				2411
				2412	/*
				2413	* In case we succeeded to offline all memory, remove it.
David Hildenbrand	08b3acd	2020-05-07 16:01:32 +0200	[diff] [blame]	2414	* This cannot fail as it cannot get onlined in the meantime.
				2415	*/
				2416	if (!rc) {
David Hildenbrand	e1c158e	2021-09-07 19:55:09 -0700	[diff] [blame]	2417	rc = try_remove_memory(start, size);
David Hildenbrand	8dc4bb5	2020-11-12 14:38:13 +0100	[diff] [blame]	2418	if (rc)
				2419	pr_err("%s: Failed to remove memory: %d", __func__, rc);
				2420	}
				2421
				2422	/*
				2423	* Rollback what we did. While memory onlining might theoretically fail
				2424	* (nacked by a notifier), it barely ever happens.
				2425	*/
				2426	if (rc) {
				2427	tmp = online_types;
				2428	walk_memory_blocks(start, size, &tmp,
				2429	try_reonline_memory_block);
David Hildenbrand	08b3acd	2020-05-07 16:01:32 +0200	[diff] [blame]	2430	}
				2431	unlock_device_hotplug();
				2432
David Hildenbrand	8dc4bb5	2020-11-12 14:38:13 +0100	[diff] [blame]	2433	kfree(online_types);
David Hildenbrand	08b3acd	2020-05-07 16:01:32 +0200	[diff] [blame]	2434	return rc;
				2435	}
				2436	EXPORT_SYMBOL_GPL(offline_and_remove_memory);
Rafael J. Wysocki	aba6efc	2013-06-01 22:24:07 +0200	[diff] [blame]	2437	#endif /* CONFIG_MEMORY_HOTREMOVE */