Blame - mm/migrate.c - yocto/kernel/common

blob: 030529c744816451bf10bf3b8ccbff6d3c4a2ad3 [file] [log] [blame]

Greg Kroah-Hartman	b244131	2017-11-01 15:07:57 +0100	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	2	/*
Hugh Dickins	14e0f9b	2015-11-05 18:49:43 -0800	[diff] [blame]	3	* Memory Migration functionality - linux/mm/migrate.c
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	4	*
				5	* Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
				6	*
				7	* Page migration was first developed in the context of the memory hotplug
				8	* project. The main authors of the migration code are:
				9	*
				10	* IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
				11	* Hirokazu Takahashi <taka@valinux.co.jp>
				12	* Dave Hansen <haveblue@us.ibm.com>
Christoph Lameter	cde5353	2008-07-04 09:59:22 -0700	[diff] [blame]	13	* Christoph Lameter
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	14	*/
				15
				16	#include <linux/migrate.h>
Paul Gortmaker	b95f1b31	2011-10-16 02:01:52 -0400	[diff] [blame]	17	#include <linux/export.h>
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	18	#include <linux/swap.h>
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	19	#include <linux/swapops.h>
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	20	#include <linux/pagemap.h>
Christoph Lameter	e23ca00	2006-04-10 22:52:57 -0700	[diff] [blame]	21	#include <linux/buffer_head.h>
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	22	#include <linux/mm_inline.h>
Pavel Emelyanov	b488893	2007-10-18 23:40:14 -0700	[diff] [blame]	23	#include <linux/nsproxy.h>
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	24	#include <linux/pagevec.h>
Hugh Dickins	e9995ef	2009-12-14 17:59:31 -0800	[diff] [blame]	25	#include <linux/ksm.h>
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	26	#include <linux/rmap.h>
				27	#include <linux/topology.h>
				28	#include <linux/cpu.h>
				29	#include <linux/cpuset.h>
Christoph Lameter	04e62a2	2006-06-23 02:03:38 -0700	[diff] [blame]	30	#include <linux/writeback.h>
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	31	#include <linux/mempolicy.h>
				32	#include <linux/vmalloc.h>
David Quigley	86c3a76	2006-06-23 02:04:02 -0700	[diff] [blame]	33	#include <linux/security.h>
Hugh Dickins	42cb14b	2015-11-05 18:50:05 -0800	[diff] [blame]	34	#include <linux/backing-dev.h>
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	35	#include <linux/compaction.h>
Adrian Bunk	4f5ca26	2008-07-23 21:27:02 -0700	[diff] [blame]	36	#include <linux/syscalls.h>
Dominik Brodowski	7addf44	2018-03-17 16:08:03 +0100	[diff] [blame]	37	#include <linux/compat.h>
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	38	#include <linux/hugetlb.h>
Aneesh Kumar K.V	8e6ac7f	2012-07-31 16:42:27 -0700	[diff] [blame]	39	#include <linux/hugetlb_cgroup.h>
Tejun Heo	5a0e3ad	2010-03-24 17:04:11 +0900	[diff] [blame]	40	#include <linux/gfp.h>
Christoph Hellwig	a520110	2019-08-28 16:19:53 +0200	[diff] [blame]	41	#include <linux/pagewalk.h>
Jérôme Glisse	df6ad69	2017-09-08 16:12:24 -0700	[diff] [blame]	42	#include <linux/pfn_t.h>
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	43	#include <linux/memremap.h>
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	44	#include <linux/userfaultfd_k.h>
Rafael Aquini	bf6bddf1	2012-12-11 16:02:42 -0800	[diff] [blame]	45	#include <linux/balloon_compaction.h>
Mel Gorman	f714f4f	2013-12-18 17:08:33 -0800	[diff] [blame]	46	#include <linux/mmu_notifier.h>
Vladimir Davydov	33c3fc7	2015-09-09 15:35:45 -0700	[diff] [blame]	47	#include <linux/page_idle.h>
Vlastimil Babka	d435edc	2016-03-15 14:56:15 -0700	[diff] [blame]	48	#include <linux/page_owner.h>
Ingo Molnar	6e84f31	2017-02-08 18:51:29 +0100	[diff] [blame]	49	#include <linux/sched/mm.h>
Linus Torvalds	197e7e5	2017-08-20 13:26:27 -0700	[diff] [blame]	50	#include <linux/ptrace.h>
Ralph Campbell	34290e2	2020-01-30 22:14:44 -0800	[diff] [blame]	51	#include <linux/oom.h>
Dave Hansen	884a6e5	2021-09-02 14:59:09 -0700	[diff] [blame]	52	#include <linux/memory.h>
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	53
Michal Nazarewicz	0d1836c	2010-12-21 17:24:26 -0800	[diff] [blame]	54	#include <asm/tlbflush.h>
				55
Mel Gorman	7b2a2d4	2012-10-19 14:07:31 +0100	[diff] [blame]	56	#define CREATE_TRACE_POINTS
				57	#include <trace/events/migrate.h>
				58
qinglin.li	a1a623f	2024-06-21 14:36:34 +0800	[diff] [blame^]	59	#undef CREATE_TRACE_POINTS
				60	#include <trace/hooks/mm.h>
				61
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	62	#include "internal.h"
				63
Yisheng Xie	9e5bcd6	2017-02-24 14:57:29 -0800	[diff] [blame]	64	int isolate_movable_page(struct page *page, isolate_mode_t mode)
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	65	{
				66	struct address_space *mapping;
				67
				68	/*
				69	* Avoid burning cycles with pages that are yet under __free_pages(),
				70	* or just got freed under us.
				71	*
				72	* In case we 'win' a race for a movable page being freed under us and
				73	* raise its refcount preventing __free_pages() from doing its job
				74	* the put_page() at the end of this block will take care of
				75	* release this page, thus avoiding a nasty leakage.
				76	*/
				77	if (unlikely(!get_page_unless_zero(page)))
				78	goto out;
				79
				80	/*
				81	* Check PageMovable before holding a PG_lock because page's owner
				82	* assumes anybody doesn't touch PG_lock of newly allocated page
Wei Yang	8bb4e7a	2019-03-05 15:46:22 -0800	[diff] [blame]	83	* so unconditionally grabbing the lock ruins page's owner side.
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	84	*/
				85	if (unlikely(!__PageMovable(page)))
				86	goto out_putpage;
				87	/*
				88	* As movable pages are not isolated from LRU lists, concurrent
				89	* compaction threads can race against page migration functions
				90	* as well as race against the releasing a page.
				91	*
				92	* In order to avoid having an already isolated movable page
				93	* being (wrongly) re-isolated while it is under migration,
				94	* or to avoid attempting to isolate pages being released,
				95	* lets be sure we have the page lock
				96	* before proceeding with the movable page isolation steps.
				97	*/
				98	if (unlikely(!trylock_page(page)))
				99	goto out_putpage;
				100
				101	if (!PageMovable(page) \|\| PageIsolated(page))
				102	goto out_no_isolated;
				103
				104	mapping = page_mapping(page);
				105	VM_BUG_ON_PAGE(!mapping, page);
				106
				107	if (!mapping->a_ops->isolate_page(page, mode))
				108	goto out_no_isolated;
				109
				110	/* Driver shouldn't use PG_isolated bit of page->flags */
				111	WARN_ON_ONCE(PageIsolated(page));
andrew.yang	0d8a836	2022-03-15 16:58:34 +1100	[diff] [blame]	112	SetPageIsolated(page);
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	113	unlock_page(page);
				114
Yisheng Xie	9e5bcd6	2017-02-24 14:57:29 -0800	[diff] [blame]	115	return 0;
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	116
				117	out_no_isolated:
				118	unlock_page(page);
				119	out_putpage:
				120	put_page(page);
				121	out:
Yisheng Xie	9e5bcd6	2017-02-24 14:57:29 -0800	[diff] [blame]	122	return -EBUSY;
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	123	}
				124
Miaohe Lin	606a6f7	2021-05-04 18:37:04 -0700	[diff] [blame]	125	static void putback_movable_page(struct page *page)
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	126	{
				127	struct address_space *mapping;
				128
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	129	mapping = page_mapping(page);
				130	mapping->a_ops->putback_page(page);
andrew.yang	0d8a836	2022-03-15 16:58:34 +1100	[diff] [blame]	131	ClearPageIsolated(page);
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	132	}
				133
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	134	/*
Rafael Aquini	5733c7d	2012-12-11 16:02:47 -0800	[diff] [blame]	135	* Put previously isolated pages back onto the appropriate lists
				136	* from where they were once taken off for compaction/migration.
				137	*
Joonsoo Kim	59c82b7	2014-01-21 15:51:17 -0800	[diff] [blame]	138	* This function shall be used whenever the isolated pageset has been
				139	* built from lru, balloon, hugetlbfs page. See isolate_migratepages_range()
Miaohe Lin	072e741	2022-05-30 19:30:15 +0800	[diff] [blame]	140	* and isolate_hugetlb().
Rafael Aquini	5733c7d	2012-12-11 16:02:47 -0800	[diff] [blame]	141	*/
				142	void putback_movable_pages(struct list_head *l)
				143	{
				144	struct page *page;
				145	struct page *page2;
				146
				147	list_for_each_entry_safe(page, page2, l, lru) {
Naoya Horiguchi	31caf66	2013-09-11 14:21:59 -0700	[diff] [blame]	148	if (unlikely(PageHuge(page))) {
				149	putback_active_hugepage(page);
				150	continue;
				151	}
Rafael Aquini	5733c7d	2012-12-11 16:02:47 -0800	[diff] [blame]	152	list_del(&page->lru);
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	153	/*
				154	* We isolated non-lru movable page so here we can use
				155	* __PageMovable because LRU page's mapping cannot have
				156	* PAGE_MAPPING_MOVABLE.
				157	*/
Minchan Kim	b1123ea6	2016-07-26 15:23:09 -0700	[diff] [blame]	158	if (unlikely(__PageMovable(page))) {
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	159	VM_BUG_ON_PAGE(!PageIsolated(page), page);
				160	lock_page(page);
				161	if (PageMovable(page))
				162	putback_movable_page(page);
				163	else
andrew.yang	0d8a836	2022-03-15 16:58:34 +1100	[diff] [blame]	164	ClearPageIsolated(page);
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	165	unlock_page(page);
				166	put_page(page);
				167	} else {
Naoya Horiguchi	e8db67e	2017-09-08 16:11:12 -0700	[diff] [blame]	168	mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
Matthew Wilcox (Oracle)	6c35784	2020-08-14 17:30:37 -0700	[diff] [blame]	169	page_is_file_lru(page), -thp_nr_pages(page));
Rabin Vincent	fc280fe	2017-04-20 14:37:46 -0700	[diff] [blame]	170	putback_lru_page(page);
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	171	}
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	172	}
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	173	}
Charan Teja Reddy	f47b852	2021-02-16 13:59:45 +0530	[diff] [blame]	174	EXPORT_SYMBOL_GPL(putback_movable_pages);
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	175
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	176	/*
				177	* Restore a potential migration pte to a working pte entry
				178	*/
Minchan Kim	e4b8222	2017-05-03 14:54:27 -0700	[diff] [blame]	179	static bool remove_migration_pte(struct page page, struct vm_area_struct vma,
Hugh Dickins	e9995ef	2009-12-14 17:59:31 -0800	[diff] [blame]	180	unsigned long addr, void *old)
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	181	{
Kirill A. Shutemov	3fe8796	2017-02-24 14:58:16 -0800	[diff] [blame]	182	struct page_vma_mapped_walk pvmw = {
				183	.page = old,
				184	.vma = vma,
				185	.address = addr,
				186	.flags = PVMW_SYNC \| PVMW_MIGRATION,
				187	};
				188	struct page *new;
				189	pte_t pte;
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	190	swp_entry_t entry;
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	191
Kirill A. Shutemov	3fe8796	2017-02-24 14:58:16 -0800	[diff] [blame]	192	VM_BUG_ON_PAGE(PageTail(page), page);
				193	while (page_vma_mapped_walk(&pvmw)) {
Naoya Horiguchi	4b0ece6	2017-03-31 15:11:44 -0700	[diff] [blame]	194	if (PageKsm(page))
				195	new = page;
				196	else
				197	new = page - pvmw.page->index +
				198	linear_page_index(vma, pvmw.address);
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	199
Zi Yan	616b837	2017-09-08 16:10:57 -0700	[diff] [blame]	200	#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
				201	/* PMD-mapped THP migration entry */
				202	if (!pvmw.pte) {
				203	VM_BUG_ON_PAGE(PageHuge(page) \|\| !PageTransCompound(page), page);
				204	remove_migration_pmd(&pvmw, new);
				205	continue;
				206	}
				207	#endif
				208
Kirill A. Shutemov	3fe8796	2017-02-24 14:58:16 -0800	[diff] [blame]	209	get_page(new);
				210	pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot)));
				211	if (pte_swp_soft_dirty(*pvmw.pte))
				212	pte = pte_mksoft_dirty(pte);
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	213
Hugh Dickins	486cf46	2011-10-19 12:50:35 -0700	[diff] [blame]	214	/*
Kirill A. Shutemov	3fe8796	2017-02-24 14:58:16 -0800	[diff] [blame]	215	* Recheck VMA as permissions can change since migration started
Hugh Dickins	486cf46	2011-10-19 12:50:35 -0700	[diff] [blame]	216	*/
Kirill A. Shutemov	3fe8796	2017-02-24 14:58:16 -0800	[diff] [blame]	217	entry = pte_to_swp_entry(*pvmw.pte);
Alistair Popple	4dd845b	2021-06-30 18:54:09 -0700	[diff] [blame]	218	if (is_writable_migration_entry(entry))
Kirill A. Shutemov	3fe8796	2017-02-24 14:58:16 -0800	[diff] [blame]	219	pte = maybe_mkwrite(pte, vma);
Peter Xu	f45ec5f	2020-04-06 20:06:01 -0700	[diff] [blame]	220	else if (pte_swp_uffd_wp(*pvmw.pte))
				221	pte = pte_mkuffd_wp(pte);
Mel Gorman	d3cb8bf	2014-10-02 19:47:41 +0100	[diff] [blame]	222
Ralph Campbell	6128763	2020-09-04 16:36:04 -0700	[diff] [blame]	223	if (unlikely(is_device_private_page(new))) {
Alistair Popple	4dd845b	2021-06-30 18:54:09 -0700	[diff] [blame]	224	if (pte_write(pte))
				225	entry = make_writable_device_private_entry(
				226	page_to_pfn(new));
				227	else
				228	entry = make_readable_device_private_entry(
				229	page_to_pfn(new));
Ralph Campbell	6128763	2020-09-04 16:36:04 -0700	[diff] [blame]	230	pte = swp_entry_to_pte(entry);
Ralph Campbell	3d321bf8	2020-09-04 16:36:07 -0700	[diff] [blame]	231	if (pte_swp_soft_dirty(*pvmw.pte))
				232	pte = pte_swp_mksoft_dirty(pte);
Ralph Campbell	6128763	2020-09-04 16:36:04 -0700	[diff] [blame]	233	if (pte_swp_uffd_wp(*pvmw.pte))
				234	pte = pte_swp_mkuffd_wp(pte);
Lars Persson	d2b2c6dd	2019-03-28 20:44:28 -0700	[diff] [blame]	235	}
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	236
Andi Kleen	3ef8fd7	2010-10-11 16:03:21 +0200	[diff] [blame]	237	#ifdef CONFIG_HUGETLB_PAGE
Kirill A. Shutemov	3fe8796	2017-02-24 14:58:16 -0800	[diff] [blame]	238	if (PageHuge(new)) {
Christophe Leroy	79c1c59	2021-06-30 18:48:00 -0700	[diff] [blame]	239	unsigned int shift = huge_page_shift(hstate_vma(vma));
				240
Kirill A. Shutemov	3fe8796	2017-02-24 14:58:16 -0800	[diff] [blame]	241	pte = pte_mkhuge(pte);
Christophe Leroy	79c1c59	2021-06-30 18:48:00 -0700	[diff] [blame]	242	pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
Aneesh Kumar K.V	383321a	2017-07-06 15:38:41 -0700	[diff] [blame]	243	set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
Kirill A. Shutemov	3fe8796	2017-02-24 14:58:16 -0800	[diff] [blame]	244	if (PageAnon(new))
				245	hugepage_add_anon_rmap(new, vma, pvmw.address);
				246	else
				247	page_dup_rmap(new, true);
Aneesh Kumar K.V	383321a	2017-07-06 15:38:41 -0700	[diff] [blame]	248	} else
				249	#endif
				250	{
				251	set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
Christoph Lameter	04e62a2	2006-06-23 02:03:38 -0700	[diff] [blame]	252
Aneesh Kumar K.V	383321a	2017-07-06 15:38:41 -0700	[diff] [blame]	253	if (PageAnon(new))
				254	page_add_anon_rmap(new, vma, pvmw.address, false);
				255	else
				256	page_add_file_rmap(new, false);
				257	}
Kirill A. Shutemov	3fe8796	2017-02-24 14:58:16 -0800	[diff] [blame]	258	if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new))
				259	mlock_vma_page(new);
Hugh Dickins	51afb12	2015-11-05 18:49:37 -0800	[diff] [blame]	260
Kirill A. Shutemov	e125fe4	2018-10-05 15:51:41 -0700	[diff] [blame]	261	if (PageTransHuge(page) && PageMlocked(page))
				262	clear_page_mlock(page);
				263
Kirill A. Shutemov	3fe8796	2017-02-24 14:58:16 -0800	[diff] [blame]	264	/* No need to invalidate - it was non-present before */
				265	update_mmu_cache(vma, pvmw.address, pvmw.pte);
				266	}
				267
Minchan Kim	e4b8222	2017-05-03 14:54:27 -0700	[diff] [blame]	268	return true;
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	269	}
				270
				271	/*
Christoph Lameter	04e62a2	2006-06-23 02:03:38 -0700	[diff] [blame]	272	* Get rid of all migration entries and replace them by
				273	* references to the indicated page.
				274	*/
Kirill A. Shutemov	e388466	2016-03-17 14:20:07 -0700	[diff] [blame]	275	void remove_migration_ptes(struct page old, struct page new, bool locked)
Christoph Lameter	04e62a2	2006-06-23 02:03:38 -0700	[diff] [blame]	276	{
Joonsoo Kim	051ac83	2014-01-21 15:49:48 -0800	[diff] [blame]	277	struct rmap_walk_control rwc = {
				278	.rmap_one = remove_migration_pte,
				279	.arg = old,
				280	};
				281
Kirill A. Shutemov	e388466	2016-03-17 14:20:07 -0700	[diff] [blame]	282	if (locked)
				283	rmap_walk_locked(new, &rwc);
				284	else
				285	rmap_walk(new, &rwc);
Christoph Lameter	04e62a2	2006-06-23 02:03:38 -0700	[diff] [blame]	286	}
				287
				288	/*
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	289	* Something used the pte of a page under migration. We need to
				290	* get to the page and wait until migration is finished.
				291	* When we return from this function the fault will be retried.
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	292	*/
Naoya Horiguchi	e66f17f	2015-02-11 15:25:22 -0800	[diff] [blame]	293	void __migration_entry_wait(struct mm_struct mm, pte_t ptep,
Naoya Horiguchi	30dad30	2013-06-12 14:05:04 -0700	[diff] [blame]	294	spinlock_t *ptl)
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	295	{
Naoya Horiguchi	30dad30	2013-06-12 14:05:04 -0700	[diff] [blame]	296	pte_t pte;
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	297	swp_entry_t entry;
				298	struct page *page;
				299
Naoya Horiguchi	30dad30	2013-06-12 14:05:04 -0700	[diff] [blame]	300	spin_lock(ptl);
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	301	pte = *ptep;
				302	if (!is_swap_pte(pte))
				303	goto out;
				304
				305	entry = pte_to_swp_entry(pte);
				306	if (!is_migration_entry(entry))
				307	goto out;
				308
Alistair Popple	af5cdaf	2021-06-30 18:54:06 -0700	[diff] [blame]	309	page = pfn_swap_entry_to_page(entry);
Xu Yu	ffc90cb	2021-06-15 18:23:42 -0700	[diff] [blame]	310	page = compound_head(page);
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	311
Nick Piggin	e286781	2008-07-25 19:45:30 -0700	[diff] [blame]	312	/*
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	313	* Once page cache replacement of page migration started, page_count
Hugh Dickins	9a1ea43	2018-12-28 00:36:14 -0800	[diff] [blame]	314	* is zero; but we must not call put_and_wait_on_page_locked() without
				315	* a ref. Use get_page_unless_zero(), and just fault again if it fails.
Nick Piggin	e286781	2008-07-25 19:45:30 -0700	[diff] [blame]	316	*/
				317	if (!get_page_unless_zero(page))
				318	goto out;
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	319	pte_unmap_unlock(ptep, ptl);
Matthew Wilcox (Oracle)	4805462	2021-02-24 12:02:02 -0800	[diff] [blame]	320	put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	321	return;
				322	out:
				323	pte_unmap_unlock(ptep, ptl);
				324	}
				325
Naoya Horiguchi	30dad30	2013-06-12 14:05:04 -0700	[diff] [blame]	326	void migration_entry_wait(struct mm_struct mm, pmd_t pmd,
				327	unsigned long address)
				328	{
				329	spinlock_t *ptl = pte_lockptr(mm, pmd);
				330	pte_t *ptep = pte_offset_map(pmd, address);
				331	__migration_entry_wait(mm, ptep, ptl);
				332	}
				333
Kirill A. Shutemov	cb900f4	2013-11-14 14:31:02 -0800	[diff] [blame]	334	void migration_entry_wait_huge(struct vm_area_struct *vma,
				335	struct mm_struct mm, pte_t pte)
Naoya Horiguchi	30dad30	2013-06-12 14:05:04 -0700	[diff] [blame]	336	{
Kirill A. Shutemov	cb900f4	2013-11-14 14:31:02 -0800	[diff] [blame]	337	spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte);
Naoya Horiguchi	30dad30	2013-06-12 14:05:04 -0700	[diff] [blame]	338	__migration_entry_wait(mm, pte, ptl);
				339	}
				340
Zi Yan	616b837	2017-09-08 16:10:57 -0700	[diff] [blame]	341	#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
				342	void pmd_migration_entry_wait(struct mm_struct mm, pmd_t pmd)
				343	{
				344	spinlock_t *ptl;
				345	struct page *page;
				346
				347	ptl = pmd_lock(mm, pmd);
				348	if (!is_pmd_migration_entry(*pmd))
				349	goto unlock;
Alistair Popple	af5cdaf	2021-06-30 18:54:06 -0700	[diff] [blame]	350	page = pfn_swap_entry_to_page(pmd_to_swp_entry(*pmd));
Zi Yan	616b837	2017-09-08 16:10:57 -0700	[diff] [blame]	351	if (!get_page_unless_zero(page))
				352	goto unlock;
				353	spin_unlock(ptl);
Matthew Wilcox (Oracle)	4805462	2021-02-24 12:02:02 -0800	[diff] [blame]	354	put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
Zi Yan	616b837	2017-09-08 16:10:57 -0700	[diff] [blame]	355	return;
				356	unlock:
				357	spin_unlock(ptl);
				358	}
				359	#endif
				360
Jan Kara	f900482	2019-03-05 15:48:46 -0800	[diff] [blame]	361	static int expected_page_refs(struct address_space mapping, struct page page)
Jan Kara	0b3901b	2018-12-28 00:39:01 -0800	[diff] [blame]	362	{
				363	int expected_count = 1;
				364
				365	/*
Ralph Campbell	f1f4f3a	2020-10-13 16:58:42 -0700	[diff] [blame]	366	* Device private pages have an extra refcount as they are
Jan Kara	0b3901b	2018-12-28 00:39:01 -0800	[diff] [blame]	367	* ZONE_DEVICE pages.
				368	*/
				369	expected_count += is_device_private_page(page);
Jan Kara	f900482	2019-03-05 15:48:46 -0800	[diff] [blame]	370	if (mapping)
Matthew Wilcox (Oracle)	6c35784	2020-08-14 17:30:37 -0700	[diff] [blame]	371	expected_count += thp_nr_pages(page) + page_has_private(page);
Jan Kara	0b3901b	2018-12-28 00:39:01 -0800	[diff] [blame]	372
				373	return expected_count;
				374	}
				375
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	376	/*
Christoph Lameter	c3fcf8a	2006-06-23 02:03:32 -0700	[diff] [blame]	377	* Replace the page in the mapping.
Christoph Lameter	5b5c712	2006-06-23 02:03:29 -0700	[diff] [blame]	378	*
				379	* The number of remaining references must be:
				380	* 1 for anonymous pages without a mapping
				381	* 2 for pages with a mapping
David Howells	266cf65	2009-04-03 16:42:36 +0100	[diff] [blame]	382	* 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	383	*/
Gu Zheng	36bc08c	2013-07-16 17:56:16 +0800	[diff] [blame]	384	int migrate_page_move_mapping(struct address_space *mapping,
Keith Busch	3710969	2019-07-18 15:58:46 -0700	[diff] [blame]	385	struct page newpage, struct page page, int extra_count)
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	386	{
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	387	XA_STATE(xas, &mapping->i_pages, page_index(page));
Hugh Dickins	42cb14b	2015-11-05 18:50:05 -0800	[diff] [blame]	388	struct zone oldzone, newzone;
				389	int dirty;
Jan Kara	f900482	2019-03-05 15:48:46 -0800	[diff] [blame]	390	int expected_count = expected_page_refs(mapping, page) + extra_count;
Shakeel Butt	5c447d2	2021-01-23 21:01:15 -0800	[diff] [blame]	391	int nr = thp_nr_pages(page);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	392
Christoph Lameter	6c5240a	2006-06-23 02:03:37 -0700	[diff] [blame]	393	if (!mapping) {
Christoph Lameter	0e8c7d0	2007-04-23 14:41:09 -0700	[diff] [blame]	394	/* Anonymous page without mapping */
Benjamin LaHaise	8e321fe	2013-12-21 17:56:08 -0500	[diff] [blame]	395	if (page_count(page) != expected_count)
Christoph Lameter	6c5240a	2006-06-23 02:03:37 -0700	[diff] [blame]	396	return -EAGAIN;
Hugh Dickins	cf4b769	2015-11-05 18:50:02 -0800	[diff] [blame]	397
				398	/* No turning back from here */
Hugh Dickins	cf4b769	2015-11-05 18:50:02 -0800	[diff] [blame]	399	newpage->index = page->index;
				400	newpage->mapping = page->mapping;
				401	if (PageSwapBacked(page))
Hugh Dickins	fa9949d	2016-05-19 17:12:41 -0700	[diff] [blame]	402	__SetPageSwapBacked(newpage);
Hugh Dickins	cf4b769	2015-11-05 18:50:02 -0800	[diff] [blame]	403
Rafael Aquini	78bd520	2012-12-11 16:02:31 -0800	[diff] [blame]	404	return MIGRATEPAGE_SUCCESS;
Christoph Lameter	6c5240a	2006-06-23 02:03:37 -0700	[diff] [blame]	405	}
				406
Hugh Dickins	42cb14b	2015-11-05 18:50:05 -0800	[diff] [blame]	407	oldzone = page_zone(page);
				408	newzone = page_zone(newpage);
				409
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	410	xas_lock_irq(&xas);
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	411	if (page_count(page) != expected_count \|\| xas_load(&xas) != page) {
				412	xas_unlock_irq(&xas);
Christoph Lameter	e23ca00	2006-04-10 22:52:57 -0700	[diff] [blame]	413	return -EAGAIN;
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	414	}
				415
Joonsoo Kim	fe896d1	2016-03-17 14:19:26 -0700	[diff] [blame]	416	if (!page_ref_freeze(page, expected_count)) {
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	417	xas_unlock_irq(&xas);
Nick Piggin	e286781	2008-07-25 19:45:30 -0700	[diff] [blame]	418	return -EAGAIN;
				419	}
				420
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	421	/*
Hugh Dickins	cf4b769	2015-11-05 18:50:02 -0800	[diff] [blame]	422	* Now we know that no one else is looking at the page:
				423	* no turning back from here.
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	424	*/
Hugh Dickins	cf4b769	2015-11-05 18:50:02 -0800	[diff] [blame]	425	newpage->index = page->index;
				426	newpage->mapping = page->mapping;
Shakeel Butt	5c447d2	2021-01-23 21:01:15 -0800	[diff] [blame]	427	page_ref_add(newpage, nr); /* add cache reference */
Nicholas Piggin	6326fec	2016-12-25 13:00:29 +1000	[diff] [blame]	428	if (PageSwapBacked(page)) {
				429	__SetPageSwapBacked(newpage);
				430	if (PageSwapCache(page)) {
				431	SetPageSwapCache(newpage);
				432	set_page_private(newpage, page_private(page));
				433	}
				434	} else {
				435	VM_BUG_ON_PAGE(PageSwapCache(page), page);
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	436	}
				437
Hugh Dickins	42cb14b	2015-11-05 18:50:05 -0800	[diff] [blame]	438	/* Move dirty while page refs frozen and newpage not yet exposed */
				439	dirty = PageDirty(page);
				440	if (dirty) {
				441	ClearPageDirty(page);
				442	SetPageDirty(newpage);
				443	}
				444
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	445	xas_store(&xas, newpage);
Naoya Horiguchi	e71769a	2018-04-20 14:55:45 -0700	[diff] [blame]	446	if (PageTransHuge(page)) {
				447	int i;
Naoya Horiguchi	e71769a	2018-04-20 14:55:45 -0700	[diff] [blame]	448
Shakeel Butt	5c447d2	2021-01-23 21:01:15 -0800	[diff] [blame]	449	for (i = 1; i < nr; i++) {
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	450	xas_next(&xas);
Matthew Wilcox (Oracle)	4101196	2019-09-23 15:34:52 -0700	[diff] [blame]	451	xas_store(&xas, newpage);
Naoya Horiguchi	e71769a	2018-04-20 14:55:45 -0700	[diff] [blame]	452	}
Naoya Horiguchi	e71769a	2018-04-20 14:55:45 -0700	[diff] [blame]	453	}
Nick Piggin	7cf9c2c	2006-12-06 20:33:44 -0800	[diff] [blame]	454
				455	/*
Jacobo Giralt	937a94c	2012-01-10 15:07:11 -0800	[diff] [blame]	456	* Drop cache reference from old page by unfreezing
				457	* to one less reference.
Nick Piggin	7cf9c2c	2006-12-06 20:33:44 -0800	[diff] [blame]	458	* We know this isn't the last reference.
				459	*/
Shakeel Butt	5c447d2	2021-01-23 21:01:15 -0800	[diff] [blame]	460	page_ref_unfreeze(page, expected_count - nr);
Nick Piggin	7cf9c2c	2006-12-06 20:33:44 -0800	[diff] [blame]	461
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	462	xas_unlock(&xas);
Hugh Dickins	42cb14b	2015-11-05 18:50:05 -0800	[diff] [blame]	463	/* Leave irq disabled to prevent preemption while updating stats */
				464
Christoph Lameter	0e8c7d0	2007-04-23 14:41:09 -0700	[diff] [blame]	465	/*
				466	* If moved to a different zone then also account
				467	* the page for that zone. Other VM counters will be
				468	* taken care of when we establish references to the
				469	* new page and drop references to the old page.
				470	*
				471	* Note that anonymous pages are accounted for
Mel Gorman	4b9d0fa	2016-07-28 15:46:17 -0700	[diff] [blame]	472	* via NR_FILE_PAGES and NR_ANON_MAPPED if they
Christoph Lameter	0e8c7d0	2007-04-23 14:41:09 -0700	[diff] [blame]	473	* are mapped to swap space.
				474	*/
Hugh Dickins	42cb14b	2015-11-05 18:50:05 -0800	[diff] [blame]	475	if (newzone != oldzone) {
Johannes Weiner	0d1c207	2020-06-03 16:01:54 -0700	[diff] [blame]	476	struct lruvec old_lruvec, new_lruvec;
				477	struct mem_cgroup *memcg;
				478
				479	memcg = page_memcg(page);
				480	old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat);
				481	new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
				482
Shakeel Butt	5c447d2	2021-01-23 21:01:15 -0800	[diff] [blame]	483	__mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr);
				484	__mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr);
Hugh Dickins	42cb14b	2015-11-05 18:50:05 -0800	[diff] [blame]	485	if (PageSwapBacked(page) && !PageSwapCache(page)) {
Shakeel Butt	5c447d2	2021-01-23 21:01:15 -0800	[diff] [blame]	486	__mod_lruvec_state(old_lruvec, NR_SHMEM, -nr);
				487	__mod_lruvec_state(new_lruvec, NR_SHMEM, nr);
Hugh Dickins	42cb14b	2015-11-05 18:50:05 -0800	[diff] [blame]	488	}
Shakeel Butt	b603894	2021-02-24 12:03:55 -0800	[diff] [blame]	489	#ifdef CONFIG_SWAP
				490	if (PageSwapCache(page)) {
				491	__mod_lruvec_state(old_lruvec, NR_SWAPCACHE, -nr);
				492	__mod_lruvec_state(new_lruvec, NR_SWAPCACHE, nr);
				493	}
				494	#endif
Christoph Hellwig	f56753a	2020-09-24 08:51:40 +0200	[diff] [blame]	495	if (dirty && mapping_can_writeback(mapping)) {
Shakeel Butt	5c447d2	2021-01-23 21:01:15 -0800	[diff] [blame]	496	__mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr);
				497	__mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr);
				498	__mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr);
				499	__mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr);
Hugh Dickins	42cb14b	2015-11-05 18:50:05 -0800	[diff] [blame]	500	}
KOSAKI Motohiro	4b02108	2009-09-21 17:01:33 -0700	[diff] [blame]	501	}
Hugh Dickins	42cb14b	2015-11-05 18:50:05 -0800	[diff] [blame]	502	local_irq_enable();
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	503
Rafael Aquini	78bd520	2012-12-11 16:02:31 -0800	[diff] [blame]	504	return MIGRATEPAGE_SUCCESS;
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	505	}
Richard Weinberger	1118dce	2016-06-16 23:26:14 +0200	[diff] [blame]	506	EXPORT_SYMBOL(migrate_page_move_mapping);
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	507
				508	/*
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	509	* The expected number of remaining references is the same as that
				510	* of migrate_page_move_mapping().
				511	*/
				512	int migrate_huge_page_move_mapping(struct address_space *mapping,
				513	struct page newpage, struct page page)
				514	{
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	515	XA_STATE(xas, &mapping->i_pages, page_index(page));
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	516	int expected_count;
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	517
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	518	xas_lock_irq(&xas);
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	519	expected_count = 2 + page_has_private(page);
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	520	if (page_count(page) != expected_count \|\| xas_load(&xas) != page) {
				521	xas_unlock_irq(&xas);
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	522	return -EAGAIN;
				523	}
				524
Joonsoo Kim	fe896d1	2016-03-17 14:19:26 -0700	[diff] [blame]	525	if (!page_ref_freeze(page, expected_count)) {
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	526	xas_unlock_irq(&xas);
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	527	return -EAGAIN;
				528	}
				529
Hugh Dickins	cf4b769	2015-11-05 18:50:02 -0800	[diff] [blame]	530	newpage->index = page->index;
				531	newpage->mapping = page->mapping;
Johannes Weiner	6a93ca8	2016-03-15 14:57:19 -0700	[diff] [blame]	532
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	533	get_page(newpage);
				534
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	535	xas_store(&xas, newpage);
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	536
Joonsoo Kim	fe896d1	2016-03-17 14:19:26 -0700	[diff] [blame]	537	page_ref_unfreeze(page, expected_count - 1);
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	538
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	539	xas_unlock_irq(&xas);
Johannes Weiner	6a93ca8	2016-03-15 14:57:19 -0700	[diff] [blame]	540
Rafael Aquini	78bd520	2012-12-11 16:02:31 -0800	[diff] [blame]	541	return MIGRATEPAGE_SUCCESS;
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	542	}
				543
				544	/*
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	545	* Copy the page to its new location
				546	*/
Jérôme Glisse	2916ecc	2017-09-08 16:12:06 -0700	[diff] [blame]	547	void migrate_page_states(struct page newpage, struct page page)
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	548	{
Rik van Riel	7851a45	2013-10-07 11:29:23 +0100	[diff] [blame]	549	int cpupid;
				550
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	551	if (PageError(page))
				552	SetPageError(newpage);
				553	if (PageReferenced(page))
				554	SetPageReferenced(newpage);
				555	if (PageUptodate(page))
				556	SetPageUptodate(newpage);
Lee Schermerhorn	894bc31	2008-10-18 20:26:39 -0700	[diff] [blame]	557	if (TestClearPageActive(page)) {
Sasha Levin	309381fea	2014-01-23 15:52:54 -0800	[diff] [blame]	558	VM_BUG_ON_PAGE(PageUnevictable(page), page);
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	559	SetPageActive(newpage);
Lee Schermerhorn	418b27e	2009-12-14 17:59:54 -0800	[diff] [blame]	560	} else if (TestClearPageUnevictable(page))
				561	SetPageUnevictable(newpage);
Johannes Weiner	1899ad1	2018-10-26 15:06:04 -0700	[diff] [blame]	562	if (PageWorkingset(page))
				563	SetPageWorkingset(newpage);
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	564	if (PageChecked(page))
				565	SetPageChecked(newpage);
				566	if (PageMappedToDisk(page))
				567	SetPageMappedToDisk(newpage);
				568
qinglin.li	a1a623f	2024-06-21 14:36:34 +0800	[diff] [blame^]	569	trace_android_vh_look_around_migrate_page(page, newpage);
				570
Hugh Dickins	42cb14b	2015-11-05 18:50:05 -0800	[diff] [blame]	571	/* Move dirty on pages not done by migrate_page_move_mapping() */
				572	if (PageDirty(page))
				573	SetPageDirty(newpage);
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	574
Vladimir Davydov	33c3fc7	2015-09-09 15:35:45 -0700	[diff] [blame]	575	if (page_is_young(page))
				576	set_page_young(newpage);
				577	if (page_is_idle(page))
				578	set_page_idle(newpage);
				579
Rik van Riel	7851a45	2013-10-07 11:29:23 +0100	[diff] [blame]	580	/*
				581	* Copy NUMA information to the new page, to prevent over-eager
				582	* future migrations of this same page.
				583	*/
				584	cpupid = page_cpupid_xchg_last(page, -1);
				585	page_cpupid_xchg_last(newpage, cpupid);
				586
Hugh Dickins	e9995ef	2009-12-14 17:59:31 -0800	[diff] [blame]	587	ksm_migrate_page(newpage, page);
Hugh Dickins	c8d6553	2013-02-22 16:35:10 -0800	[diff] [blame]	588	/*
				589	* Please do not reorder this without considering how mm/ksm.c's
				590	* get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache().
				591	*/
Naoya Horiguchi	b3b3a99	2015-04-15 16:13:15 -0700	[diff] [blame]	592	if (PageSwapCache(page))
				593	ClearPageSwapCache(page);
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	594	ClearPagePrivate(page);
Muchun Song	ad2fa37	2021-06-30 18:47:21 -0700	[diff] [blame]	595
				596	/* page->private contains hugetlb specific flags */
				597	if (!PageHuge(page))
				598	set_page_private(page, 0);
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	599
				600	/*
				601	* If any waiters have accumulated on the new page then
				602	* wake them up.
				603	*/
				604	if (PageWriteback(newpage))
				605	end_page_writeback(newpage);
Vlastimil Babka	d435edc	2016-03-15 14:56:15 -0700	[diff] [blame]	606
Yang Shi	6aeff24	2020-04-06 20:04:21 -0700	[diff] [blame]	607	/*
				608	* PG_readahead shares the same bit with PG_reclaim. The above
				609	* end_page_writeback() may clear PG_readahead mistakenly, so set the
				610	* bit after that.
				611	*/
				612	if (PageReadahead(page))
				613	SetPageReadahead(newpage);
				614
Vlastimil Babka	d435edc	2016-03-15 14:56:15 -0700	[diff] [blame]	615	copy_page_owner(page, newpage);
Johannes Weiner	74485cf	2016-03-15 14:57:54 -0700	[diff] [blame]	616
Hugh Dickins	a333e3e	2020-09-18 21:20:06 -0700	[diff] [blame]	617	if (!PageHuge(page))
				618	mem_cgroup_migrate(page, newpage);
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	619	}
Jérôme Glisse	2916ecc	2017-09-08 16:12:06 -0700	[diff] [blame]	620	EXPORT_SYMBOL(migrate_page_states);
				621
				622	void migrate_page_copy(struct page newpage, struct page page)
				623	{
				624	if (PageHuge(page) \|\| PageTransHuge(page))
				625	copy_huge_page(newpage, page);
				626	else
				627	copy_highpage(newpage, page);
				628
				629	migrate_page_states(newpage, page);
				630	}
Richard Weinberger	1118dce	2016-06-16 23:26:14 +0200	[diff] [blame]	631	EXPORT_SYMBOL(migrate_page_copy);
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	632
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	633	/************************************************************
				634	* Migration functions
				635	***********************************************************/
				636
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	637	/*
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	638	* Common logic to directly migrate a single LRU page suitable for
David Howells	266cf65	2009-04-03 16:42:36 +0100	[diff] [blame]	639	* pages that do not use PagePrivate/PagePrivate2.
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	640	*
				641	* Pages are locked upon entry and exit.
				642	*/
Christoph Lameter	2d1db3b	2006-06-23 02:03:33 -0700	[diff] [blame]	643	int migrate_page(struct address_space *mapping,
Mel Gorman	a6bc32b	2012-01-12 17:19:43 -0800	[diff] [blame]	644	struct page newpage, struct page page,
				645	enum migrate_mode mode)
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	646	{
				647	int rc;
				648
				649	BUG_ON(PageWriteback(page)); /* Writeback must be complete */
				650
Keith Busch	3710969	2019-07-18 15:58:46 -0700	[diff] [blame]	651	rc = migrate_page_move_mapping(mapping, newpage, page, 0);
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	652
Rafael Aquini	78bd520	2012-12-11 16:02:31 -0800	[diff] [blame]	653	if (rc != MIGRATEPAGE_SUCCESS)
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	654	return rc;
				655
Jérôme Glisse	2916ecc	2017-09-08 16:12:06 -0700	[diff] [blame]	656	if (mode != MIGRATE_SYNC_NO_COPY)
				657	migrate_page_copy(newpage, page);
				658	else
				659	migrate_page_states(newpage, page);
Rafael Aquini	78bd520	2012-12-11 16:02:31 -0800	[diff] [blame]	660	return MIGRATEPAGE_SUCCESS;
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	661	}
				662	EXPORT_SYMBOL(migrate_page);
				663
David Howells	9361401	2006-09-30 20:45:40 +0200	[diff] [blame]	664	#ifdef CONFIG_BLOCK
Jan Kara	84ade7c	2018-12-28 00:39:09 -0800	[diff] [blame]	665	/* Returns true if all buffers are successfully locked */
				666	static bool buffer_migrate_lock_buffers(struct buffer_head *head,
				667	enum migrate_mode mode)
				668	{
				669	struct buffer_head *bh = head;
				670
				671	/* Simple case, sync compaction */
				672	if (mode != MIGRATE_ASYNC) {
				673	do {
Jan Kara	84ade7c	2018-12-28 00:39:09 -0800	[diff] [blame]	674	lock_buffer(bh);
				675	bh = bh->b_this_page;
				676
				677	} while (bh != head);
				678
				679	return true;
				680	}
				681
				682	/* async case, we cannot block on lock_buffer so use trylock_buffer */
				683	do {
Jan Kara	84ade7c	2018-12-28 00:39:09 -0800	[diff] [blame]	684	if (!trylock_buffer(bh)) {
				685	/*
				686	* We failed to lock the buffer and cannot stall in
				687	* async migration. Release the taken locks
				688	*/
				689	struct buffer_head *failed_bh = bh;
Jan Kara	84ade7c	2018-12-28 00:39:09 -0800	[diff] [blame]	690	bh = head;
				691	while (bh != failed_bh) {
				692	unlock_buffer(bh);
Jan Kara	84ade7c	2018-12-28 00:39:09 -0800	[diff] [blame]	693	bh = bh->b_this_page;
				694	}
				695	return false;
				696	}
				697
				698	bh = bh->b_this_page;
				699	} while (bh != head);
				700	return true;
				701	}
				702
Jan Kara	89cb088	2018-12-28 00:39:12 -0800	[diff] [blame]	703	static int __buffer_migrate_page(struct address_space *mapping,
				704	struct page newpage, struct page page, enum migrate_mode mode,
				705	bool check_refs)
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	706	{
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	707	struct buffer_head bh, head;
				708	int rc;
Jan Kara	cc4f11e	2018-12-28 00:39:05 -0800	[diff] [blame]	709	int expected_count;
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	710
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	711	if (!page_has_buffers(page))
Mel Gorman	a6bc32b	2012-01-12 17:19:43 -0800	[diff] [blame]	712	return migrate_page(mapping, newpage, page, mode);
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	713
Jan Kara	cc4f11e	2018-12-28 00:39:05 -0800	[diff] [blame]	714	/* Check whether page does not have extra refs before we do more work */
Jan Kara	f900482	2019-03-05 15:48:46 -0800	[diff] [blame]	715	expected_count = expected_page_refs(mapping, page);
Jan Kara	cc4f11e	2018-12-28 00:39:05 -0800	[diff] [blame]	716	if (page_count(page) != expected_count)
				717	return -EAGAIN;
				718
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	719	head = page_buffers(page);
Jan Kara	cc4f11e	2018-12-28 00:39:05 -0800	[diff] [blame]	720	if (!buffer_migrate_lock_buffers(head, mode))
				721	return -EAGAIN;
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	722
Jan Kara	89cb088	2018-12-28 00:39:12 -0800	[diff] [blame]	723	if (check_refs) {
				724	bool busy;
				725	bool invalidated = false;
				726
				727	recheck_buffers:
				728	busy = false;
				729	spin_lock(&mapping->private_lock);
				730	bh = head;
				731	do {
				732	if (atomic_read(&bh->b_count)) {
				733	busy = true;
				734	break;
				735	}
				736	bh = bh->b_this_page;
				737	} while (bh != head);
Jan Kara	89cb088	2018-12-28 00:39:12 -0800	[diff] [blame]	738	if (busy) {
				739	if (invalidated) {
				740	rc = -EAGAIN;
				741	goto unlock_buffers;
				742	}
Jan Kara	ebdf4de	2019-08-02 21:48:47 -0700	[diff] [blame]	743	spin_unlock(&mapping->private_lock);
Jan Kara	89cb088	2018-12-28 00:39:12 -0800	[diff] [blame]	744	invalidate_bh_lrus();
				745	invalidated = true;
				746	goto recheck_buffers;
				747	}
				748	}
				749
Keith Busch	3710969	2019-07-18 15:58:46 -0700	[diff] [blame]	750	rc = migrate_page_move_mapping(mapping, newpage, page, 0);
Rafael Aquini	78bd520	2012-12-11 16:02:31 -0800	[diff] [blame]	751	if (rc != MIGRATEPAGE_SUCCESS)
Jan Kara	cc4f11e	2018-12-28 00:39:05 -0800	[diff] [blame]	752	goto unlock_buffers;
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	753
Guoqing Jiang	cd0f371	2020-06-01 21:48:06 -0700	[diff] [blame]	754	attach_page_private(newpage, detach_page_private(page));
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	755
				756	bh = head;
				757	do {
				758	set_bh_page(bh, newpage, bh_offset(bh));
				759	bh = bh->b_this_page;
				760
				761	} while (bh != head);
				762
Jérôme Glisse	2916ecc	2017-09-08 16:12:06 -0700	[diff] [blame]	763	if (mode != MIGRATE_SYNC_NO_COPY)
				764	migrate_page_copy(newpage, page);
				765	else
				766	migrate_page_states(newpage, page);
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	767
Jan Kara	cc4f11e	2018-12-28 00:39:05 -0800	[diff] [blame]	768	rc = MIGRATEPAGE_SUCCESS;
				769	unlock_buffers:
Jan Kara	ebdf4de	2019-08-02 21:48:47 -0700	[diff] [blame]	770	if (check_refs)
				771	spin_unlock(&mapping->private_lock);
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	772	bh = head;
				773	do {
				774	unlock_buffer(bh);
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	775	bh = bh->b_this_page;
				776
				777	} while (bh != head);
				778
Jan Kara	cc4f11e	2018-12-28 00:39:05 -0800	[diff] [blame]	779	return rc;
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	780	}
Jan Kara	89cb088	2018-12-28 00:39:12 -0800	[diff] [blame]	781
				782	/*
				783	* Migration function for pages with buffers. This function can only be used
				784	* if the underlying filesystem guarantees that no other references to "page"
				785	* exist. For example attached buffer heads are accessed only under page lock.
				786	*/
				787	int buffer_migrate_page(struct address_space *mapping,
				788	struct page newpage, struct page page, enum migrate_mode mode)
				789	{
				790	return __buffer_migrate_page(mapping, newpage, page, mode, false);
				791	}
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	792	EXPORT_SYMBOL(buffer_migrate_page);
Jan Kara	89cb088	2018-12-28 00:39:12 -0800	[diff] [blame]	793
				794	/*
				795	* Same as above except that this variant is more careful and checks that there
				796	* are also no buffer head references. This function is the right one for
				797	* mappings where buffer heads are directly looked up and referenced (such as
				798	* block device mappings).
				799	*/
				800	int buffer_migrate_page_norefs(struct address_space *mapping,
				801	struct page newpage, struct page page, enum migrate_mode mode)
				802	{
				803	return __buffer_migrate_page(mapping, newpage, page, mode, true);
				804	}
David Howells	9361401	2006-09-30 20:45:40 +0200	[diff] [blame]	805	#endif
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	806
Christoph Lameter	04e62a2	2006-06-23 02:03:38 -0700	[diff] [blame]	807	/*
				808	* Writeback a page to clean the dirty state
				809	*/
				810	static int writeout(struct address_space mapping, struct page page)
				811	{
				812	struct writeback_control wbc = {
				813	.sync_mode = WB_SYNC_NONE,
				814	.nr_to_write = 1,
				815	.range_start = 0,
				816	.range_end = LLONG_MAX,
Christoph Lameter	04e62a2	2006-06-23 02:03:38 -0700	[diff] [blame]	817	.for_reclaim = 1
				818	};
				819	int rc;
				820
				821	if (!mapping->a_ops->writepage)
				822	/* No write method for the address space */
				823	return -EINVAL;
				824
				825	if (!clear_page_dirty_for_io(page))
				826	/* Someone else already triggered a write */
				827	return -EAGAIN;
				828
				829	/*
				830	* A dirty page may imply that the underlying filesystem has
				831	* the page on some queue. So the page must be clean for
				832	* migration. Writeout may mean we loose the lock and the
				833	* page state is no longer what we checked for earlier.
				834	* At this point we know that the migration attempt cannot
				835	* be successful.
				836	*/
Kirill A. Shutemov	e388466	2016-03-17 14:20:07 -0700	[diff] [blame]	837	remove_migration_ptes(page, page, false);
Christoph Lameter	04e62a2	2006-06-23 02:03:38 -0700	[diff] [blame]	838
				839	rc = mapping->a_ops->writepage(page, &wbc);
Christoph Lameter	04e62a2	2006-06-23 02:03:38 -0700	[diff] [blame]	840
				841	if (rc != AOP_WRITEPAGE_ACTIVATE)
				842	/* unlocked. Relock */
				843	lock_page(page);
				844
Hugh Dickins	bda8550	2008-11-19 15:36:36 -0800	[diff] [blame]	845	return (rc < 0) ? -EIO : -EAGAIN;
Christoph Lameter	04e62a2	2006-06-23 02:03:38 -0700	[diff] [blame]	846	}
				847
				848	/*
				849	* Default handling if a filesystem does not provide a migration function.
				850	*/
Christoph Lameter	8351a6e	2006-06-23 02:03:33 -0700	[diff] [blame]	851	static int fallback_migrate_page(struct address_space *mapping,
Mel Gorman	a6bc32b	2012-01-12 17:19:43 -0800	[diff] [blame]	852	struct page newpage, struct page page, enum migrate_mode mode)
Christoph Lameter	8351a6e	2006-06-23 02:03:33 -0700	[diff] [blame]	853	{
Mel Gorman	b969c4ab	2012-01-12 17:19:34 -0800	[diff] [blame]	854	if (PageDirty(page)) {
Mel Gorman	a6bc32b	2012-01-12 17:19:43 -0800	[diff] [blame]	855	/* Only writeback pages in full synchronous migration */
Jérôme Glisse	2916ecc	2017-09-08 16:12:06 -0700	[diff] [blame]	856	switch (mode) {
				857	case MIGRATE_SYNC:
				858	case MIGRATE_SYNC_NO_COPY:
				859	break;
				860	default:
Mel Gorman	b969c4ab	2012-01-12 17:19:34 -0800	[diff] [blame]	861	return -EBUSY;
Jérôme Glisse	2916ecc	2017-09-08 16:12:06 -0700	[diff] [blame]	862	}
Christoph Lameter	04e62a2	2006-06-23 02:03:38 -0700	[diff] [blame]	863	return writeout(mapping, page);
Mel Gorman	b969c4ab	2012-01-12 17:19:34 -0800	[diff] [blame]	864	}
Christoph Lameter	8351a6e	2006-06-23 02:03:33 -0700	[diff] [blame]	865
				866	/*
				867	* Buffers may be managed in a filesystem specific way.
				868	* We must have no buffers or drop them.
				869	*/
David Howells	266cf65	2009-04-03 16:42:36 +0100	[diff] [blame]	870	if (page_has_private(page) &&
Christoph Lameter	8351a6e	2006-06-23 02:03:33 -0700	[diff] [blame]	871	!try_to_release_page(page, GFP_KERNEL))
Mel Gorman	806031b	2019-03-05 15:44:43 -0800	[diff] [blame]	872	return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY;
Christoph Lameter	8351a6e	2006-06-23 02:03:33 -0700	[diff] [blame]	873
Mel Gorman	a6bc32b	2012-01-12 17:19:43 -0800	[diff] [blame]	874	return migrate_page(mapping, newpage, page, mode);
Christoph Lameter	8351a6e	2006-06-23 02:03:33 -0700	[diff] [blame]	875	}
				876
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	877	/*
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	878	* Move a page to a newly allocated page
				879	* The page is locked and all ptes have been successfully removed.
				880	*
				881	* The new page will have replaced the old page if this function
				882	* is successful.
Lee Schermerhorn	894bc31	2008-10-18 20:26:39 -0700	[diff] [blame]	883	*
				884	* Return value:
				885	* < 0 - error code
Rafael Aquini	78bd520	2012-12-11 16:02:31 -0800	[diff] [blame]	886	* MIGRATEPAGE_SUCCESS - success
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	887	*/
Mel Gorman	3fe2011	2010-05-24 14:32:20 -0700	[diff] [blame]	888	static int move_to_new_page(struct page newpage, struct page page,
Hugh Dickins	5c3f9a6	2015-11-05 18:49:53 -0800	[diff] [blame]	889	enum migrate_mode mode)
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	890	{
				891	struct address_space *mapping;
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	892	int rc = -EAGAIN;
				893	bool is_lru = !__PageMovable(page);
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	894
Hugh Dickins	7db7671	2015-11-05 18:49:49 -0800	[diff] [blame]	895	VM_BUG_ON_PAGE(!PageLocked(page), page);
				896	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	897
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	898	mapping = page_mapping(page);
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	899
				900	if (likely(is_lru)) {
				901	if (!mapping)
				902	rc = migrate_page(mapping, newpage, page, mode);
				903	else if (mapping->a_ops->migratepage)
				904	/*
				905	* Most pages have a mapping and most filesystems
				906	* provide a migratepage callback. Anonymous pages
				907	* are part of swap space which also has its own
				908	* migratepage callback. This is the most common path
				909	* for page migration.
				910	*/
				911	rc = mapping->a_ops->migratepage(mapping, newpage,
				912	page, mode);
				913	else
				914	rc = fallback_migrate_page(mapping, newpage,
				915	page, mode);
				916	} else {
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	917	/*
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	918	* In case of non-lru page, it could be released after
				919	* isolation step. In that case, we shouldn't try migration.
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	920	*/
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	921	VM_BUG_ON_PAGE(!PageIsolated(page), page);
				922	if (!PageMovable(page)) {
				923	rc = MIGRATEPAGE_SUCCESS;
andrew.yang	0d8a836	2022-03-15 16:58:34 +1100	[diff] [blame]	924	ClearPageIsolated(page);
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	925	goto out;
				926	}
				927
				928	rc = mapping->a_ops->migratepage(mapping, newpage,
				929	page, mode);
				930	WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS &&
				931	!PageIsolated(page));
				932	}
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	933
Hugh Dickins	5c3f9a6	2015-11-05 18:49:53 -0800	[diff] [blame]	934	/*
				935	* When successful, old pagecache page->mapping must be cleared before
				936	* page is freed; but stats require that PageAnon be left as PageAnon.
				937	*/
				938	if (rc == MIGRATEPAGE_SUCCESS) {
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	939	if (__PageMovable(page)) {
				940	VM_BUG_ON_PAGE(!PageIsolated(page), page);
				941
				942	/*
				943	* We clear PG_movable under page_lock so any compactor
				944	* cannot try to migrate this page.
				945	*/
andrew.yang	0d8a836	2022-03-15 16:58:34 +1100	[diff] [blame]	946	ClearPageIsolated(page);
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	947	}
				948
				949	/*
Ralph Campbell	c23a0c9	2020-01-30 22:14:41 -0800	[diff] [blame]	950	* Anonymous and movable page->mapping will be cleared by
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	951	* free_pages_prepare so don't reset it here for keeping
				952	* the type to work PageAnon, for example.
				953	*/
				954	if (!PageMappingFlags(page))
Hugh Dickins	5c3f9a6	2015-11-05 18:49:53 -0800	[diff] [blame]	955	page->mapping = NULL;
Lars Persson	d2b2c6dd	2019-03-28 20:44:28 -0700	[diff] [blame]	956
Muchun Song	97a9f80	2022-03-22 14:41:56 -0700	[diff] [blame]	957	if (likely(!is_zone_device_page(newpage))) {
				958	int i, nr = compound_nr(newpage);
Lars Persson	d2b2c6dd	2019-03-28 20:44:28 -0700	[diff] [blame]	959
Muchun Song	97a9f80	2022-03-22 14:41:56 -0700	[diff] [blame]	960	for (i = 0; i < nr; i++)
				961	flush_dcache_page(newpage + i);
				962	}
Mel Gorman	3fe2011	2010-05-24 14:32:20 -0700	[diff] [blame]	963	}
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	964	out:
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	965	return rc;
				966	}
				967
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	968	static int __unmap_and_move(struct page page, struct page newpage,
Hugh Dickins	9c620e2	2013-02-22 16:35:14 -0800	[diff] [blame]	969	int force, enum migrate_mode mode)
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	970	{
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	971	int rc = -EAGAIN;
Baolin Wang	213ecb3	2021-09-08 15:18:06 -0700	[diff] [blame]	972	bool page_was_mapped = false;
Mel Gorman	3f6c827	2010-05-24 14:32:17 -0700	[diff] [blame]	973	struct anon_vma *anon_vma = NULL;
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	974	bool is_lru = !__PageMovable(page);
Christoph Lameter	95a402c	2006-06-23 02:03:53 -0700	[diff] [blame]	975
Nick Piggin	529ae9a	2008-08-02 12:01:03 +0200	[diff] [blame]	976	if (!trylock_page(page)) {
Mel Gorman	a6bc32b	2012-01-12 17:19:43 -0800	[diff] [blame]	977	if (!force \|\| mode == MIGRATE_ASYNC)
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	978	goto out;
Mel Gorman	3e7d344	2011-01-13 15:45:56 -0800	[diff] [blame]	979
				980	/*
				981	* It's not safe for direct compaction to call lock_page.
				982	* For example, during page readahead pages are added locked
				983	* to the LRU. Later, when the IO completes the pages are
				984	* marked uptodate and unlocked. However, the queueing
				985	* could be merging multiple pages for one bio (e.g.
Matthew Wilcox (Oracle)	d438834	2020-06-01 21:47:02 -0700	[diff] [blame]	986	* mpage_readahead). If an allocation happens for the
Mel Gorman	3e7d344	2011-01-13 15:45:56 -0800	[diff] [blame]	987	* second or third page, the process can end up locking
				988	* the same page twice and deadlocking. Rather than
				989	* trying to be clever about what pages can be locked,
				990	* avoid the use of lock_page for direct compaction
				991	* altogether.
				992	*/
				993	if (current->flags & PF_MEMALLOC)
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	994	goto out;
Mel Gorman	3e7d344	2011-01-13 15:45:56 -0800	[diff] [blame]	995
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	996	lock_page(page);
				997	}
				998
				999	if (PageWriteback(page)) {
Andrea Arcangeli	11bc82d	2011-03-22 16:33:11 -0700	[diff] [blame]	1000	/*
Jianguo Wu	fed5b64	2013-04-29 15:07:58 -0700	[diff] [blame]	1001	* Only in the case of a full synchronous migration is it
Mel Gorman	a6bc32b	2012-01-12 17:19:43 -0800	[diff] [blame]	1002	* necessary to wait for PageWriteback. In the async case,
				1003	* the retry loop is too short and in the sync-light case,
				1004	* the overhead of stalling is too much
Andrea Arcangeli	11bc82d	2011-03-22 16:33:11 -0700	[diff] [blame]	1005	*/
Jérôme Glisse	2916ecc	2017-09-08 16:12:06 -0700	[diff] [blame]	1006	switch (mode) {
				1007	case MIGRATE_SYNC:
				1008	case MIGRATE_SYNC_NO_COPY:
				1009	break;
				1010	default:
Andrea Arcangeli	11bc82d	2011-03-22 16:33:11 -0700	[diff] [blame]	1011	rc = -EBUSY;
Johannes Weiner	0a31bc9	2014-08-08 14:19:22 -0700	[diff] [blame]	1012	goto out_unlock;
Andrea Arcangeli	11bc82d	2011-03-22 16:33:11 -0700	[diff] [blame]	1013	}
				1014	if (!force)
Johannes Weiner	0a31bc9	2014-08-08 14:19:22 -0700	[diff] [blame]	1015	goto out_unlock;
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1016	wait_on_page_writeback(page);
				1017	}
Hugh Dickins	03f15c8	2015-11-05 18:49:56 -0800	[diff] [blame]	1018
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1019	/*
Baolin Wang	68a9843	2021-09-08 15:18:03 -0700	[diff] [blame]	1020	* By try_to_migrate(), page->mapcount goes down to 0 here. In this case,
KAMEZAWA Hiroyuki	dc386d4	2007-07-26 10:41:07 -0700	[diff] [blame]	1021	* we cannot notice that anon_vma is freed while we migrates a page.
Hugh Dickins	1ce82b6	2011-01-13 15:47:30 -0800	[diff] [blame]	1022	* This get_anon_vma() delays freeing anon_vma pointer until the end
KAMEZAWA Hiroyuki	dc386d4	2007-07-26 10:41:07 -0700	[diff] [blame]	1023	* of migration. File cache pages are no problem because of page_lock()
KAMEZAWA Hiroyuki	989f89c	2007-08-30 23:56:21 -0700	[diff] [blame]	1024	* File Caches may use write_page() or lock_page() in migration, then,
				1025	* just care Anon page here.
Hugh Dickins	03f15c8	2015-11-05 18:49:56 -0800	[diff] [blame]	1026	*
				1027	* Only page_get_anon_vma() understands the subtleties of
				1028	* getting a hold on an anon_vma from outside one of its mms.
				1029	* But if we cannot get anon_vma, then we won't need it anyway,
				1030	* because that implies that the anon page is no longer mapped
				1031	* (and cannot be remapped so long as we hold the page lock).
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1032	*/
Hugh Dickins	03f15c8	2015-11-05 18:49:56 -0800	[diff] [blame]	1033	if (PageAnon(page) && !PageKsm(page))
Peter Zijlstra	746b18d	2011-05-24 17:12:10 -0700	[diff] [blame]	1034	anon_vma = page_get_anon_vma(page);
Shaohua Li	62e1c55	2008-02-04 22:29:33 -0800	[diff] [blame]	1035
Hugh Dickins	7db7671	2015-11-05 18:49:49 -0800	[diff] [blame]	1036	/*
				1037	* Block others from accessing the new page when we get around to
				1038	* establishing additional references. We are usually the only one
				1039	* holding a reference to newpage at this point. We used to have a BUG
				1040	* here if trylock_page(newpage) fails, but would like to allow for
				1041	* cases where there might be a race with the previous use of newpage.
				1042	* This is much like races on refcount of oldpage: just don't BUG().
				1043	*/
				1044	if (unlikely(!trylock_page(newpage)))
				1045	goto out_unlock;
				1046
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	1047	if (unlikely(!is_lru)) {
				1048	rc = move_to_new_page(newpage, page, mode);
				1049	goto out_unlock_both;
				1050	}
				1051
KAMEZAWA Hiroyuki	dc386d4	2007-07-26 10:41:07 -0700	[diff] [blame]	1052	/*
Shaohua Li	62e1c55	2008-02-04 22:29:33 -0800	[diff] [blame]	1053	* Corner case handling:
				1054	* 1. When a new swap-cache page is read into, it is added to the LRU
				1055	* and treated as swapcache but it has no rmap yet.
				1056	* Calling try_to_unmap() against a page->mapping==NULL page will
				1057	* trigger a BUG. So handle it here.
Yang Shi	d12b895	2020-12-14 19:13:02 -0800	[diff] [blame]	1058	* 2. An orphaned page (see truncate_cleanup_page) might have
Shaohua Li	62e1c55	2008-02-04 22:29:33 -0800	[diff] [blame]	1059	* fs-private metadata. The page can be picked up due to memory
				1060	* offlining. Everywhere else except page reclaim, the page is
				1061	* invisible to the vm, so the page can not be migrated. So try to
				1062	* free the metadata, so the page can be freed.
KAMEZAWA Hiroyuki	dc386d4	2007-07-26 10:41:07 -0700	[diff] [blame]	1063	*/
Shaohua Li	62e1c55	2008-02-04 22:29:33 -0800	[diff] [blame]	1064	if (!page->mapping) {
Sasha Levin	309381fea	2014-01-23 15:52:54 -0800	[diff] [blame]	1065	VM_BUG_ON_PAGE(PageAnon(page), page);
Hugh Dickins	1ce82b6	2011-01-13 15:47:30 -0800	[diff] [blame]	1066	if (page_has_private(page)) {
Shaohua Li	62e1c55	2008-02-04 22:29:33 -0800	[diff] [blame]	1067	try_to_free_buffers(page);
Hugh Dickins	7db7671	2015-11-05 18:49:49 -0800	[diff] [blame]	1068	goto out_unlock_both;
Shaohua Li	62e1c55	2008-02-04 22:29:33 -0800	[diff] [blame]	1069	}
Hugh Dickins	7db7671	2015-11-05 18:49:49 -0800	[diff] [blame]	1070	} else if (page_mapped(page)) {
				1071	/* Establish migration ptes */
Hugh Dickins	03f15c8	2015-11-05 18:49:56 -0800	[diff] [blame]	1072	VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
				1073	page);
Alistair Popple	a98a2f0	2021-06-30 18:54:16 -0700	[diff] [blame]	1074	try_to_migrate(page, 0);
Baolin Wang	213ecb3	2021-09-08 15:18:06 -0700	[diff] [blame]	1075	page_was_mapped = true;
Hugh Dickins	2ebba6b	2014-12-12 16:56:19 -0800	[diff] [blame]	1076	}
KAMEZAWA Hiroyuki	dc386d4	2007-07-26 10:41:07 -0700	[diff] [blame]	1077
Christoph Lameter	e6a1530	2006-06-25 05:46:49 -0700	[diff] [blame]	1078	if (!page_mapped(page))
Hugh Dickins	5c3f9a6	2015-11-05 18:49:53 -0800	[diff] [blame]	1079	rc = move_to_new_page(newpage, page, mode);
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1080
Hugh Dickins	5c3f9a6	2015-11-05 18:49:53 -0800	[diff] [blame]	1081	if (page_was_mapped)
				1082	remove_migration_ptes(page,
Kirill A. Shutemov	e388466	2016-03-17 14:20:07 -0700	[diff] [blame]	1083	rc == MIGRATEPAGE_SUCCESS ? newpage : page, false);
Mel Gorman	3f6c827	2010-05-24 14:32:17 -0700	[diff] [blame]	1084
Hugh Dickins	7db7671	2015-11-05 18:49:49 -0800	[diff] [blame]	1085	out_unlock_both:
				1086	unlock_page(newpage);
				1087	out_unlock:
Mel Gorman	3f6c827	2010-05-24 14:32:17 -0700	[diff] [blame]	1088	/* Drop an anon_vma reference if we took one */
Rik van Riel	7654506	2010-08-09 17:18:41 -0700	[diff] [blame]	1089	if (anon_vma)
Peter Zijlstra	9e60109	2011-03-22 16:32:46 -0700	[diff] [blame]	1090	put_anon_vma(anon_vma);
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1091	unlock_page(page);
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	1092	out:
Minchan Kim	c6c919e	2016-07-26 15:23:02 -0700	[diff] [blame]	1093	/*
				1094	* If migration is successful, decrease refcount of the newpage
				1095	* which will not free the page because new page owner increased
				1096	* refcounter. As well, if it is LRU page, add the page to LRU
David Hildenbrand	e0a352f	2019-02-01 14:21:19 -0800	[diff] [blame]	1097	* list in here. Use the old state of the isolated source page to
				1098	* determine if we migrated a LRU page. newpage was already unlocked
				1099	* and possibly modified by its owner - don't rely on the page
				1100	* state.
Minchan Kim	c6c919e	2016-07-26 15:23:02 -0700	[diff] [blame]	1101	*/
				1102	if (rc == MIGRATEPAGE_SUCCESS) {
David Hildenbrand	e0a352f	2019-02-01 14:21:19 -0800	[diff] [blame]	1103	if (unlikely(!is_lru))
Minchan Kim	c6c919e	2016-07-26 15:23:02 -0700	[diff] [blame]	1104	put_page(newpage);
				1105	else
				1106	putback_lru_page(newpage);
				1107	}
				1108
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	1109	return rc;
				1110	}
Christoph Lameter	95a402c	2006-06-23 02:03:53 -0700	[diff] [blame]	1111
Dave Hansen	79c28a4	2021-09-02 14:59:06 -0700	[diff] [blame]	1112
				1113	/*
				1114	* node_demotion[] example:
				1115	*
				1116	* Consider a system with two sockets. Each socket has
				1117	* three classes of memory attached: fast, medium and slow.
				1118	* Each memory class is placed in its own NUMA node. The
				1119	* CPUs are placed in the node with the "fast" memory. The
				1120	* 6 NUMA nodes (0-5) might be split among the sockets like
				1121	* this:
				1122	*
				1123	* Socket A: 0, 1, 2
				1124	* Socket B: 3, 4, 5
				1125	*
				1126	* When Node 0 fills up, its memory should be migrated to
				1127	* Node 1. When Node 1 fills up, it should be migrated to
				1128	* Node 2. The migration path start on the nodes with the
				1129	* processors (since allocations default to this node) and
				1130	* fast memory, progress through medium and end with the
				1131	* slow memory:
				1132	*
				1133	* 0 -> 1 -> 2 -> stop
				1134	* 3 -> 4 -> 5 -> stop
				1135	*
				1136	* This is represented in the node_demotion[] like this:
				1137	*
				1138	* { 1, // Node 0 migrates to 1
				1139	* 2, // Node 1 migrates to 2
				1140	* -1, // Node 2 does not migrate
				1141	* 4, // Node 3 migrates to 4
				1142	* 5, // Node 4 migrates to 5
				1143	* -1} // Node 5 does not migrate
				1144	*/
				1145
				1146	/*
				1147	* Writes to this array occur without locking. Cycles are
				1148	* not allowed: Node X demotes to Y which demotes to X...
				1149	*
				1150	* If multiple reads are performed, a single rcu_read_lock()
				1151	* must be held over all reads to ensure that no cycles are
				1152	* observed.
				1153	*/
				1154	static int node_demotion[MAX_NUMNODES] __read_mostly =
				1155	{[0 ... MAX_NUMNODES - 1] = NUMA_NO_NODE};
				1156
				1157	/**
				1158	* next_demotion_node() - Get the next node in the demotion path
				1159	* @node: The starting node to lookup the next node
				1160	*
Randy Dunlap	c9bd7d1	2021-09-02 15:00:36 -0700	[diff] [blame]	1161	* Return: node id for next memory node in the demotion path hierarchy
Dave Hansen	79c28a4	2021-09-02 14:59:06 -0700	[diff] [blame]	1162	* from @node; NUMA_NO_NODE if @node is terminal. This does not keep
				1163	* @node online or guarantee that it continues to be the next demotion
				1164	* target.
				1165	*/
				1166	int next_demotion_node(int node)
				1167	{
				1168	int target;
				1169
				1170	/*
				1171	* node_demotion[] is updated without excluding this
				1172	* function from running. RCU doesn't provide any
				1173	* compiler barriers, so the READ_ONCE() is required
				1174	* to avoid compiler reordering or read merging.
				1175	*
				1176	* Make sure to use RCU over entire code blocks if
				1177	* node_demotion[] reads need to be consistent.
				1178	*/
				1179	rcu_read_lock();
				1180	target = READ_ONCE(node_demotion[node]);
				1181	rcu_read_unlock();
				1182
				1183	return target;
				1184	}
				1185
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	1186	/*
				1187	* Obtain the lock on page, remove all ptes and migrate the page
				1188	* to the newly allocated page in newpage.
				1189	*/
Linus Torvalds	6ec4476	2020-07-08 10:48:35 -0700	[diff] [blame]	1190	static int unmap_and_move(new_page_t get_new_page,
Geert Uytterhoeven	ef2a515	2015-04-14 15:44:22 -0700	[diff] [blame]	1191	free_page_t put_new_page,
				1192	unsigned long private, struct page *page,
Naoya Horiguchi	add05ce	2015-06-24 16:56:50 -0700	[diff] [blame]	1193	int force, enum migrate_mode mode,
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1194	enum migrate_reason reason,
				1195	struct list_head *ret)
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	1196	{
Hugh Dickins	2def742	2015-11-05 18:49:46 -0800	[diff] [blame]	1197	int rc = MIGRATEPAGE_SUCCESS;
Yang Shi	74d4a57	2019-11-30 17:57:12 -0800	[diff] [blame]	1198	struct page *newpage = NULL;
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	1199
Michal Hocko	94723aa	2018-04-10 16:30:07 -0700	[diff] [blame]	1200	if (!thp_migration_supported() && PageTransHuge(page))
Yang Shi	d532e2e	2020-12-14 19:13:16 -0800	[diff] [blame]	1201	return -ENOSYS;
Michal Hocko	94723aa	2018-04-10 16:30:07 -0700	[diff] [blame]	1202
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	1203	if (page_count(page) == 1) {
				1204	/* page was freed from under us. So we are done. */
Minchan Kim	c6c919e	2016-07-26 15:23:02 -0700	[diff] [blame]	1205	ClearPageActive(page);
				1206	ClearPageUnevictable(page);
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	1207	if (unlikely(__PageMovable(page))) {
				1208	lock_page(page);
				1209	if (!PageMovable(page))
andrew.yang	0d8a836	2022-03-15 16:58:34 +1100	[diff] [blame]	1210	ClearPageIsolated(page);
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	1211	unlock_page(page);
				1212	}
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	1213	goto out;
				1214	}
				1215
Yang Shi	74d4a57	2019-11-30 17:57:12 -0800	[diff] [blame]	1216	newpage = get_new_page(page, private);
				1217	if (!newpage)
				1218	return -ENOMEM;
				1219
Hugh Dickins	9c620e2	2013-02-22 16:35:14 -0800	[diff] [blame]	1220	rc = __unmap_and_move(page, newpage, force, mode);
Minchan Kim	c6c919e	2016-07-26 15:23:02 -0700	[diff] [blame]	1221	if (rc == MIGRATEPAGE_SUCCESS)
Vlastimil Babka	7cd12b4	2016-03-15 14:56:18 -0700	[diff] [blame]	1222	set_page_owner_migrate_reason(newpage, reason);
Rafael Aquini	bf6bddf1	2012-12-11 16:02:42 -0800	[diff] [blame]	1223
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	1224	out:
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1225	if (rc != -EAGAIN) {
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	1226	/*
				1227	* A page that has been migrated has all references
				1228	* removed and will be freed. A page that has not been
Ralph Campbell	c23a0c9	2020-01-30 22:14:41 -0800	[diff] [blame]	1229	* migrated will have kept its references and be restored.
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	1230	*/
				1231	list_del(&page->lru);
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1232	}
David Rientjes	68711a7	2014-06-04 16:08:25 -0700	[diff] [blame]	1233
Christoph Lameter	95a402c	2006-06-23 02:03:53 -0700	[diff] [blame]	1234	/*
Minchan Kim	c6c919e	2016-07-26 15:23:02 -0700	[diff] [blame]	1235	* If migration is successful, releases reference grabbed during
				1236	* isolation. Otherwise, restore the page to right list unless
				1237	* we want to retry.
Christoph Lameter	95a402c	2006-06-23 02:03:53 -0700	[diff] [blame]	1238	*/
Minchan Kim	c6c919e	2016-07-26 15:23:02 -0700	[diff] [blame]	1239	if (rc == MIGRATEPAGE_SUCCESS) {
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1240	/*
				1241	* Compaction can migrate also non-LRU pages which are
				1242	* not accounted to NR_ISOLATED_*. They can be recognized
				1243	* as __PageMovable
				1244	*/
				1245	if (likely(!__PageMovable(page)))
				1246	mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
				1247	page_is_file_lru(page), -thp_nr_pages(page));
				1248
Oscar Salvador	79f5f8f	2020-10-15 20:07:09 -0700	[diff] [blame]	1249	if (reason != MR_MEMORY_FAILURE)
Minchan Kim	c6c919e	2016-07-26 15:23:02 -0700	[diff] [blame]	1250	/*
Oscar Salvador	79f5f8f	2020-10-15 20:07:09 -0700	[diff] [blame]	1251	* We release the page in page_handle_poison.
Minchan Kim	c6c919e	2016-07-26 15:23:02 -0700	[diff] [blame]	1252	*/
Oscar Salvador	79f5f8f	2020-10-15 20:07:09 -0700	[diff] [blame]	1253	put_page(page);
Minchan Kim	c6c919e	2016-07-26 15:23:02 -0700	[diff] [blame]	1254	} else {
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1255	if (rc != -EAGAIN)
				1256	list_add_tail(&page->lru, ret);
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	1257
Minchan Kim	c6c919e	2016-07-26 15:23:02 -0700	[diff] [blame]	1258	if (put_new_page)
				1259	put_new_page(newpage, private);
				1260	else
				1261	put_page(newpage);
				1262	}
David Rientjes	68711a7	2014-06-04 16:08:25 -0700	[diff] [blame]	1263
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1264	return rc;
				1265	}
				1266
				1267	/*
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1268	* Counterpart of unmap_and_move_page() for hugepage migration.
				1269	*
				1270	* This function doesn't wait the completion of hugepage I/O
				1271	* because there is no race between I/O and migration for hugepage.
				1272	* Note that currently hugepage I/O occurs only in direct I/O
				1273	* where no lock is held and PG_writeback is irrelevant,
				1274	* and writeback status of all subpages are counted in the reference
				1275	* count of the head page (i.e. if all subpages of a 2MB hugepage are
				1276	* under direct I/O, the reference of the head page is 512 and a bit more.)
				1277	* This means that when we try to migrate hugepage whose subpages are
				1278	* doing direct I/O, some references remain after try_to_unmap() and
				1279	* hugepage migration fails without data corruption.
				1280	*
				1281	* There is also no race when direct I/O is issued on the page under migration,
				1282	* because then pte is replaced with migration swap entry and direct I/O code
				1283	* will wait in the page fault for migration to complete.
				1284	*/
				1285	static int unmap_and_move_huge_page(new_page_t get_new_page,
David Rientjes	68711a7	2014-06-04 16:08:25 -0700	[diff] [blame]	1286	free_page_t put_new_page, unsigned long private,
				1287	struct page *hpage, int force,
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1288	enum migrate_mode mode, int reason,
				1289	struct list_head *ret)
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1290	{
Hugh Dickins	2def742	2015-11-05 18:49:46 -0800	[diff] [blame]	1291	int rc = -EAGAIN;
Hugh Dickins	2ebba6b	2014-12-12 16:56:19 -0800	[diff] [blame]	1292	int page_was_mapped = 0;
Joonsoo Kim	32665f2	2014-01-21 15:51:15 -0800	[diff] [blame]	1293	struct page *new_hpage;
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1294	struct anon_vma *anon_vma = NULL;
Mike Kravetz	c0d0381	2020-04-01 21:11:05 -0700	[diff] [blame]	1295	struct address_space *mapping = NULL;
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1296
Naoya Horiguchi	83467ef	2013-09-11 14:22:11 -0700	[diff] [blame]	1297	/*
Anshuman Khandual	7ed2c31	2019-03-05 15:43:44 -0800	[diff] [blame]	1298	* Migratability of hugepages depends on architectures and their size.
Naoya Horiguchi	83467ef	2013-09-11 14:22:11 -0700	[diff] [blame]	1299	* This check is necessary because some callers of hugepage migration
				1300	* like soft offline and memory hotremove don't walk through page
				1301	* tables or check whether the hugepage is pmd-based or not before
				1302	* kicking migration.
				1303	*/
Naoya Horiguchi	100873d	2014-06-04 16:10:56 -0700	[diff] [blame]	1304	if (!hugepage_migration_supported(page_hstate(hpage))) {
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1305	list_move_tail(&hpage->lru, ret);
Naoya Horiguchi	83467ef	2013-09-11 14:22:11 -0700	[diff] [blame]	1306	return -ENOSYS;
Joonsoo Kim	32665f2	2014-01-21 15:51:15 -0800	[diff] [blame]	1307	}
Naoya Horiguchi	83467ef	2013-09-11 14:22:11 -0700	[diff] [blame]	1308
Muchun Song	71a64f6	2021-02-04 18:32:17 -0800	[diff] [blame]	1309	if (page_count(hpage) == 1) {
				1310	/* page was freed from under us. So we are done. */
				1311	putback_active_hugepage(hpage);
				1312	return MIGRATEPAGE_SUCCESS;
				1313	}
				1314
Michal Hocko	666feb2	2018-04-10 16:30:03 -0700	[diff] [blame]	1315	new_hpage = get_new_page(hpage, private);
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1316	if (!new_hpage)
				1317	return -ENOMEM;
				1318
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1319	if (!trylock_page(hpage)) {
Jérôme Glisse	2916ecc	2017-09-08 16:12:06 -0700	[diff] [blame]	1320	if (!force)
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1321	goto out;
Jérôme Glisse	2916ecc	2017-09-08 16:12:06 -0700	[diff] [blame]	1322	switch (mode) {
				1323	case MIGRATE_SYNC:
				1324	case MIGRATE_SYNC_NO_COPY:
				1325	break;
				1326	default:
				1327	goto out;
				1328	}
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1329	lock_page(hpage);
				1330	}
				1331
Mike Kravetz	cb6acd0	2019-02-28 16:22:02 -0800	[diff] [blame]	1332	/*
				1333	* Check for pages which are in the process of being freed. Without
				1334	* page_mapping() set, hugetlbfs specific move page routine will not
				1335	* be called and we could leak usage counts for subpools.
				1336	*/
Muchun Song	6acfb5b	2021-06-30 18:51:29 -0700	[diff] [blame]	1337	if (hugetlb_page_subpool(hpage) && !page_mapping(hpage)) {
Mike Kravetz	cb6acd0	2019-02-28 16:22:02 -0800	[diff] [blame]	1338	rc = -EBUSY;
				1339	goto out_unlock;
				1340	}
				1341
Peter Zijlstra	746b18d	2011-05-24 17:12:10 -0700	[diff] [blame]	1342	if (PageAnon(hpage))
				1343	anon_vma = page_get_anon_vma(hpage);
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1344
Hugh Dickins	7db7671	2015-11-05 18:49:49 -0800	[diff] [blame]	1345	if (unlikely(!trylock_page(new_hpage)))
				1346	goto put_anon;
				1347
Hugh Dickins	2ebba6b	2014-12-12 16:56:19 -0800	[diff] [blame]	1348	if (page_mapped(hpage)) {
Mike Kravetz	336bf30	2020-11-13 22:52:16 -0800	[diff] [blame]	1349	bool mapping_locked = false;
Alistair Popple	a98a2f0	2021-06-30 18:54:16 -0700	[diff] [blame]	1350	enum ttu_flags ttu = 0;
Mike Kravetz	c0d0381	2020-04-01 21:11:05 -0700	[diff] [blame]	1351
Mike Kravetz	336bf30	2020-11-13 22:52:16 -0800	[diff] [blame]	1352	if (!PageAnon(hpage)) {
				1353	/*
				1354	* In shared mappings, try_to_unmap could potentially
				1355	* call huge_pmd_unshare. Because of this, take
				1356	* semaphore in write mode here and set TTU_RMAP_LOCKED
				1357	* to let lower levels know we have taken the lock.
				1358	*/
				1359	mapping = hugetlb_page_mapping_lock_write(hpage);
				1360	if (unlikely(!mapping))
				1361	goto unlock_put_anon;
				1362
				1363	mapping_locked = true;
				1364	ttu \|= TTU_RMAP_LOCKED;
				1365	}
				1366
Alistair Popple	a98a2f0	2021-06-30 18:54:16 -0700	[diff] [blame]	1367	try_to_migrate(hpage, ttu);
Hugh Dickins	2ebba6b	2014-12-12 16:56:19 -0800	[diff] [blame]	1368	page_was_mapped = 1;
Mike Kravetz	336bf30	2020-11-13 22:52:16 -0800	[diff] [blame]	1369
				1370	if (mapping_locked)
				1371	i_mmap_unlock_write(mapping);
Hugh Dickins	2ebba6b	2014-12-12 16:56:19 -0800	[diff] [blame]	1372	}
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1373
				1374	if (!page_mapped(hpage))
Hugh Dickins	5c3f9a6	2015-11-05 18:49:53 -0800	[diff] [blame]	1375	rc = move_to_new_page(new_hpage, hpage, mode);
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1376
Mike Kravetz	336bf30	2020-11-13 22:52:16 -0800	[diff] [blame]	1377	if (page_was_mapped)
Hugh Dickins	5c3f9a6	2015-11-05 18:49:53 -0800	[diff] [blame]	1378	remove_migration_ptes(hpage,
Mike Kravetz	336bf30	2020-11-13 22:52:16 -0800	[diff] [blame]	1379	rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, false);
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1380
Mike Kravetz	c0d0381	2020-04-01 21:11:05 -0700	[diff] [blame]	1381	unlock_put_anon:
Hugh Dickins	7db7671	2015-11-05 18:49:49 -0800	[diff] [blame]	1382	unlock_page(new_hpage);
				1383
				1384	put_anon:
Hugh Dickins	fd4a466	2011-01-13 15:47:31 -0800	[diff] [blame]	1385	if (anon_vma)
Peter Zijlstra	9e60109	2011-03-22 16:32:46 -0700	[diff] [blame]	1386	put_anon_vma(anon_vma);
Aneesh Kumar K.V	8e6ac7f	2012-07-31 16:42:27 -0700	[diff] [blame]	1387
Hugh Dickins	2def742	2015-11-05 18:49:46 -0800	[diff] [blame]	1388	if (rc == MIGRATEPAGE_SUCCESS) {
Michal Hocko	ab5ac90	2018-01-31 16:20:48 -0800	[diff] [blame]	1389	move_hugetlb_state(hpage, new_hpage, reason);
Hugh Dickins	2def742	2015-11-05 18:49:46 -0800	[diff] [blame]	1390	put_new_page = NULL;
				1391	}
Aneesh Kumar K.V	8e6ac7f	2012-07-31 16:42:27 -0700	[diff] [blame]	1392
Mike Kravetz	cb6acd0	2019-02-28 16:22:02 -0800	[diff] [blame]	1393	out_unlock:
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1394	unlock_page(hpage);
Hillf Danton	0976133	2011-12-08 14:34:20 -0800	[diff] [blame]	1395	out:
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1396	if (rc == MIGRATEPAGE_SUCCESS)
Naoya Horiguchi	b8ec1ce	2013-09-11 14:22:01 -0700	[diff] [blame]	1397	putback_active_hugepage(hpage);
Miaohe Lin	a04840c	2021-05-04 18:37:07 -0700	[diff] [blame]	1398	else if (rc != -EAGAIN)
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1399	list_move_tail(&hpage->lru, ret);
David Rientjes	68711a7	2014-06-04 16:08:25 -0700	[diff] [blame]	1400
				1401	/*
				1402	* If migration was not successful and there's a freeing callback, use
				1403	* it. Otherwise, put_page() will drop the reference grabbed during
				1404	* isolation.
				1405	*/
Hugh Dickins	2def742	2015-11-05 18:49:46 -0800	[diff] [blame]	1406	if (put_new_page)
David Rientjes	68711a7	2014-06-04 16:08:25 -0700	[diff] [blame]	1407	put_new_page(new_hpage, private);
				1408	else
Naoya Horiguchi	3aaa76e	2015-09-22 14:59:14 -0700	[diff] [blame]	1409	putback_active_hugepage(new_hpage);
David Rientjes	68711a7	2014-06-04 16:08:25 -0700	[diff] [blame]	1410
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1411	return rc;
				1412	}
				1413
Yang Shi	d532e2e	2020-12-14 19:13:16 -0800	[diff] [blame]	1414	static inline int try_split_thp(struct page page, struct page *page2,
				1415	struct list_head *from)
				1416	{
				1417	int rc = 0;
				1418
				1419	lock_page(page);
				1420	rc = split_huge_page_to_list(page, from);
				1421	unlock_page(page);
				1422	if (!rc)
				1423	list_safe_reset_next(page, *page2, lru);
				1424
				1425	return rc;
				1426	}
				1427
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1428	/*
Srivatsa S. Bhat	c73e5c9	2013-04-29 15:08:16 -0700	[diff] [blame]	1429	* migrate_pages - migrate the pages specified in a list, to the free pages
				1430	* supplied as the target for the page migration
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1431	*
Srivatsa S. Bhat	c73e5c9	2013-04-29 15:08:16 -0700	[diff] [blame]	1432	* @from: The list of pages to be migrated.
				1433	* @get_new_page: The function used to allocate free pages to be used
				1434	* as the target of the page migration.
David Rientjes	68711a7	2014-06-04 16:08:25 -0700	[diff] [blame]	1435	* @put_new_page: The function used to free target pages if migration
				1436	* fails, or NULL if no special handling is necessary.
Srivatsa S. Bhat	c73e5c9	2013-04-29 15:08:16 -0700	[diff] [blame]	1437	* @private: Private data to be passed on to get_new_page()
				1438	* @mode: The migration mode that specifies the constraints for
				1439	* page migration, if any.
				1440	* @reason: The reason for page migration.
Yang Shi	5ac9588	2021-09-02 14:59:13 -0700	[diff] [blame]	1441	* @ret_succeeded: Set to the number of pages migrated successfully if
				1442	* the caller passes a non-NULL pointer.
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1443	*
Srivatsa S. Bhat	c73e5c9	2013-04-29 15:08:16 -0700	[diff] [blame]	1444	* The function returns after 10 attempts or if no pages are movable any more
				1445	* because the list has become empty or no retryable pages exist any more.
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1446	* It is caller's responsibility to call putback_movable_pages() to return pages
				1447	* to the LRU or free list only if ret != 0.
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1448	*
Srivatsa S. Bhat	c73e5c9	2013-04-29 15:08:16 -0700	[diff] [blame]	1449	* Returns the number of pages that were not migrated, or an error code.
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1450	*/
Hugh Dickins	9c620e2	2013-02-22 16:35:14 -0800	[diff] [blame]	1451	int migrate_pages(struct list_head *from, new_page_t get_new_page,
David Rientjes	68711a7	2014-06-04 16:08:25 -0700	[diff] [blame]	1452	free_page_t put_new_page, unsigned long private,
Yang Shi	5ac9588	2021-09-02 14:59:13 -0700	[diff] [blame]	1453	enum migrate_mode mode, int reason, unsigned int *ret_succeeded)
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1454	{
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1455	int retry = 1;
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1456	int thp_retry = 1;
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1457	int nr_failed = 0;
Mel Gorman	5647bc2	2012-10-19 10:46:20 +0100	[diff] [blame]	1458	int nr_succeeded = 0;
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1459	int nr_thp_succeeded = 0;
				1460	int nr_thp_failed = 0;
				1461	int nr_thp_split = 0;
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1462	int pass = 0;
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1463	bool is_thp = false;
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1464	struct page *page;
				1465	struct page *page2;
				1466	int swapwrite = current->flags & PF_SWAPWRITE;
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1467	int rc, nr_subpages;
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1468	LIST_HEAD(ret_pages);
Yang Shi	b0b515b	2021-06-30 18:51:48 -0700	[diff] [blame]	1469	bool nosplit = (reason == MR_NUMA_MISPLACED);
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1470
Liam Mark	7bc1aec	2021-05-04 18:37:25 -0700	[diff] [blame]	1471	trace_mm_migrate_pages_start(mode, reason);
				1472
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1473	if (!swapwrite)
				1474	current->flags \|= PF_SWAPWRITE;
				1475
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1476	for (pass = 0; pass < 10 && (retry \|\| thp_retry); pass++) {
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1477	retry = 0;
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1478	thp_retry = 0;
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1479
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1480	list_for_each_entry_safe(page, page2, from, lru) {
Michal Hocko	94723aa	2018-04-10 16:30:07 -0700	[diff] [blame]	1481	retry:
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1482	/*
				1483	* THP statistics is based on the source huge page.
				1484	* Capture required information that might get lost
				1485	* during migration.
				1486	*/
Zi Yan	6c5c7b9	2020-09-25 21:19:14 -0700	[diff] [blame]	1487	is_thp = PageTransHuge(page) && !PageHuge(page);
Matthew Wilcox (Oracle)	6c35784	2020-08-14 17:30:37 -0700	[diff] [blame]	1488	nr_subpages = thp_nr_pages(page);
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1489	cond_resched();
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1490
Naoya Horiguchi	31caf66	2013-09-11 14:21:59 -0700	[diff] [blame]	1491	if (PageHuge(page))
				1492	rc = unmap_and_move_huge_page(get_new_page,
David Rientjes	68711a7	2014-06-04 16:08:25 -0700	[diff] [blame]	1493	put_new_page, private, page,
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1494	pass > 2, mode, reason,
				1495	&ret_pages);
Naoya Horiguchi	31caf66	2013-09-11 14:21:59 -0700	[diff] [blame]	1496	else
David Rientjes	68711a7	2014-06-04 16:08:25 -0700	[diff] [blame]	1497	rc = unmap_and_move(get_new_page, put_new_page,
Naoya Horiguchi	add05ce	2015-06-24 16:56:50 -0700	[diff] [blame]	1498	private, page, pass > 2, mode,
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1499	reason, &ret_pages);
				1500	/*
				1501	* The rules are:
				1502	* Success: non hugetlb page will be freed, hugetlb
				1503	* page will be put back
				1504	* -EAGAIN: stay on the from list
				1505	* -ENOMEM: stay on the from list
				1506	* Other errno: put on ret_pages list then splice to
				1507	* from list
				1508	*/
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1509	switch(rc) {
Yang Shi	d532e2e	2020-12-14 19:13:16 -0800	[diff] [blame]	1510	/*
				1511	* THP migration might be unsupported or the
				1512	* allocation could've failed so we should
				1513	* retry on the same page with the THP split
				1514	* to base pages.
				1515	*
				1516	* Head page is retried immediately and tail
				1517	* pages are added to the tail of the list so
				1518	* we encounter them after the rest of the list
				1519	* is processed.
				1520	*/
				1521	case -ENOSYS:
				1522	/* THP migration is unsupported */
				1523	if (is_thp) {
				1524	if (!try_split_thp(page, &page2, from)) {
				1525	nr_thp_split++;
				1526	goto retry;
				1527	}
				1528
				1529	nr_thp_failed++;
				1530	nr_failed += nr_subpages;
				1531	break;
				1532	}
				1533
				1534	/* Hugetlb migration is unsupported */
				1535	nr_failed++;
				1536	break;
Christoph Lameter	95a402c	2006-06-23 02:03:53 -0700	[diff] [blame]	1537	case -ENOMEM:
Michal Hocko	94723aa	2018-04-10 16:30:07 -0700	[diff] [blame]	1538	/*
Yang Shi	d532e2e	2020-12-14 19:13:16 -0800	[diff] [blame]	1539	* When memory is low, don't bother to try to migrate
				1540	* other pages, just exit.
Yang Shi	b0b515b	2021-06-30 18:51:48 -0700	[diff] [blame]	1541	* THP NUMA faulting doesn't split THP to retry.
Michal Hocko	94723aa	2018-04-10 16:30:07 -0700	[diff] [blame]	1542	*/
Yang Shi	b0b515b	2021-06-30 18:51:48 -0700	[diff] [blame]	1543	if (is_thp && !nosplit) {
Yang Shi	d532e2e	2020-12-14 19:13:16 -0800	[diff] [blame]	1544	if (!try_split_thp(page, &page2, from)) {
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1545	nr_thp_split++;
Michal Hocko	94723aa	2018-04-10 16:30:07 -0700	[diff] [blame]	1546	goto retry;
				1547	}
Zi Yan	6c5c7b9	2020-09-25 21:19:14 -0700	[diff] [blame]	1548
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1549	nr_thp_failed++;
				1550	nr_failed += nr_subpages;
				1551	goto out;
				1552	}
David Rientjes	dfef2ef	2016-05-20 16:59:05 -0700	[diff] [blame]	1553	nr_failed++;
Christoph Lameter	95a402c	2006-06-23 02:03:53 -0700	[diff] [blame]	1554	goto out;
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1555	case -EAGAIN:
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1556	if (is_thp) {
				1557	thp_retry++;
				1558	break;
				1559	}
Christoph Lameter	2d1db3b	2006-06-23 02:03:33 -0700	[diff] [blame]	1560	retry++;
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1561	break;
Rafael Aquini	78bd520	2012-12-11 16:02:31 -0800	[diff] [blame]	1562	case MIGRATEPAGE_SUCCESS:
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1563	if (is_thp) {
				1564	nr_thp_succeeded++;
				1565	nr_succeeded += nr_subpages;
				1566	break;
				1567	}
Mel Gorman	5647bc2	2012-10-19 10:46:20 +0100	[diff] [blame]	1568	nr_succeeded++;
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1569	break;
				1570	default:
Naoya Horiguchi	354a336	2014-01-21 15:51:14 -0800	[diff] [blame]	1571	/*
Yang Shi	d532e2e	2020-12-14 19:13:16 -0800	[diff] [blame]	1572	* Permanent failure (-EBUSY, etc.):
Naoya Horiguchi	354a336	2014-01-21 15:51:14 -0800	[diff] [blame]	1573	* unlike -EAGAIN case, the failed page is
				1574	* removed from migration page list and not
				1575	* retried in the next outer loop.
				1576	*/
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1577	if (is_thp) {
				1578	nr_thp_failed++;
				1579	nr_failed += nr_subpages;
				1580	break;
				1581	}
Christoph Lameter	2d1db3b	2006-06-23 02:03:33 -0700	[diff] [blame]	1582	nr_failed++;
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1583	break;
Christoph Lameter	2d1db3b	2006-06-23 02:03:33 -0700	[diff] [blame]	1584	}
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1585	}
				1586	}
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1587	nr_failed += retry + thp_retry;
				1588	nr_thp_failed += thp_retry;
Vlastimil Babka	f2f81fb	2015-11-05 18:47:03 -0800	[diff] [blame]	1589	rc = nr_failed;
Christoph Lameter	95a402c	2006-06-23 02:03:53 -0700	[diff] [blame]	1590	out:
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1591	/*
				1592	* Put the permanent failure page back to migration list, they
				1593	* will be put back to the right list by the caller.
				1594	*/
				1595	list_splice(&ret_pages, from);
				1596
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1597	count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
				1598	count_vm_events(PGMIGRATE_FAIL, nr_failed);
				1599	count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded);
				1600	count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed);
				1601	count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split);
				1602	trace_mm_migrate_pages(nr_succeeded, nr_failed, nr_thp_succeeded,
				1603	nr_thp_failed, nr_thp_split, mode, reason);
Mel Gorman	7b2a2d4	2012-10-19 14:07:31 +0100	[diff] [blame]	1604
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1605	if (!swapwrite)
				1606	current->flags &= ~PF_SWAPWRITE;
				1607
Yang Shi	5ac9588	2021-09-02 14:59:13 -0700	[diff] [blame]	1608	if (ret_succeeded)
				1609	*ret_succeeded = nr_succeeded;
				1610
Rafael Aquini	78bd520	2012-12-11 16:02:31 -0800	[diff] [blame]	1611	return rc;
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1612	}
Charan Teja Reddy	f47b852	2021-02-16 13:59:45 +0530	[diff] [blame]	1613	EXPORT_SYMBOL_GPL(migrate_pages);
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1614
Joonsoo Kim	19fc7be	2020-08-11 18:37:25 -0700	[diff] [blame]	1615	struct page alloc_migration_target(struct page page, unsigned long private)
Joonsoo Kim	b4b3822	2020-08-11 18:37:14 -0700	[diff] [blame]	1616	{
Joonsoo Kim	19fc7be	2020-08-11 18:37:25 -0700	[diff] [blame]	1617	struct migration_target_control *mtc;
				1618	gfp_t gfp_mask;
Joonsoo Kim	b4b3822	2020-08-11 18:37:14 -0700	[diff] [blame]	1619	unsigned int order = 0;
				1620	struct page *new_page = NULL;
Joonsoo Kim	19fc7be	2020-08-11 18:37:25 -0700	[diff] [blame]	1621	int nid;
				1622	int zidx;
				1623
				1624	mtc = (struct migration_target_control *)private;
				1625	gfp_mask = mtc->gfp_mask;
				1626	nid = mtc->nid;
				1627	if (nid == NUMA_NO_NODE)
				1628	nid = page_to_nid(page);
Joonsoo Kim	b4b3822	2020-08-11 18:37:14 -0700	[diff] [blame]	1629
Joonsoo Kim	d92bbc2	2020-08-11 18:37:17 -0700	[diff] [blame]	1630	if (PageHuge(page)) {
				1631	struct hstate *h = page_hstate(compound_head(page));
				1632
Joonsoo Kim	19fc7be	2020-08-11 18:37:25 -0700	[diff] [blame]	1633	gfp_mask = htlb_modify_alloc_mask(h, gfp_mask);
				1634	return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask);
Joonsoo Kim	d92bbc2	2020-08-11 18:37:17 -0700	[diff] [blame]	1635	}
Joonsoo Kim	b4b3822	2020-08-11 18:37:14 -0700	[diff] [blame]	1636
				1637	if (PageTransHuge(page)) {
Joonsoo Kim	9933a0c	2020-08-11 18:37:20 -0700	[diff] [blame]	1638	/*
				1639	* clear __GFP_RECLAIM to make the migration callback
				1640	* consistent with regular THP allocations.
				1641	*/
				1642	gfp_mask &= ~__GFP_RECLAIM;
Joonsoo Kim	b4b3822	2020-08-11 18:37:14 -0700	[diff] [blame]	1643	gfp_mask \|= GFP_TRANSHUGE;
				1644	order = HPAGE_PMD_ORDER;
				1645	}
Joonsoo Kim	19fc7be	2020-08-11 18:37:25 -0700	[diff] [blame]	1646	zidx = zone_idx(page_zone(page));
				1647	if (is_highmem_idx(zidx) \|\| zidx == ZONE_MOVABLE)
Joonsoo Kim	b4b3822	2020-08-11 18:37:14 -0700	[diff] [blame]	1648	gfp_mask \|= __GFP_HIGHMEM;
				1649
Matthew Wilcox (Oracle)	84172f4	2021-04-29 23:01:15 -0700	[diff] [blame]	1650	new_page = __alloc_pages(gfp_mask, order, nid, mtc->nmask);
Joonsoo Kim	b4b3822	2020-08-11 18:37:14 -0700	[diff] [blame]	1651
				1652	if (new_page && PageTransHuge(new_page))
				1653	prep_transhuge_page(new_page);
				1654
				1655	return new_page;
				1656	}
				1657
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1658	#ifdef CONFIG_NUMA
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1659
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1660	static int store_status(int __user *status, int start, int value, int nr)
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1661	{
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1662	while (nr-- > 0) {
				1663	if (put_user(value, status + start))
				1664	return -EFAULT;
				1665	start++;
				1666	}
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1667
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1668	return 0;
				1669	}
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1670
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1671	static int do_move_pages_to_node(struct mm_struct *mm,
				1672	struct list_head *pagelist, int node)
				1673	{
				1674	int err;
Joonsoo Kim	a097631	2020-08-11 18:37:28 -0700	[diff] [blame]	1675	struct migration_target_control mtc = {
				1676	.nid = node,
				1677	.gfp_mask = GFP_HIGHUSER_MOVABLE \| __GFP_THISNODE,
				1678	};
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1679
Joonsoo Kim	a097631	2020-08-11 18:37:28 -0700	[diff] [blame]	1680	err = migrate_pages(pagelist, alloc_migration_target, NULL,
Yang Shi	5ac9588	2021-09-02 14:59:13 -0700	[diff] [blame]	1681	(unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1682	if (err)
				1683	putback_movable_pages(pagelist);
				1684	return err;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1685	}
				1686
				1687	/*
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1688	* Resolves the given address to a struct page, isolates it from the LRU and
				1689	* puts it to the given pagelist.
Yang Shi	e0153fc	2020-01-04 12:59:46 -0800	[diff] [blame]	1690	* Returns:
				1691	* errno - if the page cannot be found/isolated
				1692	* 0 - when it doesn't have to be migrated because it is already on the
				1693	* target node
				1694	* 1 - when it has been queued
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1695	*/
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1696	static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
				1697	int node, struct list_head *pagelist, bool migrate_all)
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1698	{
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1699	struct vm_area_struct *vma;
				1700	struct page *page;
				1701	unsigned int follflags;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1702	int err;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1703
Michel Lespinasse	d8ed45c	2020-06-08 21:33:25 -0700	[diff] [blame]	1704	mmap_read_lock(mm);
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1705	err = -EFAULT;
				1706	vma = find_vma(mm, addr);
				1707	if (!vma \|\| addr < vma->vm_start \|\| !vma_migratable(vma))
				1708	goto out;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1709
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1710	/* FOLL_DUMP to ignore special (like zero) pages */
				1711	follflags = FOLL_GET \| FOLL_DUMP;
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1712	page = follow_page(vma, addr, follflags);
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1713
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1714	err = PTR_ERR(page);
				1715	if (IS_ERR(page))
				1716	goto out;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1717
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1718	err = -ENOENT;
				1719	if (!page)
				1720	goto out;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1721
Brice Goglin	e78bbfa	2008-10-18 20:27:15 -0700	[diff] [blame]	1722	err = 0;
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1723	if (page_to_nid(page) == node)
				1724	goto out_putpage;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1725
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1726	err = -EACCES;
				1727	if (page_mapcount(page) > 1 && !migrate_all)
				1728	goto out_putpage;
				1729
				1730	if (PageHuge(page)) {
				1731	if (PageHead(page)) {
Miaohe Lin	072e741	2022-05-30 19:30:15 +0800	[diff] [blame]	1732	err = isolate_hugetlb(page, pagelist);
				1733	if (!err)
				1734	err = 1;
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1735	}
				1736	} else {
				1737	struct page *head;
				1738
				1739	head = compound_head(page);
				1740	err = isolate_lru_page(head);
				1741	if (err)
				1742	goto out_putpage;
				1743
Yang Shi	e0153fc	2020-01-04 12:59:46 -0800	[diff] [blame]	1744	err = 1;
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1745	list_add_tail(&head->lru, pagelist);
				1746	mod_node_page_state(page_pgdat(head),
Huang Ying	9de4f22	2020-04-06 20:04:41 -0700	[diff] [blame]	1747	NR_ISOLATED_ANON + page_is_file_lru(head),
Matthew Wilcox (Oracle)	6c35784	2020-08-14 17:30:37 -0700	[diff] [blame]	1748	thp_nr_pages(head));
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1749	}
				1750	out_putpage:
				1751	/*
				1752	* Either remove the duplicate refcount from
				1753	* isolate_lru_page() or drop the page ref if it was
				1754	* not isolated.
				1755	*/
				1756	put_page(page);
				1757	out:
Michel Lespinasse	d8ed45c	2020-06-08 21:33:25 -0700	[diff] [blame]	1758	mmap_read_unlock(mm);
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1759	return err;
				1760	}
				1761
Wei Yang	7ca8783	2020-04-06 20:04:12 -0700	[diff] [blame]	1762	static int move_pages_and_store_status(struct mm_struct *mm, int node,
				1763	struct list_head pagelist, int __user status,
				1764	int start, int i, unsigned long nr_pages)
				1765	{
				1766	int err;
				1767
Wei Yang	5d7ae89	2020-04-06 20:04:15 -0700	[diff] [blame]	1768	if (list_empty(pagelist))
				1769	return 0;
				1770
Wei Yang	7ca8783	2020-04-06 20:04:12 -0700	[diff] [blame]	1771	err = do_move_pages_to_node(mm, pagelist, node);
				1772	if (err) {
				1773	/*
				1774	* Positive err means the number of failed
				1775	* pages to migrate. Since we are going to
				1776	* abort and return the number of non-migrated
Long Li	ab9dd4f	2020-12-14 19:12:52 -0800	[diff] [blame]	1777	* pages, so need to include the rest of the
Wei Yang	7ca8783	2020-04-06 20:04:12 -0700	[diff] [blame]	1778	* nr_pages that have not been attempted as
				1779	* well.
				1780	*/
				1781	if (err > 0)
				1782	err += nr_pages - i - 1;
				1783	return err;
				1784	}
				1785	return store_status(status, start, node, i - start);
				1786	}
				1787
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1788	/*
Brice Goglin	5e9a0f0	2008-10-18 20:27:17 -0700	[diff] [blame]	1789	* Migrate an array of page address onto an array of nodes and fill
				1790	* the corresponding array of status.
				1791	*/
Christoph Lameter	3268c63	2012-03-21 16:34:06 -0700	[diff] [blame]	1792	static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
Brice Goglin	5e9a0f0	2008-10-18 20:27:17 -0700	[diff] [blame]	1793	unsigned long nr_pages,
				1794	const void __user * __user *pages,
				1795	const int __user *nodes,
				1796	int __user *status, int flags)
				1797	{
Gregory Price	556b68d	2023-10-03 10:48:56 -0400	[diff] [blame]	1798	compat_uptr_t __user compat_pages = (void __user )pages;
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1799	int current_node = NUMA_NO_NODE;
				1800	LIST_HEAD(pagelist);
				1801	int start, i;
				1802	int err = 0, err1;
Brice Goglin	35282a2	2009-06-16 15:32:43 -0700	[diff] [blame]	1803
Minchan Kim	361a2a2	2021-05-04 18:36:57 -0700	[diff] [blame]	1804	lru_cache_disable();
Brice Goglin	35282a2	2009-06-16 15:32:43 -0700	[diff] [blame]	1805
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1806	for (i = start = 0; i < nr_pages; i++) {
				1807	const void __user *p;
				1808	unsigned long addr;
				1809	int node;
Brice Goglin	5e9a0f0	2008-10-18 20:27:17 -0700	[diff] [blame]	1810
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1811	err = -EFAULT;
Gregory Price	556b68d	2023-10-03 10:48:56 -0400	[diff] [blame]	1812	if (in_compat_syscall()) {
				1813	compat_uptr_t cp;
				1814
				1815	if (get_user(cp, compat_pages + i))
				1816	goto out_flush;
				1817
				1818	p = compat_ptr(cp);
				1819	} else {
				1820	if (get_user(p, pages + i))
				1821	goto out_flush;
				1822	}
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1823	if (get_user(node, nodes + i))
				1824	goto out_flush;
Andrey Konovalov	057d3389	2019-09-25 16:48:30 -0700	[diff] [blame]	1825	addr = (unsigned long)untagged_addr(p);
Brice Goglin	5e9a0f0	2008-10-18 20:27:17 -0700	[diff] [blame]	1826
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1827	err = -ENODEV;
				1828	if (node < 0 \|\| node >= MAX_NUMNODES)
				1829	goto out_flush;
				1830	if (!node_state(node, N_MEMORY))
				1831	goto out_flush;
Brice Goglin	3140a22	2009-01-06 14:38:57 -0800	[diff] [blame]	1832
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1833	err = -EACCES;
				1834	if (!node_isset(node, task_nodes))
				1835	goto out_flush;
Brice Goglin	5e9a0f0	2008-10-18 20:27:17 -0700	[diff] [blame]	1836
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1837	if (current_node == NUMA_NO_NODE) {
				1838	current_node = node;
				1839	start = i;
				1840	} else if (node != current_node) {
Wei Yang	7ca8783	2020-04-06 20:04:12 -0700	[diff] [blame]	1841	err = move_pages_and_store_status(mm, current_node,
				1842	&pagelist, status, start, i, nr_pages);
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1843	if (err)
				1844	goto out;
				1845	start = i;
				1846	current_node = node;
Brice Goglin	3140a22	2009-01-06 14:38:57 -0800	[diff] [blame]	1847	}
Brice Goglin	5e9a0f0	2008-10-18 20:27:17 -0700	[diff] [blame]	1848
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1849	/*
				1850	* Errors in the page lookup or isolation are not fatal and we simply
				1851	* report them via status
				1852	*/
				1853	err = add_page_for_migration(mm, addr, current_node,
				1854	&pagelist, flags & MPOL_MF_MOVE_ALL);
Yang Shi	e0153fc	2020-01-04 12:59:46 -0800	[diff] [blame]	1855
Wei Yang	d08221a	2020-04-06 20:04:18 -0700	[diff] [blame]	1856	if (err > 0) {
Yang Shi	e0153fc	2020-01-04 12:59:46 -0800	[diff] [blame]	1857	/* The page is successfully queued for migration */
				1858	continue;
				1859	}
Brice Goglin	3140a22	2009-01-06 14:38:57 -0800	[diff] [blame]	1860
Wei Yang	d08221a	2020-04-06 20:04:18 -0700	[diff] [blame]	1861	/*
				1862	* If the page is already on the target node (!err), store the
				1863	* node, otherwise, store the err.
				1864	*/
				1865	err = store_status(status, i, err ? : current_node, 1);
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1866	if (err)
				1867	goto out_flush;
Brice Goglin	3140a22	2009-01-06 14:38:57 -0800	[diff] [blame]	1868
Wei Yang	7ca8783	2020-04-06 20:04:12 -0700	[diff] [blame]	1869	err = move_pages_and_store_status(mm, current_node, &pagelist,
				1870	status, start, i, nr_pages);
Wei Yang	4afdace	2020-04-06 20:04:09 -0700	[diff] [blame]	1871	if (err)
				1872	goto out;
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1873	current_node = NUMA_NO_NODE;
Brice Goglin	3140a22	2009-01-06 14:38:57 -0800	[diff] [blame]	1874	}
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1875	out_flush:
				1876	/* Make sure we do not overwrite the existing error */
Wei Yang	7ca8783	2020-04-06 20:04:12 -0700	[diff] [blame]	1877	err1 = move_pages_and_store_status(mm, current_node, &pagelist,
				1878	status, start, i, nr_pages);
Wei Yang	dfe9aa2	2020-01-30 22:11:14 -0800	[diff] [blame]	1879	if (err >= 0)
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1880	err = err1;
Brice Goglin	5e9a0f0	2008-10-18 20:27:17 -0700	[diff] [blame]	1881	out:
Minchan Kim	361a2a2	2021-05-04 18:36:57 -0700	[diff] [blame]	1882	lru_cache_enable();
Brice Goglin	5e9a0f0	2008-10-18 20:27:17 -0700	[diff] [blame]	1883	return err;
				1884	}
				1885
				1886	/*
Brice Goglin	2f007e7	2008-10-18 20:27:16 -0700	[diff] [blame]	1887	* Determine the nodes of an array of pages and store it in an array of status.
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1888	*/
Brice Goglin	80bba12	2008-12-09 13:14:23 -0800	[diff] [blame]	1889	static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
				1890	const void __user *pages, int status)
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1891	{
Brice Goglin	2f007e7	2008-10-18 20:27:16 -0700	[diff] [blame]	1892	unsigned long i;
Brice Goglin	2f007e7	2008-10-18 20:27:16 -0700	[diff] [blame]	1893
Michel Lespinasse	d8ed45c	2020-06-08 21:33:25 -0700	[diff] [blame]	1894	mmap_read_lock(mm);
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1895
Brice Goglin	2f007e7	2008-10-18 20:27:16 -0700	[diff] [blame]	1896	for (i = 0; i < nr_pages; i++) {
Brice Goglin	80bba12	2008-12-09 13:14:23 -0800	[diff] [blame]	1897	unsigned long addr = (unsigned long)(*pages);
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1898	struct vm_area_struct *vma;
				1899	struct page *page;
KOSAKI Motohiro	c095adb	2008-12-16 16:06:43 +0900	[diff] [blame]	1900	int err = -EFAULT;
Brice Goglin	2f007e7	2008-10-18 20:27:16 -0700	[diff] [blame]	1901
Liam Howlett	059b8b4	2021-06-28 19:39:44 -0700	[diff] [blame]	1902	vma = vma_lookup(mm, addr);
				1903	if (!vma)
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1904	goto set_status;
				1905
Kirill A. Shutemov	d899844	2015-09-04 15:47:53 -0700	[diff] [blame]	1906	/* FOLL_DUMP to ignore special (like zero) pages */
				1907	page = follow_page(vma, addr, FOLL_DUMP);
Linus Torvalds	89f5b7d	2008-06-20 11:18:25 -0700	[diff] [blame]	1908
				1909	err = PTR_ERR(page);
				1910	if (IS_ERR(page))
				1911	goto set_status;
				1912
Kirill A. Shutemov	d899844	2015-09-04 15:47:53 -0700	[diff] [blame]	1913	err = page ? page_to_nid(page) : -ENOENT;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1914	set_status:
Brice Goglin	80bba12	2008-12-09 13:14:23 -0800	[diff] [blame]	1915	*status = err;
				1916
				1917	pages++;
				1918	status++;
				1919	}
				1920
Michel Lespinasse	d8ed45c	2020-06-08 21:33:25 -0700	[diff] [blame]	1921	mmap_read_unlock(mm);
Brice Goglin	80bba12	2008-12-09 13:14:23 -0800	[diff] [blame]	1922	}
				1923
Arnd Bergmann	5b1b561	2021-09-08 15:18:17 -0700	[diff] [blame]	1924	static int get_compat_pages_array(const void __user *chunk_pages[],
				1925	const void __user * __user *pages,
				1926	unsigned long chunk_nr)
				1927	{
				1928	compat_uptr_t __user pages32 = (compat_uptr_t __user )pages;
				1929	compat_uptr_t p;
				1930	int i;
				1931
				1932	for (i = 0; i < chunk_nr; i++) {
				1933	if (get_user(p, pages32 + i))
				1934	return -EFAULT;
				1935	chunk_pages[i] = compat_ptr(p);
				1936	}
				1937
				1938	return 0;
				1939	}
				1940
Brice Goglin	80bba12	2008-12-09 13:14:23 -0800	[diff] [blame]	1941	/*
				1942	* Determine the nodes of a user array of pages and store it in
				1943	* a user array of status.
				1944	*/
				1945	static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
				1946	const void __user * __user *pages,
				1947	int __user *status)
				1948	{
				1949	#define DO_PAGES_STAT_CHUNK_NR 16
				1950	const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
				1951	int chunk_status[DO_PAGES_STAT_CHUNK_NR];
Brice Goglin	80bba12	2008-12-09 13:14:23 -0800	[diff] [blame]	1952
H. Peter Anvin	87b8d1a	2010-02-18 16:13:40 -0800	[diff] [blame]	1953	while (nr_pages) {
				1954	unsigned long chunk_nr;
Brice Goglin	80bba12	2008-12-09 13:14:23 -0800	[diff] [blame]	1955
H. Peter Anvin	87b8d1a	2010-02-18 16:13:40 -0800	[diff] [blame]	1956	chunk_nr = nr_pages;
				1957	if (chunk_nr > DO_PAGES_STAT_CHUNK_NR)
				1958	chunk_nr = DO_PAGES_STAT_CHUNK_NR;
				1959
Arnd Bergmann	5b1b561	2021-09-08 15:18:17 -0700	[diff] [blame]	1960	if (in_compat_syscall()) {
				1961	if (get_compat_pages_array(chunk_pages, pages,
				1962	chunk_nr))
				1963	break;
				1964	} else {
				1965	if (copy_from_user(chunk_pages, pages,
				1966	chunk_nr * sizeof(*chunk_pages)))
				1967	break;
				1968	}
Brice Goglin	80bba12	2008-12-09 13:14:23 -0800	[diff] [blame]	1969
				1970	do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
				1971
H. Peter Anvin	87b8d1a	2010-02-18 16:13:40 -0800	[diff] [blame]	1972	if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status)))
				1973	break;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1974
H. Peter Anvin	87b8d1a	2010-02-18 16:13:40 -0800	[diff] [blame]	1975	pages += chunk_nr;
				1976	status += chunk_nr;
				1977	nr_pages -= chunk_nr;
				1978	}
				1979	return nr_pages ? -EFAULT : 0;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1980	}
				1981
Miaohe Lin	4dc200c	2020-10-17 16:14:03 -0700	[diff] [blame]	1982	static struct mm_struct find_mm_struct(pid_t pid, nodemask_t mem_nodes)
				1983	{
				1984	struct task_struct *task;
				1985	struct mm_struct *mm;
				1986
				1987	/*
				1988	* There is no need to check if current process has the right to modify
				1989	* the specified process when they are same.
				1990	*/
				1991	if (!pid) {
				1992	mmget(current->mm);
				1993	*mem_nodes = cpuset_mems_allowed(current);
				1994	return current->mm;
				1995	}
				1996
				1997	/* Find the mm_struct */
				1998	rcu_read_lock();
				1999	task = find_task_by_vpid(pid);
				2000	if (!task) {
				2001	rcu_read_unlock();
				2002	return ERR_PTR(-ESRCH);
				2003	}
				2004	get_task_struct(task);
				2005
				2006	/*
				2007	* Check if this process has the right to modify the specified
				2008	* process. Use the regular "ptrace_may_access()" checks.
				2009	*/
				2010	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
				2011	rcu_read_unlock();
				2012	mm = ERR_PTR(-EPERM);
				2013	goto out;
				2014	}
				2015	rcu_read_unlock();
				2016
				2017	mm = ERR_PTR(security_task_movememory(task));
				2018	if (IS_ERR(mm))
				2019	goto out;
				2020	*mem_nodes = cpuset_mems_allowed(task);
				2021	mm = get_task_mm(task);
				2022	out:
				2023	put_task_struct(task);
				2024	if (!mm)
				2025	mm = ERR_PTR(-EINVAL);
				2026	return mm;
				2027	}
				2028
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	2029	/*
				2030	* Move a list of pages in the address space of the currently executing
				2031	* process.
				2032	*/
Dominik Brodowski	7addf44	2018-03-17 16:08:03 +0100	[diff] [blame]	2033	static int kernel_move_pages(pid_t pid, unsigned long nr_pages,
				2034	const void __user * __user *pages,
				2035	const int __user *nodes,
				2036	int __user *status, int flags)
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	2037	{
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	2038	struct mm_struct *mm;
Brice Goglin	5e9a0f0	2008-10-18 20:27:17 -0700	[diff] [blame]	2039	int err;
Christoph Lameter	3268c63	2012-03-21 16:34:06 -0700	[diff] [blame]	2040	nodemask_t task_nodes;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	2041
				2042	/* Check flags */
				2043	if (flags & ~(MPOL_MF_MOVE\|MPOL_MF_MOVE_ALL))
				2044	return -EINVAL;
				2045
				2046	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
				2047	return -EPERM;
				2048
Miaohe Lin	4dc200c	2020-10-17 16:14:03 -0700	[diff] [blame]	2049	mm = find_mm_struct(pid, &task_nodes);
				2050	if (IS_ERR(mm))
				2051	return PTR_ERR(mm);
Sasha Levin	6e8b09e	2012-04-25 16:01:53 -0700	[diff] [blame]	2052
				2053	if (nodes)
				2054	err = do_pages_move(mm, task_nodes, nr_pages, pages,
				2055	nodes, status, flags);
				2056	else
				2057	err = do_pages_stat(mm, nr_pages, pages, status);
Christoph Lameter	3268c63	2012-03-21 16:34:06 -0700	[diff] [blame]	2058
				2059	mmput(mm);
				2060	return err;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	2061	}
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	2062
Dominik Brodowski	7addf44	2018-03-17 16:08:03 +0100	[diff] [blame]	2063	SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
				2064	const void __user * __user *, pages,
				2065	const int __user *, nodes,
				2066	int __user *, status, int, flags)
				2067	{
				2068	return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags);
				2069	}
				2070
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2071	#ifdef CONFIG_NUMA_BALANCING
				2072	/*
				2073	* Returns true if this is a safe migration target node for misplaced NUMA
				2074	* pages. Currently it only checks the watermarks which crude
				2075	*/
				2076	static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
Mel Gorman	3abef4e	2013-02-22 16:34:27 -0800	[diff] [blame]	2077	unsigned long nr_migrate_pages)
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2078	{
				2079	int z;
Mel Gorman	599d0c9	2016-07-28 15:45:31 -0700	[diff] [blame]	2080
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2081	for (z = pgdat->nr_zones - 1; z >= 0; z--) {
				2082	struct zone *zone = pgdat->node_zones + z;
				2083
				2084	if (!populated_zone(zone))
				2085	continue;
				2086
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2087	/* Avoid waking kswapd by allocating pages_to_migrate pages. */
				2088	if (!zone_watermark_ok(zone, 0,
				2089	high_wmark_pages(zone) +
				2090	nr_migrate_pages,
Huang Ying	bfe9d00	2019-11-30 17:57:28 -0800	[diff] [blame]	2091	ZONE_MOVABLE, 0))
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2092	continue;
				2093	return true;
				2094	}
				2095	return false;
				2096	}
				2097
				2098	static struct page alloc_misplaced_dst_page(struct page page,
Michal Hocko	666feb2	2018-04-10 16:30:03 -0700	[diff] [blame]	2099	unsigned long data)
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2100	{
				2101	int nid = (int) data;
				2102	struct page *newpage;
				2103
Vlastimil Babka	96db800	2015-09-08 15:03:50 -0700	[diff] [blame]	2104	newpage = __alloc_pages_node(nid,
Johannes Weiner	e97ca8e5	2014-03-10 15:49:43 -0700	[diff] [blame]	2105	(GFP_HIGHUSER_MOVABLE \|
				2106	__GFP_THISNODE \| __GFP_NOMEMALLOC \|
				2107	__GFP_NORETRY \| __GFP_NOWARN) &
Mel Gorman	8479eba	2016-02-26 15:19:31 -0800	[diff] [blame]	2108	~__GFP_RECLAIM, 0);
Hillf Danton	bac0382	2012-11-27 14:46:24 +0000	[diff] [blame]	2109
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2110	return newpage;
				2111	}
				2112
Yang Shi	c5b5a3d	2021-06-30 18:51:42 -0700	[diff] [blame]	2113	static struct page alloc_misplaced_dst_page_thp(struct page page,
				2114	unsigned long data)
				2115	{
				2116	int nid = (int) data;
				2117	struct page *newpage;
				2118
				2119	newpage = alloc_pages_node(nid, (GFP_TRANSHUGE_LIGHT \| __GFP_THISNODE),
				2120	HPAGE_PMD_ORDER);
				2121	if (!newpage)
				2122	goto out;
				2123
				2124	prep_transhuge_page(newpage);
				2125
				2126	out:
				2127	return newpage;
				2128	}
				2129
Mel Gorman	1c30e01	2014-01-21 15:50:58 -0800	[diff] [blame]	2130	static int numamigrate_isolate_page(pg_data_t pgdat, struct page page)
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2131	{
Hugh Dickins	340ef39	2013-02-22 16:34:33 -0800	[diff] [blame]	2132	int page_lru;
Baolin Wang	2b9b624	2021-09-08 15:18:01 -0700	[diff] [blame]	2133	int nr_pages = thp_nr_pages(page);
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2134
Sasha Levin	309381fea	2014-01-23 15:52:54 -0800	[diff] [blame]	2135	VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
Mel Gorman	3abef4e	2013-02-22 16:34:27 -0800	[diff] [blame]	2136
Yang Shi	662aeea	2021-06-30 18:51:51 -0700	[diff] [blame]	2137	/* Do not migrate THP mapped by multiple processes */
				2138	if (PageTransHuge(page) && total_mapcount(page) > 1)
				2139	return 0;
				2140
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2141	/* Avoid migrating to a node that is nearly full */
Baolin Wang	2b9b624	2021-09-08 15:18:01 -0700	[diff] [blame]	2142	if (!migrate_balanced_pgdat(pgdat, nr_pages))
Hugh Dickins	340ef39	2013-02-22 16:34:33 -0800	[diff] [blame]	2143	return 0;
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2144
Hugh Dickins	340ef39	2013-02-22 16:34:33 -0800	[diff] [blame]	2145	if (isolate_lru_page(page))
				2146	return 0;
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2147
Huang Ying	9de4f22	2020-04-06 20:04:41 -0700	[diff] [blame]	2148	page_lru = page_is_file_lru(page);
Mel Gorman	599d0c9	2016-07-28 15:45:31 -0700	[diff] [blame]	2149	mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru,
Baolin Wang	2b9b624	2021-09-08 15:18:01 -0700	[diff] [blame]	2150	nr_pages);
Hugh Dickins	340ef39	2013-02-22 16:34:33 -0800	[diff] [blame]	2151
				2152	/*
				2153	* Isolating the page has taken another reference, so the
				2154	* caller's reference can be safely dropped without the page
				2155	* disappearing underneath us during migration.
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2156	*/
				2157	put_page(page);
Hugh Dickins	340ef39	2013-02-22 16:34:33 -0800	[diff] [blame]	2158	return 1;
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2159	}
				2160
Mel Gorman	a8f6077	2012-11-14 21:41:46 +0000	[diff] [blame]	2161	/*
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2162	* Attempt to migrate a misplaced page to the specified destination
				2163	* node. Caller is expected to have an elevated reference count on
				2164	* the page that will be dropped by this function before returning.
				2165	*/
Mel Gorman	1bc115d	2013-10-07 11:29:05 +0100	[diff] [blame]	2166	int migrate_misplaced_page(struct page page, struct vm_area_struct vma,
				2167	int node)
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2168	{
Mel Gorman	a8f6077	2012-11-14 21:41:46 +0000	[diff] [blame]	2169	pg_data_t *pgdat = NODE_DATA(node);
Hugh Dickins	340ef39	2013-02-22 16:34:33 -0800	[diff] [blame]	2170	int isolated;
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2171	int nr_remaining;
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2172	LIST_HEAD(migratepages);
Yang Shi	c5b5a3d	2021-06-30 18:51:42 -0700	[diff] [blame]	2173	new_page_t *new;
				2174	bool compound;
Aneesh Kumar K.V	b5916c0	2021-07-29 14:53:47 -0700	[diff] [blame]	2175	int nr_pages = thp_nr_pages(page);
Yang Shi	c5b5a3d	2021-06-30 18:51:42 -0700	[diff] [blame]	2176
				2177	/*
				2178	* PTE mapped THP or HugeTLB page can't reach here so the page could
				2179	* be either base page or THP. And it must be head page if it is
				2180	* THP.
				2181	*/
				2182	compound = PageTransHuge(page);
				2183
				2184	if (compound)
				2185	new = alloc_misplaced_dst_page_thp;
				2186	else
				2187	new = alloc_misplaced_dst_page;
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2188
				2189	/*
Mel Gorman	1bc115d	2013-10-07 11:29:05 +0100	[diff] [blame]	2190	* Don't migrate file pages that are mapped in multiple processes
				2191	* with execute permissions as they are probably shared libraries.
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2192	*/
Miaohe Lin	7ee820e	2021-05-04 18:37:16 -0700	[diff] [blame]	2193	if (page_mapcount(page) != 1 && page_is_file_lru(page) &&
				2194	(vma->vm_flags & VM_EXEC))
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2195	goto out;
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2196
Mel Gorman	a8f6077	2012-11-14 21:41:46 +0000	[diff] [blame]	2197	/*
Mel Gorman	09a913a	2018-04-10 16:29:20 -0700	[diff] [blame]	2198	* Also do not migrate dirty pages as not all filesystems can move
				2199	* dirty pages in MIGRATE_ASYNC mode which is a waste of cycles.
				2200	*/
Huang Ying	9de4f22	2020-04-06 20:04:41 -0700	[diff] [blame]	2201	if (page_is_file_lru(page) && PageDirty(page))
Mel Gorman	09a913a	2018-04-10 16:29:20 -0700	[diff] [blame]	2202	goto out;
				2203
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2204	isolated = numamigrate_isolate_page(pgdat, page);
				2205	if (!isolated)
				2206	goto out;
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2207
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2208	list_add(&page->lru, &migratepages);
Yang Shi	c5b5a3d	2021-06-30 18:51:42 -0700	[diff] [blame]	2209	nr_remaining = migrate_pages(&migratepages, *new, NULL, node,
Yang Shi	5ac9588	2021-09-02 14:59:13 -0700	[diff] [blame]	2210	MIGRATE_ASYNC, MR_NUMA_MISPLACED, NULL);
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2211	if (nr_remaining) {
Joonsoo Kim	59c82b7	2014-01-21 15:51:17 -0800	[diff] [blame]	2212	if (!list_empty(&migratepages)) {
				2213	list_del(&page->lru);
Yang Shi	c5fc5c3	2021-06-30 18:51:45 -0700	[diff] [blame]	2214	mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
				2215	page_is_file_lru(page), -nr_pages);
Joonsoo Kim	59c82b7	2014-01-21 15:51:17 -0800	[diff] [blame]	2216	putback_lru_page(page);
				2217	}
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2218	isolated = 0;
				2219	} else
Yang Shi	c5fc5c3	2021-06-30 18:51:45 -0700	[diff] [blame]	2220	count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_pages);
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2221	BUG_ON(!list_empty(&migratepages));
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2222	return isolated;
Hugh Dickins	340ef39	2013-02-22 16:34:33 -0800	[diff] [blame]	2223
				2224	out:
				2225	put_page(page);
				2226	return 0;
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2227	}
Mel Gorman	220018d	2012-12-05 09:32:56 +0000	[diff] [blame]	2228	#endif /* CONFIG_NUMA_BALANCING */
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2229	#endif /* CONFIG_NUMA */
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2230
Christoph Hellwig	9b2ed9c	2019-08-14 09:59:28 +0200	[diff] [blame]	2231	#ifdef CONFIG_DEVICE_PRIVATE
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2232	static int migrate_vma_collect_skip(unsigned long start,
				2233	unsigned long end,
				2234	struct mm_walk *walk)
				2235	{
				2236	struct migrate_vma *migrate = walk->private;
				2237	unsigned long addr;
				2238
Ralph Campbell	872ea70	2020-01-30 22:14:38 -0800	[diff] [blame]	2239	for (addr = start; addr < end; addr += PAGE_SIZE) {
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2240	migrate->dst[migrate->npages] = 0;
				2241	migrate->src[migrate->npages++] = 0;
				2242	}
				2243
				2244	return 0;
				2245	}
				2246
Miaohe Lin	843e1be	2021-05-04 18:37:13 -0700	[diff] [blame]	2247	static int migrate_vma_collect_hole(unsigned long start,
				2248	unsigned long end,
				2249	__always_unused int depth,
				2250	struct mm_walk *walk)
				2251	{
				2252	struct migrate_vma *migrate = walk->private;
				2253	unsigned long addr;
				2254
				2255	/* Only allow populating anonymous memory. */
				2256	if (!vma_is_anonymous(walk->vma))
				2257	return migrate_vma_collect_skip(start, end, walk);
				2258
				2259	for (addr = start; addr < end; addr += PAGE_SIZE) {
				2260	migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
				2261	migrate->dst[migrate->npages] = 0;
				2262	migrate->npages++;
				2263	migrate->cpages++;
				2264	}
				2265
				2266	return 0;
				2267	}
				2268
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2269	static int migrate_vma_collect_pmd(pmd_t *pmdp,
				2270	unsigned long start,
				2271	unsigned long end,
				2272	struct mm_walk *walk)
				2273	{
				2274	struct migrate_vma *migrate = walk->private;
				2275	struct vm_area_struct *vma = walk->vma;
				2276	struct mm_struct *mm = vma->vm_mm;
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2277	unsigned long addr = start, unmapped = 0;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2278	spinlock_t *ptl;
				2279	pte_t *ptep;
				2280
				2281	again:
				2282	if (pmd_none(*pmdp))
Steven Price	b7a16c7	2020-02-03 17:36:03 -0800	[diff] [blame]	2283	return migrate_vma_collect_hole(start, end, -1, walk);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2284
				2285	if (pmd_trans_huge(*pmdp)) {
				2286	struct page *page;
				2287
				2288	ptl = pmd_lock(mm, pmdp);
				2289	if (unlikely(!pmd_trans_huge(*pmdp))) {
				2290	spin_unlock(ptl);
				2291	goto again;
				2292	}
				2293
				2294	page = pmd_page(*pmdp);
				2295	if (is_huge_zero_page(page)) {
				2296	spin_unlock(ptl);
				2297	split_huge_pmd(vma, pmdp, addr);
				2298	if (pmd_trans_unstable(pmdp))
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2299	return migrate_vma_collect_skip(start, end,
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2300	walk);
				2301	} else {
				2302	int ret;
				2303
				2304	get_page(page);
				2305	spin_unlock(ptl);
				2306	if (unlikely(!trylock_page(page)))
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2307	return migrate_vma_collect_skip(start, end,
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2308	walk);
				2309	ret = split_huge_page(page);
				2310	unlock_page(page);
				2311	put_page(page);
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2312	if (ret)
				2313	return migrate_vma_collect_skip(start, end,
				2314	walk);
				2315	if (pmd_none(*pmdp))
Steven Price	b7a16c7	2020-02-03 17:36:03 -0800	[diff] [blame]	2316	return migrate_vma_collect_hole(start, end, -1,
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2317	walk);
				2318	}
				2319	}
				2320
				2321	if (unlikely(pmd_bad(*pmdp)))
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2322	return migrate_vma_collect_skip(start, end, walk);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2323
				2324	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2325	arch_enter_lazy_mmu_mode();
				2326
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2327	for (; addr < end; addr += PAGE_SIZE, ptep++) {
Christoph Hellwig	800bb1c	2020-03-16 20:32:14 +0100	[diff] [blame]	2328	unsigned long mpfn = 0, pfn;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2329	struct page *page;
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2330	swp_entry_t entry;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2331	pte_t pte;
				2332
				2333	pte = *ptep;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2334
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2335	if (pte_none(pte)) {
Ralph Campbell	0744f28	2020-08-11 18:31:41 -0700	[diff] [blame]	2336	if (vma_is_anonymous(vma)) {
				2337	mpfn = MIGRATE_PFN_MIGRATE;
				2338	migrate->cpages++;
				2339	}
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2340	goto next;
				2341	}
				2342
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2343	if (!pte_present(pte)) {
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2344	/*
				2345	* Only care about unaddressable device page special
				2346	* page table entry. Other special swap entries are not
				2347	* migratable, and we ignore regular swapped page.
				2348	*/
				2349	entry = pte_to_swp_entry(pte);
				2350	if (!is_device_private_entry(entry))
				2351	goto next;
				2352
Alistair Popple	af5cdaf	2021-06-30 18:54:06 -0700	[diff] [blame]	2353	page = pfn_swap_entry_to_page(entry);
Ralph Campbell	5143192	2020-07-23 15:30:00 -0700	[diff] [blame]	2354	if (!(migrate->flags &
				2355	MIGRATE_VMA_SELECT_DEVICE_PRIVATE) \|\|
				2356	page->pgmap->owner != migrate->pgmap_owner)
Christoph Hellwig	800bb1c	2020-03-16 20:32:14 +0100	[diff] [blame]	2357	goto next;
				2358
Christoph Hellwig	06d462b	2019-08-14 09:59:27 +0200	[diff] [blame]	2359	mpfn = migrate_pfn(page_to_pfn(page)) \|
				2360	MIGRATE_PFN_MIGRATE;
Alistair Popple	4dd845b	2021-06-30 18:54:09 -0700	[diff] [blame]	2361	if (is_writable_device_private_entry(entry))
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2362	mpfn \|= MIGRATE_PFN_WRITE;
				2363	} else {
Ralph Campbell	5143192	2020-07-23 15:30:00 -0700	[diff] [blame]	2364	if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
Christoph Hellwig	800bb1c	2020-03-16 20:32:14 +0100	[diff] [blame]	2365	goto next;
Pingfan Liu	276f756	2019-09-23 15:37:38 -0700	[diff] [blame]	2366	pfn = pte_pfn(pte);
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2367	if (is_zero_pfn(pfn)) {
				2368	mpfn = MIGRATE_PFN_MIGRATE;
				2369	migrate->cpages++;
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2370	goto next;
				2371	}
Christoph Hellwig	25b2995	2019-06-13 22:50:49 +0200	[diff] [blame]	2372	page = vm_normal_page(migrate->vma, addr, pte);
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2373	mpfn = migrate_pfn(pfn) \| MIGRATE_PFN_MIGRATE;
				2374	mpfn \|= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
				2375	}
				2376
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2377	/* FIXME support THP */
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2378	if (!page \|\| !page->mapping \|\| PageTransCompound(page)) {
Pingfan Liu	276f756	2019-09-23 15:37:38 -0700	[diff] [blame]	2379	mpfn = 0;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2380	goto next;
				2381	}
				2382
				2383	/*
				2384	* By getting a reference on the page we pin it and that blocks
				2385	* any kind of migration. Side effect is that it "freezes" the
				2386	* pte.
				2387	*
				2388	* We drop this reference after isolating the page from the lru
				2389	* for non device page (device page are not on the lru and thus
				2390	* can't be dropped from it).
				2391	*/
				2392	get_page(page);
				2393	migrate->cpages++;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2394
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2395	/*
				2396	* Optimize for the common case where page is only mapped once
				2397	* in one process. If we can lock the page, then we can safely
				2398	* set up a special migration page table entry now.
				2399	*/
				2400	if (trylock_page(page)) {
				2401	pte_t swp_pte;
				2402
				2403	mpfn \|= MIGRATE_PFN_LOCKED;
				2404	ptep_get_and_clear(mm, addr, ptep);
				2405
				2406	/* Setup special migration page table entry */
Alistair Popple	4dd845b	2021-06-30 18:54:09 -0700	[diff] [blame]	2407	if (mpfn & MIGRATE_PFN_WRITE)
				2408	entry = make_writable_migration_entry(
				2409	page_to_pfn(page));
				2410	else
				2411	entry = make_readable_migration_entry(
				2412	page_to_pfn(page));
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2413	swp_pte = swp_entry_to_pte(entry);
Alistair Popple	ad7df76	2020-09-04 16:36:01 -0700	[diff] [blame]	2414	if (pte_present(pte)) {
				2415	if (pte_soft_dirty(pte))
				2416	swp_pte = pte_swp_mksoft_dirty(swp_pte);
				2417	if (pte_uffd_wp(pte))
				2418	swp_pte = pte_swp_mkuffd_wp(swp_pte);
				2419	} else {
				2420	if (pte_swp_soft_dirty(pte))
				2421	swp_pte = pte_swp_mksoft_dirty(swp_pte);
				2422	if (pte_swp_uffd_wp(pte))
				2423	swp_pte = pte_swp_mkuffd_wp(swp_pte);
				2424	}
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2425	set_pte_at(mm, addr, ptep, swp_pte);
				2426
				2427	/*
				2428	* This is like regular unmap: we remove the rmap and
				2429	* drop page refcount. Page won't be freed, as we took
				2430	* a reference just above.
				2431	*/
				2432	page_remove_rmap(page, false);
				2433	put_page(page);
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2434
				2435	if (pte_present(pte))
				2436	unmapped++;
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2437	}
				2438
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2439	next:
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2440	migrate->dst[migrate->npages] = 0;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2441	migrate->src[migrate->npages++] = mpfn;
				2442	}
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2443
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2444	/* Only flush the TLB if we actually modified any entries */
				2445	if (unmapped)
				2446	flush_tlb_range(walk->vma, start, end);
				2447
Alistair Popple	1299c11	2022-09-02 10:35:51 +1000	[diff] [blame]	2448	arch_leave_lazy_mmu_mode();
				2449	pte_unmap_unlock(ptep - 1, ptl);
				2450
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2451	return 0;
				2452	}
				2453
Christoph Hellwig	7b86ac3	2019-08-28 16:19:54 +0200	[diff] [blame]	2454	static const struct mm_walk_ops migrate_vma_walk_ops = {
				2455	.pmd_entry = migrate_vma_collect_pmd,
				2456	.pte_hole = migrate_vma_collect_hole,
				2457	};
				2458
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2459	/*
				2460	* migrate_vma_collect() - collect pages over a range of virtual addresses
				2461	* @migrate: migrate struct containing all migration information
				2462	*
				2463	* This will walk the CPU page table. For each virtual address backed by a
				2464	* valid page, it updates the src array and takes a reference on the page, in
				2465	* order to pin the page until we lock it and unmap it.
				2466	*/
				2467	static void migrate_vma_collect(struct migrate_vma *migrate)
				2468	{
Jérôme Glisse	ac46d4f	2018-12-28 00:38:09 -0800	[diff] [blame]	2469	struct mmu_notifier_range range;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2470
Ralph Campbell	998427b	2020-07-23 15:30:01 -0700	[diff] [blame]	2471	/*
				2472	* Note that the pgmap_owner is passed to the mmu notifier callback so
				2473	* that the registered device driver can skip invalidating device
				2474	* private page mappings that won't be migrated.
				2475	*/
Alistair Popple	6b49bf6	2021-06-30 18:54:19 -0700	[diff] [blame]	2476	mmu_notifier_range_init_owner(&range, MMU_NOTIFY_MIGRATE, 0,
				2477	migrate->vma, migrate->vma->vm_mm, migrate->start, migrate->end,
Ralph Campbell	c1a06df	2020-08-06 23:17:09 -0700	[diff] [blame]	2478	migrate->pgmap_owner);
Jérôme Glisse	ac46d4f	2018-12-28 00:38:09 -0800	[diff] [blame]	2479	mmu_notifier_invalidate_range_start(&range);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2480
Christoph Hellwig	7b86ac3	2019-08-28 16:19:54 +0200	[diff] [blame]	2481	walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end,
				2482	&migrate_vma_walk_ops, migrate);
				2483
				2484	mmu_notifier_invalidate_range_end(&range);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2485	migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
				2486	}
				2487
				2488	/*
				2489	* migrate_vma_check_page() - check if page is pinned or not
				2490	* @page: struct page to check
				2491	*
				2492	* Pinned pages cannot be migrated. This is the same test as in
				2493	* migrate_page_move_mapping(), except that here we allow migration of a
				2494	* ZONE_DEVICE page.
				2495	*/
				2496	static bool migrate_vma_check_page(struct page *page)
				2497	{
				2498	/*
				2499	* One extra ref because caller holds an extra reference, either from
				2500	* isolate_lru_page() for a regular page, or migrate_vma_collect() for
				2501	* a device page.
				2502	*/
				2503	int extra = 1;
				2504
				2505	/*
				2506	* FIXME support THP (transparent huge page), it is bit more complex to
				2507	* check them than regular pages, because they can be mapped with a pmd
				2508	* or with a pte (split pte mapping).
				2509	*/
				2510	if (PageCompound(page))
				2511	return false;
				2512
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2513	/* Page from ZONE_DEVICE have one extra reference */
				2514	if (is_zone_device_page(page)) {
				2515	/*
				2516	* Private page can never be pin as they have no valid pte and
				2517	* GUP will fail for those. Yet if there is a pending migration
				2518	* a thread might try to wait on the pte migration entry and
				2519	* will bump the page reference count. Sadly there is no way to
				2520	* differentiate a regular pin from migration wait. Hence to
				2521	* avoid 2 racing thread trying to migrate back to CPU to enter
Haitao Shi	8958b24	2020-12-15 20:47:26 -0800	[diff] [blame]	2522	* infinite loop (one stopping migration because the other is
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2523	* waiting on pte migration entry). We always return true here.
				2524	*
				2525	* FIXME proper solution is to rework migration_entry_wait() so
				2526	* it does not need to take a reference on page.
				2527	*/
Christoph Hellwig	25b2995	2019-06-13 22:50:49 +0200	[diff] [blame]	2528	return is_device_private_page(page);
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2529	}
				2530
Jérôme Glisse	df6ad69	2017-09-08 16:12:24 -0700	[diff] [blame]	2531	/* For file back page */
				2532	if (page_mapping(page))
				2533	extra += 1 + page_has_private(page);
				2534
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2535	if ((page_count(page) - extra) > page_mapcount(page))
				2536	return false;
				2537
				2538	return true;
				2539	}
				2540
				2541	/*
				2542	* migrate_vma_prepare() - lock pages and isolate them from the lru
				2543	* @migrate: migrate struct containing all migration information
				2544	*
				2545	* This locks pages that have been collected by migrate_vma_collect(). Once each
				2546	* page is locked it is isolated from the lru (for non-device pages). Finally,
				2547	* the ref taken by migrate_vma_collect() is dropped, as locked pages cannot be
				2548	* migrated by concurrent kernel threads.
				2549	*/
				2550	static void migrate_vma_prepare(struct migrate_vma *migrate)
				2551	{
				2552	const unsigned long npages = migrate->npages;
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2553	const unsigned long start = migrate->start;
				2554	unsigned long addr, i, restore = 0;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2555	bool allow_drain = true;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2556
				2557	lru_add_drain();
				2558
				2559	for (i = 0; (i < npages) && migrate->cpages; i++) {
				2560	struct page *page = migrate_pfn_to_page(migrate->src[i]);
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2561	bool remap = true;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2562
				2563	if (!page)
				2564	continue;
				2565
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2566	if (!(migrate->src[i] & MIGRATE_PFN_LOCKED)) {
				2567	/*
				2568	* Because we are migrating several pages there can be
				2569	* a deadlock between 2 concurrent migration where each
				2570	* are waiting on each other page lock.
				2571	*
				2572	* Make migrate_vma() a best effort thing and backoff
				2573	* for any page we can not lock right away.
				2574	*/
				2575	if (!trylock_page(page)) {
				2576	migrate->src[i] = 0;
				2577	migrate->cpages--;
				2578	put_page(page);
				2579	continue;
				2580	}
				2581	remap = false;
				2582	migrate->src[i] \|= MIGRATE_PFN_LOCKED;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2583	}
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2584
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2585	/* ZONE_DEVICE pages are not on LRU */
				2586	if (!is_zone_device_page(page)) {
				2587	if (!PageLRU(page) && allow_drain) {
				2588	/* Drain CPU's pagevec */
				2589	lru_add_drain_all();
				2590	allow_drain = false;
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2591	}
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2592
				2593	if (isolate_lru_page(page)) {
				2594	if (remap) {
				2595	migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
				2596	migrate->cpages--;
				2597	restore++;
				2598	} else {
				2599	migrate->src[i] = 0;
				2600	unlock_page(page);
				2601	migrate->cpages--;
				2602	put_page(page);
				2603	}
				2604	continue;
				2605	}
				2606
				2607	/* Drop the reference we took in collect */
				2608	put_page(page);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2609	}
				2610
				2611	if (!migrate_vma_check_page(page)) {
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2612	if (remap) {
				2613	migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
				2614	migrate->cpages--;
				2615	restore++;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2616
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2617	if (!is_zone_device_page(page)) {
				2618	get_page(page);
				2619	putback_lru_page(page);
				2620	}
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2621	} else {
				2622	migrate->src[i] = 0;
				2623	unlock_page(page);
				2624	migrate->cpages--;
				2625
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2626	if (!is_zone_device_page(page))
				2627	putback_lru_page(page);
				2628	else
				2629	put_page(page);
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2630	}
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2631	}
				2632	}
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2633
				2634	for (i = 0, addr = start; i < npages && restore; i++, addr += PAGE_SIZE) {
				2635	struct page *page = migrate_pfn_to_page(migrate->src[i]);
				2636
				2637	if (!page \|\| (migrate->src[i] & MIGRATE_PFN_MIGRATE))
				2638	continue;
				2639
				2640	remove_migration_pte(page, migrate->vma, addr, page);
				2641
				2642	migrate->src[i] = 0;
				2643	unlock_page(page);
				2644	put_page(page);
				2645	restore--;
				2646	}
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2647	}
				2648
				2649	/*
				2650	* migrate_vma_unmap() - replace page mapping with special migration pte entry
				2651	* @migrate: migrate struct containing all migration information
				2652	*
				2653	* Replace page mapping (CPU page table pte) with a special migration pte entry
				2654	* and check again if it has been pinned. Pinned pages are restored because we
				2655	* cannot migrate them.
				2656	*
				2657	* This is the last step before we call the device driver callback to allocate
				2658	* destination memory and copy contents of original page over to new page.
				2659	*/
				2660	static void migrate_vma_unmap(struct migrate_vma *migrate)
				2661	{
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2662	const unsigned long npages = migrate->npages;
				2663	const unsigned long start = migrate->start;
				2664	unsigned long addr, i, restore = 0;
				2665
				2666	for (i = 0; i < npages; i++) {
				2667	struct page *page = migrate_pfn_to_page(migrate->src[i]);
				2668
				2669	if (!page \|\| !(migrate->src[i] & MIGRATE_PFN_MIGRATE))
				2670	continue;
				2671
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2672	if (page_mapped(page)) {
Alistair Popple	a98a2f0	2021-06-30 18:54:16 -0700	[diff] [blame]	2673	try_to_migrate(page, 0);
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2674	if (page_mapped(page))
				2675	goto restore;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2676	}
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2677
				2678	if (migrate_vma_check_page(page))
				2679	continue;
				2680
				2681	restore:
				2682	migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
				2683	migrate->cpages--;
				2684	restore++;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2685	}
				2686
				2687	for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) {
				2688	struct page *page = migrate_pfn_to_page(migrate->src[i]);
				2689
				2690	if (!page \|\| (migrate->src[i] & MIGRATE_PFN_MIGRATE))
				2691	continue;
				2692
				2693	remove_migration_ptes(page, page, false);
				2694
				2695	migrate->src[i] = 0;
				2696	unlock_page(page);
				2697	restore--;
				2698
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2699	if (is_zone_device_page(page))
				2700	put_page(page);
				2701	else
				2702	putback_lru_page(page);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2703	}
				2704	}
				2705
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	2706	/**
				2707	* migrate_vma_setup() - prepare to migrate a range of memory
Randy Dunlap	eaf444d	2020-08-11 18:33:08 -0700	[diff] [blame]	2708	* @args: contains the vma, start, and pfns arrays for the migration
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	2709	*
				2710	* Returns: negative errno on failures, 0 when 0 or more pages were migrated
				2711	* without an error.
				2712	*
				2713	* Prepare to migrate a range of memory virtual address range by collecting all
				2714	* the pages backing each virtual address in the range, saving them inside the
				2715	* src array. Then lock those pages and unmap them. Once the pages are locked
				2716	* and unmapped, check whether each page is pinned or not. Pages that aren't
				2717	* pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the
				2718	* corresponding src array entry. Then restores any pages that are pinned, by
				2719	* remapping and unlocking those pages.
				2720	*
				2721	* The caller should then allocate destination memory and copy source memory to
				2722	* it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE
				2723	* flag set). Once these are allocated and copied, the caller must update each
				2724	* corresponding entry in the dst array with the pfn value of the destination
				2725	* page and with the MIGRATE_PFN_VALID and MIGRATE_PFN_LOCKED flags set
				2726	* (destination pages must have their struct pages locked, via lock_page()).
				2727	*
				2728	* Note that the caller does not have to migrate all the pages that are marked
				2729	* with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from
				2730	* device memory to system memory. If the caller cannot migrate a device page
				2731	* back to system memory, then it must return VM_FAULT_SIGBUS, which has severe
				2732	* consequences for the userspace process, so it must be avoided if at all
				2733	* possible.
				2734	*
				2735	* For empty entries inside CPU page table (pte_none() or pmd_none() is true) we
				2736	* do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus
Ingo Molnar	f0953a1	2021-05-06 18:06:47 -0700	[diff] [blame]	2737	* allowing the caller to allocate device memory for those unbacked virtual
				2738	* addresses. For this the caller simply has to allocate device memory and
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	2739	* properly set the destination entry like for regular migration. Note that
Ingo Molnar	f0953a1	2021-05-06 18:06:47 -0700	[diff] [blame]	2740	* this can still fail, and thus inside the device driver you must check if the
				2741	* migration was successful for those entries after calling migrate_vma_pages(),
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	2742	* just like for regular migration.
				2743	*
				2744	* After that, the callers must call migrate_vma_pages() to go over each entry
				2745	* in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
				2746	* set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
				2747	* then migrate_vma_pages() to migrate struct page information from the source
				2748	* struct page to the destination struct page. If it fails to migrate the
				2749	* struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the
				2750	* src array.
				2751	*
				2752	* At this point all successfully migrated pages have an entry in the src
				2753	* array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
				2754	* array entry with MIGRATE_PFN_VALID flag set.
				2755	*
				2756	* Once migrate_vma_pages() returns the caller may inspect which pages were
				2757	* successfully migrated, and which were not. Successfully migrated pages will
				2758	* have the MIGRATE_PFN_MIGRATE flag set for their src array entry.
				2759	*
				2760	* It is safe to update device page table after migrate_vma_pages() because
Michel Lespinasse	c1e8d7c	2020-06-08 21:33:54 -0700	[diff] [blame]	2761	* both destination and source page are still locked, and the mmap_lock is held
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	2762	* in read mode (hence no one can unmap the range being migrated).
				2763	*
				2764	* Once the caller is done cleaning up things and updating its page table (if it
				2765	* chose to do so, this is not an obligation) it finally calls
				2766	* migrate_vma_finalize() to update the CPU page table to point to new pages
				2767	* for successfully migrated pages or otherwise restore the CPU page table to
				2768	* point to the original source pages.
				2769	*/
				2770	int migrate_vma_setup(struct migrate_vma *args)
				2771	{
				2772	long nr_pages = (args->end - args->start) >> PAGE_SHIFT;
				2773
				2774	args->start &= PAGE_MASK;
				2775	args->end &= PAGE_MASK;
				2776	if (!args->vma \|\| is_vm_hugetlb_page(args->vma) \|\|
				2777	(args->vma->vm_flags & VM_SPECIAL) \|\| vma_is_dax(args->vma))
				2778	return -EINVAL;
				2779	if (nr_pages <= 0)
				2780	return -EINVAL;
				2781	if (args->start < args->vma->vm_start \|\|
				2782	args->start >= args->vma->vm_end)
				2783	return -EINVAL;
				2784	if (args->end <= args->vma->vm_start \|\| args->end > args->vma->vm_end)
				2785	return -EINVAL;
				2786	if (!args->src \|\| !args->dst)
				2787	return -EINVAL;
				2788
				2789	memset(args->src, 0, sizeof(args->src) nr_pages);
				2790	args->cpages = 0;
				2791	args->npages = 0;
				2792
				2793	migrate_vma_collect(args);
				2794
				2795	if (args->cpages)
				2796	migrate_vma_prepare(args);
				2797	if (args->cpages)
				2798	migrate_vma_unmap(args);
				2799
				2800	/*
				2801	* At this point pages are locked and unmapped, and thus they have
				2802	* stable content and can safely be copied to destination memory that
				2803	* is allocated by the drivers.
				2804	*/
				2805	return 0;
				2806
				2807	}
				2808	EXPORT_SYMBOL(migrate_vma_setup);
				2809
Ralph Campbell	34290e2	2020-01-30 22:14:44 -0800	[diff] [blame]	2810	/*
				2811	* This code closely matches the code in:
				2812	* __handle_mm_fault()
				2813	* handle_pte_fault()
				2814	* do_anonymous_page()
				2815	* to map in an anonymous zero page but the struct page will be a ZONE_DEVICE
				2816	* private page.
				2817	*/
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2818	static void migrate_vma_insert_page(struct migrate_vma *migrate,
				2819	unsigned long addr,
				2820	struct page *page,
Stephen Zhang	d85c6db	2020-12-14 19:13:20 -0800	[diff] [blame]	2821	unsigned long *src)
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2822	{
				2823	struct vm_area_struct *vma = migrate->vma;
				2824	struct mm_struct *mm = vma->vm_mm;
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2825	bool flush = false;
				2826	spinlock_t *ptl;
				2827	pte_t entry;
				2828	pgd_t *pgdp;
				2829	p4d_t *p4dp;
				2830	pud_t *pudp;
				2831	pmd_t *pmdp;
				2832	pte_t *ptep;
				2833
				2834	/* Only allow populating anonymous memory */
				2835	if (!vma_is_anonymous(vma))
				2836	goto abort;
				2837
				2838	pgdp = pgd_offset(mm, addr);
				2839	p4dp = p4d_alloc(mm, pgdp, addr);
				2840	if (!p4dp)
				2841	goto abort;
				2842	pudp = pud_alloc(mm, p4dp, addr);
				2843	if (!pudp)
				2844	goto abort;
				2845	pmdp = pmd_alloc(mm, pudp, addr);
				2846	if (!pmdp)
				2847	goto abort;
				2848
				2849	if (pmd_trans_huge(pmdp) \|\| pmd_devmap(pmdp))
				2850	goto abort;
				2851
				2852	/*
				2853	* Use pte_alloc() instead of pte_alloc_map(). We can't run
				2854	* pte_offset_map() on pmds where a huge pmd might be created
				2855	* from a different thread.
				2856	*
Michel Lespinasse	3e4e28c	2020-06-08 21:33:51 -0700	[diff] [blame]	2857	* pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2858	* parallel threads are excluded by other means.
				2859	*
Michel Lespinasse	3e4e28c	2020-06-08 21:33:51 -0700	[diff] [blame]	2860	* Here we only have mmap_read_lock(mm).
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2861	*/
Joel Fernandes (Google)	4cf5892	2019-01-03 15:28:34 -0800	[diff] [blame]	2862	if (pte_alloc(mm, pmdp))
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2863	goto abort;
				2864
				2865	/* See the comment in pte_alloc_one_map() */
				2866	if (unlikely(pmd_trans_unstable(pmdp)))
				2867	goto abort;
				2868
				2869	if (unlikely(anon_vma_prepare(vma)))
				2870	goto abort;
Johannes Weiner	d9eb1ea	2020-06-03 16:02:24 -0700	[diff] [blame]	2871	if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2872	goto abort;
				2873
				2874	/*
				2875	* The memory barrier inside __SetPageUptodate makes sure that
				2876	* preceding stores to the page contents become visible before
				2877	* the set_pte_at() write.
				2878	*/
				2879	__SetPageUptodate(page);
				2880
Jérôme Glisse	df6ad69	2017-09-08 16:12:24 -0700	[diff] [blame]	2881	if (is_zone_device_page(page)) {
				2882	if (is_device_private_page(page)) {
				2883	swp_entry_t swp_entry;
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2884
Alistair Popple	4dd845b	2021-06-30 18:54:09 -0700	[diff] [blame]	2885	if (vma->vm_flags & VM_WRITE)
				2886	swp_entry = make_writable_device_private_entry(
				2887	page_to_pfn(page));
				2888	else
				2889	swp_entry = make_readable_device_private_entry(
				2890	page_to_pfn(page));
Jérôme Glisse	df6ad69	2017-09-08 16:12:24 -0700	[diff] [blame]	2891	entry = swp_entry_to_pte(swp_entry);
Miaohe Lin	34f5e9b	2021-05-04 18:37:10 -0700	[diff] [blame]	2892	} else {
				2893	/*
				2894	* For now we only support migrating to un-addressable
				2895	* device memory.
				2896	*/
				2897	pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
				2898	goto abort;
Jérôme Glisse	df6ad69	2017-09-08 16:12:24 -0700	[diff] [blame]	2899	}
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2900	} else {
				2901	entry = mk_pte(page, vma->vm_page_prot);
				2902	if (vma->vm_flags & VM_WRITE)
				2903	entry = pte_mkwrite(pte_mkdirty(entry));
				2904	}
				2905
				2906	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
				2907
Ralph Campbell	34290e2	2020-01-30 22:14:44 -0800	[diff] [blame]	2908	if (check_stable_address_space(mm))
				2909	goto unlock_abort;
				2910
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2911	if (pte_present(*ptep)) {
				2912	unsigned long pfn = pte_pfn(*ptep);
				2913
Ralph Campbell	c23a0c9	2020-01-30 22:14:41 -0800	[diff] [blame]	2914	if (!is_zero_pfn(pfn))
				2915	goto unlock_abort;
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2916	flush = true;
Ralph Campbell	c23a0c9	2020-01-30 22:14:41 -0800	[diff] [blame]	2917	} else if (!pte_none(*ptep))
				2918	goto unlock_abort;
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2919
				2920	/*
Ralph Campbell	c23a0c9	2020-01-30 22:14:41 -0800	[diff] [blame]	2921	* Check for userfaultfd but do not deliver the fault. Instead,
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2922	* just back off.
				2923	*/
Ralph Campbell	c23a0c9	2020-01-30 22:14:41 -0800	[diff] [blame]	2924	if (userfaultfd_missing(vma))
				2925	goto unlock_abort;
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2926
				2927	inc_mm_counter(mm, MM_ANONPAGES);
Johannes Weiner	be5d0a7	2020-06-03 16:01:57 -0700	[diff] [blame]	2928	page_add_new_anon_rmap(page, vma, addr, false);
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2929	if (!is_zone_device_page(page))
Joonsoo Kim	b518154	2020-08-11 18:30:40 -0700	[diff] [blame]	2930	lru_cache_add_inactive_or_unevictable(page, vma);
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2931	get_page(page);
				2932
				2933	if (flush) {
				2934	flush_cache_page(vma, addr, pte_pfn(*ptep));
				2935	ptep_clear_flush_notify(vma, addr, ptep);
				2936	set_pte_at_notify(mm, addr, ptep, entry);
				2937	update_mmu_cache(vma, addr, ptep);
				2938	} else {
				2939	/* No need to invalidate - it was non-present before */
				2940	set_pte_at(mm, addr, ptep, entry);
				2941	update_mmu_cache(vma, addr, ptep);
				2942	}
				2943
				2944	pte_unmap_unlock(ptep, ptl);
				2945	*src = MIGRATE_PFN_MIGRATE;
				2946	return;
				2947
Ralph Campbell	c23a0c9	2020-01-30 22:14:41 -0800	[diff] [blame]	2948	unlock_abort:
				2949	pte_unmap_unlock(ptep, ptl);
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2950	abort:
				2951	*src &= ~MIGRATE_PFN_MIGRATE;
				2952	}
				2953
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	2954	/**
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2955	* migrate_vma_pages() - migrate meta-data from src page to dst page
				2956	* @migrate: migrate struct containing all migration information
				2957	*
				2958	* This migrates struct page meta-data from source struct page to destination
				2959	* struct page. This effectively finishes the migration from source page to the
				2960	* destination page.
				2961	*/
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	2962	void migrate_vma_pages(struct migrate_vma *migrate)
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2963	{
				2964	const unsigned long npages = migrate->npages;
				2965	const unsigned long start = migrate->start;
Jérôme Glisse	ac46d4f	2018-12-28 00:38:09 -0800	[diff] [blame]	2966	struct mmu_notifier_range range;
				2967	unsigned long addr, i;
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2968	bool notified = false;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2969
				2970	for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
				2971	struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
				2972	struct page *page = migrate_pfn_to_page(migrate->src[i]);
				2973	struct address_space *mapping;
				2974	int r;
				2975
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2976	if (!newpage) {
				2977	migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2978	continue;
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2979	}
				2980
				2981	if (!page) {
Ralph Campbell	c23a0c9	2020-01-30 22:14:41 -0800	[diff] [blame]	2982	if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2983	continue;
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2984	if (!notified) {
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2985	notified = true;
Jérôme Glisse	ac46d4f	2018-12-28 00:38:09 -0800	[diff] [blame]	2986
Alistair Popple	6b49bf6	2021-06-30 18:54:19 -0700	[diff] [blame]	2987	mmu_notifier_range_init_owner(&range,
				2988	MMU_NOTIFY_MIGRATE, 0, migrate->vma,
				2989	migrate->vma->vm_mm, addr, migrate->end,
Ralph Campbell	5e5dda8	2020-12-14 19:12:55 -0800	[diff] [blame]	2990	migrate->pgmap_owner);
Jérôme Glisse	ac46d4f	2018-12-28 00:38:09 -0800	[diff] [blame]	2991	mmu_notifier_invalidate_range_start(&range);
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2992	}
				2993	migrate_vma_insert_page(migrate, addr, newpage,
Stephen Zhang	d85c6db	2020-12-14 19:13:20 -0800	[diff] [blame]	2994	&migrate->src[i]);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2995	continue;
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2996	}
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2997
				2998	mapping = page_mapping(page);
				2999
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	3000	if (is_zone_device_page(newpage)) {
				3001	if (is_device_private_page(newpage)) {
				3002	/*
				3003	* For now only support private anonymous when
				3004	* migrating to un-addressable device memory.
				3005	*/
				3006	if (mapping) {
				3007	migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
				3008	continue;
				3009	}
Christoph Hellwig	25b2995	2019-06-13 22:50:49 +0200	[diff] [blame]	3010	} else {
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	3011	/*
				3012	* Other types of ZONE_DEVICE page are not
				3013	* supported.
				3014	*/
				3015	migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
				3016	continue;
				3017	}
				3018	}
				3019
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	3020	r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY);
				3021	if (r != MIGRATEPAGE_SUCCESS)
				3022	migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
				3023	}
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	3024
Jérôme Glisse	4645b9f	2017-11-15 17:34:11 -0800	[diff] [blame]	3025	/*
				3026	* No need to double call mmu_notifier->invalidate_range() callback as
				3027	* the above ptep_clear_flush_notify() inside migrate_vma_insert_page()
				3028	* did already call it.
				3029	*/
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	3030	if (notified)
Jérôme Glisse	ac46d4f	2018-12-28 00:38:09 -0800	[diff] [blame]	3031	mmu_notifier_invalidate_range_only_end(&range);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	3032	}
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	3033	EXPORT_SYMBOL(migrate_vma_pages);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	3034
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	3035	/**
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	3036	* migrate_vma_finalize() - restore CPU page table entry
				3037	* @migrate: migrate struct containing all migration information
				3038	*
				3039	* This replaces the special migration pte entry with either a mapping to the
				3040	* new page if migration was successful for that page, or to the original page
				3041	* otherwise.
				3042	*
				3043	* This also unlocks the pages and puts them back on the lru, or drops the extra
				3044	* refcount, for device pages.
				3045	*/
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	3046	void migrate_vma_finalize(struct migrate_vma *migrate)
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	3047	{
				3048	const unsigned long npages = migrate->npages;
				3049	unsigned long i;
				3050
				3051	for (i = 0; i < npages; i++) {
				3052	struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
				3053	struct page *page = migrate_pfn_to_page(migrate->src[i]);
				3054
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	3055	if (!page) {
				3056	if (newpage) {
				3057	unlock_page(newpage);
				3058	put_page(newpage);
				3059	}
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	3060	continue;
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	3061	}
				3062
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	3063	if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) \|\| !newpage) {
				3064	if (newpage) {
				3065	unlock_page(newpage);
				3066	put_page(newpage);
				3067	}
				3068	newpage = page;
				3069	}
				3070
				3071	remove_migration_ptes(page, newpage, false);
				3072	unlock_page(page);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	3073
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	3074	if (is_zone_device_page(page))
				3075	put_page(page);
				3076	else
				3077	putback_lru_page(page);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	3078
				3079	if (newpage != page) {
				3080	unlock_page(newpage);
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	3081	if (is_zone_device_page(newpage))
				3082	put_page(newpage);
				3083	else
				3084	putback_lru_page(newpage);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	3085	}
				3086	}
				3087	}
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	3088	EXPORT_SYMBOL(migrate_vma_finalize);
Christoph Hellwig	9b2ed9c	2019-08-14 09:59:28 +0200	[diff] [blame]	3089	#endif /* CONFIG_DEVICE_PRIVATE */
Dave Hansen	79c28a4	2021-09-02 14:59:06 -0700	[diff] [blame]	3090
Dave Hansen	76af6a0	2021-10-18 15:15:32 -0700	[diff] [blame]	3091	#if defined(CONFIG_HOTPLUG_CPU)
Dave Hansen	79c28a4	2021-09-02 14:59:06 -0700	[diff] [blame]	3092	/* Disable reclaim-based migration. */
				3093	static void __disable_all_migrate_targets(void)
				3094	{
				3095	int node;
				3096
				3097	for_each_online_node(node)
				3098	node_demotion[node] = NUMA_NO_NODE;
				3099	}
				3100
				3101	static void disable_all_migrate_targets(void)
				3102	{
				3103	__disable_all_migrate_targets();
				3104
				3105	/*
				3106	* Ensure that the "disable" is visible across the system.
				3107	* Readers will see either a combination of before+disable
				3108	* state or disable+after. They will never see before and
				3109	* after state together.
				3110	*
				3111	* The before+after state together might have cycles and
				3112	* could cause readers to do things like loop until this
				3113	* function finishes. This ensures they can only see a
				3114	* single "bad" read and would, for instance, only loop
				3115	* once.
				3116	*/
				3117	synchronize_rcu();
				3118	}
				3119
				3120	/*
				3121	* Find an automatic demotion target for 'node'.
				3122	* Failing here is OK. It might just indicate
				3123	* being at the end of a chain.
				3124	*/
				3125	static int establish_migrate_target(int node, nodemask_t *used)
				3126	{
				3127	int migration_target;
				3128
				3129	/*
				3130	* Can not set a migration target on a
				3131	* node with it already set.
				3132	*
				3133	* No need for READ_ONCE() here since this
				3134	* in the write path for node_demotion[].
				3135	* This should be the only thread writing.
				3136	*/
				3137	if (node_demotion[node] != NUMA_NO_NODE)
				3138	return NUMA_NO_NODE;
				3139
				3140	migration_target = find_next_best_node(node, used);
				3141	if (migration_target == NUMA_NO_NODE)
				3142	return NUMA_NO_NODE;
				3143
				3144	node_demotion[node] = migration_target;
				3145
				3146	return migration_target;
				3147	}
				3148
				3149	/*
				3150	* When memory fills up on a node, memory contents can be
				3151	* automatically migrated to another node instead of
				3152	* discarded at reclaim.
				3153	*
				3154	* Establish a "migration path" which will start at nodes
				3155	* with CPUs and will follow the priorities used to build the
				3156	* page allocator zonelists.
				3157	*
				3158	* The difference here is that cycles must be avoided. If
				3159	* node0 migrates to node1, then neither node1, nor anything
				3160	* node1 migrates to can migrate to node0.
				3161	*
				3162	* This function can run simultaneously with readers of
				3163	* node_demotion[]. However, it can not run simultaneously
				3164	* with itself. Exclusion is provided by memory hotplug events
				3165	* being single-threaded.
				3166	*/
				3167	static void __set_migration_target_nodes(void)
				3168	{
				3169	nodemask_t next_pass = NODE_MASK_NONE;
				3170	nodemask_t this_pass = NODE_MASK_NONE;
				3171	nodemask_t used_targets = NODE_MASK_NONE;
				3172	int node;
				3173
				3174	/*
				3175	* Avoid any oddities like cycles that could occur
				3176	* from changes in the topology. This will leave
				3177	* a momentary gap when migration is disabled.
				3178	*/
				3179	disable_all_migrate_targets();
				3180
				3181	/*
				3182	* Allocations go close to CPUs, first. Assume that
				3183	* the migration path starts at the nodes with CPUs.
				3184	*/
				3185	next_pass = node_states[N_CPU];
				3186	again:
				3187	this_pass = next_pass;
				3188	next_pass = NODE_MASK_NONE;
				3189	/*
				3190	* To avoid cycles in the migration "graph", ensure
				3191	* that migration sources are not future targets by
				3192	* setting them in 'used_targets'. Do this only
				3193	* once per pass so that multiple source nodes can
				3194	* share a target node.
				3195	*
				3196	* 'used_targets' will become unavailable in future
				3197	* passes. This limits some opportunities for
				3198	* multiple source nodes to share a destination.
				3199	*/
				3200	nodes_or(used_targets, used_targets, this_pass);
				3201	for_each_node_mask(node, this_pass) {
				3202	int target_node = establish_migrate_target(node, &used_targets);
				3203
				3204	if (target_node == NUMA_NO_NODE)
				3205	continue;
				3206
				3207	/*
				3208	* Visit targets from this pass in the next pass.
				3209	* Eventually, every node will have been part of
				3210	* a pass, and will become set in 'used_targets'.
				3211	*/
				3212	node_set(target_node, next_pass);
				3213	}
				3214	/*
				3215	* 'next_pass' contains nodes which became migration
				3216	* targets in this pass. Make additional passes until
				3217	* no more migrations targets are available.
				3218	*/
				3219	if (!nodes_empty(next_pass))
				3220	goto again;
				3221	}
				3222
				3223	/*
				3224	* For callers that do not hold get_online_mems() already.
				3225	*/
Dave Hansen	79c28a4	2021-09-02 14:59:06 -0700	[diff] [blame]	3226	static void set_migration_target_nodes(void)
				3227	{
				3228	get_online_mems();
				3229	__set_migration_target_nodes();
				3230	put_online_mems();
				3231	}
Dave Hansen	884a6e5	2021-09-02 14:59:09 -0700	[diff] [blame]	3232
				3233	/*
Dave Hansen	884a6e5	2021-09-02 14:59:09 -0700	[diff] [blame]	3234	* This leaves migrate-on-reclaim transiently disabled between
				3235	* the MEM_GOING_OFFLINE and MEM_OFFLINE events. This runs
				3236	* whether reclaim-based migration is enabled or not, which
				3237	* ensures that the user can turn reclaim-based migration at
				3238	* any time without needing to recalculate migration targets.
				3239	*
				3240	* These callbacks already hold get_online_mems(). That is why
				3241	* __set_migration_target_nodes() can be used as opposed to
				3242	* set_migration_target_nodes().
				3243	*/
				3244	static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
Dave Hansen	295be91	2021-10-18 15:15:29 -0700	[diff] [blame]	3245	unsigned long action, void *_arg)
Dave Hansen	884a6e5	2021-09-02 14:59:09 -0700	[diff] [blame]	3246	{
Dave Hansen	295be91	2021-10-18 15:15:29 -0700	[diff] [blame]	3247	struct memory_notify *arg = _arg;
				3248
				3249	/*
				3250	* Only update the node migration order when a node is
				3251	* changing status, like online->offline. This avoids
				3252	* the overhead of synchronize_rcu() in most cases.
				3253	*/
				3254	if (arg->status_change_nid < 0)
				3255	return notifier_from_errno(0);
				3256
Dave Hansen	884a6e5	2021-09-02 14:59:09 -0700	[diff] [blame]	3257	switch (action) {
				3258	case MEM_GOING_OFFLINE:
				3259	/*
				3260	* Make sure there are not transient states where
				3261	* an offline node is a migration target. This
				3262	* will leave migration disabled until the offline
				3263	* completes and the MEM_OFFLINE case below runs.
				3264	*/
				3265	disable_all_migrate_targets();
				3266	break;
				3267	case MEM_OFFLINE:
				3268	case MEM_ONLINE:
				3269	/*
				3270	* Recalculate the target nodes once the node
				3271	* reaches its final state (online or offline).
				3272	*/
				3273	__set_migration_target_nodes();
				3274	break;
				3275	case MEM_CANCEL_OFFLINE:
				3276	/*
				3277	* MEM_GOING_OFFLINE disabled all the migration
				3278	* targets. Reenable them.
				3279	*/
				3280	__set_migration_target_nodes();
				3281	break;
				3282	case MEM_GOING_ONLINE:
				3283	case MEM_CANCEL_ONLINE:
				3284	break;
				3285	}
				3286
				3287	return notifier_from_errno(0);
				3288	}
				3289
Dave Hansen	76af6a0	2021-10-18 15:15:32 -0700	[diff] [blame]	3290	/*
				3291	* React to hotplug events that might affect the migration targets
				3292	* like events that online or offline NUMA nodes.
				3293	*
				3294	* The ordering is also currently dependent on which nodes have
				3295	* CPUs. That means we need CPU on/offline notification too.
				3296	*/
				3297	static int migration_online_cpu(unsigned int cpu)
				3298	{
				3299	set_migration_target_nodes();
				3300	return 0;
				3301	}
				3302
				3303	static int migration_offline_cpu(unsigned int cpu)
				3304	{
				3305	set_migration_target_nodes();
				3306	return 0;
				3307	}
				3308
Dave Hansen	884a6e5	2021-09-02 14:59:09 -0700	[diff] [blame]	3309	static int __init migrate_on_reclaim_init(void)
				3310	{
				3311	int ret;
				3312
Huang Ying	a6a0251	2021-10-18 15:15:35 -0700	[diff] [blame]	3313	ret = cpuhp_setup_state_nocalls(CPUHP_MM_DEMOTION_DEAD, "mm/demotion:offline",
				3314	NULL, migration_offline_cpu);
Dave Hansen	884a6e5	2021-09-02 14:59:09 -0700	[diff] [blame]	3315	/*
				3316	* In the unlikely case that this fails, the automatic
				3317	* migration targets may become suboptimal for nodes
				3318	* where N_CPU changes. With such a small impact in a
				3319	* rare case, do not bother trying to do anything special.
				3320	*/
				3321	WARN_ON(ret < 0);
Huang Ying	a6a0251	2021-10-18 15:15:35 -0700	[diff] [blame]	3322	ret = cpuhp_setup_state(CPUHP_AP_MM_DEMOTION_ONLINE, "mm/demotion:online",
				3323	migration_online_cpu, NULL);
				3324	WARN_ON(ret < 0);
Dave Hansen	884a6e5	2021-09-02 14:59:09 -0700	[diff] [blame]	3325
				3326	hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
				3327	return 0;
				3328	}
				3329	late_initcall(migrate_on_reclaim_init);
Dave Hansen	76af6a0	2021-10-18 15:15:32 -0700	[diff] [blame]	3330	#endif /* CONFIG_HOTPLUG_CPU */