Blame - mm/migrate.c - yocto/kernel/common

blob: 57aeb9b491da65ec27ef2d0f6c76ca08b89929a1 [file] [log] [blame]

Greg Kroah-Hartman	b244131	2017-11-01 15:07:57 +0100	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	2	/*
Hugh Dickins	14e0f9b	2015-11-05 18:49:43 -0800	[diff] [blame]	3	* Memory Migration functionality - linux/mm/migrate.c
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	4	*
				5	* Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
				6	*
				7	* Page migration was first developed in the context of the memory hotplug
				8	* project. The main authors of the migration code are:
				9	*
				10	* IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
				11	* Hirokazu Takahashi <taka@valinux.co.jp>
				12	* Dave Hansen <haveblue@us.ibm.com>
Christoph Lameter	cde5353	2008-07-04 09:59:22 -0700	[diff] [blame]	13	* Christoph Lameter
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	14	*/
				15
				16	#include <linux/migrate.h>
Paul Gortmaker	b95f1b31	2011-10-16 02:01:52 -0400	[diff] [blame]	17	#include <linux/export.h>
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	18	#include <linux/swap.h>
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	19	#include <linux/swapops.h>
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	20	#include <linux/pagemap.h>
Christoph Lameter	e23ca00	2006-04-10 22:52:57 -0700	[diff] [blame]	21	#include <linux/buffer_head.h>
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	22	#include <linux/mm_inline.h>
Pavel Emelyanov	b488893	2007-10-18 23:40:14 -0700	[diff] [blame]	23	#include <linux/nsproxy.h>
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	24	#include <linux/pagevec.h>
Hugh Dickins	e9995ef	2009-12-14 17:59:31 -0800	[diff] [blame]	25	#include <linux/ksm.h>
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	26	#include <linux/rmap.h>
				27	#include <linux/topology.h>
				28	#include <linux/cpu.h>
				29	#include <linux/cpuset.h>
Christoph Lameter	04e62a2	2006-06-23 02:03:38 -0700	[diff] [blame]	30	#include <linux/writeback.h>
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	31	#include <linux/mempolicy.h>
				32	#include <linux/vmalloc.h>
David Quigley	86c3a76	2006-06-23 02:04:02 -0700	[diff] [blame]	33	#include <linux/security.h>
Hugh Dickins	42cb14b	2015-11-05 18:50:05 -0800	[diff] [blame]	34	#include <linux/backing-dev.h>
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	35	#include <linux/compaction.h>
Adrian Bunk	4f5ca26	2008-07-23 21:27:02 -0700	[diff] [blame]	36	#include <linux/syscalls.h>
Dominik Brodowski	7addf44	2018-03-17 16:08:03 +0100	[diff] [blame]	37	#include <linux/compat.h>
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	38	#include <linux/hugetlb.h>
Aneesh Kumar K.V	8e6ac7f	2012-07-31 16:42:27 -0700	[diff] [blame]	39	#include <linux/hugetlb_cgroup.h>
Tejun Heo	5a0e3ad	2010-03-24 17:04:11 +0900	[diff] [blame]	40	#include <linux/gfp.h>
Christoph Hellwig	a520110	2019-08-28 16:19:53 +0200	[diff] [blame]	41	#include <linux/pagewalk.h>
Jérôme Glisse	df6ad69	2017-09-08 16:12:24 -0700	[diff] [blame]	42	#include <linux/pfn_t.h>
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	43	#include <linux/memremap.h>
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	44	#include <linux/userfaultfd_k.h>
Rafael Aquini	bf6bddf1	2012-12-11 16:02:42 -0800	[diff] [blame]	45	#include <linux/balloon_compaction.h>
Mel Gorman	f714f4f	2013-12-18 17:08:33 -0800	[diff] [blame]	46	#include <linux/mmu_notifier.h>
Vladimir Davydov	33c3fc7	2015-09-09 15:35:45 -0700	[diff] [blame]	47	#include <linux/page_idle.h>
Vlastimil Babka	d435edc	2016-03-15 14:56:15 -0700	[diff] [blame]	48	#include <linux/page_owner.h>
Ingo Molnar	6e84f31	2017-02-08 18:51:29 +0100	[diff] [blame]	49	#include <linux/sched/mm.h>
Linus Torvalds	197e7e5	2017-08-20 13:26:27 -0700	[diff] [blame]	50	#include <linux/ptrace.h>
Ralph Campbell	34290e2	2020-01-30 22:14:44 -0800	[diff] [blame]	51	#include <linux/oom.h>
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	52
Michal Nazarewicz	0d1836c	2010-12-21 17:24:26 -0800	[diff] [blame]	53	#include <asm/tlbflush.h>
				54
Mel Gorman	7b2a2d4	2012-10-19 14:07:31 +0100	[diff] [blame]	55	#define CREATE_TRACE_POINTS
				56	#include <trace/events/migrate.h>
				57
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	58	#include "internal.h"
				59
Yisheng Xie	9e5bcd6	2017-02-24 14:57:29 -0800	[diff] [blame]	60	int isolate_movable_page(struct page *page, isolate_mode_t mode)
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	61	{
				62	struct address_space *mapping;
				63
				64	/*
				65	* Avoid burning cycles with pages that are yet under __free_pages(),
				66	* or just got freed under us.
				67	*
				68	* In case we 'win' a race for a movable page being freed under us and
				69	* raise its refcount preventing __free_pages() from doing its job
				70	* the put_page() at the end of this block will take care of
				71	* release this page, thus avoiding a nasty leakage.
				72	*/
				73	if (unlikely(!get_page_unless_zero(page)))
				74	goto out;
				75
				76	/*
				77	* Check PageMovable before holding a PG_lock because page's owner
				78	* assumes anybody doesn't touch PG_lock of newly allocated page
Wei Yang	8bb4e7a	2019-03-05 15:46:22 -0800	[diff] [blame]	79	* so unconditionally grabbing the lock ruins page's owner side.
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	80	*/
				81	if (unlikely(!__PageMovable(page)))
				82	goto out_putpage;
				83	/*
				84	* As movable pages are not isolated from LRU lists, concurrent
				85	* compaction threads can race against page migration functions
				86	* as well as race against the releasing a page.
				87	*
				88	* In order to avoid having an already isolated movable page
				89	* being (wrongly) re-isolated while it is under migration,
				90	* or to avoid attempting to isolate pages being released,
				91	* lets be sure we have the page lock
				92	* before proceeding with the movable page isolation steps.
				93	*/
				94	if (unlikely(!trylock_page(page)))
				95	goto out_putpage;
				96
				97	if (!PageMovable(page) \|\| PageIsolated(page))
				98	goto out_no_isolated;
				99
				100	mapping = page_mapping(page);
				101	VM_BUG_ON_PAGE(!mapping, page);
				102
				103	if (!mapping->a_ops->isolate_page(page, mode))
				104	goto out_no_isolated;
				105
				106	/* Driver shouldn't use PG_isolated bit of page->flags */
				107	WARN_ON_ONCE(PageIsolated(page));
				108	__SetPageIsolated(page);
				109	unlock_page(page);
				110
Yisheng Xie	9e5bcd6	2017-02-24 14:57:29 -0800	[diff] [blame]	111	return 0;
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	112
				113	out_no_isolated:
				114	unlock_page(page);
				115	out_putpage:
				116	put_page(page);
				117	out:
Yisheng Xie	9e5bcd6	2017-02-24 14:57:29 -0800	[diff] [blame]	118	return -EBUSY;
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	119	}
				120
Miaohe Lin	606a6f7	2021-05-04 18:37:04 -0700	[diff] [blame]	121	static void putback_movable_page(struct page *page)
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	122	{
				123	struct address_space *mapping;
				124
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	125	mapping = page_mapping(page);
				126	mapping->a_ops->putback_page(page);
				127	__ClearPageIsolated(page);
				128	}
				129
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	130	/*
Rafael Aquini	5733c7d	2012-12-11 16:02:47 -0800	[diff] [blame]	131	* Put previously isolated pages back onto the appropriate lists
				132	* from where they were once taken off for compaction/migration.
				133	*
Joonsoo Kim	59c82b7	2014-01-21 15:51:17 -0800	[diff] [blame]	134	* This function shall be used whenever the isolated pageset has been
				135	* built from lru, balloon, hugetlbfs page. See isolate_migratepages_range()
				136	* and isolate_huge_page().
Rafael Aquini	5733c7d	2012-12-11 16:02:47 -0800	[diff] [blame]	137	*/
				138	void putback_movable_pages(struct list_head *l)
				139	{
				140	struct page *page;
				141	struct page *page2;
				142
				143	list_for_each_entry_safe(page, page2, l, lru) {
Naoya Horiguchi	31caf66	2013-09-11 14:21:59 -0700	[diff] [blame]	144	if (unlikely(PageHuge(page))) {
				145	putback_active_hugepage(page);
				146	continue;
				147	}
Rafael Aquini	5733c7d	2012-12-11 16:02:47 -0800	[diff] [blame]	148	list_del(&page->lru);
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	149	/*
				150	* We isolated non-lru movable page so here we can use
				151	* __PageMovable because LRU page's mapping cannot have
				152	* PAGE_MAPPING_MOVABLE.
				153	*/
Minchan Kim	b1123ea6	2016-07-26 15:23:09 -0700	[diff] [blame]	154	if (unlikely(__PageMovable(page))) {
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	155	VM_BUG_ON_PAGE(!PageIsolated(page), page);
				156	lock_page(page);
				157	if (PageMovable(page))
				158	putback_movable_page(page);
				159	else
				160	__ClearPageIsolated(page);
				161	unlock_page(page);
				162	put_page(page);
				163	} else {
Naoya Horiguchi	e8db67e	2017-09-08 16:11:12 -0700	[diff] [blame]	164	mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
Matthew Wilcox (Oracle)	6c35784	2020-08-14 17:30:37 -0700	[diff] [blame]	165	page_is_file_lru(page), -thp_nr_pages(page));
Rabin Vincent	fc280fe	2017-04-20 14:37:46 -0700	[diff] [blame]	166	putback_lru_page(page);
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	167	}
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	168	}
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	169	}
				170
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	171	/*
				172	* Restore a potential migration pte to a working pte entry
				173	*/
Minchan Kim	e4b8222	2017-05-03 14:54:27 -0700	[diff] [blame]	174	static bool remove_migration_pte(struct page page, struct vm_area_struct vma,
Hugh Dickins	e9995ef	2009-12-14 17:59:31 -0800	[diff] [blame]	175	unsigned long addr, void *old)
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	176	{
Kirill A. Shutemov	3fe8796	2017-02-24 14:58:16 -0800	[diff] [blame]	177	struct page_vma_mapped_walk pvmw = {
				178	.page = old,
				179	.vma = vma,
				180	.address = addr,
				181	.flags = PVMW_SYNC \| PVMW_MIGRATION,
				182	};
				183	struct page *new;
				184	pte_t pte;
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	185	swp_entry_t entry;
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	186
Kirill A. Shutemov	3fe8796	2017-02-24 14:58:16 -0800	[diff] [blame]	187	VM_BUG_ON_PAGE(PageTail(page), page);
				188	while (page_vma_mapped_walk(&pvmw)) {
Naoya Horiguchi	4b0ece6	2017-03-31 15:11:44 -0700	[diff] [blame]	189	if (PageKsm(page))
				190	new = page;
				191	else
				192	new = page - pvmw.page->index +
				193	linear_page_index(vma, pvmw.address);
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	194
Zi Yan	616b837	2017-09-08 16:10:57 -0700	[diff] [blame]	195	#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
				196	/* PMD-mapped THP migration entry */
				197	if (!pvmw.pte) {
				198	VM_BUG_ON_PAGE(PageHuge(page) \|\| !PageTransCompound(page), page);
				199	remove_migration_pmd(&pvmw, new);
				200	continue;
				201	}
				202	#endif
				203
Kirill A. Shutemov	3fe8796	2017-02-24 14:58:16 -0800	[diff] [blame]	204	get_page(new);
				205	pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot)));
				206	if (pte_swp_soft_dirty(*pvmw.pte))
				207	pte = pte_mksoft_dirty(pte);
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	208
Hugh Dickins	486cf46	2011-10-19 12:50:35 -0700	[diff] [blame]	209	/*
Kirill A. Shutemov	3fe8796	2017-02-24 14:58:16 -0800	[diff] [blame]	210	* Recheck VMA as permissions can change since migration started
Hugh Dickins	486cf46	2011-10-19 12:50:35 -0700	[diff] [blame]	211	*/
Kirill A. Shutemov	3fe8796	2017-02-24 14:58:16 -0800	[diff] [blame]	212	entry = pte_to_swp_entry(*pvmw.pte);
Alistair Popple	4dd845b	2021-06-30 18:54:09 -0700	[diff] [blame]	213	if (is_writable_migration_entry(entry))
Kirill A. Shutemov	3fe8796	2017-02-24 14:58:16 -0800	[diff] [blame]	214	pte = maybe_mkwrite(pte, vma);
Peter Xu	f45ec5f	2020-04-06 20:06:01 -0700	[diff] [blame]	215	else if (pte_swp_uffd_wp(*pvmw.pte))
				216	pte = pte_mkuffd_wp(pte);
Mel Gorman	d3cb8bf	2014-10-02 19:47:41 +0100	[diff] [blame]	217
Ralph Campbell	6128763	2020-09-04 16:36:04 -0700	[diff] [blame]	218	if (unlikely(is_device_private_page(new))) {
Alistair Popple	4dd845b	2021-06-30 18:54:09 -0700	[diff] [blame]	219	if (pte_write(pte))
				220	entry = make_writable_device_private_entry(
				221	page_to_pfn(new));
				222	else
				223	entry = make_readable_device_private_entry(
				224	page_to_pfn(new));
Ralph Campbell	6128763	2020-09-04 16:36:04 -0700	[diff] [blame]	225	pte = swp_entry_to_pte(entry);
Ralph Campbell	3d321bf8	2020-09-04 16:36:07 -0700	[diff] [blame]	226	if (pte_swp_soft_dirty(*pvmw.pte))
				227	pte = pte_swp_mksoft_dirty(pte);
Ralph Campbell	6128763	2020-09-04 16:36:04 -0700	[diff] [blame]	228	if (pte_swp_uffd_wp(*pvmw.pte))
				229	pte = pte_swp_mkuffd_wp(pte);
Lars Persson	d2b2c6dd	2019-03-28 20:44:28 -0700	[diff] [blame]	230	}
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	231
Andi Kleen	3ef8fd7	2010-10-11 16:03:21 +0200	[diff] [blame]	232	#ifdef CONFIG_HUGETLB_PAGE
Kirill A. Shutemov	3fe8796	2017-02-24 14:58:16 -0800	[diff] [blame]	233	if (PageHuge(new)) {
Christophe Leroy	79c1c59	2021-06-30 18:48:00 -0700	[diff] [blame]	234	unsigned int shift = huge_page_shift(hstate_vma(vma));
				235
Kirill A. Shutemov	3fe8796	2017-02-24 14:58:16 -0800	[diff] [blame]	236	pte = pte_mkhuge(pte);
Christophe Leroy	79c1c59	2021-06-30 18:48:00 -0700	[diff] [blame]	237	pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
Aneesh Kumar K.V	383321a	2017-07-06 15:38:41 -0700	[diff] [blame]	238	set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
Kirill A. Shutemov	3fe8796	2017-02-24 14:58:16 -0800	[diff] [blame]	239	if (PageAnon(new))
				240	hugepage_add_anon_rmap(new, vma, pvmw.address);
				241	else
				242	page_dup_rmap(new, true);
Aneesh Kumar K.V	383321a	2017-07-06 15:38:41 -0700	[diff] [blame]	243	} else
				244	#endif
				245	{
				246	set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
Christoph Lameter	04e62a2	2006-06-23 02:03:38 -0700	[diff] [blame]	247
Aneesh Kumar K.V	383321a	2017-07-06 15:38:41 -0700	[diff] [blame]	248	if (PageAnon(new))
				249	page_add_anon_rmap(new, vma, pvmw.address, false);
				250	else
				251	page_add_file_rmap(new, false);
				252	}
Kirill A. Shutemov	3fe8796	2017-02-24 14:58:16 -0800	[diff] [blame]	253	if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new))
				254	mlock_vma_page(new);
Hugh Dickins	51afb12	2015-11-05 18:49:37 -0800	[diff] [blame]	255
Kirill A. Shutemov	e125fe4	2018-10-05 15:51:41 -0700	[diff] [blame]	256	if (PageTransHuge(page) && PageMlocked(page))
				257	clear_page_mlock(page);
				258
Kirill A. Shutemov	3fe8796	2017-02-24 14:58:16 -0800	[diff] [blame]	259	/* No need to invalidate - it was non-present before */
				260	update_mmu_cache(vma, pvmw.address, pvmw.pte);
				261	}
				262
Minchan Kim	e4b8222	2017-05-03 14:54:27 -0700	[diff] [blame]	263	return true;
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	264	}
				265
				266	/*
Christoph Lameter	04e62a2	2006-06-23 02:03:38 -0700	[diff] [blame]	267	* Get rid of all migration entries and replace them by
				268	* references to the indicated page.
				269	*/
Kirill A. Shutemov	e388466	2016-03-17 14:20:07 -0700	[diff] [blame]	270	void remove_migration_ptes(struct page old, struct page new, bool locked)
Christoph Lameter	04e62a2	2006-06-23 02:03:38 -0700	[diff] [blame]	271	{
Joonsoo Kim	051ac83	2014-01-21 15:49:48 -0800	[diff] [blame]	272	struct rmap_walk_control rwc = {
				273	.rmap_one = remove_migration_pte,
				274	.arg = old,
				275	};
				276
Kirill A. Shutemov	e388466	2016-03-17 14:20:07 -0700	[diff] [blame]	277	if (locked)
				278	rmap_walk_locked(new, &rwc);
				279	else
				280	rmap_walk(new, &rwc);
Christoph Lameter	04e62a2	2006-06-23 02:03:38 -0700	[diff] [blame]	281	}
				282
				283	/*
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	284	* Something used the pte of a page under migration. We need to
				285	* get to the page and wait until migration is finished.
				286	* When we return from this function the fault will be retried.
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	287	*/
Naoya Horiguchi	e66f17f	2015-02-11 15:25:22 -0800	[diff] [blame]	288	void __migration_entry_wait(struct mm_struct mm, pte_t ptep,
Naoya Horiguchi	30dad30	2013-06-12 14:05:04 -0700	[diff] [blame]	289	spinlock_t *ptl)
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	290	{
Naoya Horiguchi	30dad30	2013-06-12 14:05:04 -0700	[diff] [blame]	291	pte_t pte;
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	292	swp_entry_t entry;
				293	struct page *page;
				294
Naoya Horiguchi	30dad30	2013-06-12 14:05:04 -0700	[diff] [blame]	295	spin_lock(ptl);
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	296	pte = *ptep;
				297	if (!is_swap_pte(pte))
				298	goto out;
				299
				300	entry = pte_to_swp_entry(pte);
				301	if (!is_migration_entry(entry))
				302	goto out;
				303
Alistair Popple	af5cdaf	2021-06-30 18:54:06 -0700	[diff] [blame]	304	page = pfn_swap_entry_to_page(entry);
Xu Yu	ffc90cb	2021-06-15 18:23:42 -0700	[diff] [blame]	305	page = compound_head(page);
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	306
Nick Piggin	e286781	2008-07-25 19:45:30 -0700	[diff] [blame]	307	/*
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	308	* Once page cache replacement of page migration started, page_count
Hugh Dickins	9a1ea43	2018-12-28 00:36:14 -0800	[diff] [blame]	309	* is zero; but we must not call put_and_wait_on_page_locked() without
				310	* a ref. Use get_page_unless_zero(), and just fault again if it fails.
Nick Piggin	e286781	2008-07-25 19:45:30 -0700	[diff] [blame]	311	*/
				312	if (!get_page_unless_zero(page))
				313	goto out;
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	314	pte_unmap_unlock(ptep, ptl);
Matthew Wilcox (Oracle)	4805462	2021-02-24 12:02:02 -0800	[diff] [blame]	315	put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
Christoph Lameter	0697212	2006-06-23 02:03:35 -0700	[diff] [blame]	316	return;
				317	out:
				318	pte_unmap_unlock(ptep, ptl);
				319	}
				320
Naoya Horiguchi	30dad30	2013-06-12 14:05:04 -0700	[diff] [blame]	321	void migration_entry_wait(struct mm_struct mm, pmd_t pmd,
				322	unsigned long address)
				323	{
				324	spinlock_t *ptl = pte_lockptr(mm, pmd);
				325	pte_t *ptep = pte_offset_map(pmd, address);
				326	__migration_entry_wait(mm, ptep, ptl);
				327	}
				328
Kirill A. Shutemov	cb900f4	2013-11-14 14:31:02 -0800	[diff] [blame]	329	void migration_entry_wait_huge(struct vm_area_struct *vma,
				330	struct mm_struct mm, pte_t pte)
Naoya Horiguchi	30dad30	2013-06-12 14:05:04 -0700	[diff] [blame]	331	{
Kirill A. Shutemov	cb900f4	2013-11-14 14:31:02 -0800	[diff] [blame]	332	spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte);
Naoya Horiguchi	30dad30	2013-06-12 14:05:04 -0700	[diff] [blame]	333	__migration_entry_wait(mm, pte, ptl);
				334	}
				335
Zi Yan	616b837	2017-09-08 16:10:57 -0700	[diff] [blame]	336	#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
				337	void pmd_migration_entry_wait(struct mm_struct mm, pmd_t pmd)
				338	{
				339	spinlock_t *ptl;
				340	struct page *page;
				341
				342	ptl = pmd_lock(mm, pmd);
				343	if (!is_pmd_migration_entry(*pmd))
				344	goto unlock;
Alistair Popple	af5cdaf	2021-06-30 18:54:06 -0700	[diff] [blame]	345	page = pfn_swap_entry_to_page(pmd_to_swp_entry(*pmd));
Zi Yan	616b837	2017-09-08 16:10:57 -0700	[diff] [blame]	346	if (!get_page_unless_zero(page))
				347	goto unlock;
				348	spin_unlock(ptl);
Matthew Wilcox (Oracle)	4805462	2021-02-24 12:02:02 -0800	[diff] [blame]	349	put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
Zi Yan	616b837	2017-09-08 16:10:57 -0700	[diff] [blame]	350	return;
				351	unlock:
				352	spin_unlock(ptl);
				353	}
				354	#endif
				355
Jan Kara	f900482	2019-03-05 15:48:46 -0800	[diff] [blame]	356	static int expected_page_refs(struct address_space mapping, struct page page)
Jan Kara	0b3901b	2018-12-28 00:39:01 -0800	[diff] [blame]	357	{
				358	int expected_count = 1;
				359
				360	/*
Ralph Campbell	f1f4f3a	2020-10-13 16:58:42 -0700	[diff] [blame]	361	* Device private pages have an extra refcount as they are
Jan Kara	0b3901b	2018-12-28 00:39:01 -0800	[diff] [blame]	362	* ZONE_DEVICE pages.
				363	*/
				364	expected_count += is_device_private_page(page);
Jan Kara	f900482	2019-03-05 15:48:46 -0800	[diff] [blame]	365	if (mapping)
Matthew Wilcox (Oracle)	6c35784	2020-08-14 17:30:37 -0700	[diff] [blame]	366	expected_count += thp_nr_pages(page) + page_has_private(page);
Jan Kara	0b3901b	2018-12-28 00:39:01 -0800	[diff] [blame]	367
				368	return expected_count;
				369	}
				370
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	371	/*
Christoph Lameter	c3fcf8a	2006-06-23 02:03:32 -0700	[diff] [blame]	372	* Replace the page in the mapping.
Christoph Lameter	5b5c712	2006-06-23 02:03:29 -0700	[diff] [blame]	373	*
				374	* The number of remaining references must be:
				375	* 1 for anonymous pages without a mapping
				376	* 2 for pages with a mapping
David Howells	266cf65	2009-04-03 16:42:36 +0100	[diff] [blame]	377	* 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	378	*/
Gu Zheng	36bc08c	2013-07-16 17:56:16 +0800	[diff] [blame]	379	int migrate_page_move_mapping(struct address_space *mapping,
Keith Busch	3710969	2019-07-18 15:58:46 -0700	[diff] [blame]	380	struct page newpage, struct page page, int extra_count)
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	381	{
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	382	XA_STATE(xas, &mapping->i_pages, page_index(page));
Hugh Dickins	42cb14b	2015-11-05 18:50:05 -0800	[diff] [blame]	383	struct zone oldzone, newzone;
				384	int dirty;
Jan Kara	f900482	2019-03-05 15:48:46 -0800	[diff] [blame]	385	int expected_count = expected_page_refs(mapping, page) + extra_count;
Shakeel Butt	5c447d2	2021-01-23 21:01:15 -0800	[diff] [blame]	386	int nr = thp_nr_pages(page);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	387
Christoph Lameter	6c5240a	2006-06-23 02:03:37 -0700	[diff] [blame]	388	if (!mapping) {
Christoph Lameter	0e8c7d0	2007-04-23 14:41:09 -0700	[diff] [blame]	389	/* Anonymous page without mapping */
Benjamin LaHaise	8e321fe	2013-12-21 17:56:08 -0500	[diff] [blame]	390	if (page_count(page) != expected_count)
Christoph Lameter	6c5240a	2006-06-23 02:03:37 -0700	[diff] [blame]	391	return -EAGAIN;
Hugh Dickins	cf4b769	2015-11-05 18:50:02 -0800	[diff] [blame]	392
				393	/* No turning back from here */
Hugh Dickins	cf4b769	2015-11-05 18:50:02 -0800	[diff] [blame]	394	newpage->index = page->index;
				395	newpage->mapping = page->mapping;
				396	if (PageSwapBacked(page))
Hugh Dickins	fa9949d	2016-05-19 17:12:41 -0700	[diff] [blame]	397	__SetPageSwapBacked(newpage);
Hugh Dickins	cf4b769	2015-11-05 18:50:02 -0800	[diff] [blame]	398
Rafael Aquini	78bd520	2012-12-11 16:02:31 -0800	[diff] [blame]	399	return MIGRATEPAGE_SUCCESS;
Christoph Lameter	6c5240a	2006-06-23 02:03:37 -0700	[diff] [blame]	400	}
				401
Hugh Dickins	42cb14b	2015-11-05 18:50:05 -0800	[diff] [blame]	402	oldzone = page_zone(page);
				403	newzone = page_zone(newpage);
				404
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	405	xas_lock_irq(&xas);
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	406	if (page_count(page) != expected_count \|\| xas_load(&xas) != page) {
				407	xas_unlock_irq(&xas);
Christoph Lameter	e23ca00	2006-04-10 22:52:57 -0700	[diff] [blame]	408	return -EAGAIN;
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	409	}
				410
Joonsoo Kim	fe896d1	2016-03-17 14:19:26 -0700	[diff] [blame]	411	if (!page_ref_freeze(page, expected_count)) {
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	412	xas_unlock_irq(&xas);
Nick Piggin	e286781	2008-07-25 19:45:30 -0700	[diff] [blame]	413	return -EAGAIN;
				414	}
				415
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	416	/*
Hugh Dickins	cf4b769	2015-11-05 18:50:02 -0800	[diff] [blame]	417	* Now we know that no one else is looking at the page:
				418	* no turning back from here.
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	419	*/
Hugh Dickins	cf4b769	2015-11-05 18:50:02 -0800	[diff] [blame]	420	newpage->index = page->index;
				421	newpage->mapping = page->mapping;
Shakeel Butt	5c447d2	2021-01-23 21:01:15 -0800	[diff] [blame]	422	page_ref_add(newpage, nr); /* add cache reference */
Nicholas Piggin	6326fec	2016-12-25 13:00:29 +1000	[diff] [blame]	423	if (PageSwapBacked(page)) {
				424	__SetPageSwapBacked(newpage);
				425	if (PageSwapCache(page)) {
				426	SetPageSwapCache(newpage);
				427	set_page_private(newpage, page_private(page));
				428	}
				429	} else {
				430	VM_BUG_ON_PAGE(PageSwapCache(page), page);
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	431	}
				432
Hugh Dickins	42cb14b	2015-11-05 18:50:05 -0800	[diff] [blame]	433	/* Move dirty while page refs frozen and newpage not yet exposed */
				434	dirty = PageDirty(page);
				435	if (dirty) {
				436	ClearPageDirty(page);
				437	SetPageDirty(newpage);
				438	}
				439
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	440	xas_store(&xas, newpage);
Naoya Horiguchi	e71769a	2018-04-20 14:55:45 -0700	[diff] [blame]	441	if (PageTransHuge(page)) {
				442	int i;
Naoya Horiguchi	e71769a	2018-04-20 14:55:45 -0700	[diff] [blame]	443
Shakeel Butt	5c447d2	2021-01-23 21:01:15 -0800	[diff] [blame]	444	for (i = 1; i < nr; i++) {
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	445	xas_next(&xas);
Matthew Wilcox (Oracle)	4101196	2019-09-23 15:34:52 -0700	[diff] [blame]	446	xas_store(&xas, newpage);
Naoya Horiguchi	e71769a	2018-04-20 14:55:45 -0700	[diff] [blame]	447	}
Naoya Horiguchi	e71769a	2018-04-20 14:55:45 -0700	[diff] [blame]	448	}
Nick Piggin	7cf9c2c	2006-12-06 20:33:44 -0800	[diff] [blame]	449
				450	/*
Jacobo Giralt	937a94c	2012-01-10 15:07:11 -0800	[diff] [blame]	451	* Drop cache reference from old page by unfreezing
				452	* to one less reference.
Nick Piggin	7cf9c2c	2006-12-06 20:33:44 -0800	[diff] [blame]	453	* We know this isn't the last reference.
				454	*/
Shakeel Butt	5c447d2	2021-01-23 21:01:15 -0800	[diff] [blame]	455	page_ref_unfreeze(page, expected_count - nr);
Nick Piggin	7cf9c2c	2006-12-06 20:33:44 -0800	[diff] [blame]	456
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	457	xas_unlock(&xas);
Hugh Dickins	42cb14b	2015-11-05 18:50:05 -0800	[diff] [blame]	458	/* Leave irq disabled to prevent preemption while updating stats */
				459
Christoph Lameter	0e8c7d0	2007-04-23 14:41:09 -0700	[diff] [blame]	460	/*
				461	* If moved to a different zone then also account
				462	* the page for that zone. Other VM counters will be
				463	* taken care of when we establish references to the
				464	* new page and drop references to the old page.
				465	*
				466	* Note that anonymous pages are accounted for
Mel Gorman	4b9d0fa	2016-07-28 15:46:17 -0700	[diff] [blame]	467	* via NR_FILE_PAGES and NR_ANON_MAPPED if they
Christoph Lameter	0e8c7d0	2007-04-23 14:41:09 -0700	[diff] [blame]	468	* are mapped to swap space.
				469	*/
Hugh Dickins	42cb14b	2015-11-05 18:50:05 -0800	[diff] [blame]	470	if (newzone != oldzone) {
Johannes Weiner	0d1c207	2020-06-03 16:01:54 -0700	[diff] [blame]	471	struct lruvec old_lruvec, new_lruvec;
				472	struct mem_cgroup *memcg;
				473
				474	memcg = page_memcg(page);
				475	old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat);
				476	new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
				477
Shakeel Butt	5c447d2	2021-01-23 21:01:15 -0800	[diff] [blame]	478	__mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr);
				479	__mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr);
Hugh Dickins	42cb14b	2015-11-05 18:50:05 -0800	[diff] [blame]	480	if (PageSwapBacked(page) && !PageSwapCache(page)) {
Shakeel Butt	5c447d2	2021-01-23 21:01:15 -0800	[diff] [blame]	481	__mod_lruvec_state(old_lruvec, NR_SHMEM, -nr);
				482	__mod_lruvec_state(new_lruvec, NR_SHMEM, nr);
Hugh Dickins	42cb14b	2015-11-05 18:50:05 -0800	[diff] [blame]	483	}
Shakeel Butt	b603894	2021-02-24 12:03:55 -0800	[diff] [blame]	484	#ifdef CONFIG_SWAP
				485	if (PageSwapCache(page)) {
				486	__mod_lruvec_state(old_lruvec, NR_SWAPCACHE, -nr);
				487	__mod_lruvec_state(new_lruvec, NR_SWAPCACHE, nr);
				488	}
				489	#endif
Christoph Hellwig	f56753a	2020-09-24 08:51:40 +0200	[diff] [blame]	490	if (dirty && mapping_can_writeback(mapping)) {
Shakeel Butt	5c447d2	2021-01-23 21:01:15 -0800	[diff] [blame]	491	__mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr);
				492	__mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr);
				493	__mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr);
				494	__mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr);
Hugh Dickins	42cb14b	2015-11-05 18:50:05 -0800	[diff] [blame]	495	}
KOSAKI Motohiro	4b02108	2009-09-21 17:01:33 -0700	[diff] [blame]	496	}
Hugh Dickins	42cb14b	2015-11-05 18:50:05 -0800	[diff] [blame]	497	local_irq_enable();
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	498
Rafael Aquini	78bd520	2012-12-11 16:02:31 -0800	[diff] [blame]	499	return MIGRATEPAGE_SUCCESS;
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	500	}
Richard Weinberger	1118dce	2016-06-16 23:26:14 +0200	[diff] [blame]	501	EXPORT_SYMBOL(migrate_page_move_mapping);
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	502
				503	/*
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	504	* The expected number of remaining references is the same as that
				505	* of migrate_page_move_mapping().
				506	*/
				507	int migrate_huge_page_move_mapping(struct address_space *mapping,
				508	struct page newpage, struct page page)
				509	{
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	510	XA_STATE(xas, &mapping->i_pages, page_index(page));
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	511	int expected_count;
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	512
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	513	xas_lock_irq(&xas);
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	514	expected_count = 2 + page_has_private(page);
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	515	if (page_count(page) != expected_count \|\| xas_load(&xas) != page) {
				516	xas_unlock_irq(&xas);
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	517	return -EAGAIN;
				518	}
				519
Joonsoo Kim	fe896d1	2016-03-17 14:19:26 -0700	[diff] [blame]	520	if (!page_ref_freeze(page, expected_count)) {
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	521	xas_unlock_irq(&xas);
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	522	return -EAGAIN;
				523	}
				524
Hugh Dickins	cf4b769	2015-11-05 18:50:02 -0800	[diff] [blame]	525	newpage->index = page->index;
				526	newpage->mapping = page->mapping;
Johannes Weiner	6a93ca8	2016-03-15 14:57:19 -0700	[diff] [blame]	527
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	528	get_page(newpage);
				529
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	530	xas_store(&xas, newpage);
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	531
Joonsoo Kim	fe896d1	2016-03-17 14:19:26 -0700	[diff] [blame]	532	page_ref_unfreeze(page, expected_count - 1);
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	533
Matthew Wilcox	89eb946	2017-12-04 04:35:16 -0500	[diff] [blame]	534	xas_unlock_irq(&xas);
Johannes Weiner	6a93ca8	2016-03-15 14:57:19 -0700	[diff] [blame]	535
Rafael Aquini	78bd520	2012-12-11 16:02:31 -0800	[diff] [blame]	536	return MIGRATEPAGE_SUCCESS;
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	537	}
				538
				539	/*
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	540	* Copy the page to its new location
				541	*/
Jérôme Glisse	2916ecc	2017-09-08 16:12:06 -0700	[diff] [blame]	542	void migrate_page_states(struct page newpage, struct page page)
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	543	{
Rik van Riel	7851a45	2013-10-07 11:29:23 +0100	[diff] [blame]	544	int cpupid;
				545
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	546	if (PageError(page))
				547	SetPageError(newpage);
				548	if (PageReferenced(page))
				549	SetPageReferenced(newpage);
				550	if (PageUptodate(page))
				551	SetPageUptodate(newpage);
Lee Schermerhorn	894bc31	2008-10-18 20:26:39 -0700	[diff] [blame]	552	if (TestClearPageActive(page)) {
Sasha Levin	309381fea	2014-01-23 15:52:54 -0800	[diff] [blame]	553	VM_BUG_ON_PAGE(PageUnevictable(page), page);
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	554	SetPageActive(newpage);
Lee Schermerhorn	418b27e	2009-12-14 17:59:54 -0800	[diff] [blame]	555	} else if (TestClearPageUnevictable(page))
				556	SetPageUnevictable(newpage);
Johannes Weiner	1899ad1	2018-10-26 15:06:04 -0700	[diff] [blame]	557	if (PageWorkingset(page))
				558	SetPageWorkingset(newpage);
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	559	if (PageChecked(page))
				560	SetPageChecked(newpage);
				561	if (PageMappedToDisk(page))
				562	SetPageMappedToDisk(newpage);
				563
Hugh Dickins	42cb14b	2015-11-05 18:50:05 -0800	[diff] [blame]	564	/* Move dirty on pages not done by migrate_page_move_mapping() */
				565	if (PageDirty(page))
				566	SetPageDirty(newpage);
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	567
Vladimir Davydov	33c3fc7	2015-09-09 15:35:45 -0700	[diff] [blame]	568	if (page_is_young(page))
				569	set_page_young(newpage);
				570	if (page_is_idle(page))
				571	set_page_idle(newpage);
				572
Rik van Riel	7851a45	2013-10-07 11:29:23 +0100	[diff] [blame]	573	/*
				574	* Copy NUMA information to the new page, to prevent over-eager
				575	* future migrations of this same page.
				576	*/
				577	cpupid = page_cpupid_xchg_last(page, -1);
				578	page_cpupid_xchg_last(newpage, cpupid);
				579
Hugh Dickins	e9995ef	2009-12-14 17:59:31 -0800	[diff] [blame]	580	ksm_migrate_page(newpage, page);
Hugh Dickins	c8d6553	2013-02-22 16:35:10 -0800	[diff] [blame]	581	/*
				582	* Please do not reorder this without considering how mm/ksm.c's
				583	* get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache().
				584	*/
Naoya Horiguchi	b3b3a99	2015-04-15 16:13:15 -0700	[diff] [blame]	585	if (PageSwapCache(page))
				586	ClearPageSwapCache(page);
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	587	ClearPagePrivate(page);
Muchun Song	ad2fa37	2021-06-30 18:47:21 -0700	[diff] [blame]	588
				589	/* page->private contains hugetlb specific flags */
				590	if (!PageHuge(page))
				591	set_page_private(page, 0);
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	592
				593	/*
				594	* If any waiters have accumulated on the new page then
				595	* wake them up.
				596	*/
				597	if (PageWriteback(newpage))
				598	end_page_writeback(newpage);
Vlastimil Babka	d435edc	2016-03-15 14:56:15 -0700	[diff] [blame]	599
Yang Shi	6aeff24	2020-04-06 20:04:21 -0700	[diff] [blame]	600	/*
				601	* PG_readahead shares the same bit with PG_reclaim. The above
				602	* end_page_writeback() may clear PG_readahead mistakenly, so set the
				603	* bit after that.
				604	*/
				605	if (PageReadahead(page))
				606	SetPageReadahead(newpage);
				607
Vlastimil Babka	d435edc	2016-03-15 14:56:15 -0700	[diff] [blame]	608	copy_page_owner(page, newpage);
Johannes Weiner	74485cf	2016-03-15 14:57:54 -0700	[diff] [blame]	609
Hugh Dickins	a333e3e	2020-09-18 21:20:06 -0700	[diff] [blame]	610	if (!PageHuge(page))
				611	mem_cgroup_migrate(page, newpage);
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	612	}
Jérôme Glisse	2916ecc	2017-09-08 16:12:06 -0700	[diff] [blame]	613	EXPORT_SYMBOL(migrate_page_states);
				614
				615	void migrate_page_copy(struct page newpage, struct page page)
				616	{
				617	if (PageHuge(page) \|\| PageTransHuge(page))
				618	copy_huge_page(newpage, page);
				619	else
				620	copy_highpage(newpage, page);
				621
				622	migrate_page_states(newpage, page);
				623	}
Richard Weinberger	1118dce	2016-06-16 23:26:14 +0200	[diff] [blame]	624	EXPORT_SYMBOL(migrate_page_copy);
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	625
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	626	/************************************************************
				627	* Migration functions
				628	***********************************************************/
				629
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	630	/*
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	631	* Common logic to directly migrate a single LRU page suitable for
David Howells	266cf65	2009-04-03 16:42:36 +0100	[diff] [blame]	632	* pages that do not use PagePrivate/PagePrivate2.
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	633	*
				634	* Pages are locked upon entry and exit.
				635	*/
Christoph Lameter	2d1db3b	2006-06-23 02:03:33 -0700	[diff] [blame]	636	int migrate_page(struct address_space *mapping,
Mel Gorman	a6bc32b	2012-01-12 17:19:43 -0800	[diff] [blame]	637	struct page newpage, struct page page,
				638	enum migrate_mode mode)
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	639	{
				640	int rc;
				641
				642	BUG_ON(PageWriteback(page)); /* Writeback must be complete */
				643
Keith Busch	3710969	2019-07-18 15:58:46 -0700	[diff] [blame]	644	rc = migrate_page_move_mapping(mapping, newpage, page, 0);
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	645
Rafael Aquini	78bd520	2012-12-11 16:02:31 -0800	[diff] [blame]	646	if (rc != MIGRATEPAGE_SUCCESS)
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	647	return rc;
				648
Jérôme Glisse	2916ecc	2017-09-08 16:12:06 -0700	[diff] [blame]	649	if (mode != MIGRATE_SYNC_NO_COPY)
				650	migrate_page_copy(newpage, page);
				651	else
				652	migrate_page_states(newpage, page);
Rafael Aquini	78bd520	2012-12-11 16:02:31 -0800	[diff] [blame]	653	return MIGRATEPAGE_SUCCESS;
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	654	}
				655	EXPORT_SYMBOL(migrate_page);
				656
David Howells	9361401	2006-09-30 20:45:40 +0200	[diff] [blame]	657	#ifdef CONFIG_BLOCK
Jan Kara	84ade7c	2018-12-28 00:39:09 -0800	[diff] [blame]	658	/* Returns true if all buffers are successfully locked */
				659	static bool buffer_migrate_lock_buffers(struct buffer_head *head,
				660	enum migrate_mode mode)
				661	{
				662	struct buffer_head *bh = head;
				663
				664	/* Simple case, sync compaction */
				665	if (mode != MIGRATE_ASYNC) {
				666	do {
Jan Kara	84ade7c	2018-12-28 00:39:09 -0800	[diff] [blame]	667	lock_buffer(bh);
				668	bh = bh->b_this_page;
				669
				670	} while (bh != head);
				671
				672	return true;
				673	}
				674
				675	/* async case, we cannot block on lock_buffer so use trylock_buffer */
				676	do {
Jan Kara	84ade7c	2018-12-28 00:39:09 -0800	[diff] [blame]	677	if (!trylock_buffer(bh)) {
				678	/*
				679	* We failed to lock the buffer and cannot stall in
				680	* async migration. Release the taken locks
				681	*/
				682	struct buffer_head *failed_bh = bh;
Jan Kara	84ade7c	2018-12-28 00:39:09 -0800	[diff] [blame]	683	bh = head;
				684	while (bh != failed_bh) {
				685	unlock_buffer(bh);
Jan Kara	84ade7c	2018-12-28 00:39:09 -0800	[diff] [blame]	686	bh = bh->b_this_page;
				687	}
				688	return false;
				689	}
				690
				691	bh = bh->b_this_page;
				692	} while (bh != head);
				693	return true;
				694	}
				695
Jan Kara	89cb088	2018-12-28 00:39:12 -0800	[diff] [blame]	696	static int __buffer_migrate_page(struct address_space *mapping,
				697	struct page newpage, struct page page, enum migrate_mode mode,
				698	bool check_refs)
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	699	{
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	700	struct buffer_head bh, head;
				701	int rc;
Jan Kara	cc4f11e	2018-12-28 00:39:05 -0800	[diff] [blame]	702	int expected_count;
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	703
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	704	if (!page_has_buffers(page))
Mel Gorman	a6bc32b	2012-01-12 17:19:43 -0800	[diff] [blame]	705	return migrate_page(mapping, newpage, page, mode);
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	706
Jan Kara	cc4f11e	2018-12-28 00:39:05 -0800	[diff] [blame]	707	/* Check whether page does not have extra refs before we do more work */
Jan Kara	f900482	2019-03-05 15:48:46 -0800	[diff] [blame]	708	expected_count = expected_page_refs(mapping, page);
Jan Kara	cc4f11e	2018-12-28 00:39:05 -0800	[diff] [blame]	709	if (page_count(page) != expected_count)
				710	return -EAGAIN;
				711
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	712	head = page_buffers(page);
Jan Kara	cc4f11e	2018-12-28 00:39:05 -0800	[diff] [blame]	713	if (!buffer_migrate_lock_buffers(head, mode))
				714	return -EAGAIN;
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	715
Jan Kara	89cb088	2018-12-28 00:39:12 -0800	[diff] [blame]	716	if (check_refs) {
				717	bool busy;
				718	bool invalidated = false;
				719
				720	recheck_buffers:
				721	busy = false;
				722	spin_lock(&mapping->private_lock);
				723	bh = head;
				724	do {
				725	if (atomic_read(&bh->b_count)) {
				726	busy = true;
				727	break;
				728	}
				729	bh = bh->b_this_page;
				730	} while (bh != head);
Jan Kara	89cb088	2018-12-28 00:39:12 -0800	[diff] [blame]	731	if (busy) {
				732	if (invalidated) {
				733	rc = -EAGAIN;
				734	goto unlock_buffers;
				735	}
Jan Kara	ebdf4de	2019-08-02 21:48:47 -0700	[diff] [blame]	736	spin_unlock(&mapping->private_lock);
Jan Kara	89cb088	2018-12-28 00:39:12 -0800	[diff] [blame]	737	invalidate_bh_lrus();
				738	invalidated = true;
				739	goto recheck_buffers;
				740	}
				741	}
				742
Keith Busch	3710969	2019-07-18 15:58:46 -0700	[diff] [blame]	743	rc = migrate_page_move_mapping(mapping, newpage, page, 0);
Rafael Aquini	78bd520	2012-12-11 16:02:31 -0800	[diff] [blame]	744	if (rc != MIGRATEPAGE_SUCCESS)
Jan Kara	cc4f11e	2018-12-28 00:39:05 -0800	[diff] [blame]	745	goto unlock_buffers;
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	746
Guoqing Jiang	cd0f371	2020-06-01 21:48:06 -0700	[diff] [blame]	747	attach_page_private(newpage, detach_page_private(page));
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	748
				749	bh = head;
				750	do {
				751	set_bh_page(bh, newpage, bh_offset(bh));
				752	bh = bh->b_this_page;
				753
				754	} while (bh != head);
				755
Jérôme Glisse	2916ecc	2017-09-08 16:12:06 -0700	[diff] [blame]	756	if (mode != MIGRATE_SYNC_NO_COPY)
				757	migrate_page_copy(newpage, page);
				758	else
				759	migrate_page_states(newpage, page);
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	760
Jan Kara	cc4f11e	2018-12-28 00:39:05 -0800	[diff] [blame]	761	rc = MIGRATEPAGE_SUCCESS;
				762	unlock_buffers:
Jan Kara	ebdf4de	2019-08-02 21:48:47 -0700	[diff] [blame]	763	if (check_refs)
				764	spin_unlock(&mapping->private_lock);
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	765	bh = head;
				766	do {
				767	unlock_buffer(bh);
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	768	bh = bh->b_this_page;
				769
				770	} while (bh != head);
				771
Jan Kara	cc4f11e	2018-12-28 00:39:05 -0800	[diff] [blame]	772	return rc;
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	773	}
Jan Kara	89cb088	2018-12-28 00:39:12 -0800	[diff] [blame]	774
				775	/*
				776	* Migration function for pages with buffers. This function can only be used
				777	* if the underlying filesystem guarantees that no other references to "page"
				778	* exist. For example attached buffer heads are accessed only under page lock.
				779	*/
				780	int buffer_migrate_page(struct address_space *mapping,
				781	struct page newpage, struct page page, enum migrate_mode mode)
				782	{
				783	return __buffer_migrate_page(mapping, newpage, page, mode, false);
				784	}
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	785	EXPORT_SYMBOL(buffer_migrate_page);
Jan Kara	89cb088	2018-12-28 00:39:12 -0800	[diff] [blame]	786
				787	/*
				788	* Same as above except that this variant is more careful and checks that there
				789	* are also no buffer head references. This function is the right one for
				790	* mappings where buffer heads are directly looked up and referenced (such as
				791	* block device mappings).
				792	*/
				793	int buffer_migrate_page_norefs(struct address_space *mapping,
				794	struct page newpage, struct page page, enum migrate_mode mode)
				795	{
				796	return __buffer_migrate_page(mapping, newpage, page, mode, true);
				797	}
David Howells	9361401	2006-09-30 20:45:40 +0200	[diff] [blame]	798	#endif
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	799
Christoph Lameter	04e62a2	2006-06-23 02:03:38 -0700	[diff] [blame]	800	/*
				801	* Writeback a page to clean the dirty state
				802	*/
				803	static int writeout(struct address_space mapping, struct page page)
				804	{
				805	struct writeback_control wbc = {
				806	.sync_mode = WB_SYNC_NONE,
				807	.nr_to_write = 1,
				808	.range_start = 0,
				809	.range_end = LLONG_MAX,
Christoph Lameter	04e62a2	2006-06-23 02:03:38 -0700	[diff] [blame]	810	.for_reclaim = 1
				811	};
				812	int rc;
				813
				814	if (!mapping->a_ops->writepage)
				815	/* No write method for the address space */
				816	return -EINVAL;
				817
				818	if (!clear_page_dirty_for_io(page))
				819	/* Someone else already triggered a write */
				820	return -EAGAIN;
				821
				822	/*
				823	* A dirty page may imply that the underlying filesystem has
				824	* the page on some queue. So the page must be clean for
				825	* migration. Writeout may mean we loose the lock and the
				826	* page state is no longer what we checked for earlier.
				827	* At this point we know that the migration attempt cannot
				828	* be successful.
				829	*/
Kirill A. Shutemov	e388466	2016-03-17 14:20:07 -0700	[diff] [blame]	830	remove_migration_ptes(page, page, false);
Christoph Lameter	04e62a2	2006-06-23 02:03:38 -0700	[diff] [blame]	831
				832	rc = mapping->a_ops->writepage(page, &wbc);
Christoph Lameter	04e62a2	2006-06-23 02:03:38 -0700	[diff] [blame]	833
				834	if (rc != AOP_WRITEPAGE_ACTIVATE)
				835	/* unlocked. Relock */
				836	lock_page(page);
				837
Hugh Dickins	bda8550	2008-11-19 15:36:36 -0800	[diff] [blame]	838	return (rc < 0) ? -EIO : -EAGAIN;
Christoph Lameter	04e62a2	2006-06-23 02:03:38 -0700	[diff] [blame]	839	}
				840
				841	/*
				842	* Default handling if a filesystem does not provide a migration function.
				843	*/
Christoph Lameter	8351a6e	2006-06-23 02:03:33 -0700	[diff] [blame]	844	static int fallback_migrate_page(struct address_space *mapping,
Mel Gorman	a6bc32b	2012-01-12 17:19:43 -0800	[diff] [blame]	845	struct page newpage, struct page page, enum migrate_mode mode)
Christoph Lameter	8351a6e	2006-06-23 02:03:33 -0700	[diff] [blame]	846	{
Mel Gorman	b969c4ab	2012-01-12 17:19:34 -0800	[diff] [blame]	847	if (PageDirty(page)) {
Mel Gorman	a6bc32b	2012-01-12 17:19:43 -0800	[diff] [blame]	848	/* Only writeback pages in full synchronous migration */
Jérôme Glisse	2916ecc	2017-09-08 16:12:06 -0700	[diff] [blame]	849	switch (mode) {
				850	case MIGRATE_SYNC:
				851	case MIGRATE_SYNC_NO_COPY:
				852	break;
				853	default:
Mel Gorman	b969c4ab	2012-01-12 17:19:34 -0800	[diff] [blame]	854	return -EBUSY;
Jérôme Glisse	2916ecc	2017-09-08 16:12:06 -0700	[diff] [blame]	855	}
Christoph Lameter	04e62a2	2006-06-23 02:03:38 -0700	[diff] [blame]	856	return writeout(mapping, page);
Mel Gorman	b969c4ab	2012-01-12 17:19:34 -0800	[diff] [blame]	857	}
Christoph Lameter	8351a6e	2006-06-23 02:03:33 -0700	[diff] [blame]	858
				859	/*
				860	* Buffers may be managed in a filesystem specific way.
				861	* We must have no buffers or drop them.
				862	*/
David Howells	266cf65	2009-04-03 16:42:36 +0100	[diff] [blame]	863	if (page_has_private(page) &&
Christoph Lameter	8351a6e	2006-06-23 02:03:33 -0700	[diff] [blame]	864	!try_to_release_page(page, GFP_KERNEL))
Mel Gorman	806031b	2019-03-05 15:44:43 -0800	[diff] [blame]	865	return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY;
Christoph Lameter	8351a6e	2006-06-23 02:03:33 -0700	[diff] [blame]	866
Mel Gorman	a6bc32b	2012-01-12 17:19:43 -0800	[diff] [blame]	867	return migrate_page(mapping, newpage, page, mode);
Christoph Lameter	8351a6e	2006-06-23 02:03:33 -0700	[diff] [blame]	868	}
				869
Christoph Lameter	1d8b85c	2006-06-23 02:03:28 -0700	[diff] [blame]	870	/*
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	871	* Move a page to a newly allocated page
				872	* The page is locked and all ptes have been successfully removed.
				873	*
				874	* The new page will have replaced the old page if this function
				875	* is successful.
Lee Schermerhorn	894bc31	2008-10-18 20:26:39 -0700	[diff] [blame]	876	*
				877	* Return value:
				878	* < 0 - error code
Rafael Aquini	78bd520	2012-12-11 16:02:31 -0800	[diff] [blame]	879	* MIGRATEPAGE_SUCCESS - success
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	880	*/
Mel Gorman	3fe2011	2010-05-24 14:32:20 -0700	[diff] [blame]	881	static int move_to_new_page(struct page newpage, struct page page,
Hugh Dickins	5c3f9a6	2015-11-05 18:49:53 -0800	[diff] [blame]	882	enum migrate_mode mode)
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	883	{
				884	struct address_space *mapping;
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	885	int rc = -EAGAIN;
				886	bool is_lru = !__PageMovable(page);
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	887
Hugh Dickins	7db7671	2015-11-05 18:49:49 -0800	[diff] [blame]	888	VM_BUG_ON_PAGE(!PageLocked(page), page);
				889	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	890
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	891	mapping = page_mapping(page);
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	892
				893	if (likely(is_lru)) {
				894	if (!mapping)
				895	rc = migrate_page(mapping, newpage, page, mode);
				896	else if (mapping->a_ops->migratepage)
				897	/*
				898	* Most pages have a mapping and most filesystems
				899	* provide a migratepage callback. Anonymous pages
				900	* are part of swap space which also has its own
				901	* migratepage callback. This is the most common path
				902	* for page migration.
				903	*/
				904	rc = mapping->a_ops->migratepage(mapping, newpage,
				905	page, mode);
				906	else
				907	rc = fallback_migrate_page(mapping, newpage,
				908	page, mode);
				909	} else {
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	910	/*
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	911	* In case of non-lru page, it could be released after
				912	* isolation step. In that case, we shouldn't try migration.
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	913	*/
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	914	VM_BUG_ON_PAGE(!PageIsolated(page), page);
				915	if (!PageMovable(page)) {
				916	rc = MIGRATEPAGE_SUCCESS;
				917	__ClearPageIsolated(page);
				918	goto out;
				919	}
				920
				921	rc = mapping->a_ops->migratepage(mapping, newpage,
				922	page, mode);
				923	WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS &&
				924	!PageIsolated(page));
				925	}
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	926
Hugh Dickins	5c3f9a6	2015-11-05 18:49:53 -0800	[diff] [blame]	927	/*
				928	* When successful, old pagecache page->mapping must be cleared before
				929	* page is freed; but stats require that PageAnon be left as PageAnon.
				930	*/
				931	if (rc == MIGRATEPAGE_SUCCESS) {
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	932	if (__PageMovable(page)) {
				933	VM_BUG_ON_PAGE(!PageIsolated(page), page);
				934
				935	/*
				936	* We clear PG_movable under page_lock so any compactor
				937	* cannot try to migrate this page.
				938	*/
				939	__ClearPageIsolated(page);
				940	}
				941
				942	/*
Ralph Campbell	c23a0c9	2020-01-30 22:14:41 -0800	[diff] [blame]	943	* Anonymous and movable page->mapping will be cleared by
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	944	* free_pages_prepare so don't reset it here for keeping
				945	* the type to work PageAnon, for example.
				946	*/
				947	if (!PageMappingFlags(page))
Hugh Dickins	5c3f9a6	2015-11-05 18:49:53 -0800	[diff] [blame]	948	page->mapping = NULL;
Lars Persson	d2b2c6dd	2019-03-28 20:44:28 -0700	[diff] [blame]	949
Christoph Hellwig	25b2995	2019-06-13 22:50:49 +0200	[diff] [blame]	950	if (likely(!is_zone_device_page(newpage)))
Lars Persson	d2b2c6dd	2019-03-28 20:44:28 -0700	[diff] [blame]	951	flush_dcache_page(newpage);
				952
Mel Gorman	3fe2011	2010-05-24 14:32:20 -0700	[diff] [blame]	953	}
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	954	out:
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	955	return rc;
				956	}
				957
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	958	static int __unmap_and_move(struct page page, struct page newpage,
Hugh Dickins	9c620e2	2013-02-22 16:35:14 -0800	[diff] [blame]	959	int force, enum migrate_mode mode)
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	960	{
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	961	int rc = -EAGAIN;
Hugh Dickins	2ebba6b	2014-12-12 16:56:19 -0800	[diff] [blame]	962	int page_was_mapped = 0;
Mel Gorman	3f6c827	2010-05-24 14:32:17 -0700	[diff] [blame]	963	struct anon_vma *anon_vma = NULL;
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	964	bool is_lru = !__PageMovable(page);
Christoph Lameter	95a402c	2006-06-23 02:03:53 -0700	[diff] [blame]	965
Nick Piggin	529ae9a	2008-08-02 12:01:03 +0200	[diff] [blame]	966	if (!trylock_page(page)) {
Mel Gorman	a6bc32b	2012-01-12 17:19:43 -0800	[diff] [blame]	967	if (!force \|\| mode == MIGRATE_ASYNC)
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	968	goto out;
Mel Gorman	3e7d344	2011-01-13 15:45:56 -0800	[diff] [blame]	969
				970	/*
				971	* It's not safe for direct compaction to call lock_page.
				972	* For example, during page readahead pages are added locked
				973	* to the LRU. Later, when the IO completes the pages are
				974	* marked uptodate and unlocked. However, the queueing
				975	* could be merging multiple pages for one bio (e.g.
Matthew Wilcox (Oracle)	d438834	2020-06-01 21:47:02 -0700	[diff] [blame]	976	* mpage_readahead). If an allocation happens for the
Mel Gorman	3e7d344	2011-01-13 15:45:56 -0800	[diff] [blame]	977	* second or third page, the process can end up locking
				978	* the same page twice and deadlocking. Rather than
				979	* trying to be clever about what pages can be locked,
				980	* avoid the use of lock_page for direct compaction
				981	* altogether.
				982	*/
				983	if (current->flags & PF_MEMALLOC)
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	984	goto out;
Mel Gorman	3e7d344	2011-01-13 15:45:56 -0800	[diff] [blame]	985
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	986	lock_page(page);
				987	}
				988
				989	if (PageWriteback(page)) {
Andrea Arcangeli	11bc82d	2011-03-22 16:33:11 -0700	[diff] [blame]	990	/*
Jianguo Wu	fed5b64	2013-04-29 15:07:58 -0700	[diff] [blame]	991	* Only in the case of a full synchronous migration is it
Mel Gorman	a6bc32b	2012-01-12 17:19:43 -0800	[diff] [blame]	992	* necessary to wait for PageWriteback. In the async case,
				993	* the retry loop is too short and in the sync-light case,
				994	* the overhead of stalling is too much
Andrea Arcangeli	11bc82d	2011-03-22 16:33:11 -0700	[diff] [blame]	995	*/
Jérôme Glisse	2916ecc	2017-09-08 16:12:06 -0700	[diff] [blame]	996	switch (mode) {
				997	case MIGRATE_SYNC:
				998	case MIGRATE_SYNC_NO_COPY:
				999	break;
				1000	default:
Andrea Arcangeli	11bc82d	2011-03-22 16:33:11 -0700	[diff] [blame]	1001	rc = -EBUSY;
Johannes Weiner	0a31bc9	2014-08-08 14:19:22 -0700	[diff] [blame]	1002	goto out_unlock;
Andrea Arcangeli	11bc82d	2011-03-22 16:33:11 -0700	[diff] [blame]	1003	}
				1004	if (!force)
Johannes Weiner	0a31bc9	2014-08-08 14:19:22 -0700	[diff] [blame]	1005	goto out_unlock;
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1006	wait_on_page_writeback(page);
				1007	}
Hugh Dickins	03f15c8	2015-11-05 18:49:56 -0800	[diff] [blame]	1008
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1009	/*
KAMEZAWA Hiroyuki	dc386d4	2007-07-26 10:41:07 -0700	[diff] [blame]	1010	* By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
				1011	* we cannot notice that anon_vma is freed while we migrates a page.
Hugh Dickins	1ce82b6	2011-01-13 15:47:30 -0800	[diff] [blame]	1012	* This get_anon_vma() delays freeing anon_vma pointer until the end
KAMEZAWA Hiroyuki	dc386d4	2007-07-26 10:41:07 -0700	[diff] [blame]	1013	* of migration. File cache pages are no problem because of page_lock()
KAMEZAWA Hiroyuki	989f89c	2007-08-30 23:56:21 -0700	[diff] [blame]	1014	* File Caches may use write_page() or lock_page() in migration, then,
				1015	* just care Anon page here.
Hugh Dickins	03f15c8	2015-11-05 18:49:56 -0800	[diff] [blame]	1016	*
				1017	* Only page_get_anon_vma() understands the subtleties of
				1018	* getting a hold on an anon_vma from outside one of its mms.
				1019	* But if we cannot get anon_vma, then we won't need it anyway,
				1020	* because that implies that the anon page is no longer mapped
				1021	* (and cannot be remapped so long as we hold the page lock).
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1022	*/
Hugh Dickins	03f15c8	2015-11-05 18:49:56 -0800	[diff] [blame]	1023	if (PageAnon(page) && !PageKsm(page))
Peter Zijlstra	746b18d	2011-05-24 17:12:10 -0700	[diff] [blame]	1024	anon_vma = page_get_anon_vma(page);
Shaohua Li	62e1c55	2008-02-04 22:29:33 -0800	[diff] [blame]	1025
Hugh Dickins	7db7671	2015-11-05 18:49:49 -0800	[diff] [blame]	1026	/*
				1027	* Block others from accessing the new page when we get around to
				1028	* establishing additional references. We are usually the only one
				1029	* holding a reference to newpage at this point. We used to have a BUG
				1030	* here if trylock_page(newpage) fails, but would like to allow for
				1031	* cases where there might be a race with the previous use of newpage.
				1032	* This is much like races on refcount of oldpage: just don't BUG().
				1033	*/
				1034	if (unlikely(!trylock_page(newpage)))
				1035	goto out_unlock;
				1036
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	1037	if (unlikely(!is_lru)) {
				1038	rc = move_to_new_page(newpage, page, mode);
				1039	goto out_unlock_both;
				1040	}
				1041
KAMEZAWA Hiroyuki	dc386d4	2007-07-26 10:41:07 -0700	[diff] [blame]	1042	/*
Shaohua Li	62e1c55	2008-02-04 22:29:33 -0800	[diff] [blame]	1043	* Corner case handling:
				1044	* 1. When a new swap-cache page is read into, it is added to the LRU
				1045	* and treated as swapcache but it has no rmap yet.
				1046	* Calling try_to_unmap() against a page->mapping==NULL page will
				1047	* trigger a BUG. So handle it here.
Yang Shi	d12b895	2020-12-14 19:13:02 -0800	[diff] [blame]	1048	* 2. An orphaned page (see truncate_cleanup_page) might have
Shaohua Li	62e1c55	2008-02-04 22:29:33 -0800	[diff] [blame]	1049	* fs-private metadata. The page can be picked up due to memory
				1050	* offlining. Everywhere else except page reclaim, the page is
				1051	* invisible to the vm, so the page can not be migrated. So try to
				1052	* free the metadata, so the page can be freed.
KAMEZAWA Hiroyuki	dc386d4	2007-07-26 10:41:07 -0700	[diff] [blame]	1053	*/
Shaohua Li	62e1c55	2008-02-04 22:29:33 -0800	[diff] [blame]	1054	if (!page->mapping) {
Sasha Levin	309381fea	2014-01-23 15:52:54 -0800	[diff] [blame]	1055	VM_BUG_ON_PAGE(PageAnon(page), page);
Hugh Dickins	1ce82b6	2011-01-13 15:47:30 -0800	[diff] [blame]	1056	if (page_has_private(page)) {
Shaohua Li	62e1c55	2008-02-04 22:29:33 -0800	[diff] [blame]	1057	try_to_free_buffers(page);
Hugh Dickins	7db7671	2015-11-05 18:49:49 -0800	[diff] [blame]	1058	goto out_unlock_both;
Shaohua Li	62e1c55	2008-02-04 22:29:33 -0800	[diff] [blame]	1059	}
Hugh Dickins	7db7671	2015-11-05 18:49:49 -0800	[diff] [blame]	1060	} else if (page_mapped(page)) {
				1061	/* Establish migration ptes */
Hugh Dickins	03f15c8	2015-11-05 18:49:56 -0800	[diff] [blame]	1062	VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
				1063	page);
Alistair Popple	a98a2f0	2021-06-30 18:54:16 -0700	[diff] [blame]	1064	try_to_migrate(page, 0);
Hugh Dickins	2ebba6b	2014-12-12 16:56:19 -0800	[diff] [blame]	1065	page_was_mapped = 1;
				1066	}
KAMEZAWA Hiroyuki	dc386d4	2007-07-26 10:41:07 -0700	[diff] [blame]	1067
Christoph Lameter	e6a1530	2006-06-25 05:46:49 -0700	[diff] [blame]	1068	if (!page_mapped(page))
Hugh Dickins	5c3f9a6	2015-11-05 18:49:53 -0800	[diff] [blame]	1069	rc = move_to_new_page(newpage, page, mode);
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1070
Hugh Dickins	5c3f9a6	2015-11-05 18:49:53 -0800	[diff] [blame]	1071	if (page_was_mapped)
				1072	remove_migration_ptes(page,
Kirill A. Shutemov	e388466	2016-03-17 14:20:07 -0700	[diff] [blame]	1073	rc == MIGRATEPAGE_SUCCESS ? newpage : page, false);
Mel Gorman	3f6c827	2010-05-24 14:32:17 -0700	[diff] [blame]	1074
Hugh Dickins	7db7671	2015-11-05 18:49:49 -0800	[diff] [blame]	1075	out_unlock_both:
				1076	unlock_page(newpage);
				1077	out_unlock:
Mel Gorman	3f6c827	2010-05-24 14:32:17 -0700	[diff] [blame]	1078	/* Drop an anon_vma reference if we took one */
Rik van Riel	7654506	2010-08-09 17:18:41 -0700	[diff] [blame]	1079	if (anon_vma)
Peter Zijlstra	9e60109	2011-03-22 16:32:46 -0700	[diff] [blame]	1080	put_anon_vma(anon_vma);
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1081	unlock_page(page);
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	1082	out:
Minchan Kim	c6c919e	2016-07-26 15:23:02 -0700	[diff] [blame]	1083	/*
				1084	* If migration is successful, decrease refcount of the newpage
				1085	* which will not free the page because new page owner increased
				1086	* refcounter. As well, if it is LRU page, add the page to LRU
David Hildenbrand	e0a352f	2019-02-01 14:21:19 -0800	[diff] [blame]	1087	* list in here. Use the old state of the isolated source page to
				1088	* determine if we migrated a LRU page. newpage was already unlocked
				1089	* and possibly modified by its owner - don't rely on the page
				1090	* state.
Minchan Kim	c6c919e	2016-07-26 15:23:02 -0700	[diff] [blame]	1091	*/
				1092	if (rc == MIGRATEPAGE_SUCCESS) {
David Hildenbrand	e0a352f	2019-02-01 14:21:19 -0800	[diff] [blame]	1093	if (unlikely(!is_lru))
Minchan Kim	c6c919e	2016-07-26 15:23:02 -0700	[diff] [blame]	1094	put_page(newpage);
				1095	else
				1096	putback_lru_page(newpage);
				1097	}
				1098
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	1099	return rc;
				1100	}
Christoph Lameter	95a402c	2006-06-23 02:03:53 -0700	[diff] [blame]	1101
Dave Hansen	79c28a4	2021-09-02 14:59:06 -0700	[diff] [blame^]	1102
				1103	/*
				1104	* node_demotion[] example:
				1105	*
				1106	* Consider a system with two sockets. Each socket has
				1107	* three classes of memory attached: fast, medium and slow.
				1108	* Each memory class is placed in its own NUMA node. The
				1109	* CPUs are placed in the node with the "fast" memory. The
				1110	* 6 NUMA nodes (0-5) might be split among the sockets like
				1111	* this:
				1112	*
				1113	* Socket A: 0, 1, 2
				1114	* Socket B: 3, 4, 5
				1115	*
				1116	* When Node 0 fills up, its memory should be migrated to
				1117	* Node 1. When Node 1 fills up, it should be migrated to
				1118	* Node 2. The migration path start on the nodes with the
				1119	* processors (since allocations default to this node) and
				1120	* fast memory, progress through medium and end with the
				1121	* slow memory:
				1122	*
				1123	* 0 -> 1 -> 2 -> stop
				1124	* 3 -> 4 -> 5 -> stop
				1125	*
				1126	* This is represented in the node_demotion[] like this:
				1127	*
				1128	* { 1, // Node 0 migrates to 1
				1129	* 2, // Node 1 migrates to 2
				1130	* -1, // Node 2 does not migrate
				1131	* 4, // Node 3 migrates to 4
				1132	* 5, // Node 4 migrates to 5
				1133	* -1} // Node 5 does not migrate
				1134	*/
				1135
				1136	/*
				1137	* Writes to this array occur without locking. Cycles are
				1138	* not allowed: Node X demotes to Y which demotes to X...
				1139	*
				1140	* If multiple reads are performed, a single rcu_read_lock()
				1141	* must be held over all reads to ensure that no cycles are
				1142	* observed.
				1143	*/
				1144	static int node_demotion[MAX_NUMNODES] __read_mostly =
				1145	{[0 ... MAX_NUMNODES - 1] = NUMA_NO_NODE};
				1146
				1147	/**
				1148	* next_demotion_node() - Get the next node in the demotion path
				1149	* @node: The starting node to lookup the next node
				1150	*
				1151	* @returns: node id for next memory node in the demotion path hierarchy
				1152	* from @node; NUMA_NO_NODE if @node is terminal. This does not keep
				1153	* @node online or guarantee that it continues to be the next demotion
				1154	* target.
				1155	*/
				1156	int next_demotion_node(int node)
				1157	{
				1158	int target;
				1159
				1160	/*
				1161	* node_demotion[] is updated without excluding this
				1162	* function from running. RCU doesn't provide any
				1163	* compiler barriers, so the READ_ONCE() is required
				1164	* to avoid compiler reordering or read merging.
				1165	*
				1166	* Make sure to use RCU over entire code blocks if
				1167	* node_demotion[] reads need to be consistent.
				1168	*/
				1169	rcu_read_lock();
				1170	target = READ_ONCE(node_demotion[node]);
				1171	rcu_read_unlock();
				1172
				1173	return target;
				1174	}
				1175
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	1176	/*
				1177	* Obtain the lock on page, remove all ptes and migrate the page
				1178	* to the newly allocated page in newpage.
				1179	*/
Linus Torvalds	6ec4476	2020-07-08 10:48:35 -0700	[diff] [blame]	1180	static int unmap_and_move(new_page_t get_new_page,
Geert Uytterhoeven	ef2a515	2015-04-14 15:44:22 -0700	[diff] [blame]	1181	free_page_t put_new_page,
				1182	unsigned long private, struct page *page,
Naoya Horiguchi	add05ce	2015-06-24 16:56:50 -0700	[diff] [blame]	1183	int force, enum migrate_mode mode,
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1184	enum migrate_reason reason,
				1185	struct list_head *ret)
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	1186	{
Hugh Dickins	2def742	2015-11-05 18:49:46 -0800	[diff] [blame]	1187	int rc = MIGRATEPAGE_SUCCESS;
Yang Shi	74d4a57	2019-11-30 17:57:12 -0800	[diff] [blame]	1188	struct page *newpage = NULL;
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	1189
Michal Hocko	94723aa	2018-04-10 16:30:07 -0700	[diff] [blame]	1190	if (!thp_migration_supported() && PageTransHuge(page))
Yang Shi	d532e2e	2020-12-14 19:13:16 -0800	[diff] [blame]	1191	return -ENOSYS;
Michal Hocko	94723aa	2018-04-10 16:30:07 -0700	[diff] [blame]	1192
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	1193	if (page_count(page) == 1) {
				1194	/* page was freed from under us. So we are done. */
Minchan Kim	c6c919e	2016-07-26 15:23:02 -0700	[diff] [blame]	1195	ClearPageActive(page);
				1196	ClearPageUnevictable(page);
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	1197	if (unlikely(__PageMovable(page))) {
				1198	lock_page(page);
				1199	if (!PageMovable(page))
				1200	__ClearPageIsolated(page);
				1201	unlock_page(page);
				1202	}
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	1203	goto out;
				1204	}
				1205
Yang Shi	74d4a57	2019-11-30 17:57:12 -0800	[diff] [blame]	1206	newpage = get_new_page(page, private);
				1207	if (!newpage)
				1208	return -ENOMEM;
				1209
Hugh Dickins	9c620e2	2013-02-22 16:35:14 -0800	[diff] [blame]	1210	rc = __unmap_and_move(page, newpage, force, mode);
Minchan Kim	c6c919e	2016-07-26 15:23:02 -0700	[diff] [blame]	1211	if (rc == MIGRATEPAGE_SUCCESS)
Vlastimil Babka	7cd12b4	2016-03-15 14:56:18 -0700	[diff] [blame]	1212	set_page_owner_migrate_reason(newpage, reason);
Rafael Aquini	bf6bddf1	2012-12-11 16:02:42 -0800	[diff] [blame]	1213
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	1214	out:
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1215	if (rc != -EAGAIN) {
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	1216	/*
				1217	* A page that has been migrated has all references
				1218	* removed and will be freed. A page that has not been
Ralph Campbell	c23a0c9	2020-01-30 22:14:41 -0800	[diff] [blame]	1219	* migrated will have kept its references and be restored.
Minchan Kim	0dabec9	2011-10-31 17:06:57 -0700	[diff] [blame]	1220	*/
				1221	list_del(&page->lru);
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1222	}
David Rientjes	68711a7	2014-06-04 16:08:25 -0700	[diff] [blame]	1223
Christoph Lameter	95a402c	2006-06-23 02:03:53 -0700	[diff] [blame]	1224	/*
Minchan Kim	c6c919e	2016-07-26 15:23:02 -0700	[diff] [blame]	1225	* If migration is successful, releases reference grabbed during
				1226	* isolation. Otherwise, restore the page to right list unless
				1227	* we want to retry.
Christoph Lameter	95a402c	2006-06-23 02:03:53 -0700	[diff] [blame]	1228	*/
Minchan Kim	c6c919e	2016-07-26 15:23:02 -0700	[diff] [blame]	1229	if (rc == MIGRATEPAGE_SUCCESS) {
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1230	/*
				1231	* Compaction can migrate also non-LRU pages which are
				1232	* not accounted to NR_ISOLATED_*. They can be recognized
				1233	* as __PageMovable
				1234	*/
				1235	if (likely(!__PageMovable(page)))
				1236	mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
				1237	page_is_file_lru(page), -thp_nr_pages(page));
				1238
Oscar Salvador	79f5f8f	2020-10-15 20:07:09 -0700	[diff] [blame]	1239	if (reason != MR_MEMORY_FAILURE)
Minchan Kim	c6c919e	2016-07-26 15:23:02 -0700	[diff] [blame]	1240	/*
Oscar Salvador	79f5f8f	2020-10-15 20:07:09 -0700	[diff] [blame]	1241	* We release the page in page_handle_poison.
Minchan Kim	c6c919e	2016-07-26 15:23:02 -0700	[diff] [blame]	1242	*/
Oscar Salvador	79f5f8f	2020-10-15 20:07:09 -0700	[diff] [blame]	1243	put_page(page);
Minchan Kim	c6c919e	2016-07-26 15:23:02 -0700	[diff] [blame]	1244	} else {
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1245	if (rc != -EAGAIN)
				1246	list_add_tail(&page->lru, ret);
Minchan Kim	bda807d	2016-07-26 15:23:05 -0700	[diff] [blame]	1247
Minchan Kim	c6c919e	2016-07-26 15:23:02 -0700	[diff] [blame]	1248	if (put_new_page)
				1249	put_new_page(newpage, private);
				1250	else
				1251	put_page(newpage);
				1252	}
David Rientjes	68711a7	2014-06-04 16:08:25 -0700	[diff] [blame]	1253
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1254	return rc;
				1255	}
				1256
				1257	/*
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1258	* Counterpart of unmap_and_move_page() for hugepage migration.
				1259	*
				1260	* This function doesn't wait the completion of hugepage I/O
				1261	* because there is no race between I/O and migration for hugepage.
				1262	* Note that currently hugepage I/O occurs only in direct I/O
				1263	* where no lock is held and PG_writeback is irrelevant,
				1264	* and writeback status of all subpages are counted in the reference
				1265	* count of the head page (i.e. if all subpages of a 2MB hugepage are
				1266	* under direct I/O, the reference of the head page is 512 and a bit more.)
				1267	* This means that when we try to migrate hugepage whose subpages are
				1268	* doing direct I/O, some references remain after try_to_unmap() and
				1269	* hugepage migration fails without data corruption.
				1270	*
				1271	* There is also no race when direct I/O is issued on the page under migration,
				1272	* because then pte is replaced with migration swap entry and direct I/O code
				1273	* will wait in the page fault for migration to complete.
				1274	*/
				1275	static int unmap_and_move_huge_page(new_page_t get_new_page,
David Rientjes	68711a7	2014-06-04 16:08:25 -0700	[diff] [blame]	1276	free_page_t put_new_page, unsigned long private,
				1277	struct page *hpage, int force,
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1278	enum migrate_mode mode, int reason,
				1279	struct list_head *ret)
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1280	{
Hugh Dickins	2def742	2015-11-05 18:49:46 -0800	[diff] [blame]	1281	int rc = -EAGAIN;
Hugh Dickins	2ebba6b	2014-12-12 16:56:19 -0800	[diff] [blame]	1282	int page_was_mapped = 0;
Joonsoo Kim	32665f2	2014-01-21 15:51:15 -0800	[diff] [blame]	1283	struct page *new_hpage;
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1284	struct anon_vma *anon_vma = NULL;
Mike Kravetz	c0d0381	2020-04-01 21:11:05 -0700	[diff] [blame]	1285	struct address_space *mapping = NULL;
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1286
Naoya Horiguchi	83467ef	2013-09-11 14:22:11 -0700	[diff] [blame]	1287	/*
Anshuman Khandual	7ed2c31	2019-03-05 15:43:44 -0800	[diff] [blame]	1288	* Migratability of hugepages depends on architectures and their size.
Naoya Horiguchi	83467ef	2013-09-11 14:22:11 -0700	[diff] [blame]	1289	* This check is necessary because some callers of hugepage migration
				1290	* like soft offline and memory hotremove don't walk through page
				1291	* tables or check whether the hugepage is pmd-based or not before
				1292	* kicking migration.
				1293	*/
Naoya Horiguchi	100873d	2014-06-04 16:10:56 -0700	[diff] [blame]	1294	if (!hugepage_migration_supported(page_hstate(hpage))) {
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1295	list_move_tail(&hpage->lru, ret);
Naoya Horiguchi	83467ef	2013-09-11 14:22:11 -0700	[diff] [blame]	1296	return -ENOSYS;
Joonsoo Kim	32665f2	2014-01-21 15:51:15 -0800	[diff] [blame]	1297	}
Naoya Horiguchi	83467ef	2013-09-11 14:22:11 -0700	[diff] [blame]	1298
Muchun Song	71a64f6	2021-02-04 18:32:17 -0800	[diff] [blame]	1299	if (page_count(hpage) == 1) {
				1300	/* page was freed from under us. So we are done. */
				1301	putback_active_hugepage(hpage);
				1302	return MIGRATEPAGE_SUCCESS;
				1303	}
				1304
Michal Hocko	666feb2	2018-04-10 16:30:03 -0700	[diff] [blame]	1305	new_hpage = get_new_page(hpage, private);
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1306	if (!new_hpage)
				1307	return -ENOMEM;
				1308
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1309	if (!trylock_page(hpage)) {
Jérôme Glisse	2916ecc	2017-09-08 16:12:06 -0700	[diff] [blame]	1310	if (!force)
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1311	goto out;
Jérôme Glisse	2916ecc	2017-09-08 16:12:06 -0700	[diff] [blame]	1312	switch (mode) {
				1313	case MIGRATE_SYNC:
				1314	case MIGRATE_SYNC_NO_COPY:
				1315	break;
				1316	default:
				1317	goto out;
				1318	}
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1319	lock_page(hpage);
				1320	}
				1321
Mike Kravetz	cb6acd0	2019-02-28 16:22:02 -0800	[diff] [blame]	1322	/*
				1323	* Check for pages which are in the process of being freed. Without
				1324	* page_mapping() set, hugetlbfs specific move page routine will not
				1325	* be called and we could leak usage counts for subpools.
				1326	*/
Muchun Song	6acfb5b	2021-06-30 18:51:29 -0700	[diff] [blame]	1327	if (hugetlb_page_subpool(hpage) && !page_mapping(hpage)) {
Mike Kravetz	cb6acd0	2019-02-28 16:22:02 -0800	[diff] [blame]	1328	rc = -EBUSY;
				1329	goto out_unlock;
				1330	}
				1331
Peter Zijlstra	746b18d	2011-05-24 17:12:10 -0700	[diff] [blame]	1332	if (PageAnon(hpage))
				1333	anon_vma = page_get_anon_vma(hpage);
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1334
Hugh Dickins	7db7671	2015-11-05 18:49:49 -0800	[diff] [blame]	1335	if (unlikely(!trylock_page(new_hpage)))
				1336	goto put_anon;
				1337
Hugh Dickins	2ebba6b	2014-12-12 16:56:19 -0800	[diff] [blame]	1338	if (page_mapped(hpage)) {
Mike Kravetz	336bf30	2020-11-13 22:52:16 -0800	[diff] [blame]	1339	bool mapping_locked = false;
Alistair Popple	a98a2f0	2021-06-30 18:54:16 -0700	[diff] [blame]	1340	enum ttu_flags ttu = 0;
Mike Kravetz	c0d0381	2020-04-01 21:11:05 -0700	[diff] [blame]	1341
Mike Kravetz	336bf30	2020-11-13 22:52:16 -0800	[diff] [blame]	1342	if (!PageAnon(hpage)) {
				1343	/*
				1344	* In shared mappings, try_to_unmap could potentially
				1345	* call huge_pmd_unshare. Because of this, take
				1346	* semaphore in write mode here and set TTU_RMAP_LOCKED
				1347	* to let lower levels know we have taken the lock.
				1348	*/
				1349	mapping = hugetlb_page_mapping_lock_write(hpage);
				1350	if (unlikely(!mapping))
				1351	goto unlock_put_anon;
				1352
				1353	mapping_locked = true;
				1354	ttu \|= TTU_RMAP_LOCKED;
				1355	}
				1356
Alistair Popple	a98a2f0	2021-06-30 18:54:16 -0700	[diff] [blame]	1357	try_to_migrate(hpage, ttu);
Hugh Dickins	2ebba6b	2014-12-12 16:56:19 -0800	[diff] [blame]	1358	page_was_mapped = 1;
Mike Kravetz	336bf30	2020-11-13 22:52:16 -0800	[diff] [blame]	1359
				1360	if (mapping_locked)
				1361	i_mmap_unlock_write(mapping);
Hugh Dickins	2ebba6b	2014-12-12 16:56:19 -0800	[diff] [blame]	1362	}
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1363
				1364	if (!page_mapped(hpage))
Hugh Dickins	5c3f9a6	2015-11-05 18:49:53 -0800	[diff] [blame]	1365	rc = move_to_new_page(new_hpage, hpage, mode);
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1366
Mike Kravetz	336bf30	2020-11-13 22:52:16 -0800	[diff] [blame]	1367	if (page_was_mapped)
Hugh Dickins	5c3f9a6	2015-11-05 18:49:53 -0800	[diff] [blame]	1368	remove_migration_ptes(hpage,
Mike Kravetz	336bf30	2020-11-13 22:52:16 -0800	[diff] [blame]	1369	rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, false);
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1370
Mike Kravetz	c0d0381	2020-04-01 21:11:05 -0700	[diff] [blame]	1371	unlock_put_anon:
Hugh Dickins	7db7671	2015-11-05 18:49:49 -0800	[diff] [blame]	1372	unlock_page(new_hpage);
				1373
				1374	put_anon:
Hugh Dickins	fd4a466	2011-01-13 15:47:31 -0800	[diff] [blame]	1375	if (anon_vma)
Peter Zijlstra	9e60109	2011-03-22 16:32:46 -0700	[diff] [blame]	1376	put_anon_vma(anon_vma);
Aneesh Kumar K.V	8e6ac7f	2012-07-31 16:42:27 -0700	[diff] [blame]	1377
Hugh Dickins	2def742	2015-11-05 18:49:46 -0800	[diff] [blame]	1378	if (rc == MIGRATEPAGE_SUCCESS) {
Michal Hocko	ab5ac90	2018-01-31 16:20:48 -0800	[diff] [blame]	1379	move_hugetlb_state(hpage, new_hpage, reason);
Hugh Dickins	2def742	2015-11-05 18:49:46 -0800	[diff] [blame]	1380	put_new_page = NULL;
				1381	}
Aneesh Kumar K.V	8e6ac7f	2012-07-31 16:42:27 -0700	[diff] [blame]	1382
Mike Kravetz	cb6acd0	2019-02-28 16:22:02 -0800	[diff] [blame]	1383	out_unlock:
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1384	unlock_page(hpage);
Hillf Danton	0976133	2011-12-08 14:34:20 -0800	[diff] [blame]	1385	out:
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1386	if (rc == MIGRATEPAGE_SUCCESS)
Naoya Horiguchi	b8ec1ce	2013-09-11 14:22:01 -0700	[diff] [blame]	1387	putback_active_hugepage(hpage);
Miaohe Lin	a04840c	2021-05-04 18:37:07 -0700	[diff] [blame]	1388	else if (rc != -EAGAIN)
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1389	list_move_tail(&hpage->lru, ret);
David Rientjes	68711a7	2014-06-04 16:08:25 -0700	[diff] [blame]	1390
				1391	/*
				1392	* If migration was not successful and there's a freeing callback, use
				1393	* it. Otherwise, put_page() will drop the reference grabbed during
				1394	* isolation.
				1395	*/
Hugh Dickins	2def742	2015-11-05 18:49:46 -0800	[diff] [blame]	1396	if (put_new_page)
David Rientjes	68711a7	2014-06-04 16:08:25 -0700	[diff] [blame]	1397	put_new_page(new_hpage, private);
				1398	else
Naoya Horiguchi	3aaa76e	2015-09-22 14:59:14 -0700	[diff] [blame]	1399	putback_active_hugepage(new_hpage);
David Rientjes	68711a7	2014-06-04 16:08:25 -0700	[diff] [blame]	1400
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1401	return rc;
				1402	}
				1403
Yang Shi	d532e2e	2020-12-14 19:13:16 -0800	[diff] [blame]	1404	static inline int try_split_thp(struct page page, struct page *page2,
				1405	struct list_head *from)
				1406	{
				1407	int rc = 0;
				1408
				1409	lock_page(page);
				1410	rc = split_huge_page_to_list(page, from);
				1411	unlock_page(page);
				1412	if (!rc)
				1413	list_safe_reset_next(page, *page2, lru);
				1414
				1415	return rc;
				1416	}
				1417
Naoya Horiguchi	290408d	2010-09-08 10:19:35 +0900	[diff] [blame]	1418	/*
Srivatsa S. Bhat	c73e5c9	2013-04-29 15:08:16 -0700	[diff] [blame]	1419	* migrate_pages - migrate the pages specified in a list, to the free pages
				1420	* supplied as the target for the page migration
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1421	*
Srivatsa S. Bhat	c73e5c9	2013-04-29 15:08:16 -0700	[diff] [blame]	1422	* @from: The list of pages to be migrated.
				1423	* @get_new_page: The function used to allocate free pages to be used
				1424	* as the target of the page migration.
David Rientjes	68711a7	2014-06-04 16:08:25 -0700	[diff] [blame]	1425	* @put_new_page: The function used to free target pages if migration
				1426	* fails, or NULL if no special handling is necessary.
Srivatsa S. Bhat	c73e5c9	2013-04-29 15:08:16 -0700	[diff] [blame]	1427	* @private: Private data to be passed on to get_new_page()
				1428	* @mode: The migration mode that specifies the constraints for
				1429	* page migration, if any.
				1430	* @reason: The reason for page migration.
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1431	*
Srivatsa S. Bhat	c73e5c9	2013-04-29 15:08:16 -0700	[diff] [blame]	1432	* The function returns after 10 attempts or if no pages are movable any more
				1433	* because the list has become empty or no retryable pages exist any more.
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1434	* It is caller's responsibility to call putback_movable_pages() to return pages
				1435	* to the LRU or free list only if ret != 0.
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1436	*
Srivatsa S. Bhat	c73e5c9	2013-04-29 15:08:16 -0700	[diff] [blame]	1437	* Returns the number of pages that were not migrated, or an error code.
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1438	*/
Hugh Dickins	9c620e2	2013-02-22 16:35:14 -0800	[diff] [blame]	1439	int migrate_pages(struct list_head *from, new_page_t get_new_page,
David Rientjes	68711a7	2014-06-04 16:08:25 -0700	[diff] [blame]	1440	free_page_t put_new_page, unsigned long private,
				1441	enum migrate_mode mode, int reason)
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1442	{
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1443	int retry = 1;
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1444	int thp_retry = 1;
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1445	int nr_failed = 0;
Mel Gorman	5647bc2	2012-10-19 10:46:20 +0100	[diff] [blame]	1446	int nr_succeeded = 0;
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1447	int nr_thp_succeeded = 0;
				1448	int nr_thp_failed = 0;
				1449	int nr_thp_split = 0;
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1450	int pass = 0;
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1451	bool is_thp = false;
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1452	struct page *page;
				1453	struct page *page2;
				1454	int swapwrite = current->flags & PF_SWAPWRITE;
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1455	int rc, nr_subpages;
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1456	LIST_HEAD(ret_pages);
Yang Shi	b0b515b	2021-06-30 18:51:48 -0700	[diff] [blame]	1457	bool nosplit = (reason == MR_NUMA_MISPLACED);
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1458
Liam Mark	7bc1aec	2021-05-04 18:37:25 -0700	[diff] [blame]	1459	trace_mm_migrate_pages_start(mode, reason);
				1460
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1461	if (!swapwrite)
				1462	current->flags \|= PF_SWAPWRITE;
				1463
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1464	for (pass = 0; pass < 10 && (retry \|\| thp_retry); pass++) {
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1465	retry = 0;
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1466	thp_retry = 0;
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1467
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1468	list_for_each_entry_safe(page, page2, from, lru) {
Michal Hocko	94723aa	2018-04-10 16:30:07 -0700	[diff] [blame]	1469	retry:
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1470	/*
				1471	* THP statistics is based on the source huge page.
				1472	* Capture required information that might get lost
				1473	* during migration.
				1474	*/
Zi Yan	6c5c7b9	2020-09-25 21:19:14 -0700	[diff] [blame]	1475	is_thp = PageTransHuge(page) && !PageHuge(page);
Matthew Wilcox (Oracle)	6c35784	2020-08-14 17:30:37 -0700	[diff] [blame]	1476	nr_subpages = thp_nr_pages(page);
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1477	cond_resched();
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1478
Naoya Horiguchi	31caf66	2013-09-11 14:21:59 -0700	[diff] [blame]	1479	if (PageHuge(page))
				1480	rc = unmap_and_move_huge_page(get_new_page,
David Rientjes	68711a7	2014-06-04 16:08:25 -0700	[diff] [blame]	1481	put_new_page, private, page,
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1482	pass > 2, mode, reason,
				1483	&ret_pages);
Naoya Horiguchi	31caf66	2013-09-11 14:21:59 -0700	[diff] [blame]	1484	else
David Rientjes	68711a7	2014-06-04 16:08:25 -0700	[diff] [blame]	1485	rc = unmap_and_move(get_new_page, put_new_page,
Naoya Horiguchi	add05ce	2015-06-24 16:56:50 -0700	[diff] [blame]	1486	private, page, pass > 2, mode,
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1487	reason, &ret_pages);
				1488	/*
				1489	* The rules are:
				1490	* Success: non hugetlb page will be freed, hugetlb
				1491	* page will be put back
				1492	* -EAGAIN: stay on the from list
				1493	* -ENOMEM: stay on the from list
				1494	* Other errno: put on ret_pages list then splice to
				1495	* from list
				1496	*/
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1497	switch(rc) {
Yang Shi	d532e2e	2020-12-14 19:13:16 -0800	[diff] [blame]	1498	/*
				1499	* THP migration might be unsupported or the
				1500	* allocation could've failed so we should
				1501	* retry on the same page with the THP split
				1502	* to base pages.
				1503	*
				1504	* Head page is retried immediately and tail
				1505	* pages are added to the tail of the list so
				1506	* we encounter them after the rest of the list
				1507	* is processed.
				1508	*/
				1509	case -ENOSYS:
				1510	/* THP migration is unsupported */
				1511	if (is_thp) {
				1512	if (!try_split_thp(page, &page2, from)) {
				1513	nr_thp_split++;
				1514	goto retry;
				1515	}
				1516
				1517	nr_thp_failed++;
				1518	nr_failed += nr_subpages;
				1519	break;
				1520	}
				1521
				1522	/* Hugetlb migration is unsupported */
				1523	nr_failed++;
				1524	break;
Christoph Lameter	95a402c	2006-06-23 02:03:53 -0700	[diff] [blame]	1525	case -ENOMEM:
Michal Hocko	94723aa	2018-04-10 16:30:07 -0700	[diff] [blame]	1526	/*
Yang Shi	d532e2e	2020-12-14 19:13:16 -0800	[diff] [blame]	1527	* When memory is low, don't bother to try to migrate
				1528	* other pages, just exit.
Yang Shi	b0b515b	2021-06-30 18:51:48 -0700	[diff] [blame]	1529	* THP NUMA faulting doesn't split THP to retry.
Michal Hocko	94723aa	2018-04-10 16:30:07 -0700	[diff] [blame]	1530	*/
Yang Shi	b0b515b	2021-06-30 18:51:48 -0700	[diff] [blame]	1531	if (is_thp && !nosplit) {
Yang Shi	d532e2e	2020-12-14 19:13:16 -0800	[diff] [blame]	1532	if (!try_split_thp(page, &page2, from)) {
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1533	nr_thp_split++;
Michal Hocko	94723aa	2018-04-10 16:30:07 -0700	[diff] [blame]	1534	goto retry;
				1535	}
Zi Yan	6c5c7b9	2020-09-25 21:19:14 -0700	[diff] [blame]	1536
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1537	nr_thp_failed++;
				1538	nr_failed += nr_subpages;
				1539	goto out;
				1540	}
David Rientjes	dfef2ef	2016-05-20 16:59:05 -0700	[diff] [blame]	1541	nr_failed++;
Christoph Lameter	95a402c	2006-06-23 02:03:53 -0700	[diff] [blame]	1542	goto out;
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1543	case -EAGAIN:
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1544	if (is_thp) {
				1545	thp_retry++;
				1546	break;
				1547	}
Christoph Lameter	2d1db3b	2006-06-23 02:03:33 -0700	[diff] [blame]	1548	retry++;
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1549	break;
Rafael Aquini	78bd520	2012-12-11 16:02:31 -0800	[diff] [blame]	1550	case MIGRATEPAGE_SUCCESS:
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1551	if (is_thp) {
				1552	nr_thp_succeeded++;
				1553	nr_succeeded += nr_subpages;
				1554	break;
				1555	}
Mel Gorman	5647bc2	2012-10-19 10:46:20 +0100	[diff] [blame]	1556	nr_succeeded++;
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1557	break;
				1558	default:
Naoya Horiguchi	354a336	2014-01-21 15:51:14 -0800	[diff] [blame]	1559	/*
Yang Shi	d532e2e	2020-12-14 19:13:16 -0800	[diff] [blame]	1560	* Permanent failure (-EBUSY, etc.):
Naoya Horiguchi	354a336	2014-01-21 15:51:14 -0800	[diff] [blame]	1561	* unlike -EAGAIN case, the failed page is
				1562	* removed from migration page list and not
				1563	* retried in the next outer loop.
				1564	*/
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1565	if (is_thp) {
				1566	nr_thp_failed++;
				1567	nr_failed += nr_subpages;
				1568	break;
				1569	}
Christoph Lameter	2d1db3b	2006-06-23 02:03:33 -0700	[diff] [blame]	1570	nr_failed++;
Christoph Lameter	e24f0b8	2006-06-23 02:03:51 -0700	[diff] [blame]	1571	break;
Christoph Lameter	2d1db3b	2006-06-23 02:03:33 -0700	[diff] [blame]	1572	}
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1573	}
				1574	}
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1575	nr_failed += retry + thp_retry;
				1576	nr_thp_failed += thp_retry;
Vlastimil Babka	f2f81fb	2015-11-05 18:47:03 -0800	[diff] [blame]	1577	rc = nr_failed;
Christoph Lameter	95a402c	2006-06-23 02:03:53 -0700	[diff] [blame]	1578	out:
Yang Shi	dd4ae78	2020-12-14 19:13:06 -0800	[diff] [blame]	1579	/*
				1580	* Put the permanent failure page back to migration list, they
				1581	* will be put back to the right list by the caller.
				1582	*/
				1583	list_splice(&ret_pages, from);
				1584
Anshuman Khandual	1a5bae2	2020-08-11 18:31:51 -0700	[diff] [blame]	1585	count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
				1586	count_vm_events(PGMIGRATE_FAIL, nr_failed);
				1587	count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded);
				1588	count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed);
				1589	count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split);
				1590	trace_mm_migrate_pages(nr_succeeded, nr_failed, nr_thp_succeeded,
				1591	nr_thp_failed, nr_thp_split, mode, reason);
Mel Gorman	7b2a2d4	2012-10-19 14:07:31 +0100	[diff] [blame]	1592
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1593	if (!swapwrite)
				1594	current->flags &= ~PF_SWAPWRITE;
				1595
Rafael Aquini	78bd520	2012-12-11 16:02:31 -0800	[diff] [blame]	1596	return rc;
Christoph Lameter	b20a350	2006-03-22 00:09:12 -0800	[diff] [blame]	1597	}
				1598
Joonsoo Kim	19fc7be	2020-08-11 18:37:25 -0700	[diff] [blame]	1599	struct page alloc_migration_target(struct page page, unsigned long private)
Joonsoo Kim	b4b3822	2020-08-11 18:37:14 -0700	[diff] [blame]	1600	{
Joonsoo Kim	19fc7be	2020-08-11 18:37:25 -0700	[diff] [blame]	1601	struct migration_target_control *mtc;
				1602	gfp_t gfp_mask;
Joonsoo Kim	b4b3822	2020-08-11 18:37:14 -0700	[diff] [blame]	1603	unsigned int order = 0;
				1604	struct page *new_page = NULL;
Joonsoo Kim	19fc7be	2020-08-11 18:37:25 -0700	[diff] [blame]	1605	int nid;
				1606	int zidx;
				1607
				1608	mtc = (struct migration_target_control *)private;
				1609	gfp_mask = mtc->gfp_mask;
				1610	nid = mtc->nid;
				1611	if (nid == NUMA_NO_NODE)
				1612	nid = page_to_nid(page);
Joonsoo Kim	b4b3822	2020-08-11 18:37:14 -0700	[diff] [blame]	1613
Joonsoo Kim	d92bbc2	2020-08-11 18:37:17 -0700	[diff] [blame]	1614	if (PageHuge(page)) {
				1615	struct hstate *h = page_hstate(compound_head(page));
				1616
Joonsoo Kim	19fc7be	2020-08-11 18:37:25 -0700	[diff] [blame]	1617	gfp_mask = htlb_modify_alloc_mask(h, gfp_mask);
				1618	return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask);
Joonsoo Kim	d92bbc2	2020-08-11 18:37:17 -0700	[diff] [blame]	1619	}
Joonsoo Kim	b4b3822	2020-08-11 18:37:14 -0700	[diff] [blame]	1620
				1621	if (PageTransHuge(page)) {
Joonsoo Kim	9933a0c	2020-08-11 18:37:20 -0700	[diff] [blame]	1622	/*
				1623	* clear __GFP_RECLAIM to make the migration callback
				1624	* consistent with regular THP allocations.
				1625	*/
				1626	gfp_mask &= ~__GFP_RECLAIM;
Joonsoo Kim	b4b3822	2020-08-11 18:37:14 -0700	[diff] [blame]	1627	gfp_mask \|= GFP_TRANSHUGE;
				1628	order = HPAGE_PMD_ORDER;
				1629	}
Joonsoo Kim	19fc7be	2020-08-11 18:37:25 -0700	[diff] [blame]	1630	zidx = zone_idx(page_zone(page));
				1631	if (is_highmem_idx(zidx) \|\| zidx == ZONE_MOVABLE)
Joonsoo Kim	b4b3822	2020-08-11 18:37:14 -0700	[diff] [blame]	1632	gfp_mask \|= __GFP_HIGHMEM;
				1633
Matthew Wilcox (Oracle)	84172f4	2021-04-29 23:01:15 -0700	[diff] [blame]	1634	new_page = __alloc_pages(gfp_mask, order, nid, mtc->nmask);
Joonsoo Kim	b4b3822	2020-08-11 18:37:14 -0700	[diff] [blame]	1635
				1636	if (new_page && PageTransHuge(new_page))
				1637	prep_transhuge_page(new_page);
				1638
				1639	return new_page;
				1640	}
				1641
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1642	#ifdef CONFIG_NUMA
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1643
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1644	static int store_status(int __user *status, int start, int value, int nr)
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1645	{
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1646	while (nr-- > 0) {
				1647	if (put_user(value, status + start))
				1648	return -EFAULT;
				1649	start++;
				1650	}
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1651
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1652	return 0;
				1653	}
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1654
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1655	static int do_move_pages_to_node(struct mm_struct *mm,
				1656	struct list_head *pagelist, int node)
				1657	{
				1658	int err;
Joonsoo Kim	a097631	2020-08-11 18:37:28 -0700	[diff] [blame]	1659	struct migration_target_control mtc = {
				1660	.nid = node,
				1661	.gfp_mask = GFP_HIGHUSER_MOVABLE \| __GFP_THISNODE,
				1662	};
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1663
Joonsoo Kim	a097631	2020-08-11 18:37:28 -0700	[diff] [blame]	1664	err = migrate_pages(pagelist, alloc_migration_target, NULL,
				1665	(unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1666	if (err)
				1667	putback_movable_pages(pagelist);
				1668	return err;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1669	}
				1670
				1671	/*
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1672	* Resolves the given address to a struct page, isolates it from the LRU and
				1673	* puts it to the given pagelist.
Yang Shi	e0153fc	2020-01-04 12:59:46 -0800	[diff] [blame]	1674	* Returns:
				1675	* errno - if the page cannot be found/isolated
				1676	* 0 - when it doesn't have to be migrated because it is already on the
				1677	* target node
				1678	* 1 - when it has been queued
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1679	*/
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1680	static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
				1681	int node, struct list_head *pagelist, bool migrate_all)
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1682	{
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1683	struct vm_area_struct *vma;
				1684	struct page *page;
				1685	unsigned int follflags;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1686	int err;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1687
Michel Lespinasse	d8ed45c	2020-06-08 21:33:25 -0700	[diff] [blame]	1688	mmap_read_lock(mm);
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1689	err = -EFAULT;
				1690	vma = find_vma(mm, addr);
				1691	if (!vma \|\| addr < vma->vm_start \|\| !vma_migratable(vma))
				1692	goto out;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1693
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1694	/* FOLL_DUMP to ignore special (like zero) pages */
				1695	follflags = FOLL_GET \| FOLL_DUMP;
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1696	page = follow_page(vma, addr, follflags);
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1697
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1698	err = PTR_ERR(page);
				1699	if (IS_ERR(page))
				1700	goto out;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1701
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1702	err = -ENOENT;
				1703	if (!page)
				1704	goto out;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1705
Brice Goglin	e78bbfa	2008-10-18 20:27:15 -0700	[diff] [blame]	1706	err = 0;
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1707	if (page_to_nid(page) == node)
				1708	goto out_putpage;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1709
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1710	err = -EACCES;
				1711	if (page_mapcount(page) > 1 && !migrate_all)
				1712	goto out_putpage;
				1713
				1714	if (PageHuge(page)) {
				1715	if (PageHead(page)) {
				1716	isolate_huge_page(page, pagelist);
Yang Shi	e0153fc	2020-01-04 12:59:46 -0800	[diff] [blame]	1717	err = 1;
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1718	}
				1719	} else {
				1720	struct page *head;
				1721
				1722	head = compound_head(page);
				1723	err = isolate_lru_page(head);
				1724	if (err)
				1725	goto out_putpage;
				1726
Yang Shi	e0153fc	2020-01-04 12:59:46 -0800	[diff] [blame]	1727	err = 1;
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1728	list_add_tail(&head->lru, pagelist);
				1729	mod_node_page_state(page_pgdat(head),
Huang Ying	9de4f22	2020-04-06 20:04:41 -0700	[diff] [blame]	1730	NR_ISOLATED_ANON + page_is_file_lru(head),
Matthew Wilcox (Oracle)	6c35784	2020-08-14 17:30:37 -0700	[diff] [blame]	1731	thp_nr_pages(head));
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1732	}
				1733	out_putpage:
				1734	/*
				1735	* Either remove the duplicate refcount from
				1736	* isolate_lru_page() or drop the page ref if it was
				1737	* not isolated.
				1738	*/
				1739	put_page(page);
				1740	out:
Michel Lespinasse	d8ed45c	2020-06-08 21:33:25 -0700	[diff] [blame]	1741	mmap_read_unlock(mm);
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1742	return err;
				1743	}
				1744
Wei Yang	7ca8783	2020-04-06 20:04:12 -0700	[diff] [blame]	1745	static int move_pages_and_store_status(struct mm_struct *mm, int node,
				1746	struct list_head pagelist, int __user status,
				1747	int start, int i, unsigned long nr_pages)
				1748	{
				1749	int err;
				1750
Wei Yang	5d7ae89	2020-04-06 20:04:15 -0700	[diff] [blame]	1751	if (list_empty(pagelist))
				1752	return 0;
				1753
Wei Yang	7ca8783	2020-04-06 20:04:12 -0700	[diff] [blame]	1754	err = do_move_pages_to_node(mm, pagelist, node);
				1755	if (err) {
				1756	/*
				1757	* Positive err means the number of failed
				1758	* pages to migrate. Since we are going to
				1759	* abort and return the number of non-migrated
Long Li	ab9dd4f	2020-12-14 19:12:52 -0800	[diff] [blame]	1760	* pages, so need to include the rest of the
Wei Yang	7ca8783	2020-04-06 20:04:12 -0700	[diff] [blame]	1761	* nr_pages that have not been attempted as
				1762	* well.
				1763	*/
				1764	if (err > 0)
				1765	err += nr_pages - i - 1;
				1766	return err;
				1767	}
				1768	return store_status(status, start, node, i - start);
				1769	}
				1770
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1771	/*
Brice Goglin	5e9a0f0	2008-10-18 20:27:17 -0700	[diff] [blame]	1772	* Migrate an array of page address onto an array of nodes and fill
				1773	* the corresponding array of status.
				1774	*/
Christoph Lameter	3268c63	2012-03-21 16:34:06 -0700	[diff] [blame]	1775	static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
Brice Goglin	5e9a0f0	2008-10-18 20:27:17 -0700	[diff] [blame]	1776	unsigned long nr_pages,
				1777	const void __user * __user *pages,
				1778	const int __user *nodes,
				1779	int __user *status, int flags)
				1780	{
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1781	int current_node = NUMA_NO_NODE;
				1782	LIST_HEAD(pagelist);
				1783	int start, i;
				1784	int err = 0, err1;
Brice Goglin	35282a2	2009-06-16 15:32:43 -0700	[diff] [blame]	1785
Minchan Kim	361a2a2	2021-05-04 18:36:57 -0700	[diff] [blame]	1786	lru_cache_disable();
Brice Goglin	35282a2	2009-06-16 15:32:43 -0700	[diff] [blame]	1787
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1788	for (i = start = 0; i < nr_pages; i++) {
				1789	const void __user *p;
				1790	unsigned long addr;
				1791	int node;
Brice Goglin	5e9a0f0	2008-10-18 20:27:17 -0700	[diff] [blame]	1792
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1793	err = -EFAULT;
				1794	if (get_user(p, pages + i))
				1795	goto out_flush;
				1796	if (get_user(node, nodes + i))
				1797	goto out_flush;
Andrey Konovalov	057d3389	2019-09-25 16:48:30 -0700	[diff] [blame]	1798	addr = (unsigned long)untagged_addr(p);
Brice Goglin	5e9a0f0	2008-10-18 20:27:17 -0700	[diff] [blame]	1799
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1800	err = -ENODEV;
				1801	if (node < 0 \|\| node >= MAX_NUMNODES)
				1802	goto out_flush;
				1803	if (!node_state(node, N_MEMORY))
				1804	goto out_flush;
Brice Goglin	3140a22	2009-01-06 14:38:57 -0800	[diff] [blame]	1805
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1806	err = -EACCES;
				1807	if (!node_isset(node, task_nodes))
				1808	goto out_flush;
Brice Goglin	5e9a0f0	2008-10-18 20:27:17 -0700	[diff] [blame]	1809
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1810	if (current_node == NUMA_NO_NODE) {
				1811	current_node = node;
				1812	start = i;
				1813	} else if (node != current_node) {
Wei Yang	7ca8783	2020-04-06 20:04:12 -0700	[diff] [blame]	1814	err = move_pages_and_store_status(mm, current_node,
				1815	&pagelist, status, start, i, nr_pages);
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1816	if (err)
				1817	goto out;
				1818	start = i;
				1819	current_node = node;
Brice Goglin	3140a22	2009-01-06 14:38:57 -0800	[diff] [blame]	1820	}
Brice Goglin	5e9a0f0	2008-10-18 20:27:17 -0700	[diff] [blame]	1821
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1822	/*
				1823	* Errors in the page lookup or isolation are not fatal and we simply
				1824	* report them via status
				1825	*/
				1826	err = add_page_for_migration(mm, addr, current_node,
				1827	&pagelist, flags & MPOL_MF_MOVE_ALL);
Yang Shi	e0153fc	2020-01-04 12:59:46 -0800	[diff] [blame]	1828
Wei Yang	d08221a	2020-04-06 20:04:18 -0700	[diff] [blame]	1829	if (err > 0) {
Yang Shi	e0153fc	2020-01-04 12:59:46 -0800	[diff] [blame]	1830	/* The page is successfully queued for migration */
				1831	continue;
				1832	}
Brice Goglin	3140a22	2009-01-06 14:38:57 -0800	[diff] [blame]	1833
Wei Yang	d08221a	2020-04-06 20:04:18 -0700	[diff] [blame]	1834	/*
				1835	* If the page is already on the target node (!err), store the
				1836	* node, otherwise, store the err.
				1837	*/
				1838	err = store_status(status, i, err ? : current_node, 1);
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1839	if (err)
				1840	goto out_flush;
Brice Goglin	3140a22	2009-01-06 14:38:57 -0800	[diff] [blame]	1841
Wei Yang	7ca8783	2020-04-06 20:04:12 -0700	[diff] [blame]	1842	err = move_pages_and_store_status(mm, current_node, &pagelist,
				1843	status, start, i, nr_pages);
Wei Yang	4afdace	2020-04-06 20:04:09 -0700	[diff] [blame]	1844	if (err)
				1845	goto out;
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1846	current_node = NUMA_NO_NODE;
Brice Goglin	3140a22	2009-01-06 14:38:57 -0800	[diff] [blame]	1847	}
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1848	out_flush:
				1849	/* Make sure we do not overwrite the existing error */
Wei Yang	7ca8783	2020-04-06 20:04:12 -0700	[diff] [blame]	1850	err1 = move_pages_and_store_status(mm, current_node, &pagelist,
				1851	status, start, i, nr_pages);
Wei Yang	dfe9aa2	2020-01-30 22:11:14 -0800	[diff] [blame]	1852	if (err >= 0)
Michal Hocko	a49bd4d	2018-04-10 16:29:59 -0700	[diff] [blame]	1853	err = err1;
Brice Goglin	5e9a0f0	2008-10-18 20:27:17 -0700	[diff] [blame]	1854	out:
Minchan Kim	361a2a2	2021-05-04 18:36:57 -0700	[diff] [blame]	1855	lru_cache_enable();
Brice Goglin	5e9a0f0	2008-10-18 20:27:17 -0700	[diff] [blame]	1856	return err;
				1857	}
				1858
				1859	/*
Brice Goglin	2f007e7	2008-10-18 20:27:16 -0700	[diff] [blame]	1860	* Determine the nodes of an array of pages and store it in an array of status.
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1861	*/
Brice Goglin	80bba12	2008-12-09 13:14:23 -0800	[diff] [blame]	1862	static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
				1863	const void __user *pages, int status)
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1864	{
Brice Goglin	2f007e7	2008-10-18 20:27:16 -0700	[diff] [blame]	1865	unsigned long i;
Brice Goglin	2f007e7	2008-10-18 20:27:16 -0700	[diff] [blame]	1866
Michel Lespinasse	d8ed45c	2020-06-08 21:33:25 -0700	[diff] [blame]	1867	mmap_read_lock(mm);
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1868
Brice Goglin	2f007e7	2008-10-18 20:27:16 -0700	[diff] [blame]	1869	for (i = 0; i < nr_pages; i++) {
Brice Goglin	80bba12	2008-12-09 13:14:23 -0800	[diff] [blame]	1870	unsigned long addr = (unsigned long)(*pages);
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1871	struct vm_area_struct *vma;
				1872	struct page *page;
KOSAKI Motohiro	c095adb	2008-12-16 16:06:43 +0900	[diff] [blame]	1873	int err = -EFAULT;
Brice Goglin	2f007e7	2008-10-18 20:27:16 -0700	[diff] [blame]	1874
Liam Howlett	059b8b4	2021-06-28 19:39:44 -0700	[diff] [blame]	1875	vma = vma_lookup(mm, addr);
				1876	if (!vma)
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1877	goto set_status;
				1878
Kirill A. Shutemov	d899844	2015-09-04 15:47:53 -0700	[diff] [blame]	1879	/* FOLL_DUMP to ignore special (like zero) pages */
				1880	page = follow_page(vma, addr, FOLL_DUMP);
Linus Torvalds	89f5b7d	2008-06-20 11:18:25 -0700	[diff] [blame]	1881
				1882	err = PTR_ERR(page);
				1883	if (IS_ERR(page))
				1884	goto set_status;
				1885
Kirill A. Shutemov	d899844	2015-09-04 15:47:53 -0700	[diff] [blame]	1886	err = page ? page_to_nid(page) : -ENOENT;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1887	set_status:
Brice Goglin	80bba12	2008-12-09 13:14:23 -0800	[diff] [blame]	1888	*status = err;
				1889
				1890	pages++;
				1891	status++;
				1892	}
				1893
Michel Lespinasse	d8ed45c	2020-06-08 21:33:25 -0700	[diff] [blame]	1894	mmap_read_unlock(mm);
Brice Goglin	80bba12	2008-12-09 13:14:23 -0800	[diff] [blame]	1895	}
				1896
				1897	/*
				1898	* Determine the nodes of a user array of pages and store it in
				1899	* a user array of status.
				1900	*/
				1901	static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
				1902	const void __user * __user *pages,
				1903	int __user *status)
				1904	{
				1905	#define DO_PAGES_STAT_CHUNK_NR 16
				1906	const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
				1907	int chunk_status[DO_PAGES_STAT_CHUNK_NR];
Brice Goglin	80bba12	2008-12-09 13:14:23 -0800	[diff] [blame]	1908
H. Peter Anvin	87b8d1a	2010-02-18 16:13:40 -0800	[diff] [blame]	1909	while (nr_pages) {
				1910	unsigned long chunk_nr;
Brice Goglin	80bba12	2008-12-09 13:14:23 -0800	[diff] [blame]	1911
H. Peter Anvin	87b8d1a	2010-02-18 16:13:40 -0800	[diff] [blame]	1912	chunk_nr = nr_pages;
				1913	if (chunk_nr > DO_PAGES_STAT_CHUNK_NR)
				1914	chunk_nr = DO_PAGES_STAT_CHUNK_NR;
				1915
				1916	if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages)))
				1917	break;
Brice Goglin	80bba12	2008-12-09 13:14:23 -0800	[diff] [blame]	1918
				1919	do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
				1920
H. Peter Anvin	87b8d1a	2010-02-18 16:13:40 -0800	[diff] [blame]	1921	if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status)))
				1922	break;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1923
H. Peter Anvin	87b8d1a	2010-02-18 16:13:40 -0800	[diff] [blame]	1924	pages += chunk_nr;
				1925	status += chunk_nr;
				1926	nr_pages -= chunk_nr;
				1927	}
				1928	return nr_pages ? -EFAULT : 0;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1929	}
				1930
Miaohe Lin	4dc200c	2020-10-17 16:14:03 -0700	[diff] [blame]	1931	static struct mm_struct find_mm_struct(pid_t pid, nodemask_t mem_nodes)
				1932	{
				1933	struct task_struct *task;
				1934	struct mm_struct *mm;
				1935
				1936	/*
				1937	* There is no need to check if current process has the right to modify
				1938	* the specified process when they are same.
				1939	*/
				1940	if (!pid) {
				1941	mmget(current->mm);
				1942	*mem_nodes = cpuset_mems_allowed(current);
				1943	return current->mm;
				1944	}
				1945
				1946	/* Find the mm_struct */
				1947	rcu_read_lock();
				1948	task = find_task_by_vpid(pid);
				1949	if (!task) {
				1950	rcu_read_unlock();
				1951	return ERR_PTR(-ESRCH);
				1952	}
				1953	get_task_struct(task);
				1954
				1955	/*
				1956	* Check if this process has the right to modify the specified
				1957	* process. Use the regular "ptrace_may_access()" checks.
				1958	*/
				1959	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
				1960	rcu_read_unlock();
				1961	mm = ERR_PTR(-EPERM);
				1962	goto out;
				1963	}
				1964	rcu_read_unlock();
				1965
				1966	mm = ERR_PTR(security_task_movememory(task));
				1967	if (IS_ERR(mm))
				1968	goto out;
				1969	*mem_nodes = cpuset_mems_allowed(task);
				1970	mm = get_task_mm(task);
				1971	out:
				1972	put_task_struct(task);
				1973	if (!mm)
				1974	mm = ERR_PTR(-EINVAL);
				1975	return mm;
				1976	}
				1977
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1978	/*
				1979	* Move a list of pages in the address space of the currently executing
				1980	* process.
				1981	*/
Dominik Brodowski	7addf44	2018-03-17 16:08:03 +0100	[diff] [blame]	1982	static int kernel_move_pages(pid_t pid, unsigned long nr_pages,
				1983	const void __user * __user *pages,
				1984	const int __user *nodes,
				1985	int __user *status, int flags)
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1986	{
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1987	struct mm_struct *mm;
Brice Goglin	5e9a0f0	2008-10-18 20:27:17 -0700	[diff] [blame]	1988	int err;
Christoph Lameter	3268c63	2012-03-21 16:34:06 -0700	[diff] [blame]	1989	nodemask_t task_nodes;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	1990
				1991	/* Check flags */
				1992	if (flags & ~(MPOL_MF_MOVE\|MPOL_MF_MOVE_ALL))
				1993	return -EINVAL;
				1994
				1995	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
				1996	return -EPERM;
				1997
Miaohe Lin	4dc200c	2020-10-17 16:14:03 -0700	[diff] [blame]	1998	mm = find_mm_struct(pid, &task_nodes);
				1999	if (IS_ERR(mm))
				2000	return PTR_ERR(mm);
Sasha Levin	6e8b09e	2012-04-25 16:01:53 -0700	[diff] [blame]	2001
				2002	if (nodes)
				2003	err = do_pages_move(mm, task_nodes, nr_pages, pages,
				2004	nodes, status, flags);
				2005	else
				2006	err = do_pages_stat(mm, nr_pages, pages, status);
Christoph Lameter	3268c63	2012-03-21 16:34:06 -0700	[diff] [blame]	2007
				2008	mmput(mm);
				2009	return err;
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	2010	}
Christoph Lameter	742755a	2006-06-23 02:03:55 -0700	[diff] [blame]	2011
Dominik Brodowski	7addf44	2018-03-17 16:08:03 +0100	[diff] [blame]	2012	SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
				2013	const void __user * __user *, pages,
				2014	const int __user *, nodes,
				2015	int __user *, status, int, flags)
				2016	{
				2017	return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags);
				2018	}
				2019
				2020	#ifdef CONFIG_COMPAT
				2021	COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages,
				2022	compat_uptr_t __user *, pages32,
				2023	const int __user *, nodes,
				2024	int __user *, status,
				2025	int, flags)
				2026	{
				2027	const void __user * __user *pages;
				2028	int i;
				2029
				2030	pages = compat_alloc_user_space(nr_pages * sizeof(void *));
				2031	for (i = 0; i < nr_pages; i++) {
				2032	compat_uptr_t p;
				2033
				2034	if (get_user(p, pages32 + i) \|\|
				2035	put_user(compat_ptr(p), pages + i))
				2036	return -EFAULT;
				2037	}
				2038	return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags);
				2039	}
				2040	#endif /* CONFIG_COMPAT */
				2041
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2042	#ifdef CONFIG_NUMA_BALANCING
				2043	/*
				2044	* Returns true if this is a safe migration target node for misplaced NUMA
				2045	* pages. Currently it only checks the watermarks which crude
				2046	*/
				2047	static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
Mel Gorman	3abef4e	2013-02-22 16:34:27 -0800	[diff] [blame]	2048	unsigned long nr_migrate_pages)
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2049	{
				2050	int z;
Mel Gorman	599d0c9	2016-07-28 15:45:31 -0700	[diff] [blame]	2051
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2052	for (z = pgdat->nr_zones - 1; z >= 0; z--) {
				2053	struct zone *zone = pgdat->node_zones + z;
				2054
				2055	if (!populated_zone(zone))
				2056	continue;
				2057
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2058	/* Avoid waking kswapd by allocating pages_to_migrate pages. */
				2059	if (!zone_watermark_ok(zone, 0,
				2060	high_wmark_pages(zone) +
				2061	nr_migrate_pages,
Huang Ying	bfe9d00	2019-11-30 17:57:28 -0800	[diff] [blame]	2062	ZONE_MOVABLE, 0))
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2063	continue;
				2064	return true;
				2065	}
				2066	return false;
				2067	}
				2068
				2069	static struct page alloc_misplaced_dst_page(struct page page,
Michal Hocko	666feb2	2018-04-10 16:30:03 -0700	[diff] [blame]	2070	unsigned long data)
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2071	{
				2072	int nid = (int) data;
				2073	struct page *newpage;
				2074
Vlastimil Babka	96db800	2015-09-08 15:03:50 -0700	[diff] [blame]	2075	newpage = __alloc_pages_node(nid,
Johannes Weiner	e97ca8e5	2014-03-10 15:49:43 -0700	[diff] [blame]	2076	(GFP_HIGHUSER_MOVABLE \|
				2077	__GFP_THISNODE \| __GFP_NOMEMALLOC \|
				2078	__GFP_NORETRY \| __GFP_NOWARN) &
Mel Gorman	8479eba	2016-02-26 15:19:31 -0800	[diff] [blame]	2079	~__GFP_RECLAIM, 0);
Hillf Danton	bac0382	2012-11-27 14:46:24 +0000	[diff] [blame]	2080
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2081	return newpage;
				2082	}
				2083
Yang Shi	c5b5a3d	2021-06-30 18:51:42 -0700	[diff] [blame]	2084	static struct page alloc_misplaced_dst_page_thp(struct page page,
				2085	unsigned long data)
				2086	{
				2087	int nid = (int) data;
				2088	struct page *newpage;
				2089
				2090	newpage = alloc_pages_node(nid, (GFP_TRANSHUGE_LIGHT \| __GFP_THISNODE),
				2091	HPAGE_PMD_ORDER);
				2092	if (!newpage)
				2093	goto out;
				2094
				2095	prep_transhuge_page(newpage);
				2096
				2097	out:
				2098	return newpage;
				2099	}
				2100
Mel Gorman	1c30e01	2014-01-21 15:50:58 -0800	[diff] [blame]	2101	static int numamigrate_isolate_page(pg_data_t pgdat, struct page page)
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2102	{
Hugh Dickins	340ef39	2013-02-22 16:34:33 -0800	[diff] [blame]	2103	int page_lru;
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2104
Sasha Levin	309381fea	2014-01-23 15:52:54 -0800	[diff] [blame]	2105	VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
Mel Gorman	3abef4e	2013-02-22 16:34:27 -0800	[diff] [blame]	2106
Yang Shi	662aeea	2021-06-30 18:51:51 -0700	[diff] [blame]	2107	/* Do not migrate THP mapped by multiple processes */
				2108	if (PageTransHuge(page) && total_mapcount(page) > 1)
				2109	return 0;
				2110
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2111	/* Avoid migrating to a node that is nearly full */
Matthew Wilcox (Oracle)	d8c6546	2019-09-23 15:34:30 -0700	[diff] [blame]	2112	if (!migrate_balanced_pgdat(pgdat, compound_nr(page)))
Hugh Dickins	340ef39	2013-02-22 16:34:33 -0800	[diff] [blame]	2113	return 0;
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2114
Hugh Dickins	340ef39	2013-02-22 16:34:33 -0800	[diff] [blame]	2115	if (isolate_lru_page(page))
				2116	return 0;
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2117
Huang Ying	9de4f22	2020-04-06 20:04:41 -0700	[diff] [blame]	2118	page_lru = page_is_file_lru(page);
Mel Gorman	599d0c9	2016-07-28 15:45:31 -0700	[diff] [blame]	2119	mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru,
Matthew Wilcox (Oracle)	6c35784	2020-08-14 17:30:37 -0700	[diff] [blame]	2120	thp_nr_pages(page));
Hugh Dickins	340ef39	2013-02-22 16:34:33 -0800	[diff] [blame]	2121
				2122	/*
				2123	* Isolating the page has taken another reference, so the
				2124	* caller's reference can be safely dropped without the page
				2125	* disappearing underneath us during migration.
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2126	*/
				2127	put_page(page);
Hugh Dickins	340ef39	2013-02-22 16:34:33 -0800	[diff] [blame]	2128	return 1;
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2129	}
				2130
Mel Gorman	a8f6077	2012-11-14 21:41:46 +0000	[diff] [blame]	2131	/*
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2132	* Attempt to migrate a misplaced page to the specified destination
				2133	* node. Caller is expected to have an elevated reference count on
				2134	* the page that will be dropped by this function before returning.
				2135	*/
Mel Gorman	1bc115d	2013-10-07 11:29:05 +0100	[diff] [blame]	2136	int migrate_misplaced_page(struct page page, struct vm_area_struct vma,
				2137	int node)
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2138	{
Mel Gorman	a8f6077	2012-11-14 21:41:46 +0000	[diff] [blame]	2139	pg_data_t *pgdat = NODE_DATA(node);
Hugh Dickins	340ef39	2013-02-22 16:34:33 -0800	[diff] [blame]	2140	int isolated;
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2141	int nr_remaining;
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2142	LIST_HEAD(migratepages);
Yang Shi	c5b5a3d	2021-06-30 18:51:42 -0700	[diff] [blame]	2143	new_page_t *new;
				2144	bool compound;
Aneesh Kumar K.V	b5916c0	2021-07-29 14:53:47 -0700	[diff] [blame]	2145	int nr_pages = thp_nr_pages(page);
Yang Shi	c5b5a3d	2021-06-30 18:51:42 -0700	[diff] [blame]	2146
				2147	/*
				2148	* PTE mapped THP or HugeTLB page can't reach here so the page could
				2149	* be either base page or THP. And it must be head page if it is
				2150	* THP.
				2151	*/
				2152	compound = PageTransHuge(page);
				2153
				2154	if (compound)
				2155	new = alloc_misplaced_dst_page_thp;
				2156	else
				2157	new = alloc_misplaced_dst_page;
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2158
				2159	/*
Mel Gorman	1bc115d	2013-10-07 11:29:05 +0100	[diff] [blame]	2160	* Don't migrate file pages that are mapped in multiple processes
				2161	* with execute permissions as they are probably shared libraries.
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2162	*/
Miaohe Lin	7ee820e	2021-05-04 18:37:16 -0700	[diff] [blame]	2163	if (page_mapcount(page) != 1 && page_is_file_lru(page) &&
				2164	(vma->vm_flags & VM_EXEC))
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2165	goto out;
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2166
Mel Gorman	a8f6077	2012-11-14 21:41:46 +0000	[diff] [blame]	2167	/*
Mel Gorman	09a913a	2018-04-10 16:29:20 -0700	[diff] [blame]	2168	* Also do not migrate dirty pages as not all filesystems can move
				2169	* dirty pages in MIGRATE_ASYNC mode which is a waste of cycles.
				2170	*/
Huang Ying	9de4f22	2020-04-06 20:04:41 -0700	[diff] [blame]	2171	if (page_is_file_lru(page) && PageDirty(page))
Mel Gorman	09a913a	2018-04-10 16:29:20 -0700	[diff] [blame]	2172	goto out;
				2173
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2174	isolated = numamigrate_isolate_page(pgdat, page);
				2175	if (!isolated)
				2176	goto out;
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2177
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2178	list_add(&page->lru, &migratepages);
Yang Shi	c5b5a3d	2021-06-30 18:51:42 -0700	[diff] [blame]	2179	nr_remaining = migrate_pages(&migratepages, *new, NULL, node,
				2180	MIGRATE_ASYNC, MR_NUMA_MISPLACED);
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2181	if (nr_remaining) {
Joonsoo Kim	59c82b7	2014-01-21 15:51:17 -0800	[diff] [blame]	2182	if (!list_empty(&migratepages)) {
				2183	list_del(&page->lru);
Yang Shi	c5fc5c3	2021-06-30 18:51:45 -0700	[diff] [blame]	2184	mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
				2185	page_is_file_lru(page), -nr_pages);
Joonsoo Kim	59c82b7	2014-01-21 15:51:17 -0800	[diff] [blame]	2186	putback_lru_page(page);
				2187	}
Mel Gorman	b32967f	2012-11-19 12:35:47 +0000	[diff] [blame]	2188	isolated = 0;
				2189	} else
Yang Shi	c5fc5c3	2021-06-30 18:51:45 -0700	[diff] [blame]	2190	count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_pages);
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2191	BUG_ON(!list_empty(&migratepages));
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2192	return isolated;
Hugh Dickins	340ef39	2013-02-22 16:34:33 -0800	[diff] [blame]	2193
				2194	out:
				2195	put_page(page);
				2196	return 0;
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2197	}
Mel Gorman	220018d	2012-12-05 09:32:56 +0000	[diff] [blame]	2198	#endif /* CONFIG_NUMA_BALANCING */
Peter Zijlstra	7039e1d	2012-10-25 14:16:34 +0200	[diff] [blame]	2199	#endif /* CONFIG_NUMA */
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2200
Christoph Hellwig	9b2ed9c	2019-08-14 09:59:28 +0200	[diff] [blame]	2201	#ifdef CONFIG_DEVICE_PRIVATE
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2202	static int migrate_vma_collect_skip(unsigned long start,
				2203	unsigned long end,
				2204	struct mm_walk *walk)
				2205	{
				2206	struct migrate_vma *migrate = walk->private;
				2207	unsigned long addr;
				2208
Ralph Campbell	872ea70	2020-01-30 22:14:38 -0800	[diff] [blame]	2209	for (addr = start; addr < end; addr += PAGE_SIZE) {
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2210	migrate->dst[migrate->npages] = 0;
				2211	migrate->src[migrate->npages++] = 0;
				2212	}
				2213
				2214	return 0;
				2215	}
				2216
Miaohe Lin	843e1be	2021-05-04 18:37:13 -0700	[diff] [blame]	2217	static int migrate_vma_collect_hole(unsigned long start,
				2218	unsigned long end,
				2219	__always_unused int depth,
				2220	struct mm_walk *walk)
				2221	{
				2222	struct migrate_vma *migrate = walk->private;
				2223	unsigned long addr;
				2224
				2225	/* Only allow populating anonymous memory. */
				2226	if (!vma_is_anonymous(walk->vma))
				2227	return migrate_vma_collect_skip(start, end, walk);
				2228
				2229	for (addr = start; addr < end; addr += PAGE_SIZE) {
				2230	migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
				2231	migrate->dst[migrate->npages] = 0;
				2232	migrate->npages++;
				2233	migrate->cpages++;
				2234	}
				2235
				2236	return 0;
				2237	}
				2238
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2239	static int migrate_vma_collect_pmd(pmd_t *pmdp,
				2240	unsigned long start,
				2241	unsigned long end,
				2242	struct mm_walk *walk)
				2243	{
				2244	struct migrate_vma *migrate = walk->private;
				2245	struct vm_area_struct *vma = walk->vma;
				2246	struct mm_struct *mm = vma->vm_mm;
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2247	unsigned long addr = start, unmapped = 0;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2248	spinlock_t *ptl;
				2249	pte_t *ptep;
				2250
				2251	again:
				2252	if (pmd_none(*pmdp))
Steven Price	b7a16c7	2020-02-03 17:36:03 -0800	[diff] [blame]	2253	return migrate_vma_collect_hole(start, end, -1, walk);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2254
				2255	if (pmd_trans_huge(*pmdp)) {
				2256	struct page *page;
				2257
				2258	ptl = pmd_lock(mm, pmdp);
				2259	if (unlikely(!pmd_trans_huge(*pmdp))) {
				2260	spin_unlock(ptl);
				2261	goto again;
				2262	}
				2263
				2264	page = pmd_page(*pmdp);
				2265	if (is_huge_zero_page(page)) {
				2266	spin_unlock(ptl);
				2267	split_huge_pmd(vma, pmdp, addr);
				2268	if (pmd_trans_unstable(pmdp))
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2269	return migrate_vma_collect_skip(start, end,
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2270	walk);
				2271	} else {
				2272	int ret;
				2273
				2274	get_page(page);
				2275	spin_unlock(ptl);
				2276	if (unlikely(!trylock_page(page)))
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2277	return migrate_vma_collect_skip(start, end,
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2278	walk);
				2279	ret = split_huge_page(page);
				2280	unlock_page(page);
				2281	put_page(page);
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2282	if (ret)
				2283	return migrate_vma_collect_skip(start, end,
				2284	walk);
				2285	if (pmd_none(*pmdp))
Steven Price	b7a16c7	2020-02-03 17:36:03 -0800	[diff] [blame]	2286	return migrate_vma_collect_hole(start, end, -1,
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2287	walk);
				2288	}
				2289	}
				2290
				2291	if (unlikely(pmd_bad(*pmdp)))
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2292	return migrate_vma_collect_skip(start, end, walk);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2293
				2294	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2295	arch_enter_lazy_mmu_mode();
				2296
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2297	for (; addr < end; addr += PAGE_SIZE, ptep++) {
Christoph Hellwig	800bb1c	2020-03-16 20:32:14 +0100	[diff] [blame]	2298	unsigned long mpfn = 0, pfn;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2299	struct page *page;
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2300	swp_entry_t entry;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2301	pte_t pte;
				2302
				2303	pte = *ptep;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2304
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2305	if (pte_none(pte)) {
Ralph Campbell	0744f28	2020-08-11 18:31:41 -0700	[diff] [blame]	2306	if (vma_is_anonymous(vma)) {
				2307	mpfn = MIGRATE_PFN_MIGRATE;
				2308	migrate->cpages++;
				2309	}
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2310	goto next;
				2311	}
				2312
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2313	if (!pte_present(pte)) {
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2314	/*
				2315	* Only care about unaddressable device page special
				2316	* page table entry. Other special swap entries are not
				2317	* migratable, and we ignore regular swapped page.
				2318	*/
				2319	entry = pte_to_swp_entry(pte);
				2320	if (!is_device_private_entry(entry))
				2321	goto next;
				2322
Alistair Popple	af5cdaf	2021-06-30 18:54:06 -0700	[diff] [blame]	2323	page = pfn_swap_entry_to_page(entry);
Ralph Campbell	5143192	2020-07-23 15:30:00 -0700	[diff] [blame]	2324	if (!(migrate->flags &
				2325	MIGRATE_VMA_SELECT_DEVICE_PRIVATE) \|\|
				2326	page->pgmap->owner != migrate->pgmap_owner)
Christoph Hellwig	800bb1c	2020-03-16 20:32:14 +0100	[diff] [blame]	2327	goto next;
				2328
Christoph Hellwig	06d462b	2019-08-14 09:59:27 +0200	[diff] [blame]	2329	mpfn = migrate_pfn(page_to_pfn(page)) \|
				2330	MIGRATE_PFN_MIGRATE;
Alistair Popple	4dd845b	2021-06-30 18:54:09 -0700	[diff] [blame]	2331	if (is_writable_device_private_entry(entry))
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2332	mpfn \|= MIGRATE_PFN_WRITE;
				2333	} else {
Ralph Campbell	5143192	2020-07-23 15:30:00 -0700	[diff] [blame]	2334	if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
Christoph Hellwig	800bb1c	2020-03-16 20:32:14 +0100	[diff] [blame]	2335	goto next;
Pingfan Liu	276f756	2019-09-23 15:37:38 -0700	[diff] [blame]	2336	pfn = pte_pfn(pte);
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2337	if (is_zero_pfn(pfn)) {
				2338	mpfn = MIGRATE_PFN_MIGRATE;
				2339	migrate->cpages++;
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2340	goto next;
				2341	}
Christoph Hellwig	25b2995	2019-06-13 22:50:49 +0200	[diff] [blame]	2342	page = vm_normal_page(migrate->vma, addr, pte);
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2343	mpfn = migrate_pfn(pfn) \| MIGRATE_PFN_MIGRATE;
				2344	mpfn \|= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
				2345	}
				2346
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2347	/* FIXME support THP */
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2348	if (!page \|\| !page->mapping \|\| PageTransCompound(page)) {
Pingfan Liu	276f756	2019-09-23 15:37:38 -0700	[diff] [blame]	2349	mpfn = 0;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2350	goto next;
				2351	}
				2352
				2353	/*
				2354	* By getting a reference on the page we pin it and that blocks
				2355	* any kind of migration. Side effect is that it "freezes" the
				2356	* pte.
				2357	*
				2358	* We drop this reference after isolating the page from the lru
				2359	* for non device page (device page are not on the lru and thus
				2360	* can't be dropped from it).
				2361	*/
				2362	get_page(page);
				2363	migrate->cpages++;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2364
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2365	/*
				2366	* Optimize for the common case where page is only mapped once
				2367	* in one process. If we can lock the page, then we can safely
				2368	* set up a special migration page table entry now.
				2369	*/
				2370	if (trylock_page(page)) {
				2371	pte_t swp_pte;
				2372
				2373	mpfn \|= MIGRATE_PFN_LOCKED;
				2374	ptep_get_and_clear(mm, addr, ptep);
				2375
				2376	/* Setup special migration page table entry */
Alistair Popple	4dd845b	2021-06-30 18:54:09 -0700	[diff] [blame]	2377	if (mpfn & MIGRATE_PFN_WRITE)
				2378	entry = make_writable_migration_entry(
				2379	page_to_pfn(page));
				2380	else
				2381	entry = make_readable_migration_entry(
				2382	page_to_pfn(page));
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2383	swp_pte = swp_entry_to_pte(entry);
Alistair Popple	ad7df76	2020-09-04 16:36:01 -0700	[diff] [blame]	2384	if (pte_present(pte)) {
				2385	if (pte_soft_dirty(pte))
				2386	swp_pte = pte_swp_mksoft_dirty(swp_pte);
				2387	if (pte_uffd_wp(pte))
				2388	swp_pte = pte_swp_mkuffd_wp(swp_pte);
				2389	} else {
				2390	if (pte_swp_soft_dirty(pte))
				2391	swp_pte = pte_swp_mksoft_dirty(swp_pte);
				2392	if (pte_swp_uffd_wp(pte))
				2393	swp_pte = pte_swp_mkuffd_wp(swp_pte);
				2394	}
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2395	set_pte_at(mm, addr, ptep, swp_pte);
				2396
				2397	/*
				2398	* This is like regular unmap: we remove the rmap and
				2399	* drop page refcount. Page won't be freed, as we took
				2400	* a reference just above.
				2401	*/
				2402	page_remove_rmap(page, false);
				2403	put_page(page);
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2404
				2405	if (pte_present(pte))
				2406	unmapped++;
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2407	}
				2408
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2409	next:
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2410	migrate->dst[migrate->npages] = 0;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2411	migrate->src[migrate->npages++] = mpfn;
				2412	}
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2413	arch_leave_lazy_mmu_mode();
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2414	pte_unmap_unlock(ptep - 1, ptl);
				2415
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2416	/* Only flush the TLB if we actually modified any entries */
				2417	if (unmapped)
				2418	flush_tlb_range(walk->vma, start, end);
				2419
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2420	return 0;
				2421	}
				2422
Christoph Hellwig	7b86ac3	2019-08-28 16:19:54 +0200	[diff] [blame]	2423	static const struct mm_walk_ops migrate_vma_walk_ops = {
				2424	.pmd_entry = migrate_vma_collect_pmd,
				2425	.pte_hole = migrate_vma_collect_hole,
				2426	};
				2427
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2428	/*
				2429	* migrate_vma_collect() - collect pages over a range of virtual addresses
				2430	* @migrate: migrate struct containing all migration information
				2431	*
				2432	* This will walk the CPU page table. For each virtual address backed by a
				2433	* valid page, it updates the src array and takes a reference on the page, in
				2434	* order to pin the page until we lock it and unmap it.
				2435	*/
				2436	static void migrate_vma_collect(struct migrate_vma *migrate)
				2437	{
Jérôme Glisse	ac46d4f	2018-12-28 00:38:09 -0800	[diff] [blame]	2438	struct mmu_notifier_range range;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2439
Ralph Campbell	998427b	2020-07-23 15:30:01 -0700	[diff] [blame]	2440	/*
				2441	* Note that the pgmap_owner is passed to the mmu notifier callback so
				2442	* that the registered device driver can skip invalidating device
				2443	* private page mappings that won't be migrated.
				2444	*/
Alistair Popple	6b49bf6	2021-06-30 18:54:19 -0700	[diff] [blame]	2445	mmu_notifier_range_init_owner(&range, MMU_NOTIFY_MIGRATE, 0,
				2446	migrate->vma, migrate->vma->vm_mm, migrate->start, migrate->end,
Ralph Campbell	c1a06df	2020-08-06 23:17:09 -0700	[diff] [blame]	2447	migrate->pgmap_owner);
Jérôme Glisse	ac46d4f	2018-12-28 00:38:09 -0800	[diff] [blame]	2448	mmu_notifier_invalidate_range_start(&range);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2449
Christoph Hellwig	7b86ac3	2019-08-28 16:19:54 +0200	[diff] [blame]	2450	walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end,
				2451	&migrate_vma_walk_ops, migrate);
				2452
				2453	mmu_notifier_invalidate_range_end(&range);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2454	migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
				2455	}
				2456
				2457	/*
				2458	* migrate_vma_check_page() - check if page is pinned or not
				2459	* @page: struct page to check
				2460	*
				2461	* Pinned pages cannot be migrated. This is the same test as in
				2462	* migrate_page_move_mapping(), except that here we allow migration of a
				2463	* ZONE_DEVICE page.
				2464	*/
				2465	static bool migrate_vma_check_page(struct page *page)
				2466	{
				2467	/*
				2468	* One extra ref because caller holds an extra reference, either from
				2469	* isolate_lru_page() for a regular page, or migrate_vma_collect() for
				2470	* a device page.
				2471	*/
				2472	int extra = 1;
				2473
				2474	/*
				2475	* FIXME support THP (transparent huge page), it is bit more complex to
				2476	* check them than regular pages, because they can be mapped with a pmd
				2477	* or with a pte (split pte mapping).
				2478	*/
				2479	if (PageCompound(page))
				2480	return false;
				2481
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2482	/* Page from ZONE_DEVICE have one extra reference */
				2483	if (is_zone_device_page(page)) {
				2484	/*
				2485	* Private page can never be pin as they have no valid pte and
				2486	* GUP will fail for those. Yet if there is a pending migration
				2487	* a thread might try to wait on the pte migration entry and
				2488	* will bump the page reference count. Sadly there is no way to
				2489	* differentiate a regular pin from migration wait. Hence to
				2490	* avoid 2 racing thread trying to migrate back to CPU to enter
Haitao Shi	8958b24	2020-12-15 20:47:26 -0800	[diff] [blame]	2491	* infinite loop (one stopping migration because the other is
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2492	* waiting on pte migration entry). We always return true here.
				2493	*
				2494	* FIXME proper solution is to rework migration_entry_wait() so
				2495	* it does not need to take a reference on page.
				2496	*/
Christoph Hellwig	25b2995	2019-06-13 22:50:49 +0200	[diff] [blame]	2497	return is_device_private_page(page);
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2498	}
				2499
Jérôme Glisse	df6ad69	2017-09-08 16:12:24 -0700	[diff] [blame]	2500	/* For file back page */
				2501	if (page_mapping(page))
				2502	extra += 1 + page_has_private(page);
				2503
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2504	if ((page_count(page) - extra) > page_mapcount(page))
				2505	return false;
				2506
				2507	return true;
				2508	}
				2509
				2510	/*
				2511	* migrate_vma_prepare() - lock pages and isolate them from the lru
				2512	* @migrate: migrate struct containing all migration information
				2513	*
				2514	* This locks pages that have been collected by migrate_vma_collect(). Once each
				2515	* page is locked it is isolated from the lru (for non-device pages). Finally,
				2516	* the ref taken by migrate_vma_collect() is dropped, as locked pages cannot be
				2517	* migrated by concurrent kernel threads.
				2518	*/
				2519	static void migrate_vma_prepare(struct migrate_vma *migrate)
				2520	{
				2521	const unsigned long npages = migrate->npages;
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2522	const unsigned long start = migrate->start;
				2523	unsigned long addr, i, restore = 0;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2524	bool allow_drain = true;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2525
				2526	lru_add_drain();
				2527
				2528	for (i = 0; (i < npages) && migrate->cpages; i++) {
				2529	struct page *page = migrate_pfn_to_page(migrate->src[i]);
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2530	bool remap = true;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2531
				2532	if (!page)
				2533	continue;
				2534
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2535	if (!(migrate->src[i] & MIGRATE_PFN_LOCKED)) {
				2536	/*
				2537	* Because we are migrating several pages there can be
				2538	* a deadlock between 2 concurrent migration where each
				2539	* are waiting on each other page lock.
				2540	*
				2541	* Make migrate_vma() a best effort thing and backoff
				2542	* for any page we can not lock right away.
				2543	*/
				2544	if (!trylock_page(page)) {
				2545	migrate->src[i] = 0;
				2546	migrate->cpages--;
				2547	put_page(page);
				2548	continue;
				2549	}
				2550	remap = false;
				2551	migrate->src[i] \|= MIGRATE_PFN_LOCKED;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2552	}
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2553
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2554	/* ZONE_DEVICE pages are not on LRU */
				2555	if (!is_zone_device_page(page)) {
				2556	if (!PageLRU(page) && allow_drain) {
				2557	/* Drain CPU's pagevec */
				2558	lru_add_drain_all();
				2559	allow_drain = false;
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2560	}
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2561
				2562	if (isolate_lru_page(page)) {
				2563	if (remap) {
				2564	migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
				2565	migrate->cpages--;
				2566	restore++;
				2567	} else {
				2568	migrate->src[i] = 0;
				2569	unlock_page(page);
				2570	migrate->cpages--;
				2571	put_page(page);
				2572	}
				2573	continue;
				2574	}
				2575
				2576	/* Drop the reference we took in collect */
				2577	put_page(page);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2578	}
				2579
				2580	if (!migrate_vma_check_page(page)) {
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2581	if (remap) {
				2582	migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
				2583	migrate->cpages--;
				2584	restore++;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2585
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2586	if (!is_zone_device_page(page)) {
				2587	get_page(page);
				2588	putback_lru_page(page);
				2589	}
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2590	} else {
				2591	migrate->src[i] = 0;
				2592	unlock_page(page);
				2593	migrate->cpages--;
				2594
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2595	if (!is_zone_device_page(page))
				2596	putback_lru_page(page);
				2597	else
				2598	put_page(page);
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2599	}
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2600	}
				2601	}
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2602
				2603	for (i = 0, addr = start; i < npages && restore; i++, addr += PAGE_SIZE) {
				2604	struct page *page = migrate_pfn_to_page(migrate->src[i]);
				2605
				2606	if (!page \|\| (migrate->src[i] & MIGRATE_PFN_MIGRATE))
				2607	continue;
				2608
				2609	remove_migration_pte(page, migrate->vma, addr, page);
				2610
				2611	migrate->src[i] = 0;
				2612	unlock_page(page);
				2613	put_page(page);
				2614	restore--;
				2615	}
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2616	}
				2617
				2618	/*
				2619	* migrate_vma_unmap() - replace page mapping with special migration pte entry
				2620	* @migrate: migrate struct containing all migration information
				2621	*
				2622	* Replace page mapping (CPU page table pte) with a special migration pte entry
				2623	* and check again if it has been pinned. Pinned pages are restored because we
				2624	* cannot migrate them.
				2625	*
				2626	* This is the last step before we call the device driver callback to allocate
				2627	* destination memory and copy contents of original page over to new page.
				2628	*/
				2629	static void migrate_vma_unmap(struct migrate_vma *migrate)
				2630	{
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2631	const unsigned long npages = migrate->npages;
				2632	const unsigned long start = migrate->start;
				2633	unsigned long addr, i, restore = 0;
				2634
				2635	for (i = 0; i < npages; i++) {
				2636	struct page *page = migrate_pfn_to_page(migrate->src[i]);
				2637
				2638	if (!page \|\| !(migrate->src[i] & MIGRATE_PFN_MIGRATE))
				2639	continue;
				2640
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2641	if (page_mapped(page)) {
Alistair Popple	a98a2f0	2021-06-30 18:54:16 -0700	[diff] [blame]	2642	try_to_migrate(page, 0);
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2643	if (page_mapped(page))
				2644	goto restore;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2645	}
Jérôme Glisse	8c3328f	2017-09-08 16:12:13 -0700	[diff] [blame]	2646
				2647	if (migrate_vma_check_page(page))
				2648	continue;
				2649
				2650	restore:
				2651	migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
				2652	migrate->cpages--;
				2653	restore++;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2654	}
				2655
				2656	for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) {
				2657	struct page *page = migrate_pfn_to_page(migrate->src[i]);
				2658
				2659	if (!page \|\| (migrate->src[i] & MIGRATE_PFN_MIGRATE))
				2660	continue;
				2661
				2662	remove_migration_ptes(page, page, false);
				2663
				2664	migrate->src[i] = 0;
				2665	unlock_page(page);
				2666	restore--;
				2667
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2668	if (is_zone_device_page(page))
				2669	put_page(page);
				2670	else
				2671	putback_lru_page(page);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2672	}
				2673	}
				2674
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	2675	/**
				2676	* migrate_vma_setup() - prepare to migrate a range of memory
Randy Dunlap	eaf444d	2020-08-11 18:33:08 -0700	[diff] [blame]	2677	* @args: contains the vma, start, and pfns arrays for the migration
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	2678	*
				2679	* Returns: negative errno on failures, 0 when 0 or more pages were migrated
				2680	* without an error.
				2681	*
				2682	* Prepare to migrate a range of memory virtual address range by collecting all
				2683	* the pages backing each virtual address in the range, saving them inside the
				2684	* src array. Then lock those pages and unmap them. Once the pages are locked
				2685	* and unmapped, check whether each page is pinned or not. Pages that aren't
				2686	* pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the
				2687	* corresponding src array entry. Then restores any pages that are pinned, by
				2688	* remapping and unlocking those pages.
				2689	*
				2690	* The caller should then allocate destination memory and copy source memory to
				2691	* it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE
				2692	* flag set). Once these are allocated and copied, the caller must update each
				2693	* corresponding entry in the dst array with the pfn value of the destination
				2694	* page and with the MIGRATE_PFN_VALID and MIGRATE_PFN_LOCKED flags set
				2695	* (destination pages must have their struct pages locked, via lock_page()).
				2696	*
				2697	* Note that the caller does not have to migrate all the pages that are marked
				2698	* with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from
				2699	* device memory to system memory. If the caller cannot migrate a device page
				2700	* back to system memory, then it must return VM_FAULT_SIGBUS, which has severe
				2701	* consequences for the userspace process, so it must be avoided if at all
				2702	* possible.
				2703	*
				2704	* For empty entries inside CPU page table (pte_none() or pmd_none() is true) we
				2705	* do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus
Ingo Molnar	f0953a1	2021-05-06 18:06:47 -0700	[diff] [blame]	2706	* allowing the caller to allocate device memory for those unbacked virtual
				2707	* addresses. For this the caller simply has to allocate device memory and
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	2708	* properly set the destination entry like for regular migration. Note that
Ingo Molnar	f0953a1	2021-05-06 18:06:47 -0700	[diff] [blame]	2709	* this can still fail, and thus inside the device driver you must check if the
				2710	* migration was successful for those entries after calling migrate_vma_pages(),
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	2711	* just like for regular migration.
				2712	*
				2713	* After that, the callers must call migrate_vma_pages() to go over each entry
				2714	* in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
				2715	* set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
				2716	* then migrate_vma_pages() to migrate struct page information from the source
				2717	* struct page to the destination struct page. If it fails to migrate the
				2718	* struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the
				2719	* src array.
				2720	*
				2721	* At this point all successfully migrated pages have an entry in the src
				2722	* array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
				2723	* array entry with MIGRATE_PFN_VALID flag set.
				2724	*
				2725	* Once migrate_vma_pages() returns the caller may inspect which pages were
				2726	* successfully migrated, and which were not. Successfully migrated pages will
				2727	* have the MIGRATE_PFN_MIGRATE flag set for their src array entry.
				2728	*
				2729	* It is safe to update device page table after migrate_vma_pages() because
Michel Lespinasse	c1e8d7c	2020-06-08 21:33:54 -0700	[diff] [blame]	2730	* both destination and source page are still locked, and the mmap_lock is held
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	2731	* in read mode (hence no one can unmap the range being migrated).
				2732	*
				2733	* Once the caller is done cleaning up things and updating its page table (if it
				2734	* chose to do so, this is not an obligation) it finally calls
				2735	* migrate_vma_finalize() to update the CPU page table to point to new pages
				2736	* for successfully migrated pages or otherwise restore the CPU page table to
				2737	* point to the original source pages.
				2738	*/
				2739	int migrate_vma_setup(struct migrate_vma *args)
				2740	{
				2741	long nr_pages = (args->end - args->start) >> PAGE_SHIFT;
				2742
				2743	args->start &= PAGE_MASK;
				2744	args->end &= PAGE_MASK;
				2745	if (!args->vma \|\| is_vm_hugetlb_page(args->vma) \|\|
				2746	(args->vma->vm_flags & VM_SPECIAL) \|\| vma_is_dax(args->vma))
				2747	return -EINVAL;
				2748	if (nr_pages <= 0)
				2749	return -EINVAL;
				2750	if (args->start < args->vma->vm_start \|\|
				2751	args->start >= args->vma->vm_end)
				2752	return -EINVAL;
				2753	if (args->end <= args->vma->vm_start \|\| args->end > args->vma->vm_end)
				2754	return -EINVAL;
				2755	if (!args->src \|\| !args->dst)
				2756	return -EINVAL;
				2757
				2758	memset(args->src, 0, sizeof(args->src) nr_pages);
				2759	args->cpages = 0;
				2760	args->npages = 0;
				2761
				2762	migrate_vma_collect(args);
				2763
				2764	if (args->cpages)
				2765	migrate_vma_prepare(args);
				2766	if (args->cpages)
				2767	migrate_vma_unmap(args);
				2768
				2769	/*
				2770	* At this point pages are locked and unmapped, and thus they have
				2771	* stable content and can safely be copied to destination memory that
				2772	* is allocated by the drivers.
				2773	*/
				2774	return 0;
				2775
				2776	}
				2777	EXPORT_SYMBOL(migrate_vma_setup);
				2778
Ralph Campbell	34290e2	2020-01-30 22:14:44 -0800	[diff] [blame]	2779	/*
				2780	* This code closely matches the code in:
				2781	* __handle_mm_fault()
				2782	* handle_pte_fault()
				2783	* do_anonymous_page()
				2784	* to map in an anonymous zero page but the struct page will be a ZONE_DEVICE
				2785	* private page.
				2786	*/
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2787	static void migrate_vma_insert_page(struct migrate_vma *migrate,
				2788	unsigned long addr,
				2789	struct page *page,
Stephen Zhang	d85c6db	2020-12-14 19:13:20 -0800	[diff] [blame]	2790	unsigned long *src)
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2791	{
				2792	struct vm_area_struct *vma = migrate->vma;
				2793	struct mm_struct *mm = vma->vm_mm;
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2794	bool flush = false;
				2795	spinlock_t *ptl;
				2796	pte_t entry;
				2797	pgd_t *pgdp;
				2798	p4d_t *p4dp;
				2799	pud_t *pudp;
				2800	pmd_t *pmdp;
				2801	pte_t *ptep;
				2802
				2803	/* Only allow populating anonymous memory */
				2804	if (!vma_is_anonymous(vma))
				2805	goto abort;
				2806
				2807	pgdp = pgd_offset(mm, addr);
				2808	p4dp = p4d_alloc(mm, pgdp, addr);
				2809	if (!p4dp)
				2810	goto abort;
				2811	pudp = pud_alloc(mm, p4dp, addr);
				2812	if (!pudp)
				2813	goto abort;
				2814	pmdp = pmd_alloc(mm, pudp, addr);
				2815	if (!pmdp)
				2816	goto abort;
				2817
				2818	if (pmd_trans_huge(pmdp) \|\| pmd_devmap(pmdp))
				2819	goto abort;
				2820
				2821	/*
				2822	* Use pte_alloc() instead of pte_alloc_map(). We can't run
				2823	* pte_offset_map() on pmds where a huge pmd might be created
				2824	* from a different thread.
				2825	*
Michel Lespinasse	3e4e28c	2020-06-08 21:33:51 -0700	[diff] [blame]	2826	* pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2827	* parallel threads are excluded by other means.
				2828	*
Michel Lespinasse	3e4e28c	2020-06-08 21:33:51 -0700	[diff] [blame]	2829	* Here we only have mmap_read_lock(mm).
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2830	*/
Joel Fernandes (Google)	4cf5892	2019-01-03 15:28:34 -0800	[diff] [blame]	2831	if (pte_alloc(mm, pmdp))
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2832	goto abort;
				2833
				2834	/* See the comment in pte_alloc_one_map() */
				2835	if (unlikely(pmd_trans_unstable(pmdp)))
				2836	goto abort;
				2837
				2838	if (unlikely(anon_vma_prepare(vma)))
				2839	goto abort;
Johannes Weiner	d9eb1ea	2020-06-03 16:02:24 -0700	[diff] [blame]	2840	if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2841	goto abort;
				2842
				2843	/*
				2844	* The memory barrier inside __SetPageUptodate makes sure that
				2845	* preceding stores to the page contents become visible before
				2846	* the set_pte_at() write.
				2847	*/
				2848	__SetPageUptodate(page);
				2849
Jérôme Glisse	df6ad69	2017-09-08 16:12:24 -0700	[diff] [blame]	2850	if (is_zone_device_page(page)) {
				2851	if (is_device_private_page(page)) {
				2852	swp_entry_t swp_entry;
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2853
Alistair Popple	4dd845b	2021-06-30 18:54:09 -0700	[diff] [blame]	2854	if (vma->vm_flags & VM_WRITE)
				2855	swp_entry = make_writable_device_private_entry(
				2856	page_to_pfn(page));
				2857	else
				2858	swp_entry = make_readable_device_private_entry(
				2859	page_to_pfn(page));
Jérôme Glisse	df6ad69	2017-09-08 16:12:24 -0700	[diff] [blame]	2860	entry = swp_entry_to_pte(swp_entry);
Miaohe Lin	34f5e9b	2021-05-04 18:37:10 -0700	[diff] [blame]	2861	} else {
				2862	/*
				2863	* For now we only support migrating to un-addressable
				2864	* device memory.
				2865	*/
				2866	pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
				2867	goto abort;
Jérôme Glisse	df6ad69	2017-09-08 16:12:24 -0700	[diff] [blame]	2868	}
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2869	} else {
				2870	entry = mk_pte(page, vma->vm_page_prot);
				2871	if (vma->vm_flags & VM_WRITE)
				2872	entry = pte_mkwrite(pte_mkdirty(entry));
				2873	}
				2874
				2875	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
				2876
Ralph Campbell	34290e2	2020-01-30 22:14:44 -0800	[diff] [blame]	2877	if (check_stable_address_space(mm))
				2878	goto unlock_abort;
				2879
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2880	if (pte_present(*ptep)) {
				2881	unsigned long pfn = pte_pfn(*ptep);
				2882
Ralph Campbell	c23a0c9	2020-01-30 22:14:41 -0800	[diff] [blame]	2883	if (!is_zero_pfn(pfn))
				2884	goto unlock_abort;
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2885	flush = true;
Ralph Campbell	c23a0c9	2020-01-30 22:14:41 -0800	[diff] [blame]	2886	} else if (!pte_none(*ptep))
				2887	goto unlock_abort;
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2888
				2889	/*
Ralph Campbell	c23a0c9	2020-01-30 22:14:41 -0800	[diff] [blame]	2890	* Check for userfaultfd but do not deliver the fault. Instead,
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2891	* just back off.
				2892	*/
Ralph Campbell	c23a0c9	2020-01-30 22:14:41 -0800	[diff] [blame]	2893	if (userfaultfd_missing(vma))
				2894	goto unlock_abort;
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2895
				2896	inc_mm_counter(mm, MM_ANONPAGES);
Johannes Weiner	be5d0a7	2020-06-03 16:01:57 -0700	[diff] [blame]	2897	page_add_new_anon_rmap(page, vma, addr, false);
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2898	if (!is_zone_device_page(page))
Joonsoo Kim	b518154	2020-08-11 18:30:40 -0700	[diff] [blame]	2899	lru_cache_add_inactive_or_unevictable(page, vma);
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2900	get_page(page);
				2901
				2902	if (flush) {
				2903	flush_cache_page(vma, addr, pte_pfn(*ptep));
				2904	ptep_clear_flush_notify(vma, addr, ptep);
				2905	set_pte_at_notify(mm, addr, ptep, entry);
				2906	update_mmu_cache(vma, addr, ptep);
				2907	} else {
				2908	/* No need to invalidate - it was non-present before */
				2909	set_pte_at(mm, addr, ptep, entry);
				2910	update_mmu_cache(vma, addr, ptep);
				2911	}
				2912
				2913	pte_unmap_unlock(ptep, ptl);
				2914	*src = MIGRATE_PFN_MIGRATE;
				2915	return;
				2916
Ralph Campbell	c23a0c9	2020-01-30 22:14:41 -0800	[diff] [blame]	2917	unlock_abort:
				2918	pte_unmap_unlock(ptep, ptl);
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2919	abort:
				2920	*src &= ~MIGRATE_PFN_MIGRATE;
				2921	}
				2922
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	2923	/**
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2924	* migrate_vma_pages() - migrate meta-data from src page to dst page
				2925	* @migrate: migrate struct containing all migration information
				2926	*
				2927	* This migrates struct page meta-data from source struct page to destination
				2928	* struct page. This effectively finishes the migration from source page to the
				2929	* destination page.
				2930	*/
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	2931	void migrate_vma_pages(struct migrate_vma *migrate)
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2932	{
				2933	const unsigned long npages = migrate->npages;
				2934	const unsigned long start = migrate->start;
Jérôme Glisse	ac46d4f	2018-12-28 00:38:09 -0800	[diff] [blame]	2935	struct mmu_notifier_range range;
				2936	unsigned long addr, i;
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2937	bool notified = false;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2938
				2939	for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
				2940	struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
				2941	struct page *page = migrate_pfn_to_page(migrate->src[i]);
				2942	struct address_space *mapping;
				2943	int r;
				2944
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2945	if (!newpage) {
				2946	migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2947	continue;
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2948	}
				2949
				2950	if (!page) {
Ralph Campbell	c23a0c9	2020-01-30 22:14:41 -0800	[diff] [blame]	2951	if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2952	continue;
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2953	if (!notified) {
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2954	notified = true;
Jérôme Glisse	ac46d4f	2018-12-28 00:38:09 -0800	[diff] [blame]	2955
Alistair Popple	6b49bf6	2021-06-30 18:54:19 -0700	[diff] [blame]	2956	mmu_notifier_range_init_owner(&range,
				2957	MMU_NOTIFY_MIGRATE, 0, migrate->vma,
				2958	migrate->vma->vm_mm, addr, migrate->end,
Ralph Campbell	5e5dda8	2020-12-14 19:12:55 -0800	[diff] [blame]	2959	migrate->pgmap_owner);
Jérôme Glisse	ac46d4f	2018-12-28 00:38:09 -0800	[diff] [blame]	2960	mmu_notifier_invalidate_range_start(&range);
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2961	}
				2962	migrate_vma_insert_page(migrate, addr, newpage,
Stephen Zhang	d85c6db	2020-12-14 19:13:20 -0800	[diff] [blame]	2963	&migrate->src[i]);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2964	continue;
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2965	}
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2966
				2967	mapping = page_mapping(page);
				2968
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2969	if (is_zone_device_page(newpage)) {
				2970	if (is_device_private_page(newpage)) {
				2971	/*
				2972	* For now only support private anonymous when
				2973	* migrating to un-addressable device memory.
				2974	*/
				2975	if (mapping) {
				2976	migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
				2977	continue;
				2978	}
Christoph Hellwig	25b2995	2019-06-13 22:50:49 +0200	[diff] [blame]	2979	} else {
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	2980	/*
				2981	* Other types of ZONE_DEVICE page are not
				2982	* supported.
				2983	*/
				2984	migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
				2985	continue;
				2986	}
				2987	}
				2988
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	2989	r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY);
				2990	if (r != MIGRATEPAGE_SUCCESS)
				2991	migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
				2992	}
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2993
Jérôme Glisse	4645b9f	2017-11-15 17:34:11 -0800	[diff] [blame]	2994	/*
				2995	* No need to double call mmu_notifier->invalidate_range() callback as
				2996	* the above ptep_clear_flush_notify() inside migrate_vma_insert_page()
				2997	* did already call it.
				2998	*/
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	2999	if (notified)
Jérôme Glisse	ac46d4f	2018-12-28 00:38:09 -0800	[diff] [blame]	3000	mmu_notifier_invalidate_range_only_end(&range);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	3001	}
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	3002	EXPORT_SYMBOL(migrate_vma_pages);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	3003
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	3004	/**
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	3005	* migrate_vma_finalize() - restore CPU page table entry
				3006	* @migrate: migrate struct containing all migration information
				3007	*
				3008	* This replaces the special migration pte entry with either a mapping to the
				3009	* new page if migration was successful for that page, or to the original page
				3010	* otherwise.
				3011	*
				3012	* This also unlocks the pages and puts them back on the lru, or drops the extra
				3013	* refcount, for device pages.
				3014	*/
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	3015	void migrate_vma_finalize(struct migrate_vma *migrate)
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	3016	{
				3017	const unsigned long npages = migrate->npages;
				3018	unsigned long i;
				3019
				3020	for (i = 0; i < npages; i++) {
				3021	struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
				3022	struct page *page = migrate_pfn_to_page(migrate->src[i]);
				3023
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	3024	if (!page) {
				3025	if (newpage) {
				3026	unlock_page(newpage);
				3027	put_page(newpage);
				3028	}
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	3029	continue;
Jérôme Glisse	8315ada	2017-09-08 16:12:21 -0700	[diff] [blame]	3030	}
				3031
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	3032	if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) \|\| !newpage) {
				3033	if (newpage) {
				3034	unlock_page(newpage);
				3035	put_page(newpage);
				3036	}
				3037	newpage = page;
				3038	}
				3039
				3040	remove_migration_ptes(page, newpage, false);
				3041	unlock_page(page);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	3042
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	3043	if (is_zone_device_page(page))
				3044	put_page(page);
				3045	else
				3046	putback_lru_page(page);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	3047
				3048	if (newpage != page) {
				3049	unlock_page(newpage);
Jérôme Glisse	a5430dd	2017-09-08 16:12:17 -0700	[diff] [blame]	3050	if (is_zone_device_page(newpage))
				3051	put_page(newpage);
				3052	else
				3053	putback_lru_page(newpage);
Jérôme Glisse	8763cb4	2017-09-08 16:12:09 -0700	[diff] [blame]	3054	}
				3055	}
				3056	}
Christoph Hellwig	a7d1f22	2019-08-14 09:59:19 +0200	[diff] [blame]	3057	EXPORT_SYMBOL(migrate_vma_finalize);
Christoph Hellwig	9b2ed9c	2019-08-14 09:59:28 +0200	[diff] [blame]	3058	#endif /* CONFIG_DEVICE_PRIVATE */
Dave Hansen	79c28a4	2021-09-02 14:59:06 -0700	[diff] [blame^]	3059
				3060	/* Disable reclaim-based migration. */
				3061	static void __disable_all_migrate_targets(void)
				3062	{
				3063	int node;
				3064
				3065	for_each_online_node(node)
				3066	node_demotion[node] = NUMA_NO_NODE;
				3067	}
				3068
				3069	static void disable_all_migrate_targets(void)
				3070	{
				3071	__disable_all_migrate_targets();
				3072
				3073	/*
				3074	* Ensure that the "disable" is visible across the system.
				3075	* Readers will see either a combination of before+disable
				3076	* state or disable+after. They will never see before and
				3077	* after state together.
				3078	*
				3079	* The before+after state together might have cycles and
				3080	* could cause readers to do things like loop until this
				3081	* function finishes. This ensures they can only see a
				3082	* single "bad" read and would, for instance, only loop
				3083	* once.
				3084	*/
				3085	synchronize_rcu();
				3086	}
				3087
				3088	/*
				3089	* Find an automatic demotion target for 'node'.
				3090	* Failing here is OK. It might just indicate
				3091	* being at the end of a chain.
				3092	*/
				3093	static int establish_migrate_target(int node, nodemask_t *used)
				3094	{
				3095	int migration_target;
				3096
				3097	/*
				3098	* Can not set a migration target on a
				3099	* node with it already set.
				3100	*
				3101	* No need for READ_ONCE() here since this
				3102	* in the write path for node_demotion[].
				3103	* This should be the only thread writing.
				3104	*/
				3105	if (node_demotion[node] != NUMA_NO_NODE)
				3106	return NUMA_NO_NODE;
				3107
				3108	migration_target = find_next_best_node(node, used);
				3109	if (migration_target == NUMA_NO_NODE)
				3110	return NUMA_NO_NODE;
				3111
				3112	node_demotion[node] = migration_target;
				3113
				3114	return migration_target;
				3115	}
				3116
				3117	/*
				3118	* When memory fills up on a node, memory contents can be
				3119	* automatically migrated to another node instead of
				3120	* discarded at reclaim.
				3121	*
				3122	* Establish a "migration path" which will start at nodes
				3123	* with CPUs and will follow the priorities used to build the
				3124	* page allocator zonelists.
				3125	*
				3126	* The difference here is that cycles must be avoided. If
				3127	* node0 migrates to node1, then neither node1, nor anything
				3128	* node1 migrates to can migrate to node0.
				3129	*
				3130	* This function can run simultaneously with readers of
				3131	* node_demotion[]. However, it can not run simultaneously
				3132	* with itself. Exclusion is provided by memory hotplug events
				3133	* being single-threaded.
				3134	*/
				3135	static void __set_migration_target_nodes(void)
				3136	{
				3137	nodemask_t next_pass = NODE_MASK_NONE;
				3138	nodemask_t this_pass = NODE_MASK_NONE;
				3139	nodemask_t used_targets = NODE_MASK_NONE;
				3140	int node;
				3141
				3142	/*
				3143	* Avoid any oddities like cycles that could occur
				3144	* from changes in the topology. This will leave
				3145	* a momentary gap when migration is disabled.
				3146	*/
				3147	disable_all_migrate_targets();
				3148
				3149	/*
				3150	* Allocations go close to CPUs, first. Assume that
				3151	* the migration path starts at the nodes with CPUs.
				3152	*/
				3153	next_pass = node_states[N_CPU];
				3154	again:
				3155	this_pass = next_pass;
				3156	next_pass = NODE_MASK_NONE;
				3157	/*
				3158	* To avoid cycles in the migration "graph", ensure
				3159	* that migration sources are not future targets by
				3160	* setting them in 'used_targets'. Do this only
				3161	* once per pass so that multiple source nodes can
				3162	* share a target node.
				3163	*
				3164	* 'used_targets' will become unavailable in future
				3165	* passes. This limits some opportunities for
				3166	* multiple source nodes to share a destination.
				3167	*/
				3168	nodes_or(used_targets, used_targets, this_pass);
				3169	for_each_node_mask(node, this_pass) {
				3170	int target_node = establish_migrate_target(node, &used_targets);
				3171
				3172	if (target_node == NUMA_NO_NODE)
				3173	continue;
				3174
				3175	/*
				3176	* Visit targets from this pass in the next pass.
				3177	* Eventually, every node will have been part of
				3178	* a pass, and will become set in 'used_targets'.
				3179	*/
				3180	node_set(target_node, next_pass);
				3181	}
				3182	/*
				3183	* 'next_pass' contains nodes which became migration
				3184	* targets in this pass. Make additional passes until
				3185	* no more migrations targets are available.
				3186	*/
				3187	if (!nodes_empty(next_pass))
				3188	goto again;
				3189	}
				3190
				3191	/*
				3192	* For callers that do not hold get_online_mems() already.
				3193	*/
				3194	__maybe_unused // <- temporay to prevent warnings during bisects
				3195	static void set_migration_target_nodes(void)
				3196	{
				3197	get_online_mems();
				3198	__set_migration_target_nodes();
				3199	put_online_mems();
				3200	}