blob: 030529c744816451bf10bf3b8ccbff6d3c4a2ad3 [file] [log] [blame]
Greg Kroah-Hartmanb2441312017-11-01 15:07:57 +01001// SPDX-License-Identifier: GPL-2.0
Christoph Lameterb20a3502006-03-22 00:09:12 -08002/*
Hugh Dickins14e0f9b2015-11-05 18:49:43 -08003 * Memory Migration functionality - linux/mm/migrate.c
Christoph Lameterb20a3502006-03-22 00:09:12 -08004 *
5 * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
6 *
7 * Page migration was first developed in the context of the memory hotplug
8 * project. The main authors of the migration code are:
9 *
10 * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
11 * Hirokazu Takahashi <taka@valinux.co.jp>
12 * Dave Hansen <haveblue@us.ibm.com>
Christoph Lametercde53532008-07-04 09:59:22 -070013 * Christoph Lameter
Christoph Lameterb20a3502006-03-22 00:09:12 -080014 */
15
16#include <linux/migrate.h>
Paul Gortmakerb95f1b312011-10-16 02:01:52 -040017#include <linux/export.h>
Christoph Lameterb20a3502006-03-22 00:09:12 -080018#include <linux/swap.h>
Christoph Lameter06972122006-06-23 02:03:35 -070019#include <linux/swapops.h>
Christoph Lameterb20a3502006-03-22 00:09:12 -080020#include <linux/pagemap.h>
Christoph Lametere23ca002006-04-10 22:52:57 -070021#include <linux/buffer_head.h>
Christoph Lameterb20a3502006-03-22 00:09:12 -080022#include <linux/mm_inline.h>
Pavel Emelyanovb4888932007-10-18 23:40:14 -070023#include <linux/nsproxy.h>
Christoph Lameterb20a3502006-03-22 00:09:12 -080024#include <linux/pagevec.h>
Hugh Dickinse9995ef2009-12-14 17:59:31 -080025#include <linux/ksm.h>
Christoph Lameterb20a3502006-03-22 00:09:12 -080026#include <linux/rmap.h>
27#include <linux/topology.h>
28#include <linux/cpu.h>
29#include <linux/cpuset.h>
Christoph Lameter04e62a22006-06-23 02:03:38 -070030#include <linux/writeback.h>
Christoph Lameter742755a2006-06-23 02:03:55 -070031#include <linux/mempolicy.h>
32#include <linux/vmalloc.h>
David Quigley86c3a762006-06-23 02:04:02 -070033#include <linux/security.h>
Hugh Dickins42cb14b2015-11-05 18:50:05 -080034#include <linux/backing-dev.h>
Minchan Kimbda807d2016-07-26 15:23:05 -070035#include <linux/compaction.h>
Adrian Bunk4f5ca262008-07-23 21:27:02 -070036#include <linux/syscalls.h>
Dominik Brodowski7addf442018-03-17 16:08:03 +010037#include <linux/compat.h>
Naoya Horiguchi290408d2010-09-08 10:19:35 +090038#include <linux/hugetlb.h>
Aneesh Kumar K.V8e6ac7f2012-07-31 16:42:27 -070039#include <linux/hugetlb_cgroup.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090040#include <linux/gfp.h>
Christoph Hellwiga5201102019-08-28 16:19:53 +020041#include <linux/pagewalk.h>
Jérôme Glissedf6ad692017-09-08 16:12:24 -070042#include <linux/pfn_t.h>
Jérôme Glissea5430dd2017-09-08 16:12:17 -070043#include <linux/memremap.h>
Jérôme Glisse8315ada2017-09-08 16:12:21 -070044#include <linux/userfaultfd_k.h>
Rafael Aquinibf6bddf12012-12-11 16:02:42 -080045#include <linux/balloon_compaction.h>
Mel Gormanf714f4f2013-12-18 17:08:33 -080046#include <linux/mmu_notifier.h>
Vladimir Davydov33c3fc72015-09-09 15:35:45 -070047#include <linux/page_idle.h>
Vlastimil Babkad435edc2016-03-15 14:56:15 -070048#include <linux/page_owner.h>
Ingo Molnar6e84f312017-02-08 18:51:29 +010049#include <linux/sched/mm.h>
Linus Torvalds197e7e52017-08-20 13:26:27 -070050#include <linux/ptrace.h>
Ralph Campbell34290e22020-01-30 22:14:44 -080051#include <linux/oom.h>
Dave Hansen884a6e52021-09-02 14:59:09 -070052#include <linux/memory.h>
Christoph Lameterb20a3502006-03-22 00:09:12 -080053
Michal Nazarewicz0d1836c2010-12-21 17:24:26 -080054#include <asm/tlbflush.h>
55
Mel Gorman7b2a2d42012-10-19 14:07:31 +010056#define CREATE_TRACE_POINTS
57#include <trace/events/migrate.h>
58
qinglin.lia1a623f2024-06-21 14:36:34 +080059#undef CREATE_TRACE_POINTS
60#include <trace/hooks/mm.h>
61
Christoph Lameterb20a3502006-03-22 00:09:12 -080062#include "internal.h"
63
Yisheng Xie9e5bcd62017-02-24 14:57:29 -080064int isolate_movable_page(struct page *page, isolate_mode_t mode)
Minchan Kimbda807d2016-07-26 15:23:05 -070065{
66 struct address_space *mapping;
67
68 /*
69 * Avoid burning cycles with pages that are yet under __free_pages(),
70 * or just got freed under us.
71 *
72 * In case we 'win' a race for a movable page being freed under us and
73 * raise its refcount preventing __free_pages() from doing its job
74 * the put_page() at the end of this block will take care of
75 * release this page, thus avoiding a nasty leakage.
76 */
77 if (unlikely(!get_page_unless_zero(page)))
78 goto out;
79
80 /*
81 * Check PageMovable before holding a PG_lock because page's owner
82 * assumes anybody doesn't touch PG_lock of newly allocated page
Wei Yang8bb4e7a2019-03-05 15:46:22 -080083 * so unconditionally grabbing the lock ruins page's owner side.
Minchan Kimbda807d2016-07-26 15:23:05 -070084 */
85 if (unlikely(!__PageMovable(page)))
86 goto out_putpage;
87 /*
88 * As movable pages are not isolated from LRU lists, concurrent
89 * compaction threads can race against page migration functions
90 * as well as race against the releasing a page.
91 *
92 * In order to avoid having an already isolated movable page
93 * being (wrongly) re-isolated while it is under migration,
94 * or to avoid attempting to isolate pages being released,
95 * lets be sure we have the page lock
96 * before proceeding with the movable page isolation steps.
97 */
98 if (unlikely(!trylock_page(page)))
99 goto out_putpage;
100
101 if (!PageMovable(page) || PageIsolated(page))
102 goto out_no_isolated;
103
104 mapping = page_mapping(page);
105 VM_BUG_ON_PAGE(!mapping, page);
106
107 if (!mapping->a_ops->isolate_page(page, mode))
108 goto out_no_isolated;
109
110 /* Driver shouldn't use PG_isolated bit of page->flags */
111 WARN_ON_ONCE(PageIsolated(page));
andrew.yang0d8a8362022-03-15 16:58:34 +1100112 SetPageIsolated(page);
Minchan Kimbda807d2016-07-26 15:23:05 -0700113 unlock_page(page);
114
Yisheng Xie9e5bcd62017-02-24 14:57:29 -0800115 return 0;
Minchan Kimbda807d2016-07-26 15:23:05 -0700116
117out_no_isolated:
118 unlock_page(page);
119out_putpage:
120 put_page(page);
121out:
Yisheng Xie9e5bcd62017-02-24 14:57:29 -0800122 return -EBUSY;
Minchan Kimbda807d2016-07-26 15:23:05 -0700123}
124
Miaohe Lin606a6f72021-05-04 18:37:04 -0700125static void putback_movable_page(struct page *page)
Minchan Kimbda807d2016-07-26 15:23:05 -0700126{
127 struct address_space *mapping;
128
Minchan Kimbda807d2016-07-26 15:23:05 -0700129 mapping = page_mapping(page);
130 mapping->a_ops->putback_page(page);
andrew.yang0d8a8362022-03-15 16:58:34 +1100131 ClearPageIsolated(page);
Minchan Kimbda807d2016-07-26 15:23:05 -0700132}
133
Christoph Lameterb20a3502006-03-22 00:09:12 -0800134/*
Rafael Aquini5733c7d2012-12-11 16:02:47 -0800135 * Put previously isolated pages back onto the appropriate lists
136 * from where they were once taken off for compaction/migration.
137 *
Joonsoo Kim59c82b72014-01-21 15:51:17 -0800138 * This function shall be used whenever the isolated pageset has been
139 * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range()
Miaohe Lin072e7412022-05-30 19:30:15 +0800140 * and isolate_hugetlb().
Rafael Aquini5733c7d2012-12-11 16:02:47 -0800141 */
142void putback_movable_pages(struct list_head *l)
143{
144 struct page *page;
145 struct page *page2;
146
147 list_for_each_entry_safe(page, page2, l, lru) {
Naoya Horiguchi31caf662013-09-11 14:21:59 -0700148 if (unlikely(PageHuge(page))) {
149 putback_active_hugepage(page);
150 continue;
151 }
Rafael Aquini5733c7d2012-12-11 16:02:47 -0800152 list_del(&page->lru);
Minchan Kimbda807d2016-07-26 15:23:05 -0700153 /*
154 * We isolated non-lru movable page so here we can use
155 * __PageMovable because LRU page's mapping cannot have
156 * PAGE_MAPPING_MOVABLE.
157 */
Minchan Kimb1123ea62016-07-26 15:23:09 -0700158 if (unlikely(__PageMovable(page))) {
Minchan Kimbda807d2016-07-26 15:23:05 -0700159 VM_BUG_ON_PAGE(!PageIsolated(page), page);
160 lock_page(page);
161 if (PageMovable(page))
162 putback_movable_page(page);
163 else
andrew.yang0d8a8362022-03-15 16:58:34 +1100164 ClearPageIsolated(page);
Minchan Kimbda807d2016-07-26 15:23:05 -0700165 unlock_page(page);
166 put_page(page);
167 } else {
Naoya Horiguchie8db67e2017-09-08 16:11:12 -0700168 mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
Matthew Wilcox (Oracle)6c357842020-08-14 17:30:37 -0700169 page_is_file_lru(page), -thp_nr_pages(page));
Rabin Vincentfc280fe2017-04-20 14:37:46 -0700170 putback_lru_page(page);
Minchan Kimbda807d2016-07-26 15:23:05 -0700171 }
Christoph Lameterb20a3502006-03-22 00:09:12 -0800172 }
Christoph Lameterb20a3502006-03-22 00:09:12 -0800173}
Charan Teja Reddyf47b8522021-02-16 13:59:45 +0530174EXPORT_SYMBOL_GPL(putback_movable_pages);
Christoph Lameterb20a3502006-03-22 00:09:12 -0800175
Christoph Lameter06972122006-06-23 02:03:35 -0700176/*
177 * Restore a potential migration pte to a working pte entry
178 */
Minchan Kime4b82222017-05-03 14:54:27 -0700179static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
Hugh Dickinse9995ef2009-12-14 17:59:31 -0800180 unsigned long addr, void *old)
Christoph Lameter06972122006-06-23 02:03:35 -0700181{
Kirill A. Shutemov3fe87962017-02-24 14:58:16 -0800182 struct page_vma_mapped_walk pvmw = {
183 .page = old,
184 .vma = vma,
185 .address = addr,
186 .flags = PVMW_SYNC | PVMW_MIGRATION,
187 };
188 struct page *new;
189 pte_t pte;
Christoph Lameter06972122006-06-23 02:03:35 -0700190 swp_entry_t entry;
Christoph Lameter06972122006-06-23 02:03:35 -0700191
Kirill A. Shutemov3fe87962017-02-24 14:58:16 -0800192 VM_BUG_ON_PAGE(PageTail(page), page);
193 while (page_vma_mapped_walk(&pvmw)) {
Naoya Horiguchi4b0ece62017-03-31 15:11:44 -0700194 if (PageKsm(page))
195 new = page;
196 else
197 new = page - pvmw.page->index +
198 linear_page_index(vma, pvmw.address);
Christoph Lameter06972122006-06-23 02:03:35 -0700199
Zi Yan616b8372017-09-08 16:10:57 -0700200#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
201 /* PMD-mapped THP migration entry */
202 if (!pvmw.pte) {
203 VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page);
204 remove_migration_pmd(&pvmw, new);
205 continue;
206 }
207#endif
208
Kirill A. Shutemov3fe87962017-02-24 14:58:16 -0800209 get_page(new);
210 pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot)));
211 if (pte_swp_soft_dirty(*pvmw.pte))
212 pte = pte_mksoft_dirty(pte);
Christoph Lameter06972122006-06-23 02:03:35 -0700213
Hugh Dickins486cf462011-10-19 12:50:35 -0700214 /*
Kirill A. Shutemov3fe87962017-02-24 14:58:16 -0800215 * Recheck VMA as permissions can change since migration started
Hugh Dickins486cf462011-10-19 12:50:35 -0700216 */
Kirill A. Shutemov3fe87962017-02-24 14:58:16 -0800217 entry = pte_to_swp_entry(*pvmw.pte);
Alistair Popple4dd845b2021-06-30 18:54:09 -0700218 if (is_writable_migration_entry(entry))
Kirill A. Shutemov3fe87962017-02-24 14:58:16 -0800219 pte = maybe_mkwrite(pte, vma);
Peter Xuf45ec5f2020-04-06 20:06:01 -0700220 else if (pte_swp_uffd_wp(*pvmw.pte))
221 pte = pte_mkuffd_wp(pte);
Mel Gormand3cb8bf2014-10-02 19:47:41 +0100222
Ralph Campbell61287632020-09-04 16:36:04 -0700223 if (unlikely(is_device_private_page(new))) {
Alistair Popple4dd845b2021-06-30 18:54:09 -0700224 if (pte_write(pte))
225 entry = make_writable_device_private_entry(
226 page_to_pfn(new));
227 else
228 entry = make_readable_device_private_entry(
229 page_to_pfn(new));
Ralph Campbell61287632020-09-04 16:36:04 -0700230 pte = swp_entry_to_pte(entry);
Ralph Campbell3d321bf82020-09-04 16:36:07 -0700231 if (pte_swp_soft_dirty(*pvmw.pte))
232 pte = pte_swp_mksoft_dirty(pte);
Ralph Campbell61287632020-09-04 16:36:04 -0700233 if (pte_swp_uffd_wp(*pvmw.pte))
234 pte = pte_swp_mkuffd_wp(pte);
Lars Perssond2b2c6dd2019-03-28 20:44:28 -0700235 }
Jérôme Glissea5430dd2017-09-08 16:12:17 -0700236
Andi Kleen3ef8fd72010-10-11 16:03:21 +0200237#ifdef CONFIG_HUGETLB_PAGE
Kirill A. Shutemov3fe87962017-02-24 14:58:16 -0800238 if (PageHuge(new)) {
Christophe Leroy79c1c592021-06-30 18:48:00 -0700239 unsigned int shift = huge_page_shift(hstate_vma(vma));
240
Kirill A. Shutemov3fe87962017-02-24 14:58:16 -0800241 pte = pte_mkhuge(pte);
Christophe Leroy79c1c592021-06-30 18:48:00 -0700242 pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
Aneesh Kumar K.V383321a2017-07-06 15:38:41 -0700243 set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
Kirill A. Shutemov3fe87962017-02-24 14:58:16 -0800244 if (PageAnon(new))
245 hugepage_add_anon_rmap(new, vma, pvmw.address);
246 else
247 page_dup_rmap(new, true);
Aneesh Kumar K.V383321a2017-07-06 15:38:41 -0700248 } else
249#endif
250 {
251 set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
Christoph Lameter04e62a22006-06-23 02:03:38 -0700252
Aneesh Kumar K.V383321a2017-07-06 15:38:41 -0700253 if (PageAnon(new))
254 page_add_anon_rmap(new, vma, pvmw.address, false);
255 else
256 page_add_file_rmap(new, false);
257 }
Kirill A. Shutemov3fe87962017-02-24 14:58:16 -0800258 if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new))
259 mlock_vma_page(new);
Hugh Dickins51afb122015-11-05 18:49:37 -0800260
Kirill A. Shutemove125fe42018-10-05 15:51:41 -0700261 if (PageTransHuge(page) && PageMlocked(page))
262 clear_page_mlock(page);
263
Kirill A. Shutemov3fe87962017-02-24 14:58:16 -0800264 /* No need to invalidate - it was non-present before */
265 update_mmu_cache(vma, pvmw.address, pvmw.pte);
266 }
267
Minchan Kime4b82222017-05-03 14:54:27 -0700268 return true;
Christoph Lameter06972122006-06-23 02:03:35 -0700269}
270
271/*
Christoph Lameter04e62a22006-06-23 02:03:38 -0700272 * Get rid of all migration entries and replace them by
273 * references to the indicated page.
274 */
Kirill A. Shutemove3884662016-03-17 14:20:07 -0700275void remove_migration_ptes(struct page *old, struct page *new, bool locked)
Christoph Lameter04e62a22006-06-23 02:03:38 -0700276{
Joonsoo Kim051ac832014-01-21 15:49:48 -0800277 struct rmap_walk_control rwc = {
278 .rmap_one = remove_migration_pte,
279 .arg = old,
280 };
281
Kirill A. Shutemove3884662016-03-17 14:20:07 -0700282 if (locked)
283 rmap_walk_locked(new, &rwc);
284 else
285 rmap_walk(new, &rwc);
Christoph Lameter04e62a22006-06-23 02:03:38 -0700286}
287
288/*
Christoph Lameter06972122006-06-23 02:03:35 -0700289 * Something used the pte of a page under migration. We need to
290 * get to the page and wait until migration is finished.
291 * When we return from this function the fault will be retried.
Christoph Lameter06972122006-06-23 02:03:35 -0700292 */
Naoya Horiguchie66f17f2015-02-11 15:25:22 -0800293void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
Naoya Horiguchi30dad302013-06-12 14:05:04 -0700294 spinlock_t *ptl)
Christoph Lameter06972122006-06-23 02:03:35 -0700295{
Naoya Horiguchi30dad302013-06-12 14:05:04 -0700296 pte_t pte;
Christoph Lameter06972122006-06-23 02:03:35 -0700297 swp_entry_t entry;
298 struct page *page;
299
Naoya Horiguchi30dad302013-06-12 14:05:04 -0700300 spin_lock(ptl);
Christoph Lameter06972122006-06-23 02:03:35 -0700301 pte = *ptep;
302 if (!is_swap_pte(pte))
303 goto out;
304
305 entry = pte_to_swp_entry(pte);
306 if (!is_migration_entry(entry))
307 goto out;
308
Alistair Poppleaf5cdaf2021-06-30 18:54:06 -0700309 page = pfn_swap_entry_to_page(entry);
Xu Yuffc90cb2021-06-15 18:23:42 -0700310 page = compound_head(page);
Christoph Lameter06972122006-06-23 02:03:35 -0700311
Nick Piggine2867812008-07-25 19:45:30 -0700312 /*
Matthew Wilcox89eb9462017-12-04 04:35:16 -0500313 * Once page cache replacement of page migration started, page_count
Hugh Dickins9a1ea432018-12-28 00:36:14 -0800314 * is zero; but we must not call put_and_wait_on_page_locked() without
315 * a ref. Use get_page_unless_zero(), and just fault again if it fails.
Nick Piggine2867812008-07-25 19:45:30 -0700316 */
317 if (!get_page_unless_zero(page))
318 goto out;
Christoph Lameter06972122006-06-23 02:03:35 -0700319 pte_unmap_unlock(ptep, ptl);
Matthew Wilcox (Oracle)48054622021-02-24 12:02:02 -0800320 put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
Christoph Lameter06972122006-06-23 02:03:35 -0700321 return;
322out:
323 pte_unmap_unlock(ptep, ptl);
324}
325
Naoya Horiguchi30dad302013-06-12 14:05:04 -0700326void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
327 unsigned long address)
328{
329 spinlock_t *ptl = pte_lockptr(mm, pmd);
330 pte_t *ptep = pte_offset_map(pmd, address);
331 __migration_entry_wait(mm, ptep, ptl);
332}
333
Kirill A. Shutemovcb900f42013-11-14 14:31:02 -0800334void migration_entry_wait_huge(struct vm_area_struct *vma,
335 struct mm_struct *mm, pte_t *pte)
Naoya Horiguchi30dad302013-06-12 14:05:04 -0700336{
Kirill A. Shutemovcb900f42013-11-14 14:31:02 -0800337 spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte);
Naoya Horiguchi30dad302013-06-12 14:05:04 -0700338 __migration_entry_wait(mm, pte, ptl);
339}
340
Zi Yan616b8372017-09-08 16:10:57 -0700341#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
342void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
343{
344 spinlock_t *ptl;
345 struct page *page;
346
347 ptl = pmd_lock(mm, pmd);
348 if (!is_pmd_migration_entry(*pmd))
349 goto unlock;
Alistair Poppleaf5cdaf2021-06-30 18:54:06 -0700350 page = pfn_swap_entry_to_page(pmd_to_swp_entry(*pmd));
Zi Yan616b8372017-09-08 16:10:57 -0700351 if (!get_page_unless_zero(page))
352 goto unlock;
353 spin_unlock(ptl);
Matthew Wilcox (Oracle)48054622021-02-24 12:02:02 -0800354 put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
Zi Yan616b8372017-09-08 16:10:57 -0700355 return;
356unlock:
357 spin_unlock(ptl);
358}
359#endif
360
Jan Karaf9004822019-03-05 15:48:46 -0800361static int expected_page_refs(struct address_space *mapping, struct page *page)
Jan Kara0b3901b2018-12-28 00:39:01 -0800362{
363 int expected_count = 1;
364
365 /*
Ralph Campbellf1f4f3a2020-10-13 16:58:42 -0700366 * Device private pages have an extra refcount as they are
Jan Kara0b3901b2018-12-28 00:39:01 -0800367 * ZONE_DEVICE pages.
368 */
369 expected_count += is_device_private_page(page);
Jan Karaf9004822019-03-05 15:48:46 -0800370 if (mapping)
Matthew Wilcox (Oracle)6c357842020-08-14 17:30:37 -0700371 expected_count += thp_nr_pages(page) + page_has_private(page);
Jan Kara0b3901b2018-12-28 00:39:01 -0800372
373 return expected_count;
374}
375
Christoph Lameterb20a3502006-03-22 00:09:12 -0800376/*
Christoph Lameterc3fcf8a2006-06-23 02:03:32 -0700377 * Replace the page in the mapping.
Christoph Lameter5b5c7122006-06-23 02:03:29 -0700378 *
379 * The number of remaining references must be:
380 * 1 for anonymous pages without a mapping
381 * 2 for pages with a mapping
David Howells266cf652009-04-03 16:42:36 +0100382 * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
Christoph Lameterb20a3502006-03-22 00:09:12 -0800383 */
Gu Zheng36bc08c2013-07-16 17:56:16 +0800384int migrate_page_move_mapping(struct address_space *mapping,
Keith Busch37109692019-07-18 15:58:46 -0700385 struct page *newpage, struct page *page, int extra_count)
Christoph Lameterb20a3502006-03-22 00:09:12 -0800386{
Matthew Wilcox89eb9462017-12-04 04:35:16 -0500387 XA_STATE(xas, &mapping->i_pages, page_index(page));
Hugh Dickins42cb14b2015-11-05 18:50:05 -0800388 struct zone *oldzone, *newzone;
389 int dirty;
Jan Karaf9004822019-03-05 15:48:46 -0800390 int expected_count = expected_page_refs(mapping, page) + extra_count;
Shakeel Butt5c447d22021-01-23 21:01:15 -0800391 int nr = thp_nr_pages(page);
Jérôme Glisse8763cb42017-09-08 16:12:09 -0700392
Christoph Lameter6c5240a2006-06-23 02:03:37 -0700393 if (!mapping) {
Christoph Lameter0e8c7d02007-04-23 14:41:09 -0700394 /* Anonymous page without mapping */
Benjamin LaHaise8e321fe2013-12-21 17:56:08 -0500395 if (page_count(page) != expected_count)
Christoph Lameter6c5240a2006-06-23 02:03:37 -0700396 return -EAGAIN;
Hugh Dickinscf4b7692015-11-05 18:50:02 -0800397
398 /* No turning back from here */
Hugh Dickinscf4b7692015-11-05 18:50:02 -0800399 newpage->index = page->index;
400 newpage->mapping = page->mapping;
401 if (PageSwapBacked(page))
Hugh Dickinsfa9949d2016-05-19 17:12:41 -0700402 __SetPageSwapBacked(newpage);
Hugh Dickinscf4b7692015-11-05 18:50:02 -0800403
Rafael Aquini78bd5202012-12-11 16:02:31 -0800404 return MIGRATEPAGE_SUCCESS;
Christoph Lameter6c5240a2006-06-23 02:03:37 -0700405 }
406
Hugh Dickins42cb14b2015-11-05 18:50:05 -0800407 oldzone = page_zone(page);
408 newzone = page_zone(newpage);
409
Matthew Wilcox89eb9462017-12-04 04:35:16 -0500410 xas_lock_irq(&xas);
Matthew Wilcox89eb9462017-12-04 04:35:16 -0500411 if (page_count(page) != expected_count || xas_load(&xas) != page) {
412 xas_unlock_irq(&xas);
Christoph Lametere23ca002006-04-10 22:52:57 -0700413 return -EAGAIN;
Christoph Lameterb20a3502006-03-22 00:09:12 -0800414 }
415
Joonsoo Kimfe896d12016-03-17 14:19:26 -0700416 if (!page_ref_freeze(page, expected_count)) {
Matthew Wilcox89eb9462017-12-04 04:35:16 -0500417 xas_unlock_irq(&xas);
Nick Piggine2867812008-07-25 19:45:30 -0700418 return -EAGAIN;
419 }
420
Christoph Lameterb20a3502006-03-22 00:09:12 -0800421 /*
Hugh Dickinscf4b7692015-11-05 18:50:02 -0800422 * Now we know that no one else is looking at the page:
423 * no turning back from here.
Christoph Lameterb20a3502006-03-22 00:09:12 -0800424 */
Hugh Dickinscf4b7692015-11-05 18:50:02 -0800425 newpage->index = page->index;
426 newpage->mapping = page->mapping;
Shakeel Butt5c447d22021-01-23 21:01:15 -0800427 page_ref_add(newpage, nr); /* add cache reference */
Nicholas Piggin6326fec2016-12-25 13:00:29 +1000428 if (PageSwapBacked(page)) {
429 __SetPageSwapBacked(newpage);
430 if (PageSwapCache(page)) {
431 SetPageSwapCache(newpage);
432 set_page_private(newpage, page_private(page));
433 }
434 } else {
435 VM_BUG_ON_PAGE(PageSwapCache(page), page);
Christoph Lameterb20a3502006-03-22 00:09:12 -0800436 }
437
Hugh Dickins42cb14b2015-11-05 18:50:05 -0800438 /* Move dirty while page refs frozen and newpage not yet exposed */
439 dirty = PageDirty(page);
440 if (dirty) {
441 ClearPageDirty(page);
442 SetPageDirty(newpage);
443 }
444
Matthew Wilcox89eb9462017-12-04 04:35:16 -0500445 xas_store(&xas, newpage);
Naoya Horiguchie71769a2018-04-20 14:55:45 -0700446 if (PageTransHuge(page)) {
447 int i;
Naoya Horiguchie71769a2018-04-20 14:55:45 -0700448
Shakeel Butt5c447d22021-01-23 21:01:15 -0800449 for (i = 1; i < nr; i++) {
Matthew Wilcox89eb9462017-12-04 04:35:16 -0500450 xas_next(&xas);
Matthew Wilcox (Oracle)41011962019-09-23 15:34:52 -0700451 xas_store(&xas, newpage);
Naoya Horiguchie71769a2018-04-20 14:55:45 -0700452 }
Naoya Horiguchie71769a2018-04-20 14:55:45 -0700453 }
Nick Piggin7cf9c2c2006-12-06 20:33:44 -0800454
455 /*
Jacobo Giralt937a94c2012-01-10 15:07:11 -0800456 * Drop cache reference from old page by unfreezing
457 * to one less reference.
Nick Piggin7cf9c2c2006-12-06 20:33:44 -0800458 * We know this isn't the last reference.
459 */
Shakeel Butt5c447d22021-01-23 21:01:15 -0800460 page_ref_unfreeze(page, expected_count - nr);
Nick Piggin7cf9c2c2006-12-06 20:33:44 -0800461
Matthew Wilcox89eb9462017-12-04 04:35:16 -0500462 xas_unlock(&xas);
Hugh Dickins42cb14b2015-11-05 18:50:05 -0800463 /* Leave irq disabled to prevent preemption while updating stats */
464
Christoph Lameter0e8c7d02007-04-23 14:41:09 -0700465 /*
466 * If moved to a different zone then also account
467 * the page for that zone. Other VM counters will be
468 * taken care of when we establish references to the
469 * new page and drop references to the old page.
470 *
471 * Note that anonymous pages are accounted for
Mel Gorman4b9d0fa2016-07-28 15:46:17 -0700472 * via NR_FILE_PAGES and NR_ANON_MAPPED if they
Christoph Lameter0e8c7d02007-04-23 14:41:09 -0700473 * are mapped to swap space.
474 */
Hugh Dickins42cb14b2015-11-05 18:50:05 -0800475 if (newzone != oldzone) {
Johannes Weiner0d1c2072020-06-03 16:01:54 -0700476 struct lruvec *old_lruvec, *new_lruvec;
477 struct mem_cgroup *memcg;
478
479 memcg = page_memcg(page);
480 old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat);
481 new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
482
Shakeel Butt5c447d22021-01-23 21:01:15 -0800483 __mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr);
484 __mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr);
Hugh Dickins42cb14b2015-11-05 18:50:05 -0800485 if (PageSwapBacked(page) && !PageSwapCache(page)) {
Shakeel Butt5c447d22021-01-23 21:01:15 -0800486 __mod_lruvec_state(old_lruvec, NR_SHMEM, -nr);
487 __mod_lruvec_state(new_lruvec, NR_SHMEM, nr);
Hugh Dickins42cb14b2015-11-05 18:50:05 -0800488 }
Shakeel Buttb6038942021-02-24 12:03:55 -0800489#ifdef CONFIG_SWAP
490 if (PageSwapCache(page)) {
491 __mod_lruvec_state(old_lruvec, NR_SWAPCACHE, -nr);
492 __mod_lruvec_state(new_lruvec, NR_SWAPCACHE, nr);
493 }
494#endif
Christoph Hellwigf56753a2020-09-24 08:51:40 +0200495 if (dirty && mapping_can_writeback(mapping)) {
Shakeel Butt5c447d22021-01-23 21:01:15 -0800496 __mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr);
497 __mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr);
498 __mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr);
499 __mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr);
Hugh Dickins42cb14b2015-11-05 18:50:05 -0800500 }
KOSAKI Motohiro4b021082009-09-21 17:01:33 -0700501 }
Hugh Dickins42cb14b2015-11-05 18:50:05 -0800502 local_irq_enable();
Christoph Lameterb20a3502006-03-22 00:09:12 -0800503
Rafael Aquini78bd5202012-12-11 16:02:31 -0800504 return MIGRATEPAGE_SUCCESS;
Christoph Lameterb20a3502006-03-22 00:09:12 -0800505}
Richard Weinberger1118dce2016-06-16 23:26:14 +0200506EXPORT_SYMBOL(migrate_page_move_mapping);
Christoph Lameterb20a3502006-03-22 00:09:12 -0800507
508/*
Naoya Horiguchi290408d2010-09-08 10:19:35 +0900509 * The expected number of remaining references is the same as that
510 * of migrate_page_move_mapping().
511 */
512int migrate_huge_page_move_mapping(struct address_space *mapping,
513 struct page *newpage, struct page *page)
514{
Matthew Wilcox89eb9462017-12-04 04:35:16 -0500515 XA_STATE(xas, &mapping->i_pages, page_index(page));
Naoya Horiguchi290408d2010-09-08 10:19:35 +0900516 int expected_count;
Naoya Horiguchi290408d2010-09-08 10:19:35 +0900517
Matthew Wilcox89eb9462017-12-04 04:35:16 -0500518 xas_lock_irq(&xas);
Naoya Horiguchi290408d2010-09-08 10:19:35 +0900519 expected_count = 2 + page_has_private(page);
Matthew Wilcox89eb9462017-12-04 04:35:16 -0500520 if (page_count(page) != expected_count || xas_load(&xas) != page) {
521 xas_unlock_irq(&xas);
Naoya Horiguchi290408d2010-09-08 10:19:35 +0900522 return -EAGAIN;
523 }
524
Joonsoo Kimfe896d12016-03-17 14:19:26 -0700525 if (!page_ref_freeze(page, expected_count)) {
Matthew Wilcox89eb9462017-12-04 04:35:16 -0500526 xas_unlock_irq(&xas);
Naoya Horiguchi290408d2010-09-08 10:19:35 +0900527 return -EAGAIN;
528 }
529
Hugh Dickinscf4b7692015-11-05 18:50:02 -0800530 newpage->index = page->index;
531 newpage->mapping = page->mapping;
Johannes Weiner6a93ca82016-03-15 14:57:19 -0700532
Naoya Horiguchi290408d2010-09-08 10:19:35 +0900533 get_page(newpage);
534
Matthew Wilcox89eb9462017-12-04 04:35:16 -0500535 xas_store(&xas, newpage);
Naoya Horiguchi290408d2010-09-08 10:19:35 +0900536
Joonsoo Kimfe896d12016-03-17 14:19:26 -0700537 page_ref_unfreeze(page, expected_count - 1);
Naoya Horiguchi290408d2010-09-08 10:19:35 +0900538
Matthew Wilcox89eb9462017-12-04 04:35:16 -0500539 xas_unlock_irq(&xas);
Johannes Weiner6a93ca82016-03-15 14:57:19 -0700540
Rafael Aquini78bd5202012-12-11 16:02:31 -0800541 return MIGRATEPAGE_SUCCESS;
Naoya Horiguchi290408d2010-09-08 10:19:35 +0900542}
543
544/*
Christoph Lameterb20a3502006-03-22 00:09:12 -0800545 * Copy the page to its new location
546 */
Jérôme Glisse2916ecc2017-09-08 16:12:06 -0700547void migrate_page_states(struct page *newpage, struct page *page)
Christoph Lameterb20a3502006-03-22 00:09:12 -0800548{
Rik van Riel7851a452013-10-07 11:29:23 +0100549 int cpupid;
550
Christoph Lameterb20a3502006-03-22 00:09:12 -0800551 if (PageError(page))
552 SetPageError(newpage);
553 if (PageReferenced(page))
554 SetPageReferenced(newpage);
555 if (PageUptodate(page))
556 SetPageUptodate(newpage);
Lee Schermerhorn894bc312008-10-18 20:26:39 -0700557 if (TestClearPageActive(page)) {
Sasha Levin309381fea2014-01-23 15:52:54 -0800558 VM_BUG_ON_PAGE(PageUnevictable(page), page);
Christoph Lameterb20a3502006-03-22 00:09:12 -0800559 SetPageActive(newpage);
Lee Schermerhorn418b27e2009-12-14 17:59:54 -0800560 } else if (TestClearPageUnevictable(page))
561 SetPageUnevictable(newpage);
Johannes Weiner1899ad12018-10-26 15:06:04 -0700562 if (PageWorkingset(page))
563 SetPageWorkingset(newpage);
Christoph Lameterb20a3502006-03-22 00:09:12 -0800564 if (PageChecked(page))
565 SetPageChecked(newpage);
566 if (PageMappedToDisk(page))
567 SetPageMappedToDisk(newpage);
568
qinglin.lia1a623f2024-06-21 14:36:34 +0800569 trace_android_vh_look_around_migrate_page(page, newpage);
570
Hugh Dickins42cb14b2015-11-05 18:50:05 -0800571 /* Move dirty on pages not done by migrate_page_move_mapping() */
572 if (PageDirty(page))
573 SetPageDirty(newpage);
Christoph Lameterb20a3502006-03-22 00:09:12 -0800574
Vladimir Davydov33c3fc72015-09-09 15:35:45 -0700575 if (page_is_young(page))
576 set_page_young(newpage);
577 if (page_is_idle(page))
578 set_page_idle(newpage);
579
Rik van Riel7851a452013-10-07 11:29:23 +0100580 /*
581 * Copy NUMA information to the new page, to prevent over-eager
582 * future migrations of this same page.
583 */
584 cpupid = page_cpupid_xchg_last(page, -1);
585 page_cpupid_xchg_last(newpage, cpupid);
586
Hugh Dickinse9995ef2009-12-14 17:59:31 -0800587 ksm_migrate_page(newpage, page);
Hugh Dickinsc8d65532013-02-22 16:35:10 -0800588 /*
589 * Please do not reorder this without considering how mm/ksm.c's
590 * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache().
591 */
Naoya Horiguchib3b3a992015-04-15 16:13:15 -0700592 if (PageSwapCache(page))
593 ClearPageSwapCache(page);
Christoph Lameterb20a3502006-03-22 00:09:12 -0800594 ClearPagePrivate(page);
Muchun Songad2fa372021-06-30 18:47:21 -0700595
596 /* page->private contains hugetlb specific flags */
597 if (!PageHuge(page))
598 set_page_private(page, 0);
Christoph Lameterb20a3502006-03-22 00:09:12 -0800599
600 /*
601 * If any waiters have accumulated on the new page then
602 * wake them up.
603 */
604 if (PageWriteback(newpage))
605 end_page_writeback(newpage);
Vlastimil Babkad435edc2016-03-15 14:56:15 -0700606
Yang Shi6aeff242020-04-06 20:04:21 -0700607 /*
608 * PG_readahead shares the same bit with PG_reclaim. The above
609 * end_page_writeback() may clear PG_readahead mistakenly, so set the
610 * bit after that.
611 */
612 if (PageReadahead(page))
613 SetPageReadahead(newpage);
614
Vlastimil Babkad435edc2016-03-15 14:56:15 -0700615 copy_page_owner(page, newpage);
Johannes Weiner74485cf2016-03-15 14:57:54 -0700616
Hugh Dickinsa333e3e2020-09-18 21:20:06 -0700617 if (!PageHuge(page))
618 mem_cgroup_migrate(page, newpage);
Christoph Lameterb20a3502006-03-22 00:09:12 -0800619}
Jérôme Glisse2916ecc2017-09-08 16:12:06 -0700620EXPORT_SYMBOL(migrate_page_states);
621
622void migrate_page_copy(struct page *newpage, struct page *page)
623{
624 if (PageHuge(page) || PageTransHuge(page))
625 copy_huge_page(newpage, page);
626 else
627 copy_highpage(newpage, page);
628
629 migrate_page_states(newpage, page);
630}
Richard Weinberger1118dce2016-06-16 23:26:14 +0200631EXPORT_SYMBOL(migrate_page_copy);
Christoph Lameterb20a3502006-03-22 00:09:12 -0800632
Christoph Lameter1d8b85c2006-06-23 02:03:28 -0700633/************************************************************
634 * Migration functions
635 ***********************************************************/
636
Christoph Lameterb20a3502006-03-22 00:09:12 -0800637/*
Minchan Kimbda807d2016-07-26 15:23:05 -0700638 * Common logic to directly migrate a single LRU page suitable for
David Howells266cf652009-04-03 16:42:36 +0100639 * pages that do not use PagePrivate/PagePrivate2.
Christoph Lameterb20a3502006-03-22 00:09:12 -0800640 *
641 * Pages are locked upon entry and exit.
642 */
Christoph Lameter2d1db3b2006-06-23 02:03:33 -0700643int migrate_page(struct address_space *mapping,
Mel Gormana6bc32b2012-01-12 17:19:43 -0800644 struct page *newpage, struct page *page,
645 enum migrate_mode mode)
Christoph Lameterb20a3502006-03-22 00:09:12 -0800646{
647 int rc;
648
649 BUG_ON(PageWriteback(page)); /* Writeback must be complete */
650
Keith Busch37109692019-07-18 15:58:46 -0700651 rc = migrate_page_move_mapping(mapping, newpage, page, 0);
Christoph Lameterb20a3502006-03-22 00:09:12 -0800652
Rafael Aquini78bd5202012-12-11 16:02:31 -0800653 if (rc != MIGRATEPAGE_SUCCESS)
Christoph Lameterb20a3502006-03-22 00:09:12 -0800654 return rc;
655
Jérôme Glisse2916ecc2017-09-08 16:12:06 -0700656 if (mode != MIGRATE_SYNC_NO_COPY)
657 migrate_page_copy(newpage, page);
658 else
659 migrate_page_states(newpage, page);
Rafael Aquini78bd5202012-12-11 16:02:31 -0800660 return MIGRATEPAGE_SUCCESS;
Christoph Lameterb20a3502006-03-22 00:09:12 -0800661}
662EXPORT_SYMBOL(migrate_page);
663
David Howells93614012006-09-30 20:45:40 +0200664#ifdef CONFIG_BLOCK
Jan Kara84ade7c2018-12-28 00:39:09 -0800665/* Returns true if all buffers are successfully locked */
666static bool buffer_migrate_lock_buffers(struct buffer_head *head,
667 enum migrate_mode mode)
668{
669 struct buffer_head *bh = head;
670
671 /* Simple case, sync compaction */
672 if (mode != MIGRATE_ASYNC) {
673 do {
Jan Kara84ade7c2018-12-28 00:39:09 -0800674 lock_buffer(bh);
675 bh = bh->b_this_page;
676
677 } while (bh != head);
678
679 return true;
680 }
681
682 /* async case, we cannot block on lock_buffer so use trylock_buffer */
683 do {
Jan Kara84ade7c2018-12-28 00:39:09 -0800684 if (!trylock_buffer(bh)) {
685 /*
686 * We failed to lock the buffer and cannot stall in
687 * async migration. Release the taken locks
688 */
689 struct buffer_head *failed_bh = bh;
Jan Kara84ade7c2018-12-28 00:39:09 -0800690 bh = head;
691 while (bh != failed_bh) {
692 unlock_buffer(bh);
Jan Kara84ade7c2018-12-28 00:39:09 -0800693 bh = bh->b_this_page;
694 }
695 return false;
696 }
697
698 bh = bh->b_this_page;
699 } while (bh != head);
700 return true;
701}
702
Jan Kara89cb0882018-12-28 00:39:12 -0800703static int __buffer_migrate_page(struct address_space *mapping,
704 struct page *newpage, struct page *page, enum migrate_mode mode,
705 bool check_refs)
Christoph Lameter1d8b85c2006-06-23 02:03:28 -0700706{
Christoph Lameter1d8b85c2006-06-23 02:03:28 -0700707 struct buffer_head *bh, *head;
708 int rc;
Jan Karacc4f11e2018-12-28 00:39:05 -0800709 int expected_count;
Christoph Lameter1d8b85c2006-06-23 02:03:28 -0700710
Christoph Lameter1d8b85c2006-06-23 02:03:28 -0700711 if (!page_has_buffers(page))
Mel Gormana6bc32b2012-01-12 17:19:43 -0800712 return migrate_page(mapping, newpage, page, mode);
Christoph Lameter1d8b85c2006-06-23 02:03:28 -0700713
Jan Karacc4f11e2018-12-28 00:39:05 -0800714 /* Check whether page does not have extra refs before we do more work */
Jan Karaf9004822019-03-05 15:48:46 -0800715 expected_count = expected_page_refs(mapping, page);
Jan Karacc4f11e2018-12-28 00:39:05 -0800716 if (page_count(page) != expected_count)
717 return -EAGAIN;
718
Christoph Lameter1d8b85c2006-06-23 02:03:28 -0700719 head = page_buffers(page);
Jan Karacc4f11e2018-12-28 00:39:05 -0800720 if (!buffer_migrate_lock_buffers(head, mode))
721 return -EAGAIN;
Christoph Lameter1d8b85c2006-06-23 02:03:28 -0700722
Jan Kara89cb0882018-12-28 00:39:12 -0800723 if (check_refs) {
724 bool busy;
725 bool invalidated = false;
726
727recheck_buffers:
728 busy = false;
729 spin_lock(&mapping->private_lock);
730 bh = head;
731 do {
732 if (atomic_read(&bh->b_count)) {
733 busy = true;
734 break;
735 }
736 bh = bh->b_this_page;
737 } while (bh != head);
Jan Kara89cb0882018-12-28 00:39:12 -0800738 if (busy) {
739 if (invalidated) {
740 rc = -EAGAIN;
741 goto unlock_buffers;
742 }
Jan Karaebdf4de2019-08-02 21:48:47 -0700743 spin_unlock(&mapping->private_lock);
Jan Kara89cb0882018-12-28 00:39:12 -0800744 invalidate_bh_lrus();
745 invalidated = true;
746 goto recheck_buffers;
747 }
748 }
749
Keith Busch37109692019-07-18 15:58:46 -0700750 rc = migrate_page_move_mapping(mapping, newpage, page, 0);
Rafael Aquini78bd5202012-12-11 16:02:31 -0800751 if (rc != MIGRATEPAGE_SUCCESS)
Jan Karacc4f11e2018-12-28 00:39:05 -0800752 goto unlock_buffers;
Christoph Lameter1d8b85c2006-06-23 02:03:28 -0700753
Guoqing Jiangcd0f3712020-06-01 21:48:06 -0700754 attach_page_private(newpage, detach_page_private(page));
Christoph Lameter1d8b85c2006-06-23 02:03:28 -0700755
756 bh = head;
757 do {
758 set_bh_page(bh, newpage, bh_offset(bh));
759 bh = bh->b_this_page;
760
761 } while (bh != head);
762
Jérôme Glisse2916ecc2017-09-08 16:12:06 -0700763 if (mode != MIGRATE_SYNC_NO_COPY)
764 migrate_page_copy(newpage, page);
765 else
766 migrate_page_states(newpage, page);
Christoph Lameter1d8b85c2006-06-23 02:03:28 -0700767
Jan Karacc4f11e2018-12-28 00:39:05 -0800768 rc = MIGRATEPAGE_SUCCESS;
769unlock_buffers:
Jan Karaebdf4de2019-08-02 21:48:47 -0700770 if (check_refs)
771 spin_unlock(&mapping->private_lock);
Christoph Lameter1d8b85c2006-06-23 02:03:28 -0700772 bh = head;
773 do {
774 unlock_buffer(bh);
Christoph Lameter1d8b85c2006-06-23 02:03:28 -0700775 bh = bh->b_this_page;
776
777 } while (bh != head);
778
Jan Karacc4f11e2018-12-28 00:39:05 -0800779 return rc;
Christoph Lameter1d8b85c2006-06-23 02:03:28 -0700780}
Jan Kara89cb0882018-12-28 00:39:12 -0800781
782/*
783 * Migration function for pages with buffers. This function can only be used
784 * if the underlying filesystem guarantees that no other references to "page"
785 * exist. For example attached buffer heads are accessed only under page lock.
786 */
787int buffer_migrate_page(struct address_space *mapping,
788 struct page *newpage, struct page *page, enum migrate_mode mode)
789{
790 return __buffer_migrate_page(mapping, newpage, page, mode, false);
791}
Christoph Lameter1d8b85c2006-06-23 02:03:28 -0700792EXPORT_SYMBOL(buffer_migrate_page);
Jan Kara89cb0882018-12-28 00:39:12 -0800793
794/*
795 * Same as above except that this variant is more careful and checks that there
796 * are also no buffer head references. This function is the right one for
797 * mappings where buffer heads are directly looked up and referenced (such as
798 * block device mappings).
799 */
800int buffer_migrate_page_norefs(struct address_space *mapping,
801 struct page *newpage, struct page *page, enum migrate_mode mode)
802{
803 return __buffer_migrate_page(mapping, newpage, page, mode, true);
804}
David Howells93614012006-09-30 20:45:40 +0200805#endif
Christoph Lameter1d8b85c2006-06-23 02:03:28 -0700806
Christoph Lameter04e62a22006-06-23 02:03:38 -0700807/*
808 * Writeback a page to clean the dirty state
809 */
810static int writeout(struct address_space *mapping, struct page *page)
811{
812 struct writeback_control wbc = {
813 .sync_mode = WB_SYNC_NONE,
814 .nr_to_write = 1,
815 .range_start = 0,
816 .range_end = LLONG_MAX,
Christoph Lameter04e62a22006-06-23 02:03:38 -0700817 .for_reclaim = 1
818 };
819 int rc;
820
821 if (!mapping->a_ops->writepage)
822 /* No write method for the address space */
823 return -EINVAL;
824
825 if (!clear_page_dirty_for_io(page))
826 /* Someone else already triggered a write */
827 return -EAGAIN;
828
829 /*
830 * A dirty page may imply that the underlying filesystem has
831 * the page on some queue. So the page must be clean for
832 * migration. Writeout may mean we loose the lock and the
833 * page state is no longer what we checked for earlier.
834 * At this point we know that the migration attempt cannot
835 * be successful.
836 */
Kirill A. Shutemove3884662016-03-17 14:20:07 -0700837 remove_migration_ptes(page, page, false);
Christoph Lameter04e62a22006-06-23 02:03:38 -0700838
839 rc = mapping->a_ops->writepage(page, &wbc);
Christoph Lameter04e62a22006-06-23 02:03:38 -0700840
841 if (rc != AOP_WRITEPAGE_ACTIVATE)
842 /* unlocked. Relock */
843 lock_page(page);
844
Hugh Dickinsbda85502008-11-19 15:36:36 -0800845 return (rc < 0) ? -EIO : -EAGAIN;
Christoph Lameter04e62a22006-06-23 02:03:38 -0700846}
847
848/*
849 * Default handling if a filesystem does not provide a migration function.
850 */
Christoph Lameter8351a6e2006-06-23 02:03:33 -0700851static int fallback_migrate_page(struct address_space *mapping,
Mel Gormana6bc32b2012-01-12 17:19:43 -0800852 struct page *newpage, struct page *page, enum migrate_mode mode)
Christoph Lameter8351a6e2006-06-23 02:03:33 -0700853{
Mel Gormanb969c4ab2012-01-12 17:19:34 -0800854 if (PageDirty(page)) {
Mel Gormana6bc32b2012-01-12 17:19:43 -0800855 /* Only writeback pages in full synchronous migration */
Jérôme Glisse2916ecc2017-09-08 16:12:06 -0700856 switch (mode) {
857 case MIGRATE_SYNC:
858 case MIGRATE_SYNC_NO_COPY:
859 break;
860 default:
Mel Gormanb969c4ab2012-01-12 17:19:34 -0800861 return -EBUSY;
Jérôme Glisse2916ecc2017-09-08 16:12:06 -0700862 }
Christoph Lameter04e62a22006-06-23 02:03:38 -0700863 return writeout(mapping, page);
Mel Gormanb969c4ab2012-01-12 17:19:34 -0800864 }
Christoph Lameter8351a6e2006-06-23 02:03:33 -0700865
866 /*
867 * Buffers may be managed in a filesystem specific way.
868 * We must have no buffers or drop them.
869 */
David Howells266cf652009-04-03 16:42:36 +0100870 if (page_has_private(page) &&
Christoph Lameter8351a6e2006-06-23 02:03:33 -0700871 !try_to_release_page(page, GFP_KERNEL))
Mel Gorman806031b2019-03-05 15:44:43 -0800872 return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY;
Christoph Lameter8351a6e2006-06-23 02:03:33 -0700873
Mel Gormana6bc32b2012-01-12 17:19:43 -0800874 return migrate_page(mapping, newpage, page, mode);
Christoph Lameter8351a6e2006-06-23 02:03:33 -0700875}
876
Christoph Lameter1d8b85c2006-06-23 02:03:28 -0700877/*
Christoph Lametere24f0b82006-06-23 02:03:51 -0700878 * Move a page to a newly allocated page
879 * The page is locked and all ptes have been successfully removed.
880 *
881 * The new page will have replaced the old page if this function
882 * is successful.
Lee Schermerhorn894bc312008-10-18 20:26:39 -0700883 *
884 * Return value:
885 * < 0 - error code
Rafael Aquini78bd5202012-12-11 16:02:31 -0800886 * MIGRATEPAGE_SUCCESS - success
Christoph Lametere24f0b82006-06-23 02:03:51 -0700887 */
Mel Gorman3fe20112010-05-24 14:32:20 -0700888static int move_to_new_page(struct page *newpage, struct page *page,
Hugh Dickins5c3f9a62015-11-05 18:49:53 -0800889 enum migrate_mode mode)
Christoph Lametere24f0b82006-06-23 02:03:51 -0700890{
891 struct address_space *mapping;
Minchan Kimbda807d2016-07-26 15:23:05 -0700892 int rc = -EAGAIN;
893 bool is_lru = !__PageMovable(page);
Christoph Lametere24f0b82006-06-23 02:03:51 -0700894
Hugh Dickins7db76712015-11-05 18:49:49 -0800895 VM_BUG_ON_PAGE(!PageLocked(page), page);
896 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
Christoph Lametere24f0b82006-06-23 02:03:51 -0700897
Christoph Lametere24f0b82006-06-23 02:03:51 -0700898 mapping = page_mapping(page);
Minchan Kimbda807d2016-07-26 15:23:05 -0700899
900 if (likely(is_lru)) {
901 if (!mapping)
902 rc = migrate_page(mapping, newpage, page, mode);
903 else if (mapping->a_ops->migratepage)
904 /*
905 * Most pages have a mapping and most filesystems
906 * provide a migratepage callback. Anonymous pages
907 * are part of swap space which also has its own
908 * migratepage callback. This is the most common path
909 * for page migration.
910 */
911 rc = mapping->a_ops->migratepage(mapping, newpage,
912 page, mode);
913 else
914 rc = fallback_migrate_page(mapping, newpage,
915 page, mode);
916 } else {
Christoph Lametere24f0b82006-06-23 02:03:51 -0700917 /*
Minchan Kimbda807d2016-07-26 15:23:05 -0700918 * In case of non-lru page, it could be released after
919 * isolation step. In that case, we shouldn't try migration.
Christoph Lametere24f0b82006-06-23 02:03:51 -0700920 */
Minchan Kimbda807d2016-07-26 15:23:05 -0700921 VM_BUG_ON_PAGE(!PageIsolated(page), page);
922 if (!PageMovable(page)) {
923 rc = MIGRATEPAGE_SUCCESS;
andrew.yang0d8a8362022-03-15 16:58:34 +1100924 ClearPageIsolated(page);
Minchan Kimbda807d2016-07-26 15:23:05 -0700925 goto out;
926 }
927
928 rc = mapping->a_ops->migratepage(mapping, newpage,
929 page, mode);
930 WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS &&
931 !PageIsolated(page));
932 }
Christoph Lametere24f0b82006-06-23 02:03:51 -0700933
Hugh Dickins5c3f9a62015-11-05 18:49:53 -0800934 /*
935 * When successful, old pagecache page->mapping must be cleared before
936 * page is freed; but stats require that PageAnon be left as PageAnon.
937 */
938 if (rc == MIGRATEPAGE_SUCCESS) {
Minchan Kimbda807d2016-07-26 15:23:05 -0700939 if (__PageMovable(page)) {
940 VM_BUG_ON_PAGE(!PageIsolated(page), page);
941
942 /*
943 * We clear PG_movable under page_lock so any compactor
944 * cannot try to migrate this page.
945 */
andrew.yang0d8a8362022-03-15 16:58:34 +1100946 ClearPageIsolated(page);
Minchan Kimbda807d2016-07-26 15:23:05 -0700947 }
948
949 /*
Ralph Campbellc23a0c92020-01-30 22:14:41 -0800950 * Anonymous and movable page->mapping will be cleared by
Minchan Kimbda807d2016-07-26 15:23:05 -0700951 * free_pages_prepare so don't reset it here for keeping
952 * the type to work PageAnon, for example.
953 */
954 if (!PageMappingFlags(page))
Hugh Dickins5c3f9a62015-11-05 18:49:53 -0800955 page->mapping = NULL;
Lars Perssond2b2c6dd2019-03-28 20:44:28 -0700956
Muchun Song97a9f802022-03-22 14:41:56 -0700957 if (likely(!is_zone_device_page(newpage))) {
958 int i, nr = compound_nr(newpage);
Lars Perssond2b2c6dd2019-03-28 20:44:28 -0700959
Muchun Song97a9f802022-03-22 14:41:56 -0700960 for (i = 0; i < nr; i++)
961 flush_dcache_page(newpage + i);
962 }
Mel Gorman3fe20112010-05-24 14:32:20 -0700963 }
Minchan Kimbda807d2016-07-26 15:23:05 -0700964out:
Christoph Lametere24f0b82006-06-23 02:03:51 -0700965 return rc;
966}
967
Minchan Kim0dabec92011-10-31 17:06:57 -0700968static int __unmap_and_move(struct page *page, struct page *newpage,
Hugh Dickins9c620e22013-02-22 16:35:14 -0800969 int force, enum migrate_mode mode)
Christoph Lametere24f0b82006-06-23 02:03:51 -0700970{
Minchan Kim0dabec92011-10-31 17:06:57 -0700971 int rc = -EAGAIN;
Baolin Wang213ecb32021-09-08 15:18:06 -0700972 bool page_was_mapped = false;
Mel Gorman3f6c8272010-05-24 14:32:17 -0700973 struct anon_vma *anon_vma = NULL;
Minchan Kimbda807d2016-07-26 15:23:05 -0700974 bool is_lru = !__PageMovable(page);
Christoph Lameter95a402c2006-06-23 02:03:53 -0700975
Nick Piggin529ae9a2008-08-02 12:01:03 +0200976 if (!trylock_page(page)) {
Mel Gormana6bc32b2012-01-12 17:19:43 -0800977 if (!force || mode == MIGRATE_ASYNC)
Minchan Kim0dabec92011-10-31 17:06:57 -0700978 goto out;
Mel Gorman3e7d3442011-01-13 15:45:56 -0800979
980 /*
981 * It's not safe for direct compaction to call lock_page.
982 * For example, during page readahead pages are added locked
983 * to the LRU. Later, when the IO completes the pages are
984 * marked uptodate and unlocked. However, the queueing
985 * could be merging multiple pages for one bio (e.g.
Matthew Wilcox (Oracle)d4388342020-06-01 21:47:02 -0700986 * mpage_readahead). If an allocation happens for the
Mel Gorman3e7d3442011-01-13 15:45:56 -0800987 * second or third page, the process can end up locking
988 * the same page twice and deadlocking. Rather than
989 * trying to be clever about what pages can be locked,
990 * avoid the use of lock_page for direct compaction
991 * altogether.
992 */
993 if (current->flags & PF_MEMALLOC)
Minchan Kim0dabec92011-10-31 17:06:57 -0700994 goto out;
Mel Gorman3e7d3442011-01-13 15:45:56 -0800995
Christoph Lametere24f0b82006-06-23 02:03:51 -0700996 lock_page(page);
997 }
998
999 if (PageWriteback(page)) {
Andrea Arcangeli11bc82d2011-03-22 16:33:11 -07001000 /*
Jianguo Wufed5b642013-04-29 15:07:58 -07001001 * Only in the case of a full synchronous migration is it
Mel Gormana6bc32b2012-01-12 17:19:43 -08001002 * necessary to wait for PageWriteback. In the async case,
1003 * the retry loop is too short and in the sync-light case,
1004 * the overhead of stalling is too much
Andrea Arcangeli11bc82d2011-03-22 16:33:11 -07001005 */
Jérôme Glisse2916ecc2017-09-08 16:12:06 -07001006 switch (mode) {
1007 case MIGRATE_SYNC:
1008 case MIGRATE_SYNC_NO_COPY:
1009 break;
1010 default:
Andrea Arcangeli11bc82d2011-03-22 16:33:11 -07001011 rc = -EBUSY;
Johannes Weiner0a31bc92014-08-08 14:19:22 -07001012 goto out_unlock;
Andrea Arcangeli11bc82d2011-03-22 16:33:11 -07001013 }
1014 if (!force)
Johannes Weiner0a31bc92014-08-08 14:19:22 -07001015 goto out_unlock;
Christoph Lametere24f0b82006-06-23 02:03:51 -07001016 wait_on_page_writeback(page);
1017 }
Hugh Dickins03f15c82015-11-05 18:49:56 -08001018
Christoph Lametere24f0b82006-06-23 02:03:51 -07001019 /*
Baolin Wang68a98432021-09-08 15:18:03 -07001020 * By try_to_migrate(), page->mapcount goes down to 0 here. In this case,
KAMEZAWA Hiroyukidc386d42007-07-26 10:41:07 -07001021 * we cannot notice that anon_vma is freed while we migrates a page.
Hugh Dickins1ce82b62011-01-13 15:47:30 -08001022 * This get_anon_vma() delays freeing anon_vma pointer until the end
KAMEZAWA Hiroyukidc386d42007-07-26 10:41:07 -07001023 * of migration. File cache pages are no problem because of page_lock()
KAMEZAWA Hiroyuki989f89c2007-08-30 23:56:21 -07001024 * File Caches may use write_page() or lock_page() in migration, then,
1025 * just care Anon page here.
Hugh Dickins03f15c82015-11-05 18:49:56 -08001026 *
1027 * Only page_get_anon_vma() understands the subtleties of
1028 * getting a hold on an anon_vma from outside one of its mms.
1029 * But if we cannot get anon_vma, then we won't need it anyway,
1030 * because that implies that the anon page is no longer mapped
1031 * (and cannot be remapped so long as we hold the page lock).
Christoph Lametere24f0b82006-06-23 02:03:51 -07001032 */
Hugh Dickins03f15c82015-11-05 18:49:56 -08001033 if (PageAnon(page) && !PageKsm(page))
Peter Zijlstra746b18d2011-05-24 17:12:10 -07001034 anon_vma = page_get_anon_vma(page);
Shaohua Li62e1c552008-02-04 22:29:33 -08001035
Hugh Dickins7db76712015-11-05 18:49:49 -08001036 /*
1037 * Block others from accessing the new page when we get around to
1038 * establishing additional references. We are usually the only one
1039 * holding a reference to newpage at this point. We used to have a BUG
1040 * here if trylock_page(newpage) fails, but would like to allow for
1041 * cases where there might be a race with the previous use of newpage.
1042 * This is much like races on refcount of oldpage: just don't BUG().
1043 */
1044 if (unlikely(!trylock_page(newpage)))
1045 goto out_unlock;
1046
Minchan Kimbda807d2016-07-26 15:23:05 -07001047 if (unlikely(!is_lru)) {
1048 rc = move_to_new_page(newpage, page, mode);
1049 goto out_unlock_both;
1050 }
1051
KAMEZAWA Hiroyukidc386d42007-07-26 10:41:07 -07001052 /*
Shaohua Li62e1c552008-02-04 22:29:33 -08001053 * Corner case handling:
1054 * 1. When a new swap-cache page is read into, it is added to the LRU
1055 * and treated as swapcache but it has no rmap yet.
1056 * Calling try_to_unmap() against a page->mapping==NULL page will
1057 * trigger a BUG. So handle it here.
Yang Shid12b8952020-12-14 19:13:02 -08001058 * 2. An orphaned page (see truncate_cleanup_page) might have
Shaohua Li62e1c552008-02-04 22:29:33 -08001059 * fs-private metadata. The page can be picked up due to memory
1060 * offlining. Everywhere else except page reclaim, the page is
1061 * invisible to the vm, so the page can not be migrated. So try to
1062 * free the metadata, so the page can be freed.
KAMEZAWA Hiroyukidc386d42007-07-26 10:41:07 -07001063 */
Shaohua Li62e1c552008-02-04 22:29:33 -08001064 if (!page->mapping) {
Sasha Levin309381fea2014-01-23 15:52:54 -08001065 VM_BUG_ON_PAGE(PageAnon(page), page);
Hugh Dickins1ce82b62011-01-13 15:47:30 -08001066 if (page_has_private(page)) {
Shaohua Li62e1c552008-02-04 22:29:33 -08001067 try_to_free_buffers(page);
Hugh Dickins7db76712015-11-05 18:49:49 -08001068 goto out_unlock_both;
Shaohua Li62e1c552008-02-04 22:29:33 -08001069 }
Hugh Dickins7db76712015-11-05 18:49:49 -08001070 } else if (page_mapped(page)) {
1071 /* Establish migration ptes */
Hugh Dickins03f15c82015-11-05 18:49:56 -08001072 VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
1073 page);
Alistair Popplea98a2f02021-06-30 18:54:16 -07001074 try_to_migrate(page, 0);
Baolin Wang213ecb32021-09-08 15:18:06 -07001075 page_was_mapped = true;
Hugh Dickins2ebba6b2014-12-12 16:56:19 -08001076 }
KAMEZAWA Hiroyukidc386d42007-07-26 10:41:07 -07001077
Christoph Lametere6a15302006-06-25 05:46:49 -07001078 if (!page_mapped(page))
Hugh Dickins5c3f9a62015-11-05 18:49:53 -08001079 rc = move_to_new_page(newpage, page, mode);
Christoph Lametere24f0b82006-06-23 02:03:51 -07001080
Hugh Dickins5c3f9a62015-11-05 18:49:53 -08001081 if (page_was_mapped)
1082 remove_migration_ptes(page,
Kirill A. Shutemove3884662016-03-17 14:20:07 -07001083 rc == MIGRATEPAGE_SUCCESS ? newpage : page, false);
Mel Gorman3f6c8272010-05-24 14:32:17 -07001084
Hugh Dickins7db76712015-11-05 18:49:49 -08001085out_unlock_both:
1086 unlock_page(newpage);
1087out_unlock:
Mel Gorman3f6c8272010-05-24 14:32:17 -07001088 /* Drop an anon_vma reference if we took one */
Rik van Riel76545062010-08-09 17:18:41 -07001089 if (anon_vma)
Peter Zijlstra9e601092011-03-22 16:32:46 -07001090 put_anon_vma(anon_vma);
Christoph Lametere24f0b82006-06-23 02:03:51 -07001091 unlock_page(page);
Minchan Kim0dabec92011-10-31 17:06:57 -07001092out:
Minchan Kimc6c919e2016-07-26 15:23:02 -07001093 /*
1094 * If migration is successful, decrease refcount of the newpage
1095 * which will not free the page because new page owner increased
1096 * refcounter. As well, if it is LRU page, add the page to LRU
David Hildenbrande0a352f2019-02-01 14:21:19 -08001097 * list in here. Use the old state of the isolated source page to
1098 * determine if we migrated a LRU page. newpage was already unlocked
1099 * and possibly modified by its owner - don't rely on the page
1100 * state.
Minchan Kimc6c919e2016-07-26 15:23:02 -07001101 */
1102 if (rc == MIGRATEPAGE_SUCCESS) {
David Hildenbrande0a352f2019-02-01 14:21:19 -08001103 if (unlikely(!is_lru))
Minchan Kimc6c919e2016-07-26 15:23:02 -07001104 put_page(newpage);
1105 else
1106 putback_lru_page(newpage);
1107 }
1108
Minchan Kim0dabec92011-10-31 17:06:57 -07001109 return rc;
1110}
Christoph Lameter95a402c2006-06-23 02:03:53 -07001111
Dave Hansen79c28a42021-09-02 14:59:06 -07001112
1113/*
1114 * node_demotion[] example:
1115 *
1116 * Consider a system with two sockets. Each socket has
1117 * three classes of memory attached: fast, medium and slow.
1118 * Each memory class is placed in its own NUMA node. The
1119 * CPUs are placed in the node with the "fast" memory. The
1120 * 6 NUMA nodes (0-5) might be split among the sockets like
1121 * this:
1122 *
1123 * Socket A: 0, 1, 2
1124 * Socket B: 3, 4, 5
1125 *
1126 * When Node 0 fills up, its memory should be migrated to
1127 * Node 1. When Node 1 fills up, it should be migrated to
1128 * Node 2. The migration path start on the nodes with the
1129 * processors (since allocations default to this node) and
1130 * fast memory, progress through medium and end with the
1131 * slow memory:
1132 *
1133 * 0 -> 1 -> 2 -> stop
1134 * 3 -> 4 -> 5 -> stop
1135 *
1136 * This is represented in the node_demotion[] like this:
1137 *
1138 * { 1, // Node 0 migrates to 1
1139 * 2, // Node 1 migrates to 2
1140 * -1, // Node 2 does not migrate
1141 * 4, // Node 3 migrates to 4
1142 * 5, // Node 4 migrates to 5
1143 * -1} // Node 5 does not migrate
1144 */
1145
1146/*
1147 * Writes to this array occur without locking. Cycles are
1148 * not allowed: Node X demotes to Y which demotes to X...
1149 *
1150 * If multiple reads are performed, a single rcu_read_lock()
1151 * must be held over all reads to ensure that no cycles are
1152 * observed.
1153 */
1154static int node_demotion[MAX_NUMNODES] __read_mostly =
1155 {[0 ... MAX_NUMNODES - 1] = NUMA_NO_NODE};
1156
1157/**
1158 * next_demotion_node() - Get the next node in the demotion path
1159 * @node: The starting node to lookup the next node
1160 *
Randy Dunlapc9bd7d12021-09-02 15:00:36 -07001161 * Return: node id for next memory node in the demotion path hierarchy
Dave Hansen79c28a42021-09-02 14:59:06 -07001162 * from @node; NUMA_NO_NODE if @node is terminal. This does not keep
1163 * @node online or guarantee that it *continues* to be the next demotion
1164 * target.
1165 */
1166int next_demotion_node(int node)
1167{
1168 int target;
1169
1170 /*
1171 * node_demotion[] is updated without excluding this
1172 * function from running. RCU doesn't provide any
1173 * compiler barriers, so the READ_ONCE() is required
1174 * to avoid compiler reordering or read merging.
1175 *
1176 * Make sure to use RCU over entire code blocks if
1177 * node_demotion[] reads need to be consistent.
1178 */
1179 rcu_read_lock();
1180 target = READ_ONCE(node_demotion[node]);
1181 rcu_read_unlock();
1182
1183 return target;
1184}
1185
Minchan Kim0dabec92011-10-31 17:06:57 -07001186/*
1187 * Obtain the lock on page, remove all ptes and migrate the page
1188 * to the newly allocated page in newpage.
1189 */
Linus Torvalds6ec44762020-07-08 10:48:35 -07001190static int unmap_and_move(new_page_t get_new_page,
Geert Uytterhoevenef2a5152015-04-14 15:44:22 -07001191 free_page_t put_new_page,
1192 unsigned long private, struct page *page,
Naoya Horiguchiadd05ce2015-06-24 16:56:50 -07001193 int force, enum migrate_mode mode,
Yang Shidd4ae782020-12-14 19:13:06 -08001194 enum migrate_reason reason,
1195 struct list_head *ret)
Minchan Kim0dabec92011-10-31 17:06:57 -07001196{
Hugh Dickins2def7422015-11-05 18:49:46 -08001197 int rc = MIGRATEPAGE_SUCCESS;
Yang Shi74d4a572019-11-30 17:57:12 -08001198 struct page *newpage = NULL;
Minchan Kim0dabec92011-10-31 17:06:57 -07001199
Michal Hocko94723aa2018-04-10 16:30:07 -07001200 if (!thp_migration_supported() && PageTransHuge(page))
Yang Shid532e2e2020-12-14 19:13:16 -08001201 return -ENOSYS;
Michal Hocko94723aa2018-04-10 16:30:07 -07001202
Minchan Kim0dabec92011-10-31 17:06:57 -07001203 if (page_count(page) == 1) {
1204 /* page was freed from under us. So we are done. */
Minchan Kimc6c919e2016-07-26 15:23:02 -07001205 ClearPageActive(page);
1206 ClearPageUnevictable(page);
Minchan Kimbda807d2016-07-26 15:23:05 -07001207 if (unlikely(__PageMovable(page))) {
1208 lock_page(page);
1209 if (!PageMovable(page))
andrew.yang0d8a8362022-03-15 16:58:34 +11001210 ClearPageIsolated(page);
Minchan Kimbda807d2016-07-26 15:23:05 -07001211 unlock_page(page);
1212 }
Minchan Kim0dabec92011-10-31 17:06:57 -07001213 goto out;
1214 }
1215
Yang Shi74d4a572019-11-30 17:57:12 -08001216 newpage = get_new_page(page, private);
1217 if (!newpage)
1218 return -ENOMEM;
1219
Hugh Dickins9c620e22013-02-22 16:35:14 -08001220 rc = __unmap_and_move(page, newpage, force, mode);
Minchan Kimc6c919e2016-07-26 15:23:02 -07001221 if (rc == MIGRATEPAGE_SUCCESS)
Vlastimil Babka7cd12b42016-03-15 14:56:18 -07001222 set_page_owner_migrate_reason(newpage, reason);
Rafael Aquinibf6bddf12012-12-11 16:02:42 -08001223
Minchan Kim0dabec92011-10-31 17:06:57 -07001224out:
Christoph Lametere24f0b82006-06-23 02:03:51 -07001225 if (rc != -EAGAIN) {
Minchan Kim0dabec92011-10-31 17:06:57 -07001226 /*
1227 * A page that has been migrated has all references
1228 * removed and will be freed. A page that has not been
Ralph Campbellc23a0c92020-01-30 22:14:41 -08001229 * migrated will have kept its references and be restored.
Minchan Kim0dabec92011-10-31 17:06:57 -07001230 */
1231 list_del(&page->lru);
Christoph Lametere24f0b82006-06-23 02:03:51 -07001232 }
David Rientjes68711a72014-06-04 16:08:25 -07001233
Christoph Lameter95a402c2006-06-23 02:03:53 -07001234 /*
Minchan Kimc6c919e2016-07-26 15:23:02 -07001235 * If migration is successful, releases reference grabbed during
1236 * isolation. Otherwise, restore the page to right list unless
1237 * we want to retry.
Christoph Lameter95a402c2006-06-23 02:03:53 -07001238 */
Minchan Kimc6c919e2016-07-26 15:23:02 -07001239 if (rc == MIGRATEPAGE_SUCCESS) {
Yang Shidd4ae782020-12-14 19:13:06 -08001240 /*
1241 * Compaction can migrate also non-LRU pages which are
1242 * not accounted to NR_ISOLATED_*. They can be recognized
1243 * as __PageMovable
1244 */
1245 if (likely(!__PageMovable(page)))
1246 mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
1247 page_is_file_lru(page), -thp_nr_pages(page));
1248
Oscar Salvador79f5f8f2020-10-15 20:07:09 -07001249 if (reason != MR_MEMORY_FAILURE)
Minchan Kimc6c919e2016-07-26 15:23:02 -07001250 /*
Oscar Salvador79f5f8f2020-10-15 20:07:09 -07001251 * We release the page in page_handle_poison.
Minchan Kimc6c919e2016-07-26 15:23:02 -07001252 */
Oscar Salvador79f5f8f2020-10-15 20:07:09 -07001253 put_page(page);
Minchan Kimc6c919e2016-07-26 15:23:02 -07001254 } else {
Yang Shidd4ae782020-12-14 19:13:06 -08001255 if (rc != -EAGAIN)
1256 list_add_tail(&page->lru, ret);
Minchan Kimbda807d2016-07-26 15:23:05 -07001257
Minchan Kimc6c919e2016-07-26 15:23:02 -07001258 if (put_new_page)
1259 put_new_page(newpage, private);
1260 else
1261 put_page(newpage);
1262 }
David Rientjes68711a72014-06-04 16:08:25 -07001263
Christoph Lametere24f0b82006-06-23 02:03:51 -07001264 return rc;
1265}
1266
1267/*
Naoya Horiguchi290408d2010-09-08 10:19:35 +09001268 * Counterpart of unmap_and_move_page() for hugepage migration.
1269 *
1270 * This function doesn't wait the completion of hugepage I/O
1271 * because there is no race between I/O and migration for hugepage.
1272 * Note that currently hugepage I/O occurs only in direct I/O
1273 * where no lock is held and PG_writeback is irrelevant,
1274 * and writeback status of all subpages are counted in the reference
1275 * count of the head page (i.e. if all subpages of a 2MB hugepage are
1276 * under direct I/O, the reference of the head page is 512 and a bit more.)
1277 * This means that when we try to migrate hugepage whose subpages are
1278 * doing direct I/O, some references remain after try_to_unmap() and
1279 * hugepage migration fails without data corruption.
1280 *
1281 * There is also no race when direct I/O is issued on the page under migration,
1282 * because then pte is replaced with migration swap entry and direct I/O code
1283 * will wait in the page fault for migration to complete.
1284 */
1285static int unmap_and_move_huge_page(new_page_t get_new_page,
David Rientjes68711a72014-06-04 16:08:25 -07001286 free_page_t put_new_page, unsigned long private,
1287 struct page *hpage, int force,
Yang Shidd4ae782020-12-14 19:13:06 -08001288 enum migrate_mode mode, int reason,
1289 struct list_head *ret)
Naoya Horiguchi290408d2010-09-08 10:19:35 +09001290{
Hugh Dickins2def7422015-11-05 18:49:46 -08001291 int rc = -EAGAIN;
Hugh Dickins2ebba6b2014-12-12 16:56:19 -08001292 int page_was_mapped = 0;
Joonsoo Kim32665f22014-01-21 15:51:15 -08001293 struct page *new_hpage;
Naoya Horiguchi290408d2010-09-08 10:19:35 +09001294 struct anon_vma *anon_vma = NULL;
Mike Kravetzc0d03812020-04-01 21:11:05 -07001295 struct address_space *mapping = NULL;
Naoya Horiguchi290408d2010-09-08 10:19:35 +09001296
Naoya Horiguchi83467ef2013-09-11 14:22:11 -07001297 /*
Anshuman Khandual7ed2c312019-03-05 15:43:44 -08001298 * Migratability of hugepages depends on architectures and their size.
Naoya Horiguchi83467ef2013-09-11 14:22:11 -07001299 * This check is necessary because some callers of hugepage migration
1300 * like soft offline and memory hotremove don't walk through page
1301 * tables or check whether the hugepage is pmd-based or not before
1302 * kicking migration.
1303 */
Naoya Horiguchi100873d2014-06-04 16:10:56 -07001304 if (!hugepage_migration_supported(page_hstate(hpage))) {
Yang Shidd4ae782020-12-14 19:13:06 -08001305 list_move_tail(&hpage->lru, ret);
Naoya Horiguchi83467ef2013-09-11 14:22:11 -07001306 return -ENOSYS;
Joonsoo Kim32665f22014-01-21 15:51:15 -08001307 }
Naoya Horiguchi83467ef2013-09-11 14:22:11 -07001308
Muchun Song71a64f62021-02-04 18:32:17 -08001309 if (page_count(hpage) == 1) {
1310 /* page was freed from under us. So we are done. */
1311 putback_active_hugepage(hpage);
1312 return MIGRATEPAGE_SUCCESS;
1313 }
1314
Michal Hocko666feb22018-04-10 16:30:03 -07001315 new_hpage = get_new_page(hpage, private);
Naoya Horiguchi290408d2010-09-08 10:19:35 +09001316 if (!new_hpage)
1317 return -ENOMEM;
1318
Naoya Horiguchi290408d2010-09-08 10:19:35 +09001319 if (!trylock_page(hpage)) {
Jérôme Glisse2916ecc2017-09-08 16:12:06 -07001320 if (!force)
Naoya Horiguchi290408d2010-09-08 10:19:35 +09001321 goto out;
Jérôme Glisse2916ecc2017-09-08 16:12:06 -07001322 switch (mode) {
1323 case MIGRATE_SYNC:
1324 case MIGRATE_SYNC_NO_COPY:
1325 break;
1326 default:
1327 goto out;
1328 }
Naoya Horiguchi290408d2010-09-08 10:19:35 +09001329 lock_page(hpage);
1330 }
1331
Mike Kravetzcb6acd02019-02-28 16:22:02 -08001332 /*
1333 * Check for pages which are in the process of being freed. Without
1334 * page_mapping() set, hugetlbfs specific move page routine will not
1335 * be called and we could leak usage counts for subpools.
1336 */
Muchun Song6acfb5b2021-06-30 18:51:29 -07001337 if (hugetlb_page_subpool(hpage) && !page_mapping(hpage)) {
Mike Kravetzcb6acd02019-02-28 16:22:02 -08001338 rc = -EBUSY;
1339 goto out_unlock;
1340 }
1341
Peter Zijlstra746b18d2011-05-24 17:12:10 -07001342 if (PageAnon(hpage))
1343 anon_vma = page_get_anon_vma(hpage);
Naoya Horiguchi290408d2010-09-08 10:19:35 +09001344
Hugh Dickins7db76712015-11-05 18:49:49 -08001345 if (unlikely(!trylock_page(new_hpage)))
1346 goto put_anon;
1347
Hugh Dickins2ebba6b2014-12-12 16:56:19 -08001348 if (page_mapped(hpage)) {
Mike Kravetz336bf302020-11-13 22:52:16 -08001349 bool mapping_locked = false;
Alistair Popplea98a2f02021-06-30 18:54:16 -07001350 enum ttu_flags ttu = 0;
Mike Kravetzc0d03812020-04-01 21:11:05 -07001351
Mike Kravetz336bf302020-11-13 22:52:16 -08001352 if (!PageAnon(hpage)) {
1353 /*
1354 * In shared mappings, try_to_unmap could potentially
1355 * call huge_pmd_unshare. Because of this, take
1356 * semaphore in write mode here and set TTU_RMAP_LOCKED
1357 * to let lower levels know we have taken the lock.
1358 */
1359 mapping = hugetlb_page_mapping_lock_write(hpage);
1360 if (unlikely(!mapping))
1361 goto unlock_put_anon;
1362
1363 mapping_locked = true;
1364 ttu |= TTU_RMAP_LOCKED;
1365 }
1366
Alistair Popplea98a2f02021-06-30 18:54:16 -07001367 try_to_migrate(hpage, ttu);
Hugh Dickins2ebba6b2014-12-12 16:56:19 -08001368 page_was_mapped = 1;
Mike Kravetz336bf302020-11-13 22:52:16 -08001369
1370 if (mapping_locked)
1371 i_mmap_unlock_write(mapping);
Hugh Dickins2ebba6b2014-12-12 16:56:19 -08001372 }
Naoya Horiguchi290408d2010-09-08 10:19:35 +09001373
1374 if (!page_mapped(hpage))
Hugh Dickins5c3f9a62015-11-05 18:49:53 -08001375 rc = move_to_new_page(new_hpage, hpage, mode);
Naoya Horiguchi290408d2010-09-08 10:19:35 +09001376
Mike Kravetz336bf302020-11-13 22:52:16 -08001377 if (page_was_mapped)
Hugh Dickins5c3f9a62015-11-05 18:49:53 -08001378 remove_migration_ptes(hpage,
Mike Kravetz336bf302020-11-13 22:52:16 -08001379 rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, false);
Naoya Horiguchi290408d2010-09-08 10:19:35 +09001380
Mike Kravetzc0d03812020-04-01 21:11:05 -07001381unlock_put_anon:
Hugh Dickins7db76712015-11-05 18:49:49 -08001382 unlock_page(new_hpage);
1383
1384put_anon:
Hugh Dickinsfd4a4662011-01-13 15:47:31 -08001385 if (anon_vma)
Peter Zijlstra9e601092011-03-22 16:32:46 -07001386 put_anon_vma(anon_vma);
Aneesh Kumar K.V8e6ac7f2012-07-31 16:42:27 -07001387
Hugh Dickins2def7422015-11-05 18:49:46 -08001388 if (rc == MIGRATEPAGE_SUCCESS) {
Michal Hockoab5ac902018-01-31 16:20:48 -08001389 move_hugetlb_state(hpage, new_hpage, reason);
Hugh Dickins2def7422015-11-05 18:49:46 -08001390 put_new_page = NULL;
1391 }
Aneesh Kumar K.V8e6ac7f2012-07-31 16:42:27 -07001392
Mike Kravetzcb6acd02019-02-28 16:22:02 -08001393out_unlock:
Naoya Horiguchi290408d2010-09-08 10:19:35 +09001394 unlock_page(hpage);
Hillf Danton09761332011-12-08 14:34:20 -08001395out:
Yang Shidd4ae782020-12-14 19:13:06 -08001396 if (rc == MIGRATEPAGE_SUCCESS)
Naoya Horiguchib8ec1ce2013-09-11 14:22:01 -07001397 putback_active_hugepage(hpage);
Miaohe Lina04840c2021-05-04 18:37:07 -07001398 else if (rc != -EAGAIN)
Yang Shidd4ae782020-12-14 19:13:06 -08001399 list_move_tail(&hpage->lru, ret);
David Rientjes68711a72014-06-04 16:08:25 -07001400
1401 /*
1402 * If migration was not successful and there's a freeing callback, use
1403 * it. Otherwise, put_page() will drop the reference grabbed during
1404 * isolation.
1405 */
Hugh Dickins2def7422015-11-05 18:49:46 -08001406 if (put_new_page)
David Rientjes68711a72014-06-04 16:08:25 -07001407 put_new_page(new_hpage, private);
1408 else
Naoya Horiguchi3aaa76e2015-09-22 14:59:14 -07001409 putback_active_hugepage(new_hpage);
David Rientjes68711a72014-06-04 16:08:25 -07001410
Naoya Horiguchi290408d2010-09-08 10:19:35 +09001411 return rc;
1412}
1413
Yang Shid532e2e2020-12-14 19:13:16 -08001414static inline int try_split_thp(struct page *page, struct page **page2,
1415 struct list_head *from)
1416{
1417 int rc = 0;
1418
1419 lock_page(page);
1420 rc = split_huge_page_to_list(page, from);
1421 unlock_page(page);
1422 if (!rc)
1423 list_safe_reset_next(page, *page2, lru);
1424
1425 return rc;
1426}
1427
Naoya Horiguchi290408d2010-09-08 10:19:35 +09001428/*
Srivatsa S. Bhatc73e5c92013-04-29 15:08:16 -07001429 * migrate_pages - migrate the pages specified in a list, to the free pages
1430 * supplied as the target for the page migration
Christoph Lameterb20a3502006-03-22 00:09:12 -08001431 *
Srivatsa S. Bhatc73e5c92013-04-29 15:08:16 -07001432 * @from: The list of pages to be migrated.
1433 * @get_new_page: The function used to allocate free pages to be used
1434 * as the target of the page migration.
David Rientjes68711a72014-06-04 16:08:25 -07001435 * @put_new_page: The function used to free target pages if migration
1436 * fails, or NULL if no special handling is necessary.
Srivatsa S. Bhatc73e5c92013-04-29 15:08:16 -07001437 * @private: Private data to be passed on to get_new_page()
1438 * @mode: The migration mode that specifies the constraints for
1439 * page migration, if any.
1440 * @reason: The reason for page migration.
Yang Shi5ac95882021-09-02 14:59:13 -07001441 * @ret_succeeded: Set to the number of pages migrated successfully if
1442 * the caller passes a non-NULL pointer.
Christoph Lameterb20a3502006-03-22 00:09:12 -08001443 *
Srivatsa S. Bhatc73e5c92013-04-29 15:08:16 -07001444 * The function returns after 10 attempts or if no pages are movable any more
1445 * because the list has become empty or no retryable pages exist any more.
Yang Shidd4ae782020-12-14 19:13:06 -08001446 * It is caller's responsibility to call putback_movable_pages() to return pages
1447 * to the LRU or free list only if ret != 0.
Christoph Lameterb20a3502006-03-22 00:09:12 -08001448 *
Srivatsa S. Bhatc73e5c92013-04-29 15:08:16 -07001449 * Returns the number of pages that were not migrated, or an error code.
Christoph Lameterb20a3502006-03-22 00:09:12 -08001450 */
Hugh Dickins9c620e22013-02-22 16:35:14 -08001451int migrate_pages(struct list_head *from, new_page_t get_new_page,
David Rientjes68711a72014-06-04 16:08:25 -07001452 free_page_t put_new_page, unsigned long private,
Yang Shi5ac95882021-09-02 14:59:13 -07001453 enum migrate_mode mode, int reason, unsigned int *ret_succeeded)
Christoph Lameterb20a3502006-03-22 00:09:12 -08001454{
Christoph Lametere24f0b82006-06-23 02:03:51 -07001455 int retry = 1;
Anshuman Khandual1a5bae22020-08-11 18:31:51 -07001456 int thp_retry = 1;
Christoph Lameterb20a3502006-03-22 00:09:12 -08001457 int nr_failed = 0;
Mel Gorman5647bc22012-10-19 10:46:20 +01001458 int nr_succeeded = 0;
Anshuman Khandual1a5bae22020-08-11 18:31:51 -07001459 int nr_thp_succeeded = 0;
1460 int nr_thp_failed = 0;
1461 int nr_thp_split = 0;
Christoph Lameterb20a3502006-03-22 00:09:12 -08001462 int pass = 0;
Anshuman Khandual1a5bae22020-08-11 18:31:51 -07001463 bool is_thp = false;
Christoph Lameterb20a3502006-03-22 00:09:12 -08001464 struct page *page;
1465 struct page *page2;
1466 int swapwrite = current->flags & PF_SWAPWRITE;
Anshuman Khandual1a5bae22020-08-11 18:31:51 -07001467 int rc, nr_subpages;
Yang Shidd4ae782020-12-14 19:13:06 -08001468 LIST_HEAD(ret_pages);
Yang Shib0b515b2021-06-30 18:51:48 -07001469 bool nosplit = (reason == MR_NUMA_MISPLACED);
Christoph Lameterb20a3502006-03-22 00:09:12 -08001470
Liam Mark7bc1aec2021-05-04 18:37:25 -07001471 trace_mm_migrate_pages_start(mode, reason);
1472
Christoph Lameterb20a3502006-03-22 00:09:12 -08001473 if (!swapwrite)
1474 current->flags |= PF_SWAPWRITE;
1475
Anshuman Khandual1a5bae22020-08-11 18:31:51 -07001476 for (pass = 0; pass < 10 && (retry || thp_retry); pass++) {
Christoph Lametere24f0b82006-06-23 02:03:51 -07001477 retry = 0;
Anshuman Khandual1a5bae22020-08-11 18:31:51 -07001478 thp_retry = 0;
Christoph Lameterb20a3502006-03-22 00:09:12 -08001479
Christoph Lametere24f0b82006-06-23 02:03:51 -07001480 list_for_each_entry_safe(page, page2, from, lru) {
Michal Hocko94723aa2018-04-10 16:30:07 -07001481retry:
Anshuman Khandual1a5bae22020-08-11 18:31:51 -07001482 /*
1483 * THP statistics is based on the source huge page.
1484 * Capture required information that might get lost
1485 * during migration.
1486 */
Zi Yan6c5c7b92020-09-25 21:19:14 -07001487 is_thp = PageTransHuge(page) && !PageHuge(page);
Matthew Wilcox (Oracle)6c357842020-08-14 17:30:37 -07001488 nr_subpages = thp_nr_pages(page);
Christoph Lametere24f0b82006-06-23 02:03:51 -07001489 cond_resched();
Christoph Lameterb20a3502006-03-22 00:09:12 -08001490
Naoya Horiguchi31caf662013-09-11 14:21:59 -07001491 if (PageHuge(page))
1492 rc = unmap_and_move_huge_page(get_new_page,
David Rientjes68711a72014-06-04 16:08:25 -07001493 put_new_page, private, page,
Yang Shidd4ae782020-12-14 19:13:06 -08001494 pass > 2, mode, reason,
1495 &ret_pages);
Naoya Horiguchi31caf662013-09-11 14:21:59 -07001496 else
David Rientjes68711a72014-06-04 16:08:25 -07001497 rc = unmap_and_move(get_new_page, put_new_page,
Naoya Horiguchiadd05ce2015-06-24 16:56:50 -07001498 private, page, pass > 2, mode,
Yang Shidd4ae782020-12-14 19:13:06 -08001499 reason, &ret_pages);
1500 /*
1501 * The rules are:
1502 * Success: non hugetlb page will be freed, hugetlb
1503 * page will be put back
1504 * -EAGAIN: stay on the from list
1505 * -ENOMEM: stay on the from list
1506 * Other errno: put on ret_pages list then splice to
1507 * from list
1508 */
Christoph Lametere24f0b82006-06-23 02:03:51 -07001509 switch(rc) {
Yang Shid532e2e2020-12-14 19:13:16 -08001510 /*
1511 * THP migration might be unsupported or the
1512 * allocation could've failed so we should
1513 * retry on the same page with the THP split
1514 * to base pages.
1515 *
1516 * Head page is retried immediately and tail
1517 * pages are added to the tail of the list so
1518 * we encounter them after the rest of the list
1519 * is processed.
1520 */
1521 case -ENOSYS:
1522 /* THP migration is unsupported */
1523 if (is_thp) {
1524 if (!try_split_thp(page, &page2, from)) {
1525 nr_thp_split++;
1526 goto retry;
1527 }
1528
1529 nr_thp_failed++;
1530 nr_failed += nr_subpages;
1531 break;
1532 }
1533
1534 /* Hugetlb migration is unsupported */
1535 nr_failed++;
1536 break;
Christoph Lameter95a402c2006-06-23 02:03:53 -07001537 case -ENOMEM:
Michal Hocko94723aa2018-04-10 16:30:07 -07001538 /*
Yang Shid532e2e2020-12-14 19:13:16 -08001539 * When memory is low, don't bother to try to migrate
1540 * other pages, just exit.
Yang Shib0b515b2021-06-30 18:51:48 -07001541 * THP NUMA faulting doesn't split THP to retry.
Michal Hocko94723aa2018-04-10 16:30:07 -07001542 */
Yang Shib0b515b2021-06-30 18:51:48 -07001543 if (is_thp && !nosplit) {
Yang Shid532e2e2020-12-14 19:13:16 -08001544 if (!try_split_thp(page, &page2, from)) {
Anshuman Khandual1a5bae22020-08-11 18:31:51 -07001545 nr_thp_split++;
Michal Hocko94723aa2018-04-10 16:30:07 -07001546 goto retry;
1547 }
Zi Yan6c5c7b92020-09-25 21:19:14 -07001548
Anshuman Khandual1a5bae22020-08-11 18:31:51 -07001549 nr_thp_failed++;
1550 nr_failed += nr_subpages;
1551 goto out;
1552 }
David Rientjesdfef2ef2016-05-20 16:59:05 -07001553 nr_failed++;
Christoph Lameter95a402c2006-06-23 02:03:53 -07001554 goto out;
Christoph Lametere24f0b82006-06-23 02:03:51 -07001555 case -EAGAIN:
Anshuman Khandual1a5bae22020-08-11 18:31:51 -07001556 if (is_thp) {
1557 thp_retry++;
1558 break;
1559 }
Christoph Lameter2d1db3b2006-06-23 02:03:33 -07001560 retry++;
Christoph Lametere24f0b82006-06-23 02:03:51 -07001561 break;
Rafael Aquini78bd5202012-12-11 16:02:31 -08001562 case MIGRATEPAGE_SUCCESS:
Anshuman Khandual1a5bae22020-08-11 18:31:51 -07001563 if (is_thp) {
1564 nr_thp_succeeded++;
1565 nr_succeeded += nr_subpages;
1566 break;
1567 }
Mel Gorman5647bc22012-10-19 10:46:20 +01001568 nr_succeeded++;
Christoph Lametere24f0b82006-06-23 02:03:51 -07001569 break;
1570 default:
Naoya Horiguchi354a3362014-01-21 15:51:14 -08001571 /*
Yang Shid532e2e2020-12-14 19:13:16 -08001572 * Permanent failure (-EBUSY, etc.):
Naoya Horiguchi354a3362014-01-21 15:51:14 -08001573 * unlike -EAGAIN case, the failed page is
1574 * removed from migration page list and not
1575 * retried in the next outer loop.
1576 */
Anshuman Khandual1a5bae22020-08-11 18:31:51 -07001577 if (is_thp) {
1578 nr_thp_failed++;
1579 nr_failed += nr_subpages;
1580 break;
1581 }
Christoph Lameter2d1db3b2006-06-23 02:03:33 -07001582 nr_failed++;
Christoph Lametere24f0b82006-06-23 02:03:51 -07001583 break;
Christoph Lameter2d1db3b2006-06-23 02:03:33 -07001584 }
Christoph Lameterb20a3502006-03-22 00:09:12 -08001585 }
1586 }
Anshuman Khandual1a5bae22020-08-11 18:31:51 -07001587 nr_failed += retry + thp_retry;
1588 nr_thp_failed += thp_retry;
Vlastimil Babkaf2f81fb2015-11-05 18:47:03 -08001589 rc = nr_failed;
Christoph Lameter95a402c2006-06-23 02:03:53 -07001590out:
Yang Shidd4ae782020-12-14 19:13:06 -08001591 /*
1592 * Put the permanent failure page back to migration list, they
1593 * will be put back to the right list by the caller.
1594 */
1595 list_splice(&ret_pages, from);
1596
Anshuman Khandual1a5bae22020-08-11 18:31:51 -07001597 count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
1598 count_vm_events(PGMIGRATE_FAIL, nr_failed);
1599 count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded);
1600 count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed);
1601 count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split);
1602 trace_mm_migrate_pages(nr_succeeded, nr_failed, nr_thp_succeeded,
1603 nr_thp_failed, nr_thp_split, mode, reason);
Mel Gorman7b2a2d42012-10-19 14:07:31 +01001604
Christoph Lameterb20a3502006-03-22 00:09:12 -08001605 if (!swapwrite)
1606 current->flags &= ~PF_SWAPWRITE;
1607
Yang Shi5ac95882021-09-02 14:59:13 -07001608 if (ret_succeeded)
1609 *ret_succeeded = nr_succeeded;
1610
Rafael Aquini78bd5202012-12-11 16:02:31 -08001611 return rc;
Christoph Lameterb20a3502006-03-22 00:09:12 -08001612}
Charan Teja Reddyf47b8522021-02-16 13:59:45 +05301613EXPORT_SYMBOL_GPL(migrate_pages);
Christoph Lameterb20a3502006-03-22 00:09:12 -08001614
Joonsoo Kim19fc7be2020-08-11 18:37:25 -07001615struct page *alloc_migration_target(struct page *page, unsigned long private)
Joonsoo Kimb4b38222020-08-11 18:37:14 -07001616{
Joonsoo Kim19fc7be2020-08-11 18:37:25 -07001617 struct migration_target_control *mtc;
1618 gfp_t gfp_mask;
Joonsoo Kimb4b38222020-08-11 18:37:14 -07001619 unsigned int order = 0;
1620 struct page *new_page = NULL;
Joonsoo Kim19fc7be2020-08-11 18:37:25 -07001621 int nid;
1622 int zidx;
1623
1624 mtc = (struct migration_target_control *)private;
1625 gfp_mask = mtc->gfp_mask;
1626 nid = mtc->nid;
1627 if (nid == NUMA_NO_NODE)
1628 nid = page_to_nid(page);
Joonsoo Kimb4b38222020-08-11 18:37:14 -07001629
Joonsoo Kimd92bbc22020-08-11 18:37:17 -07001630 if (PageHuge(page)) {
1631 struct hstate *h = page_hstate(compound_head(page));
1632
Joonsoo Kim19fc7be2020-08-11 18:37:25 -07001633 gfp_mask = htlb_modify_alloc_mask(h, gfp_mask);
1634 return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask);
Joonsoo Kimd92bbc22020-08-11 18:37:17 -07001635 }
Joonsoo Kimb4b38222020-08-11 18:37:14 -07001636
1637 if (PageTransHuge(page)) {
Joonsoo Kim9933a0c2020-08-11 18:37:20 -07001638 /*
1639 * clear __GFP_RECLAIM to make the migration callback
1640 * consistent with regular THP allocations.
1641 */
1642 gfp_mask &= ~__GFP_RECLAIM;
Joonsoo Kimb4b38222020-08-11 18:37:14 -07001643 gfp_mask |= GFP_TRANSHUGE;
1644 order = HPAGE_PMD_ORDER;
1645 }
Joonsoo Kim19fc7be2020-08-11 18:37:25 -07001646 zidx = zone_idx(page_zone(page));
1647 if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE)
Joonsoo Kimb4b38222020-08-11 18:37:14 -07001648 gfp_mask |= __GFP_HIGHMEM;
1649
Matthew Wilcox (Oracle)84172f42021-04-29 23:01:15 -07001650 new_page = __alloc_pages(gfp_mask, order, nid, mtc->nmask);
Joonsoo Kimb4b38222020-08-11 18:37:14 -07001651
1652 if (new_page && PageTransHuge(new_page))
1653 prep_transhuge_page(new_page);
1654
1655 return new_page;
1656}
1657
Christoph Lameter742755a2006-06-23 02:03:55 -07001658#ifdef CONFIG_NUMA
Christoph Lameter742755a2006-06-23 02:03:55 -07001659
Michal Hockoa49bd4d2018-04-10 16:29:59 -07001660static int store_status(int __user *status, int start, int value, int nr)
Christoph Lameter742755a2006-06-23 02:03:55 -07001661{
Michal Hockoa49bd4d2018-04-10 16:29:59 -07001662 while (nr-- > 0) {
1663 if (put_user(value, status + start))
1664 return -EFAULT;
1665 start++;
1666 }
Christoph Lameter742755a2006-06-23 02:03:55 -07001667
Michal Hockoa49bd4d2018-04-10 16:29:59 -07001668 return 0;
1669}
Christoph Lameter742755a2006-06-23 02:03:55 -07001670
Michal Hockoa49bd4d2018-04-10 16:29:59 -07001671static int do_move_pages_to_node(struct mm_struct *mm,
1672 struct list_head *pagelist, int node)
1673{
1674 int err;
Joonsoo Kima0976312020-08-11 18:37:28 -07001675 struct migration_target_control mtc = {
1676 .nid = node,
1677 .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1678 };
Christoph Lameter742755a2006-06-23 02:03:55 -07001679
Joonsoo Kima0976312020-08-11 18:37:28 -07001680 err = migrate_pages(pagelist, alloc_migration_target, NULL,
Yang Shi5ac95882021-09-02 14:59:13 -07001681 (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
Michal Hockoa49bd4d2018-04-10 16:29:59 -07001682 if (err)
1683 putback_movable_pages(pagelist);
1684 return err;
Christoph Lameter742755a2006-06-23 02:03:55 -07001685}
1686
1687/*
Michal Hockoa49bd4d2018-04-10 16:29:59 -07001688 * Resolves the given address to a struct page, isolates it from the LRU and
1689 * puts it to the given pagelist.
Yang Shie0153fc2020-01-04 12:59:46 -08001690 * Returns:
1691 * errno - if the page cannot be found/isolated
1692 * 0 - when it doesn't have to be migrated because it is already on the
1693 * target node
1694 * 1 - when it has been queued
Christoph Lameter742755a2006-06-23 02:03:55 -07001695 */
Michal Hockoa49bd4d2018-04-10 16:29:59 -07001696static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
1697 int node, struct list_head *pagelist, bool migrate_all)
Christoph Lameter742755a2006-06-23 02:03:55 -07001698{
Michal Hockoa49bd4d2018-04-10 16:29:59 -07001699 struct vm_area_struct *vma;
1700 struct page *page;
1701 unsigned int follflags;
Christoph Lameter742755a2006-06-23 02:03:55 -07001702 int err;
Christoph Lameter742755a2006-06-23 02:03:55 -07001703
Michel Lespinassed8ed45c2020-06-08 21:33:25 -07001704 mmap_read_lock(mm);
Michal Hockoa49bd4d2018-04-10 16:29:59 -07001705 err = -EFAULT;
1706 vma = find_vma(mm, addr);
1707 if (!vma || addr < vma->vm_start || !vma_migratable(vma))
1708 goto out;
Christoph Lameter742755a2006-06-23 02:03:55 -07001709
Michal Hockoa49bd4d2018-04-10 16:29:59 -07001710 /* FOLL_DUMP to ignore special (like zero) pages */
1711 follflags = FOLL_GET | FOLL_DUMP;
Michal Hockoa49bd4d2018-04-10 16:29:59 -07001712 page = follow_page(vma, addr, follflags);
Christoph Lameter742755a2006-06-23 02:03:55 -07001713
Michal Hockoa49bd4d2018-04-10 16:29:59 -07001714 err = PTR_ERR(page);
1715 if (IS_ERR(page))
1716 goto out;
Christoph Lameter742755a2006-06-23 02:03:55 -07001717
Michal Hockoa49bd4d2018-04-10 16:29:59 -07001718 err = -ENOENT;
1719 if (!page)
1720 goto out;
Christoph Lameter742755a2006-06-23 02:03:55 -07001721
Brice Gogline78bbfa2008-10-18 20:27:15 -07001722 err = 0;
Michal Hockoa49bd4d2018-04-10 16:29:59 -07001723 if (page_to_nid(page) == node)
1724 goto out_putpage;
Christoph Lameter742755a2006-06-23 02:03:55 -07001725
Michal Hockoa49bd4d2018-04-10 16:29:59 -07001726 err = -EACCES;
1727 if (page_mapcount(page) > 1 && !migrate_all)
1728 goto out_putpage;
1729
1730 if (PageHuge(page)) {
1731 if (PageHead(page)) {
Miaohe Lin072e7412022-05-30 19:30:15 +08001732 err = isolate_hugetlb(page, pagelist);
1733 if (!err)
1734 err = 1;
Michal Hockoa49bd4d2018-04-10 16:29:59 -07001735 }
1736 } else {
1737 struct page *head;
1738
1739 head = compound_head(page);
1740 err = isolate_lru_page(head);
1741 if (err)
1742 goto out_putpage;
1743
Yang Shie0153fc2020-01-04 12:59:46 -08001744 err = 1;
Michal Hockoa49bd4d2018-04-10 16:29:59 -07001745 list_add_tail(&head->lru, pagelist);
1746 mod_node_page_state(page_pgdat(head),
Huang Ying9de4f222020-04-06 20:04:41 -07001747 NR_ISOLATED_ANON + page_is_file_lru(head),
Matthew Wilcox (Oracle)6c357842020-08-14 17:30:37 -07001748 thp_nr_pages(head));
Michal Hockoa49bd4d2018-04-10 16:29:59 -07001749 }
1750out_putpage:
1751 /*
1752 * Either remove the duplicate refcount from
1753 * isolate_lru_page() or drop the page ref if it was
1754 * not isolated.
1755 */
1756 put_page(page);
1757out:
Michel Lespinassed8ed45c2020-06-08 21:33:25 -07001758 mmap_read_unlock(mm);
Christoph Lameter742755a2006-06-23 02:03:55 -07001759 return err;
1760}
1761
Wei Yang7ca87832020-04-06 20:04:12 -07001762static int move_pages_and_store_status(struct mm_struct *mm, int node,
1763 struct list_head *pagelist, int __user *status,
1764 int start, int i, unsigned long nr_pages)
1765{
1766 int err;
1767
Wei Yang5d7ae892020-04-06 20:04:15 -07001768 if (list_empty(pagelist))
1769 return 0;
1770
Wei Yang7ca87832020-04-06 20:04:12 -07001771 err = do_move_pages_to_node(mm, pagelist, node);
1772 if (err) {
1773 /*
1774 * Positive err means the number of failed
1775 * pages to migrate. Since we are going to
1776 * abort and return the number of non-migrated
Long Liab9dd4f2020-12-14 19:12:52 -08001777 * pages, so need to include the rest of the
Wei Yang7ca87832020-04-06 20:04:12 -07001778 * nr_pages that have not been attempted as
1779 * well.
1780 */
1781 if (err > 0)
1782 err += nr_pages - i - 1;
1783 return err;
1784 }
1785 return store_status(status, start, node, i - start);
1786}
1787
Christoph Lameter742755a2006-06-23 02:03:55 -07001788/*
Brice Goglin5e9a0f02008-10-18 20:27:17 -07001789 * Migrate an array of page address onto an array of nodes and fill
1790 * the corresponding array of status.
1791 */
Christoph Lameter3268c632012-03-21 16:34:06 -07001792static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
Brice Goglin5e9a0f02008-10-18 20:27:17 -07001793 unsigned long nr_pages,
1794 const void __user * __user *pages,
1795 const int __user *nodes,
1796 int __user *status, int flags)
1797{
Gregory Price556b68d2023-10-03 10:48:56 -04001798 compat_uptr_t __user *compat_pages = (void __user *)pages;
Michal Hockoa49bd4d2018-04-10 16:29:59 -07001799 int current_node = NUMA_NO_NODE;
1800 LIST_HEAD(pagelist);
1801 int start, i;
1802 int err = 0, err1;
Brice Goglin35282a22009-06-16 15:32:43 -07001803
Minchan Kim361a2a22021-05-04 18:36:57 -07001804 lru_cache_disable();
Brice Goglin35282a22009-06-16 15:32:43 -07001805
Michal Hockoa49bd4d2018-04-10 16:29:59 -07001806 for (i = start = 0; i < nr_pages; i++) {
1807 const void __user *p;
1808 unsigned long addr;
1809 int node;
Brice Goglin5e9a0f02008-10-18 20:27:17 -07001810
Michal Hockoa49bd4d2018-04-10 16:29:59 -07001811 err = -EFAULT;
Gregory Price556b68d2023-10-03 10:48:56 -04001812 if (in_compat_syscall()) {
1813 compat_uptr_t cp;
1814
1815 if (get_user(cp, compat_pages + i))
1816 goto out_flush;
1817
1818 p = compat_ptr(cp);
1819 } else {
1820 if (get_user(p, pages + i))
1821 goto out_flush;
1822 }
Michal Hockoa49bd4d2018-04-10 16:29:59 -07001823 if (get_user(node, nodes + i))
1824 goto out_flush;
Andrey Konovalov057d33892019-09-25 16:48:30 -07001825 addr = (unsigned long)untagged_addr(p);
Brice Goglin5e9a0f02008-10-18 20:27:17 -07001826
Michal Hockoa49bd4d2018-04-10 16:29:59 -07001827 err = -ENODEV;
1828 if (node < 0 || node >= MAX_NUMNODES)
1829 goto out_flush;
1830 if (!node_state(node, N_MEMORY))
1831 goto out_flush;
Brice Goglin3140a222009-01-06 14:38:57 -08001832
Michal Hockoa49bd4d2018-04-10 16:29:59 -07001833 err = -EACCES;
1834 if (!node_isset(node, task_nodes))
1835 goto out_flush;
Brice Goglin5e9a0f02008-10-18 20:27:17 -07001836
Michal Hockoa49bd4d2018-04-10 16:29:59 -07001837 if (current_node == NUMA_NO_NODE) {
1838 current_node = node;
1839 start = i;
1840 } else if (node != current_node) {
Wei Yang7ca87832020-04-06 20:04:12 -07001841 err = move_pages_and_store_status(mm, current_node,
1842 &pagelist, status, start, i, nr_pages);
Michal Hockoa49bd4d2018-04-10 16:29:59 -07001843 if (err)
1844 goto out;
1845 start = i;
1846 current_node = node;
Brice Goglin3140a222009-01-06 14:38:57 -08001847 }
Brice Goglin5e9a0f02008-10-18 20:27:17 -07001848
Michal Hockoa49bd4d2018-04-10 16:29:59 -07001849 /*
1850 * Errors in the page lookup or isolation are not fatal and we simply
1851 * report them via status
1852 */
1853 err = add_page_for_migration(mm, addr, current_node,
1854 &pagelist, flags & MPOL_MF_MOVE_ALL);
Yang Shie0153fc2020-01-04 12:59:46 -08001855
Wei Yangd08221a2020-04-06 20:04:18 -07001856 if (err > 0) {
Yang Shie0153fc2020-01-04 12:59:46 -08001857 /* The page is successfully queued for migration */
1858 continue;
1859 }
Brice Goglin3140a222009-01-06 14:38:57 -08001860
Wei Yangd08221a2020-04-06 20:04:18 -07001861 /*
1862 * If the page is already on the target node (!err), store the
1863 * node, otherwise, store the err.
1864 */
1865 err = store_status(status, i, err ? : current_node, 1);
Michal Hockoa49bd4d2018-04-10 16:29:59 -07001866 if (err)
1867 goto out_flush;
Brice Goglin3140a222009-01-06 14:38:57 -08001868
Wei Yang7ca87832020-04-06 20:04:12 -07001869 err = move_pages_and_store_status(mm, current_node, &pagelist,
1870 status, start, i, nr_pages);
Wei Yang4afdace2020-04-06 20:04:09 -07001871 if (err)
1872 goto out;
Michal Hockoa49bd4d2018-04-10 16:29:59 -07001873 current_node = NUMA_NO_NODE;
Brice Goglin3140a222009-01-06 14:38:57 -08001874 }
Michal Hockoa49bd4d2018-04-10 16:29:59 -07001875out_flush:
1876 /* Make sure we do not overwrite the existing error */
Wei Yang7ca87832020-04-06 20:04:12 -07001877 err1 = move_pages_and_store_status(mm, current_node, &pagelist,
1878 status, start, i, nr_pages);
Wei Yangdfe9aa22020-01-30 22:11:14 -08001879 if (err >= 0)
Michal Hockoa49bd4d2018-04-10 16:29:59 -07001880 err = err1;
Brice Goglin5e9a0f02008-10-18 20:27:17 -07001881out:
Minchan Kim361a2a22021-05-04 18:36:57 -07001882 lru_cache_enable();
Brice Goglin5e9a0f02008-10-18 20:27:17 -07001883 return err;
1884}
1885
1886/*
Brice Goglin2f007e72008-10-18 20:27:16 -07001887 * Determine the nodes of an array of pages and store it in an array of status.
Christoph Lameter742755a2006-06-23 02:03:55 -07001888 */
Brice Goglin80bba122008-12-09 13:14:23 -08001889static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
1890 const void __user **pages, int *status)
Christoph Lameter742755a2006-06-23 02:03:55 -07001891{
Brice Goglin2f007e72008-10-18 20:27:16 -07001892 unsigned long i;
Brice Goglin2f007e72008-10-18 20:27:16 -07001893
Michel Lespinassed8ed45c2020-06-08 21:33:25 -07001894 mmap_read_lock(mm);
Christoph Lameter742755a2006-06-23 02:03:55 -07001895
Brice Goglin2f007e72008-10-18 20:27:16 -07001896 for (i = 0; i < nr_pages; i++) {
Brice Goglin80bba122008-12-09 13:14:23 -08001897 unsigned long addr = (unsigned long)(*pages);
Christoph Lameter742755a2006-06-23 02:03:55 -07001898 struct vm_area_struct *vma;
1899 struct page *page;
KOSAKI Motohiroc095adb2008-12-16 16:06:43 +09001900 int err = -EFAULT;
Brice Goglin2f007e72008-10-18 20:27:16 -07001901
Liam Howlett059b8b42021-06-28 19:39:44 -07001902 vma = vma_lookup(mm, addr);
1903 if (!vma)
Christoph Lameter742755a2006-06-23 02:03:55 -07001904 goto set_status;
1905
Kirill A. Shutemovd8998442015-09-04 15:47:53 -07001906 /* FOLL_DUMP to ignore special (like zero) pages */
1907 page = follow_page(vma, addr, FOLL_DUMP);
Linus Torvalds89f5b7d2008-06-20 11:18:25 -07001908
1909 err = PTR_ERR(page);
1910 if (IS_ERR(page))
1911 goto set_status;
1912
Kirill A. Shutemovd8998442015-09-04 15:47:53 -07001913 err = page ? page_to_nid(page) : -ENOENT;
Christoph Lameter742755a2006-06-23 02:03:55 -07001914set_status:
Brice Goglin80bba122008-12-09 13:14:23 -08001915 *status = err;
1916
1917 pages++;
1918 status++;
1919 }
1920
Michel Lespinassed8ed45c2020-06-08 21:33:25 -07001921 mmap_read_unlock(mm);
Brice Goglin80bba122008-12-09 13:14:23 -08001922}
1923
Arnd Bergmann5b1b5612021-09-08 15:18:17 -07001924static int get_compat_pages_array(const void __user *chunk_pages[],
1925 const void __user * __user *pages,
1926 unsigned long chunk_nr)
1927{
1928 compat_uptr_t __user *pages32 = (compat_uptr_t __user *)pages;
1929 compat_uptr_t p;
1930 int i;
1931
1932 for (i = 0; i < chunk_nr; i++) {
1933 if (get_user(p, pages32 + i))
1934 return -EFAULT;
1935 chunk_pages[i] = compat_ptr(p);
1936 }
1937
1938 return 0;
1939}
1940
Brice Goglin80bba122008-12-09 13:14:23 -08001941/*
1942 * Determine the nodes of a user array of pages and store it in
1943 * a user array of status.
1944 */
1945static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
1946 const void __user * __user *pages,
1947 int __user *status)
1948{
1949#define DO_PAGES_STAT_CHUNK_NR 16
1950 const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
1951 int chunk_status[DO_PAGES_STAT_CHUNK_NR];
Brice Goglin80bba122008-12-09 13:14:23 -08001952
H. Peter Anvin87b8d1a2010-02-18 16:13:40 -08001953 while (nr_pages) {
1954 unsigned long chunk_nr;
Brice Goglin80bba122008-12-09 13:14:23 -08001955
H. Peter Anvin87b8d1a2010-02-18 16:13:40 -08001956 chunk_nr = nr_pages;
1957 if (chunk_nr > DO_PAGES_STAT_CHUNK_NR)
1958 chunk_nr = DO_PAGES_STAT_CHUNK_NR;
1959
Arnd Bergmann5b1b5612021-09-08 15:18:17 -07001960 if (in_compat_syscall()) {
1961 if (get_compat_pages_array(chunk_pages, pages,
1962 chunk_nr))
1963 break;
1964 } else {
1965 if (copy_from_user(chunk_pages, pages,
1966 chunk_nr * sizeof(*chunk_pages)))
1967 break;
1968 }
Brice Goglin80bba122008-12-09 13:14:23 -08001969
1970 do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
1971
H. Peter Anvin87b8d1a2010-02-18 16:13:40 -08001972 if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status)))
1973 break;
Christoph Lameter742755a2006-06-23 02:03:55 -07001974
H. Peter Anvin87b8d1a2010-02-18 16:13:40 -08001975 pages += chunk_nr;
1976 status += chunk_nr;
1977 nr_pages -= chunk_nr;
1978 }
1979 return nr_pages ? -EFAULT : 0;
Christoph Lameter742755a2006-06-23 02:03:55 -07001980}
1981
Miaohe Lin4dc200c2020-10-17 16:14:03 -07001982static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes)
1983{
1984 struct task_struct *task;
1985 struct mm_struct *mm;
1986
1987 /*
1988 * There is no need to check if current process has the right to modify
1989 * the specified process when they are same.
1990 */
1991 if (!pid) {
1992 mmget(current->mm);
1993 *mem_nodes = cpuset_mems_allowed(current);
1994 return current->mm;
1995 }
1996
1997 /* Find the mm_struct */
1998 rcu_read_lock();
1999 task = find_task_by_vpid(pid);
2000 if (!task) {
2001 rcu_read_unlock();
2002 return ERR_PTR(-ESRCH);
2003 }
2004 get_task_struct(task);
2005
2006 /*
2007 * Check if this process has the right to modify the specified
2008 * process. Use the regular "ptrace_may_access()" checks.
2009 */
2010 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
2011 rcu_read_unlock();
2012 mm = ERR_PTR(-EPERM);
2013 goto out;
2014 }
2015 rcu_read_unlock();
2016
2017 mm = ERR_PTR(security_task_movememory(task));
2018 if (IS_ERR(mm))
2019 goto out;
2020 *mem_nodes = cpuset_mems_allowed(task);
2021 mm = get_task_mm(task);
2022out:
2023 put_task_struct(task);
2024 if (!mm)
2025 mm = ERR_PTR(-EINVAL);
2026 return mm;
2027}
2028
Christoph Lameter742755a2006-06-23 02:03:55 -07002029/*
2030 * Move a list of pages in the address space of the currently executing
2031 * process.
2032 */
Dominik Brodowski7addf442018-03-17 16:08:03 +01002033static int kernel_move_pages(pid_t pid, unsigned long nr_pages,
2034 const void __user * __user *pages,
2035 const int __user *nodes,
2036 int __user *status, int flags)
Christoph Lameter742755a2006-06-23 02:03:55 -07002037{
Christoph Lameter742755a2006-06-23 02:03:55 -07002038 struct mm_struct *mm;
Brice Goglin5e9a0f02008-10-18 20:27:17 -07002039 int err;
Christoph Lameter3268c632012-03-21 16:34:06 -07002040 nodemask_t task_nodes;
Christoph Lameter742755a2006-06-23 02:03:55 -07002041
2042 /* Check flags */
2043 if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
2044 return -EINVAL;
2045
2046 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
2047 return -EPERM;
2048
Miaohe Lin4dc200c2020-10-17 16:14:03 -07002049 mm = find_mm_struct(pid, &task_nodes);
2050 if (IS_ERR(mm))
2051 return PTR_ERR(mm);
Sasha Levin6e8b09e2012-04-25 16:01:53 -07002052
2053 if (nodes)
2054 err = do_pages_move(mm, task_nodes, nr_pages, pages,
2055 nodes, status, flags);
2056 else
2057 err = do_pages_stat(mm, nr_pages, pages, status);
Christoph Lameter3268c632012-03-21 16:34:06 -07002058
2059 mmput(mm);
2060 return err;
Christoph Lameter742755a2006-06-23 02:03:55 -07002061}
Christoph Lameter742755a2006-06-23 02:03:55 -07002062
Dominik Brodowski7addf442018-03-17 16:08:03 +01002063SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
2064 const void __user * __user *, pages,
2065 const int __user *, nodes,
2066 int __user *, status, int, flags)
2067{
2068 return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags);
2069}
2070
Peter Zijlstra7039e1d2012-10-25 14:16:34 +02002071#ifdef CONFIG_NUMA_BALANCING
2072/*
2073 * Returns true if this is a safe migration target node for misplaced NUMA
2074 * pages. Currently it only checks the watermarks which crude
2075 */
2076static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
Mel Gorman3abef4e2013-02-22 16:34:27 -08002077 unsigned long nr_migrate_pages)
Peter Zijlstra7039e1d2012-10-25 14:16:34 +02002078{
2079 int z;
Mel Gorman599d0c92016-07-28 15:45:31 -07002080
Peter Zijlstra7039e1d2012-10-25 14:16:34 +02002081 for (z = pgdat->nr_zones - 1; z >= 0; z--) {
2082 struct zone *zone = pgdat->node_zones + z;
2083
2084 if (!populated_zone(zone))
2085 continue;
2086
Peter Zijlstra7039e1d2012-10-25 14:16:34 +02002087 /* Avoid waking kswapd by allocating pages_to_migrate pages. */
2088 if (!zone_watermark_ok(zone, 0,
2089 high_wmark_pages(zone) +
2090 nr_migrate_pages,
Huang Yingbfe9d002019-11-30 17:57:28 -08002091 ZONE_MOVABLE, 0))
Peter Zijlstra7039e1d2012-10-25 14:16:34 +02002092 continue;
2093 return true;
2094 }
2095 return false;
2096}
2097
2098static struct page *alloc_misplaced_dst_page(struct page *page,
Michal Hocko666feb22018-04-10 16:30:03 -07002099 unsigned long data)
Peter Zijlstra7039e1d2012-10-25 14:16:34 +02002100{
2101 int nid = (int) data;
2102 struct page *newpage;
2103
Vlastimil Babka96db8002015-09-08 15:03:50 -07002104 newpage = __alloc_pages_node(nid,
Johannes Weinere97ca8e52014-03-10 15:49:43 -07002105 (GFP_HIGHUSER_MOVABLE |
2106 __GFP_THISNODE | __GFP_NOMEMALLOC |
2107 __GFP_NORETRY | __GFP_NOWARN) &
Mel Gorman8479eba2016-02-26 15:19:31 -08002108 ~__GFP_RECLAIM, 0);
Hillf Dantonbac03822012-11-27 14:46:24 +00002109
Peter Zijlstra7039e1d2012-10-25 14:16:34 +02002110 return newpage;
2111}
2112
Yang Shic5b5a3d2021-06-30 18:51:42 -07002113static struct page *alloc_misplaced_dst_page_thp(struct page *page,
2114 unsigned long data)
2115{
2116 int nid = (int) data;
2117 struct page *newpage;
2118
2119 newpage = alloc_pages_node(nid, (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE),
2120 HPAGE_PMD_ORDER);
2121 if (!newpage)
2122 goto out;
2123
2124 prep_transhuge_page(newpage);
2125
2126out:
2127 return newpage;
2128}
2129
Mel Gorman1c30e012014-01-21 15:50:58 -08002130static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
Mel Gormanb32967f2012-11-19 12:35:47 +00002131{
Hugh Dickins340ef392013-02-22 16:34:33 -08002132 int page_lru;
Baolin Wang2b9b6242021-09-08 15:18:01 -07002133 int nr_pages = thp_nr_pages(page);
Mel Gormanb32967f2012-11-19 12:35:47 +00002134
Sasha Levin309381fea2014-01-23 15:52:54 -08002135 VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
Mel Gorman3abef4e2013-02-22 16:34:27 -08002136
Yang Shi662aeea2021-06-30 18:51:51 -07002137 /* Do not migrate THP mapped by multiple processes */
2138 if (PageTransHuge(page) && total_mapcount(page) > 1)
2139 return 0;
2140
Mel Gormanb32967f2012-11-19 12:35:47 +00002141 /* Avoid migrating to a node that is nearly full */
Baolin Wang2b9b6242021-09-08 15:18:01 -07002142 if (!migrate_balanced_pgdat(pgdat, nr_pages))
Hugh Dickins340ef392013-02-22 16:34:33 -08002143 return 0;
Mel Gormanb32967f2012-11-19 12:35:47 +00002144
Hugh Dickins340ef392013-02-22 16:34:33 -08002145 if (isolate_lru_page(page))
2146 return 0;
Mel Gormanb32967f2012-11-19 12:35:47 +00002147
Huang Ying9de4f222020-04-06 20:04:41 -07002148 page_lru = page_is_file_lru(page);
Mel Gorman599d0c92016-07-28 15:45:31 -07002149 mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru,
Baolin Wang2b9b6242021-09-08 15:18:01 -07002150 nr_pages);
Hugh Dickins340ef392013-02-22 16:34:33 -08002151
2152 /*
2153 * Isolating the page has taken another reference, so the
2154 * caller's reference can be safely dropped without the page
2155 * disappearing underneath us during migration.
Mel Gormanb32967f2012-11-19 12:35:47 +00002156 */
2157 put_page(page);
Hugh Dickins340ef392013-02-22 16:34:33 -08002158 return 1;
Mel Gormanb32967f2012-11-19 12:35:47 +00002159}
2160
Mel Gormana8f60772012-11-14 21:41:46 +00002161/*
Peter Zijlstra7039e1d2012-10-25 14:16:34 +02002162 * Attempt to migrate a misplaced page to the specified destination
2163 * node. Caller is expected to have an elevated reference count on
2164 * the page that will be dropped by this function before returning.
2165 */
Mel Gorman1bc115d2013-10-07 11:29:05 +01002166int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
2167 int node)
Peter Zijlstra7039e1d2012-10-25 14:16:34 +02002168{
Mel Gormana8f60772012-11-14 21:41:46 +00002169 pg_data_t *pgdat = NODE_DATA(node);
Hugh Dickins340ef392013-02-22 16:34:33 -08002170 int isolated;
Mel Gormanb32967f2012-11-19 12:35:47 +00002171 int nr_remaining;
Peter Zijlstra7039e1d2012-10-25 14:16:34 +02002172 LIST_HEAD(migratepages);
Yang Shic5b5a3d2021-06-30 18:51:42 -07002173 new_page_t *new;
2174 bool compound;
Aneesh Kumar K.Vb5916c02021-07-29 14:53:47 -07002175 int nr_pages = thp_nr_pages(page);
Yang Shic5b5a3d2021-06-30 18:51:42 -07002176
2177 /*
2178 * PTE mapped THP or HugeTLB page can't reach here so the page could
2179 * be either base page or THP. And it must be head page if it is
2180 * THP.
2181 */
2182 compound = PageTransHuge(page);
2183
2184 if (compound)
2185 new = alloc_misplaced_dst_page_thp;
2186 else
2187 new = alloc_misplaced_dst_page;
Peter Zijlstra7039e1d2012-10-25 14:16:34 +02002188
2189 /*
Mel Gorman1bc115d2013-10-07 11:29:05 +01002190 * Don't migrate file pages that are mapped in multiple processes
2191 * with execute permissions as they are probably shared libraries.
Peter Zijlstra7039e1d2012-10-25 14:16:34 +02002192 */
Miaohe Lin7ee820e2021-05-04 18:37:16 -07002193 if (page_mapcount(page) != 1 && page_is_file_lru(page) &&
2194 (vma->vm_flags & VM_EXEC))
Peter Zijlstra7039e1d2012-10-25 14:16:34 +02002195 goto out;
Peter Zijlstra7039e1d2012-10-25 14:16:34 +02002196
Mel Gormana8f60772012-11-14 21:41:46 +00002197 /*
Mel Gorman09a913a2018-04-10 16:29:20 -07002198 * Also do not migrate dirty pages as not all filesystems can move
2199 * dirty pages in MIGRATE_ASYNC mode which is a waste of cycles.
2200 */
Huang Ying9de4f222020-04-06 20:04:41 -07002201 if (page_is_file_lru(page) && PageDirty(page))
Mel Gorman09a913a2018-04-10 16:29:20 -07002202 goto out;
2203
Mel Gormanb32967f2012-11-19 12:35:47 +00002204 isolated = numamigrate_isolate_page(pgdat, page);
2205 if (!isolated)
2206 goto out;
Peter Zijlstra7039e1d2012-10-25 14:16:34 +02002207
Mel Gormanb32967f2012-11-19 12:35:47 +00002208 list_add(&page->lru, &migratepages);
Yang Shic5b5a3d2021-06-30 18:51:42 -07002209 nr_remaining = migrate_pages(&migratepages, *new, NULL, node,
Yang Shi5ac95882021-09-02 14:59:13 -07002210 MIGRATE_ASYNC, MR_NUMA_MISPLACED, NULL);
Mel Gormanb32967f2012-11-19 12:35:47 +00002211 if (nr_remaining) {
Joonsoo Kim59c82b72014-01-21 15:51:17 -08002212 if (!list_empty(&migratepages)) {
2213 list_del(&page->lru);
Yang Shic5fc5c32021-06-30 18:51:45 -07002214 mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
2215 page_is_file_lru(page), -nr_pages);
Joonsoo Kim59c82b72014-01-21 15:51:17 -08002216 putback_lru_page(page);
2217 }
Mel Gormanb32967f2012-11-19 12:35:47 +00002218 isolated = 0;
2219 } else
Yang Shic5fc5c32021-06-30 18:51:45 -07002220 count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_pages);
Peter Zijlstra7039e1d2012-10-25 14:16:34 +02002221 BUG_ON(!list_empty(&migratepages));
Peter Zijlstra7039e1d2012-10-25 14:16:34 +02002222 return isolated;
Hugh Dickins340ef392013-02-22 16:34:33 -08002223
2224out:
2225 put_page(page);
2226 return 0;
Peter Zijlstra7039e1d2012-10-25 14:16:34 +02002227}
Mel Gorman220018d2012-12-05 09:32:56 +00002228#endif /* CONFIG_NUMA_BALANCING */
Peter Zijlstra7039e1d2012-10-25 14:16:34 +02002229#endif /* CONFIG_NUMA */
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002230
Christoph Hellwig9b2ed9c2019-08-14 09:59:28 +02002231#ifdef CONFIG_DEVICE_PRIVATE
Jérôme Glisse8315ada2017-09-08 16:12:21 -07002232static int migrate_vma_collect_skip(unsigned long start,
2233 unsigned long end,
2234 struct mm_walk *walk)
2235{
2236 struct migrate_vma *migrate = walk->private;
2237 unsigned long addr;
2238
Ralph Campbell872ea702020-01-30 22:14:38 -08002239 for (addr = start; addr < end; addr += PAGE_SIZE) {
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002240 migrate->dst[migrate->npages] = 0;
2241 migrate->src[migrate->npages++] = 0;
2242 }
2243
2244 return 0;
2245}
2246
Miaohe Lin843e1be2021-05-04 18:37:13 -07002247static int migrate_vma_collect_hole(unsigned long start,
2248 unsigned long end,
2249 __always_unused int depth,
2250 struct mm_walk *walk)
2251{
2252 struct migrate_vma *migrate = walk->private;
2253 unsigned long addr;
2254
2255 /* Only allow populating anonymous memory. */
2256 if (!vma_is_anonymous(walk->vma))
2257 return migrate_vma_collect_skip(start, end, walk);
2258
2259 for (addr = start; addr < end; addr += PAGE_SIZE) {
2260 migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
2261 migrate->dst[migrate->npages] = 0;
2262 migrate->npages++;
2263 migrate->cpages++;
2264 }
2265
2266 return 0;
2267}
2268
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002269static int migrate_vma_collect_pmd(pmd_t *pmdp,
2270 unsigned long start,
2271 unsigned long end,
2272 struct mm_walk *walk)
2273{
2274 struct migrate_vma *migrate = walk->private;
2275 struct vm_area_struct *vma = walk->vma;
2276 struct mm_struct *mm = vma->vm_mm;
Jérôme Glisse8c3328f2017-09-08 16:12:13 -07002277 unsigned long addr = start, unmapped = 0;
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002278 spinlock_t *ptl;
2279 pte_t *ptep;
2280
2281again:
2282 if (pmd_none(*pmdp))
Steven Priceb7a16c72020-02-03 17:36:03 -08002283 return migrate_vma_collect_hole(start, end, -1, walk);
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002284
2285 if (pmd_trans_huge(*pmdp)) {
2286 struct page *page;
2287
2288 ptl = pmd_lock(mm, pmdp);
2289 if (unlikely(!pmd_trans_huge(*pmdp))) {
2290 spin_unlock(ptl);
2291 goto again;
2292 }
2293
2294 page = pmd_page(*pmdp);
2295 if (is_huge_zero_page(page)) {
2296 spin_unlock(ptl);
2297 split_huge_pmd(vma, pmdp, addr);
2298 if (pmd_trans_unstable(pmdp))
Jérôme Glisse8315ada2017-09-08 16:12:21 -07002299 return migrate_vma_collect_skip(start, end,
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002300 walk);
2301 } else {
2302 int ret;
2303
2304 get_page(page);
2305 spin_unlock(ptl);
2306 if (unlikely(!trylock_page(page)))
Jérôme Glisse8315ada2017-09-08 16:12:21 -07002307 return migrate_vma_collect_skip(start, end,
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002308 walk);
2309 ret = split_huge_page(page);
2310 unlock_page(page);
2311 put_page(page);
Jérôme Glisse8315ada2017-09-08 16:12:21 -07002312 if (ret)
2313 return migrate_vma_collect_skip(start, end,
2314 walk);
2315 if (pmd_none(*pmdp))
Steven Priceb7a16c72020-02-03 17:36:03 -08002316 return migrate_vma_collect_hole(start, end, -1,
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002317 walk);
2318 }
2319 }
2320
2321 if (unlikely(pmd_bad(*pmdp)))
Jérôme Glisse8315ada2017-09-08 16:12:21 -07002322 return migrate_vma_collect_skip(start, end, walk);
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002323
2324 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
Jérôme Glisse8c3328f2017-09-08 16:12:13 -07002325 arch_enter_lazy_mmu_mode();
2326
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002327 for (; addr < end; addr += PAGE_SIZE, ptep++) {
Christoph Hellwig800bb1c2020-03-16 20:32:14 +01002328 unsigned long mpfn = 0, pfn;
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002329 struct page *page;
Jérôme Glisse8c3328f2017-09-08 16:12:13 -07002330 swp_entry_t entry;
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002331 pte_t pte;
2332
2333 pte = *ptep;
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002334
Jérôme Glissea5430dd2017-09-08 16:12:17 -07002335 if (pte_none(pte)) {
Ralph Campbell0744f282020-08-11 18:31:41 -07002336 if (vma_is_anonymous(vma)) {
2337 mpfn = MIGRATE_PFN_MIGRATE;
2338 migrate->cpages++;
2339 }
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002340 goto next;
2341 }
2342
Jérôme Glissea5430dd2017-09-08 16:12:17 -07002343 if (!pte_present(pte)) {
Jérôme Glissea5430dd2017-09-08 16:12:17 -07002344 /*
2345 * Only care about unaddressable device page special
2346 * page table entry. Other special swap entries are not
2347 * migratable, and we ignore regular swapped page.
2348 */
2349 entry = pte_to_swp_entry(pte);
2350 if (!is_device_private_entry(entry))
2351 goto next;
2352
Alistair Poppleaf5cdaf2021-06-30 18:54:06 -07002353 page = pfn_swap_entry_to_page(entry);
Ralph Campbell51431922020-07-23 15:30:00 -07002354 if (!(migrate->flags &
2355 MIGRATE_VMA_SELECT_DEVICE_PRIVATE) ||
2356 page->pgmap->owner != migrate->pgmap_owner)
Christoph Hellwig800bb1c2020-03-16 20:32:14 +01002357 goto next;
2358
Christoph Hellwig06d462b2019-08-14 09:59:27 +02002359 mpfn = migrate_pfn(page_to_pfn(page)) |
2360 MIGRATE_PFN_MIGRATE;
Alistair Popple4dd845b2021-06-30 18:54:09 -07002361 if (is_writable_device_private_entry(entry))
Jérôme Glissea5430dd2017-09-08 16:12:17 -07002362 mpfn |= MIGRATE_PFN_WRITE;
2363 } else {
Ralph Campbell51431922020-07-23 15:30:00 -07002364 if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
Christoph Hellwig800bb1c2020-03-16 20:32:14 +01002365 goto next;
Pingfan Liu276f7562019-09-23 15:37:38 -07002366 pfn = pte_pfn(pte);
Jérôme Glisse8315ada2017-09-08 16:12:21 -07002367 if (is_zero_pfn(pfn)) {
2368 mpfn = MIGRATE_PFN_MIGRATE;
2369 migrate->cpages++;
Jérôme Glisse8315ada2017-09-08 16:12:21 -07002370 goto next;
2371 }
Christoph Hellwig25b29952019-06-13 22:50:49 +02002372 page = vm_normal_page(migrate->vma, addr, pte);
Jérôme Glissea5430dd2017-09-08 16:12:17 -07002373 mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
2374 mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
2375 }
2376
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002377 /* FIXME support THP */
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002378 if (!page || !page->mapping || PageTransCompound(page)) {
Pingfan Liu276f7562019-09-23 15:37:38 -07002379 mpfn = 0;
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002380 goto next;
2381 }
2382
2383 /*
2384 * By getting a reference on the page we pin it and that blocks
2385 * any kind of migration. Side effect is that it "freezes" the
2386 * pte.
2387 *
2388 * We drop this reference after isolating the page from the lru
2389 * for non device page (device page are not on the lru and thus
2390 * can't be dropped from it).
2391 */
2392 get_page(page);
2393 migrate->cpages++;
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002394
Jérôme Glisse8c3328f2017-09-08 16:12:13 -07002395 /*
2396 * Optimize for the common case where page is only mapped once
2397 * in one process. If we can lock the page, then we can safely
2398 * set up a special migration page table entry now.
2399 */
2400 if (trylock_page(page)) {
2401 pte_t swp_pte;
2402
2403 mpfn |= MIGRATE_PFN_LOCKED;
2404 ptep_get_and_clear(mm, addr, ptep);
2405
2406 /* Setup special migration page table entry */
Alistair Popple4dd845b2021-06-30 18:54:09 -07002407 if (mpfn & MIGRATE_PFN_WRITE)
2408 entry = make_writable_migration_entry(
2409 page_to_pfn(page));
2410 else
2411 entry = make_readable_migration_entry(
2412 page_to_pfn(page));
Jérôme Glisse8c3328f2017-09-08 16:12:13 -07002413 swp_pte = swp_entry_to_pte(entry);
Alistair Popplead7df762020-09-04 16:36:01 -07002414 if (pte_present(pte)) {
2415 if (pte_soft_dirty(pte))
2416 swp_pte = pte_swp_mksoft_dirty(swp_pte);
2417 if (pte_uffd_wp(pte))
2418 swp_pte = pte_swp_mkuffd_wp(swp_pte);
2419 } else {
2420 if (pte_swp_soft_dirty(pte))
2421 swp_pte = pte_swp_mksoft_dirty(swp_pte);
2422 if (pte_swp_uffd_wp(pte))
2423 swp_pte = pte_swp_mkuffd_wp(swp_pte);
2424 }
Jérôme Glisse8c3328f2017-09-08 16:12:13 -07002425 set_pte_at(mm, addr, ptep, swp_pte);
2426
2427 /*
2428 * This is like regular unmap: we remove the rmap and
2429 * drop page refcount. Page won't be freed, as we took
2430 * a reference just above.
2431 */
2432 page_remove_rmap(page, false);
2433 put_page(page);
Jérôme Glissea5430dd2017-09-08 16:12:17 -07002434
2435 if (pte_present(pte))
2436 unmapped++;
Jérôme Glisse8c3328f2017-09-08 16:12:13 -07002437 }
2438
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002439next:
Jérôme Glissea5430dd2017-09-08 16:12:17 -07002440 migrate->dst[migrate->npages] = 0;
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002441 migrate->src[migrate->npages++] = mpfn;
2442 }
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002443
Jérôme Glisse8c3328f2017-09-08 16:12:13 -07002444 /* Only flush the TLB if we actually modified any entries */
2445 if (unmapped)
2446 flush_tlb_range(walk->vma, start, end);
2447
Alistair Popple1299c112022-09-02 10:35:51 +10002448 arch_leave_lazy_mmu_mode();
2449 pte_unmap_unlock(ptep - 1, ptl);
2450
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002451 return 0;
2452}
2453
Christoph Hellwig7b86ac32019-08-28 16:19:54 +02002454static const struct mm_walk_ops migrate_vma_walk_ops = {
2455 .pmd_entry = migrate_vma_collect_pmd,
2456 .pte_hole = migrate_vma_collect_hole,
2457};
2458
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002459/*
2460 * migrate_vma_collect() - collect pages over a range of virtual addresses
2461 * @migrate: migrate struct containing all migration information
2462 *
2463 * This will walk the CPU page table. For each virtual address backed by a
2464 * valid page, it updates the src array and takes a reference on the page, in
2465 * order to pin the page until we lock it and unmap it.
2466 */
2467static void migrate_vma_collect(struct migrate_vma *migrate)
2468{
Jérôme Glisseac46d4f2018-12-28 00:38:09 -08002469 struct mmu_notifier_range range;
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002470
Ralph Campbell998427b2020-07-23 15:30:01 -07002471 /*
2472 * Note that the pgmap_owner is passed to the mmu notifier callback so
2473 * that the registered device driver can skip invalidating device
2474 * private page mappings that won't be migrated.
2475 */
Alistair Popple6b49bf62021-06-30 18:54:19 -07002476 mmu_notifier_range_init_owner(&range, MMU_NOTIFY_MIGRATE, 0,
2477 migrate->vma, migrate->vma->vm_mm, migrate->start, migrate->end,
Ralph Campbellc1a06df2020-08-06 23:17:09 -07002478 migrate->pgmap_owner);
Jérôme Glisseac46d4f2018-12-28 00:38:09 -08002479 mmu_notifier_invalidate_range_start(&range);
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002480
Christoph Hellwig7b86ac32019-08-28 16:19:54 +02002481 walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end,
2482 &migrate_vma_walk_ops, migrate);
2483
2484 mmu_notifier_invalidate_range_end(&range);
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002485 migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
2486}
2487
2488/*
2489 * migrate_vma_check_page() - check if page is pinned or not
2490 * @page: struct page to check
2491 *
2492 * Pinned pages cannot be migrated. This is the same test as in
2493 * migrate_page_move_mapping(), except that here we allow migration of a
2494 * ZONE_DEVICE page.
2495 */
2496static bool migrate_vma_check_page(struct page *page)
2497{
2498 /*
2499 * One extra ref because caller holds an extra reference, either from
2500 * isolate_lru_page() for a regular page, or migrate_vma_collect() for
2501 * a device page.
2502 */
2503 int extra = 1;
2504
2505 /*
2506 * FIXME support THP (transparent huge page), it is bit more complex to
2507 * check them than regular pages, because they can be mapped with a pmd
2508 * or with a pte (split pte mapping).
2509 */
2510 if (PageCompound(page))
2511 return false;
2512
Jérôme Glissea5430dd2017-09-08 16:12:17 -07002513 /* Page from ZONE_DEVICE have one extra reference */
2514 if (is_zone_device_page(page)) {
2515 /*
2516 * Private page can never be pin as they have no valid pte and
2517 * GUP will fail for those. Yet if there is a pending migration
2518 * a thread might try to wait on the pte migration entry and
2519 * will bump the page reference count. Sadly there is no way to
2520 * differentiate a regular pin from migration wait. Hence to
2521 * avoid 2 racing thread trying to migrate back to CPU to enter
Haitao Shi8958b242020-12-15 20:47:26 -08002522 * infinite loop (one stopping migration because the other is
Jérôme Glissea5430dd2017-09-08 16:12:17 -07002523 * waiting on pte migration entry). We always return true here.
2524 *
2525 * FIXME proper solution is to rework migration_entry_wait() so
2526 * it does not need to take a reference on page.
2527 */
Christoph Hellwig25b29952019-06-13 22:50:49 +02002528 return is_device_private_page(page);
Jérôme Glissea5430dd2017-09-08 16:12:17 -07002529 }
2530
Jérôme Glissedf6ad692017-09-08 16:12:24 -07002531 /* For file back page */
2532 if (page_mapping(page))
2533 extra += 1 + page_has_private(page);
2534
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002535 if ((page_count(page) - extra) > page_mapcount(page))
2536 return false;
2537
2538 return true;
2539}
2540
2541/*
2542 * migrate_vma_prepare() - lock pages and isolate them from the lru
2543 * @migrate: migrate struct containing all migration information
2544 *
2545 * This locks pages that have been collected by migrate_vma_collect(). Once each
2546 * page is locked it is isolated from the lru (for non-device pages). Finally,
2547 * the ref taken by migrate_vma_collect() is dropped, as locked pages cannot be
2548 * migrated by concurrent kernel threads.
2549 */
2550static void migrate_vma_prepare(struct migrate_vma *migrate)
2551{
2552 const unsigned long npages = migrate->npages;
Jérôme Glisse8c3328f2017-09-08 16:12:13 -07002553 const unsigned long start = migrate->start;
2554 unsigned long addr, i, restore = 0;
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002555 bool allow_drain = true;
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002556
2557 lru_add_drain();
2558
2559 for (i = 0; (i < npages) && migrate->cpages; i++) {
2560 struct page *page = migrate_pfn_to_page(migrate->src[i]);
Jérôme Glisse8c3328f2017-09-08 16:12:13 -07002561 bool remap = true;
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002562
2563 if (!page)
2564 continue;
2565
Jérôme Glisse8c3328f2017-09-08 16:12:13 -07002566 if (!(migrate->src[i] & MIGRATE_PFN_LOCKED)) {
2567 /*
2568 * Because we are migrating several pages there can be
2569 * a deadlock between 2 concurrent migration where each
2570 * are waiting on each other page lock.
2571 *
2572 * Make migrate_vma() a best effort thing and backoff
2573 * for any page we can not lock right away.
2574 */
2575 if (!trylock_page(page)) {
2576 migrate->src[i] = 0;
2577 migrate->cpages--;
2578 put_page(page);
2579 continue;
2580 }
2581 remap = false;
2582 migrate->src[i] |= MIGRATE_PFN_LOCKED;
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002583 }
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002584
Jérôme Glissea5430dd2017-09-08 16:12:17 -07002585 /* ZONE_DEVICE pages are not on LRU */
2586 if (!is_zone_device_page(page)) {
2587 if (!PageLRU(page) && allow_drain) {
2588 /* Drain CPU's pagevec */
2589 lru_add_drain_all();
2590 allow_drain = false;
Jérôme Glisse8c3328f2017-09-08 16:12:13 -07002591 }
Jérôme Glissea5430dd2017-09-08 16:12:17 -07002592
2593 if (isolate_lru_page(page)) {
2594 if (remap) {
2595 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
2596 migrate->cpages--;
2597 restore++;
2598 } else {
2599 migrate->src[i] = 0;
2600 unlock_page(page);
2601 migrate->cpages--;
2602 put_page(page);
2603 }
2604 continue;
2605 }
2606
2607 /* Drop the reference we took in collect */
2608 put_page(page);
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002609 }
2610
2611 if (!migrate_vma_check_page(page)) {
Jérôme Glisse8c3328f2017-09-08 16:12:13 -07002612 if (remap) {
2613 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
2614 migrate->cpages--;
2615 restore++;
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002616
Jérôme Glissea5430dd2017-09-08 16:12:17 -07002617 if (!is_zone_device_page(page)) {
2618 get_page(page);
2619 putback_lru_page(page);
2620 }
Jérôme Glisse8c3328f2017-09-08 16:12:13 -07002621 } else {
2622 migrate->src[i] = 0;
2623 unlock_page(page);
2624 migrate->cpages--;
2625
Jérôme Glissea5430dd2017-09-08 16:12:17 -07002626 if (!is_zone_device_page(page))
2627 putback_lru_page(page);
2628 else
2629 put_page(page);
Jérôme Glisse8c3328f2017-09-08 16:12:13 -07002630 }
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002631 }
2632 }
Jérôme Glisse8c3328f2017-09-08 16:12:13 -07002633
2634 for (i = 0, addr = start; i < npages && restore; i++, addr += PAGE_SIZE) {
2635 struct page *page = migrate_pfn_to_page(migrate->src[i]);
2636
2637 if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
2638 continue;
2639
2640 remove_migration_pte(page, migrate->vma, addr, page);
2641
2642 migrate->src[i] = 0;
2643 unlock_page(page);
2644 put_page(page);
2645 restore--;
2646 }
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002647}
2648
2649/*
2650 * migrate_vma_unmap() - replace page mapping with special migration pte entry
2651 * @migrate: migrate struct containing all migration information
2652 *
2653 * Replace page mapping (CPU page table pte) with a special migration pte entry
2654 * and check again if it has been pinned. Pinned pages are restored because we
2655 * cannot migrate them.
2656 *
2657 * This is the last step before we call the device driver callback to allocate
2658 * destination memory and copy contents of original page over to new page.
2659 */
2660static void migrate_vma_unmap(struct migrate_vma *migrate)
2661{
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002662 const unsigned long npages = migrate->npages;
2663 const unsigned long start = migrate->start;
2664 unsigned long addr, i, restore = 0;
2665
2666 for (i = 0; i < npages; i++) {
2667 struct page *page = migrate_pfn_to_page(migrate->src[i]);
2668
2669 if (!page || !(migrate->src[i] & MIGRATE_PFN_MIGRATE))
2670 continue;
2671
Jérôme Glisse8c3328f2017-09-08 16:12:13 -07002672 if (page_mapped(page)) {
Alistair Popplea98a2f02021-06-30 18:54:16 -07002673 try_to_migrate(page, 0);
Jérôme Glisse8c3328f2017-09-08 16:12:13 -07002674 if (page_mapped(page))
2675 goto restore;
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002676 }
Jérôme Glisse8c3328f2017-09-08 16:12:13 -07002677
2678 if (migrate_vma_check_page(page))
2679 continue;
2680
2681restore:
2682 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
2683 migrate->cpages--;
2684 restore++;
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002685 }
2686
2687 for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) {
2688 struct page *page = migrate_pfn_to_page(migrate->src[i]);
2689
2690 if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
2691 continue;
2692
2693 remove_migration_ptes(page, page, false);
2694
2695 migrate->src[i] = 0;
2696 unlock_page(page);
2697 restore--;
2698
Jérôme Glissea5430dd2017-09-08 16:12:17 -07002699 if (is_zone_device_page(page))
2700 put_page(page);
2701 else
2702 putback_lru_page(page);
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002703 }
2704}
2705
Christoph Hellwiga7d1f222019-08-14 09:59:19 +02002706/**
2707 * migrate_vma_setup() - prepare to migrate a range of memory
Randy Dunlapeaf444d2020-08-11 18:33:08 -07002708 * @args: contains the vma, start, and pfns arrays for the migration
Christoph Hellwiga7d1f222019-08-14 09:59:19 +02002709 *
2710 * Returns: negative errno on failures, 0 when 0 or more pages were migrated
2711 * without an error.
2712 *
2713 * Prepare to migrate a range of memory virtual address range by collecting all
2714 * the pages backing each virtual address in the range, saving them inside the
2715 * src array. Then lock those pages and unmap them. Once the pages are locked
2716 * and unmapped, check whether each page is pinned or not. Pages that aren't
2717 * pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the
2718 * corresponding src array entry. Then restores any pages that are pinned, by
2719 * remapping and unlocking those pages.
2720 *
2721 * The caller should then allocate destination memory and copy source memory to
2722 * it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE
2723 * flag set). Once these are allocated and copied, the caller must update each
2724 * corresponding entry in the dst array with the pfn value of the destination
2725 * page and with the MIGRATE_PFN_VALID and MIGRATE_PFN_LOCKED flags set
2726 * (destination pages must have their struct pages locked, via lock_page()).
2727 *
2728 * Note that the caller does not have to migrate all the pages that are marked
2729 * with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from
2730 * device memory to system memory. If the caller cannot migrate a device page
2731 * back to system memory, then it must return VM_FAULT_SIGBUS, which has severe
2732 * consequences for the userspace process, so it must be avoided if at all
2733 * possible.
2734 *
2735 * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we
2736 * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus
Ingo Molnarf0953a12021-05-06 18:06:47 -07002737 * allowing the caller to allocate device memory for those unbacked virtual
2738 * addresses. For this the caller simply has to allocate device memory and
Christoph Hellwiga7d1f222019-08-14 09:59:19 +02002739 * properly set the destination entry like for regular migration. Note that
Ingo Molnarf0953a12021-05-06 18:06:47 -07002740 * this can still fail, and thus inside the device driver you must check if the
2741 * migration was successful for those entries after calling migrate_vma_pages(),
Christoph Hellwiga7d1f222019-08-14 09:59:19 +02002742 * just like for regular migration.
2743 *
2744 * After that, the callers must call migrate_vma_pages() to go over each entry
2745 * in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
2746 * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
2747 * then migrate_vma_pages() to migrate struct page information from the source
2748 * struct page to the destination struct page. If it fails to migrate the
2749 * struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the
2750 * src array.
2751 *
2752 * At this point all successfully migrated pages have an entry in the src
2753 * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
2754 * array entry with MIGRATE_PFN_VALID flag set.
2755 *
2756 * Once migrate_vma_pages() returns the caller may inspect which pages were
2757 * successfully migrated, and which were not. Successfully migrated pages will
2758 * have the MIGRATE_PFN_MIGRATE flag set for their src array entry.
2759 *
2760 * It is safe to update device page table after migrate_vma_pages() because
Michel Lespinassec1e8d7c2020-06-08 21:33:54 -07002761 * both destination and source page are still locked, and the mmap_lock is held
Christoph Hellwiga7d1f222019-08-14 09:59:19 +02002762 * in read mode (hence no one can unmap the range being migrated).
2763 *
2764 * Once the caller is done cleaning up things and updating its page table (if it
2765 * chose to do so, this is not an obligation) it finally calls
2766 * migrate_vma_finalize() to update the CPU page table to point to new pages
2767 * for successfully migrated pages or otherwise restore the CPU page table to
2768 * point to the original source pages.
2769 */
2770int migrate_vma_setup(struct migrate_vma *args)
2771{
2772 long nr_pages = (args->end - args->start) >> PAGE_SHIFT;
2773
2774 args->start &= PAGE_MASK;
2775 args->end &= PAGE_MASK;
2776 if (!args->vma || is_vm_hugetlb_page(args->vma) ||
2777 (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma))
2778 return -EINVAL;
2779 if (nr_pages <= 0)
2780 return -EINVAL;
2781 if (args->start < args->vma->vm_start ||
2782 args->start >= args->vma->vm_end)
2783 return -EINVAL;
2784 if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end)
2785 return -EINVAL;
2786 if (!args->src || !args->dst)
2787 return -EINVAL;
2788
2789 memset(args->src, 0, sizeof(*args->src) * nr_pages);
2790 args->cpages = 0;
2791 args->npages = 0;
2792
2793 migrate_vma_collect(args);
2794
2795 if (args->cpages)
2796 migrate_vma_prepare(args);
2797 if (args->cpages)
2798 migrate_vma_unmap(args);
2799
2800 /*
2801 * At this point pages are locked and unmapped, and thus they have
2802 * stable content and can safely be copied to destination memory that
2803 * is allocated by the drivers.
2804 */
2805 return 0;
2806
2807}
2808EXPORT_SYMBOL(migrate_vma_setup);
2809
Ralph Campbell34290e22020-01-30 22:14:44 -08002810/*
2811 * This code closely matches the code in:
2812 * __handle_mm_fault()
2813 * handle_pte_fault()
2814 * do_anonymous_page()
2815 * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE
2816 * private page.
2817 */
Jérôme Glisse8315ada2017-09-08 16:12:21 -07002818static void migrate_vma_insert_page(struct migrate_vma *migrate,
2819 unsigned long addr,
2820 struct page *page,
Stephen Zhangd85c6db2020-12-14 19:13:20 -08002821 unsigned long *src)
Jérôme Glisse8315ada2017-09-08 16:12:21 -07002822{
2823 struct vm_area_struct *vma = migrate->vma;
2824 struct mm_struct *mm = vma->vm_mm;
Jérôme Glisse8315ada2017-09-08 16:12:21 -07002825 bool flush = false;
2826 spinlock_t *ptl;
2827 pte_t entry;
2828 pgd_t *pgdp;
2829 p4d_t *p4dp;
2830 pud_t *pudp;
2831 pmd_t *pmdp;
2832 pte_t *ptep;
2833
2834 /* Only allow populating anonymous memory */
2835 if (!vma_is_anonymous(vma))
2836 goto abort;
2837
2838 pgdp = pgd_offset(mm, addr);
2839 p4dp = p4d_alloc(mm, pgdp, addr);
2840 if (!p4dp)
2841 goto abort;
2842 pudp = pud_alloc(mm, p4dp, addr);
2843 if (!pudp)
2844 goto abort;
2845 pmdp = pmd_alloc(mm, pudp, addr);
2846 if (!pmdp)
2847 goto abort;
2848
2849 if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp))
2850 goto abort;
2851
2852 /*
2853 * Use pte_alloc() instead of pte_alloc_map(). We can't run
2854 * pte_offset_map() on pmds where a huge pmd might be created
2855 * from a different thread.
2856 *
Michel Lespinasse3e4e28c2020-06-08 21:33:51 -07002857 * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
Jérôme Glisse8315ada2017-09-08 16:12:21 -07002858 * parallel threads are excluded by other means.
2859 *
Michel Lespinasse3e4e28c2020-06-08 21:33:51 -07002860 * Here we only have mmap_read_lock(mm).
Jérôme Glisse8315ada2017-09-08 16:12:21 -07002861 */
Joel Fernandes (Google)4cf58922019-01-03 15:28:34 -08002862 if (pte_alloc(mm, pmdp))
Jérôme Glisse8315ada2017-09-08 16:12:21 -07002863 goto abort;
2864
2865 /* See the comment in pte_alloc_one_map() */
2866 if (unlikely(pmd_trans_unstable(pmdp)))
2867 goto abort;
2868
2869 if (unlikely(anon_vma_prepare(vma)))
2870 goto abort;
Johannes Weinerd9eb1ea2020-06-03 16:02:24 -07002871 if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
Jérôme Glisse8315ada2017-09-08 16:12:21 -07002872 goto abort;
2873
2874 /*
2875 * The memory barrier inside __SetPageUptodate makes sure that
2876 * preceding stores to the page contents become visible before
2877 * the set_pte_at() write.
2878 */
2879 __SetPageUptodate(page);
2880
Jérôme Glissedf6ad692017-09-08 16:12:24 -07002881 if (is_zone_device_page(page)) {
2882 if (is_device_private_page(page)) {
2883 swp_entry_t swp_entry;
Jérôme Glisse8315ada2017-09-08 16:12:21 -07002884
Alistair Popple4dd845b2021-06-30 18:54:09 -07002885 if (vma->vm_flags & VM_WRITE)
2886 swp_entry = make_writable_device_private_entry(
2887 page_to_pfn(page));
2888 else
2889 swp_entry = make_readable_device_private_entry(
2890 page_to_pfn(page));
Jérôme Glissedf6ad692017-09-08 16:12:24 -07002891 entry = swp_entry_to_pte(swp_entry);
Miaohe Lin34f5e9b2021-05-04 18:37:10 -07002892 } else {
2893 /*
2894 * For now we only support migrating to un-addressable
2895 * device memory.
2896 */
2897 pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
2898 goto abort;
Jérôme Glissedf6ad692017-09-08 16:12:24 -07002899 }
Jérôme Glisse8315ada2017-09-08 16:12:21 -07002900 } else {
2901 entry = mk_pte(page, vma->vm_page_prot);
2902 if (vma->vm_flags & VM_WRITE)
2903 entry = pte_mkwrite(pte_mkdirty(entry));
2904 }
2905
2906 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
2907
Ralph Campbell34290e22020-01-30 22:14:44 -08002908 if (check_stable_address_space(mm))
2909 goto unlock_abort;
2910
Jérôme Glisse8315ada2017-09-08 16:12:21 -07002911 if (pte_present(*ptep)) {
2912 unsigned long pfn = pte_pfn(*ptep);
2913
Ralph Campbellc23a0c92020-01-30 22:14:41 -08002914 if (!is_zero_pfn(pfn))
2915 goto unlock_abort;
Jérôme Glisse8315ada2017-09-08 16:12:21 -07002916 flush = true;
Ralph Campbellc23a0c92020-01-30 22:14:41 -08002917 } else if (!pte_none(*ptep))
2918 goto unlock_abort;
Jérôme Glisse8315ada2017-09-08 16:12:21 -07002919
2920 /*
Ralph Campbellc23a0c92020-01-30 22:14:41 -08002921 * Check for userfaultfd but do not deliver the fault. Instead,
Jérôme Glisse8315ada2017-09-08 16:12:21 -07002922 * just back off.
2923 */
Ralph Campbellc23a0c92020-01-30 22:14:41 -08002924 if (userfaultfd_missing(vma))
2925 goto unlock_abort;
Jérôme Glisse8315ada2017-09-08 16:12:21 -07002926
2927 inc_mm_counter(mm, MM_ANONPAGES);
Johannes Weinerbe5d0a72020-06-03 16:01:57 -07002928 page_add_new_anon_rmap(page, vma, addr, false);
Jérôme Glisse8315ada2017-09-08 16:12:21 -07002929 if (!is_zone_device_page(page))
Joonsoo Kimb5181542020-08-11 18:30:40 -07002930 lru_cache_add_inactive_or_unevictable(page, vma);
Jérôme Glisse8315ada2017-09-08 16:12:21 -07002931 get_page(page);
2932
2933 if (flush) {
2934 flush_cache_page(vma, addr, pte_pfn(*ptep));
2935 ptep_clear_flush_notify(vma, addr, ptep);
2936 set_pte_at_notify(mm, addr, ptep, entry);
2937 update_mmu_cache(vma, addr, ptep);
2938 } else {
2939 /* No need to invalidate - it was non-present before */
2940 set_pte_at(mm, addr, ptep, entry);
2941 update_mmu_cache(vma, addr, ptep);
2942 }
2943
2944 pte_unmap_unlock(ptep, ptl);
2945 *src = MIGRATE_PFN_MIGRATE;
2946 return;
2947
Ralph Campbellc23a0c92020-01-30 22:14:41 -08002948unlock_abort:
2949 pte_unmap_unlock(ptep, ptl);
Jérôme Glisse8315ada2017-09-08 16:12:21 -07002950abort:
2951 *src &= ~MIGRATE_PFN_MIGRATE;
2952}
2953
Christoph Hellwiga7d1f222019-08-14 09:59:19 +02002954/**
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002955 * migrate_vma_pages() - migrate meta-data from src page to dst page
2956 * @migrate: migrate struct containing all migration information
2957 *
2958 * This migrates struct page meta-data from source struct page to destination
2959 * struct page. This effectively finishes the migration from source page to the
2960 * destination page.
2961 */
Christoph Hellwiga7d1f222019-08-14 09:59:19 +02002962void migrate_vma_pages(struct migrate_vma *migrate)
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002963{
2964 const unsigned long npages = migrate->npages;
2965 const unsigned long start = migrate->start;
Jérôme Glisseac46d4f2018-12-28 00:38:09 -08002966 struct mmu_notifier_range range;
2967 unsigned long addr, i;
Jérôme Glisse8315ada2017-09-08 16:12:21 -07002968 bool notified = false;
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002969
2970 for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
2971 struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
2972 struct page *page = migrate_pfn_to_page(migrate->src[i]);
2973 struct address_space *mapping;
2974 int r;
2975
Jérôme Glisse8315ada2017-09-08 16:12:21 -07002976 if (!newpage) {
2977 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002978 continue;
Jérôme Glisse8315ada2017-09-08 16:12:21 -07002979 }
2980
2981 if (!page) {
Ralph Campbellc23a0c92020-01-30 22:14:41 -08002982 if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
Jérôme Glisse8315ada2017-09-08 16:12:21 -07002983 continue;
Jérôme Glisse8315ada2017-09-08 16:12:21 -07002984 if (!notified) {
Jérôme Glisse8315ada2017-09-08 16:12:21 -07002985 notified = true;
Jérôme Glisseac46d4f2018-12-28 00:38:09 -08002986
Alistair Popple6b49bf62021-06-30 18:54:19 -07002987 mmu_notifier_range_init_owner(&range,
2988 MMU_NOTIFY_MIGRATE, 0, migrate->vma,
2989 migrate->vma->vm_mm, addr, migrate->end,
Ralph Campbell5e5dda82020-12-14 19:12:55 -08002990 migrate->pgmap_owner);
Jérôme Glisseac46d4f2018-12-28 00:38:09 -08002991 mmu_notifier_invalidate_range_start(&range);
Jérôme Glisse8315ada2017-09-08 16:12:21 -07002992 }
2993 migrate_vma_insert_page(migrate, addr, newpage,
Stephen Zhangd85c6db2020-12-14 19:13:20 -08002994 &migrate->src[i]);
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002995 continue;
Jérôme Glisse8315ada2017-09-08 16:12:21 -07002996 }
Jérôme Glisse8763cb42017-09-08 16:12:09 -07002997
2998 mapping = page_mapping(page);
2999
Jérôme Glissea5430dd2017-09-08 16:12:17 -07003000 if (is_zone_device_page(newpage)) {
3001 if (is_device_private_page(newpage)) {
3002 /*
3003 * For now only support private anonymous when
3004 * migrating to un-addressable device memory.
3005 */
3006 if (mapping) {
3007 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
3008 continue;
3009 }
Christoph Hellwig25b29952019-06-13 22:50:49 +02003010 } else {
Jérôme Glissea5430dd2017-09-08 16:12:17 -07003011 /*
3012 * Other types of ZONE_DEVICE page are not
3013 * supported.
3014 */
3015 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
3016 continue;
3017 }
3018 }
3019
Jérôme Glisse8763cb42017-09-08 16:12:09 -07003020 r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY);
3021 if (r != MIGRATEPAGE_SUCCESS)
3022 migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
3023 }
Jérôme Glisse8315ada2017-09-08 16:12:21 -07003024
Jérôme Glisse4645b9f2017-11-15 17:34:11 -08003025 /*
3026 * No need to double call mmu_notifier->invalidate_range() callback as
3027 * the above ptep_clear_flush_notify() inside migrate_vma_insert_page()
3028 * did already call it.
3029 */
Jérôme Glisse8315ada2017-09-08 16:12:21 -07003030 if (notified)
Jérôme Glisseac46d4f2018-12-28 00:38:09 -08003031 mmu_notifier_invalidate_range_only_end(&range);
Jérôme Glisse8763cb42017-09-08 16:12:09 -07003032}
Christoph Hellwiga7d1f222019-08-14 09:59:19 +02003033EXPORT_SYMBOL(migrate_vma_pages);
Jérôme Glisse8763cb42017-09-08 16:12:09 -07003034
Christoph Hellwiga7d1f222019-08-14 09:59:19 +02003035/**
Jérôme Glisse8763cb42017-09-08 16:12:09 -07003036 * migrate_vma_finalize() - restore CPU page table entry
3037 * @migrate: migrate struct containing all migration information
3038 *
3039 * This replaces the special migration pte entry with either a mapping to the
3040 * new page if migration was successful for that page, or to the original page
3041 * otherwise.
3042 *
3043 * This also unlocks the pages and puts them back on the lru, or drops the extra
3044 * refcount, for device pages.
3045 */
Christoph Hellwiga7d1f222019-08-14 09:59:19 +02003046void migrate_vma_finalize(struct migrate_vma *migrate)
Jérôme Glisse8763cb42017-09-08 16:12:09 -07003047{
3048 const unsigned long npages = migrate->npages;
3049 unsigned long i;
3050
3051 for (i = 0; i < npages; i++) {
3052 struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
3053 struct page *page = migrate_pfn_to_page(migrate->src[i]);
3054
Jérôme Glisse8315ada2017-09-08 16:12:21 -07003055 if (!page) {
3056 if (newpage) {
3057 unlock_page(newpage);
3058 put_page(newpage);
3059 }
Jérôme Glisse8763cb42017-09-08 16:12:09 -07003060 continue;
Jérôme Glisse8315ada2017-09-08 16:12:21 -07003061 }
3062
Jérôme Glisse8763cb42017-09-08 16:12:09 -07003063 if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) {
3064 if (newpage) {
3065 unlock_page(newpage);
3066 put_page(newpage);
3067 }
3068 newpage = page;
3069 }
3070
3071 remove_migration_ptes(page, newpage, false);
3072 unlock_page(page);
Jérôme Glisse8763cb42017-09-08 16:12:09 -07003073
Jérôme Glissea5430dd2017-09-08 16:12:17 -07003074 if (is_zone_device_page(page))
3075 put_page(page);
3076 else
3077 putback_lru_page(page);
Jérôme Glisse8763cb42017-09-08 16:12:09 -07003078
3079 if (newpage != page) {
3080 unlock_page(newpage);
Jérôme Glissea5430dd2017-09-08 16:12:17 -07003081 if (is_zone_device_page(newpage))
3082 put_page(newpage);
3083 else
3084 putback_lru_page(newpage);
Jérôme Glisse8763cb42017-09-08 16:12:09 -07003085 }
3086 }
3087}
Christoph Hellwiga7d1f222019-08-14 09:59:19 +02003088EXPORT_SYMBOL(migrate_vma_finalize);
Christoph Hellwig9b2ed9c2019-08-14 09:59:28 +02003089#endif /* CONFIG_DEVICE_PRIVATE */
Dave Hansen79c28a42021-09-02 14:59:06 -07003090
Dave Hansen76af6a02021-10-18 15:15:32 -07003091#if defined(CONFIG_HOTPLUG_CPU)
Dave Hansen79c28a42021-09-02 14:59:06 -07003092/* Disable reclaim-based migration. */
3093static void __disable_all_migrate_targets(void)
3094{
3095 int node;
3096
3097 for_each_online_node(node)
3098 node_demotion[node] = NUMA_NO_NODE;
3099}
3100
3101static void disable_all_migrate_targets(void)
3102{
3103 __disable_all_migrate_targets();
3104
3105 /*
3106 * Ensure that the "disable" is visible across the system.
3107 * Readers will see either a combination of before+disable
3108 * state or disable+after. They will never see before and
3109 * after state together.
3110 *
3111 * The before+after state together might have cycles and
3112 * could cause readers to do things like loop until this
3113 * function finishes. This ensures they can only see a
3114 * single "bad" read and would, for instance, only loop
3115 * once.
3116 */
3117 synchronize_rcu();
3118}
3119
3120/*
3121 * Find an automatic demotion target for 'node'.
3122 * Failing here is OK. It might just indicate
3123 * being at the end of a chain.
3124 */
3125static int establish_migrate_target(int node, nodemask_t *used)
3126{
3127 int migration_target;
3128
3129 /*
3130 * Can not set a migration target on a
3131 * node with it already set.
3132 *
3133 * No need for READ_ONCE() here since this
3134 * in the write path for node_demotion[].
3135 * This should be the only thread writing.
3136 */
3137 if (node_demotion[node] != NUMA_NO_NODE)
3138 return NUMA_NO_NODE;
3139
3140 migration_target = find_next_best_node(node, used);
3141 if (migration_target == NUMA_NO_NODE)
3142 return NUMA_NO_NODE;
3143
3144 node_demotion[node] = migration_target;
3145
3146 return migration_target;
3147}
3148
3149/*
3150 * When memory fills up on a node, memory contents can be
3151 * automatically migrated to another node instead of
3152 * discarded at reclaim.
3153 *
3154 * Establish a "migration path" which will start at nodes
3155 * with CPUs and will follow the priorities used to build the
3156 * page allocator zonelists.
3157 *
3158 * The difference here is that cycles must be avoided. If
3159 * node0 migrates to node1, then neither node1, nor anything
3160 * node1 migrates to can migrate to node0.
3161 *
3162 * This function can run simultaneously with readers of
3163 * node_demotion[]. However, it can not run simultaneously
3164 * with itself. Exclusion is provided by memory hotplug events
3165 * being single-threaded.
3166 */
3167static void __set_migration_target_nodes(void)
3168{
3169 nodemask_t next_pass = NODE_MASK_NONE;
3170 nodemask_t this_pass = NODE_MASK_NONE;
3171 nodemask_t used_targets = NODE_MASK_NONE;
3172 int node;
3173
3174 /*
3175 * Avoid any oddities like cycles that could occur
3176 * from changes in the topology. This will leave
3177 * a momentary gap when migration is disabled.
3178 */
3179 disable_all_migrate_targets();
3180
3181 /*
3182 * Allocations go close to CPUs, first. Assume that
3183 * the migration path starts at the nodes with CPUs.
3184 */
3185 next_pass = node_states[N_CPU];
3186again:
3187 this_pass = next_pass;
3188 next_pass = NODE_MASK_NONE;
3189 /*
3190 * To avoid cycles in the migration "graph", ensure
3191 * that migration sources are not future targets by
3192 * setting them in 'used_targets'. Do this only
3193 * once per pass so that multiple source nodes can
3194 * share a target node.
3195 *
3196 * 'used_targets' will become unavailable in future
3197 * passes. This limits some opportunities for
3198 * multiple source nodes to share a destination.
3199 */
3200 nodes_or(used_targets, used_targets, this_pass);
3201 for_each_node_mask(node, this_pass) {
3202 int target_node = establish_migrate_target(node, &used_targets);
3203
3204 if (target_node == NUMA_NO_NODE)
3205 continue;
3206
3207 /*
3208 * Visit targets from this pass in the next pass.
3209 * Eventually, every node will have been part of
3210 * a pass, and will become set in 'used_targets'.
3211 */
3212 node_set(target_node, next_pass);
3213 }
3214 /*
3215 * 'next_pass' contains nodes which became migration
3216 * targets in this pass. Make additional passes until
3217 * no more migrations targets are available.
3218 */
3219 if (!nodes_empty(next_pass))
3220 goto again;
3221}
3222
3223/*
3224 * For callers that do not hold get_online_mems() already.
3225 */
Dave Hansen79c28a42021-09-02 14:59:06 -07003226static void set_migration_target_nodes(void)
3227{
3228 get_online_mems();
3229 __set_migration_target_nodes();
3230 put_online_mems();
3231}
Dave Hansen884a6e52021-09-02 14:59:09 -07003232
3233/*
Dave Hansen884a6e52021-09-02 14:59:09 -07003234 * This leaves migrate-on-reclaim transiently disabled between
3235 * the MEM_GOING_OFFLINE and MEM_OFFLINE events. This runs
3236 * whether reclaim-based migration is enabled or not, which
3237 * ensures that the user can turn reclaim-based migration at
3238 * any time without needing to recalculate migration targets.
3239 *
3240 * These callbacks already hold get_online_mems(). That is why
3241 * __set_migration_target_nodes() can be used as opposed to
3242 * set_migration_target_nodes().
3243 */
3244static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
Dave Hansen295be912021-10-18 15:15:29 -07003245 unsigned long action, void *_arg)
Dave Hansen884a6e52021-09-02 14:59:09 -07003246{
Dave Hansen295be912021-10-18 15:15:29 -07003247 struct memory_notify *arg = _arg;
3248
3249 /*
3250 * Only update the node migration order when a node is
3251 * changing status, like online->offline. This avoids
3252 * the overhead of synchronize_rcu() in most cases.
3253 */
3254 if (arg->status_change_nid < 0)
3255 return notifier_from_errno(0);
3256
Dave Hansen884a6e52021-09-02 14:59:09 -07003257 switch (action) {
3258 case MEM_GOING_OFFLINE:
3259 /*
3260 * Make sure there are not transient states where
3261 * an offline node is a migration target. This
3262 * will leave migration disabled until the offline
3263 * completes and the MEM_OFFLINE case below runs.
3264 */
3265 disable_all_migrate_targets();
3266 break;
3267 case MEM_OFFLINE:
3268 case MEM_ONLINE:
3269 /*
3270 * Recalculate the target nodes once the node
3271 * reaches its final state (online or offline).
3272 */
3273 __set_migration_target_nodes();
3274 break;
3275 case MEM_CANCEL_OFFLINE:
3276 /*
3277 * MEM_GOING_OFFLINE disabled all the migration
3278 * targets. Reenable them.
3279 */
3280 __set_migration_target_nodes();
3281 break;
3282 case MEM_GOING_ONLINE:
3283 case MEM_CANCEL_ONLINE:
3284 break;
3285 }
3286
3287 return notifier_from_errno(0);
3288}
3289
Dave Hansen76af6a02021-10-18 15:15:32 -07003290/*
3291 * React to hotplug events that might affect the migration targets
3292 * like events that online or offline NUMA nodes.
3293 *
3294 * The ordering is also currently dependent on which nodes have
3295 * CPUs. That means we need CPU on/offline notification too.
3296 */
3297static int migration_online_cpu(unsigned int cpu)
3298{
3299 set_migration_target_nodes();
3300 return 0;
3301}
3302
3303static int migration_offline_cpu(unsigned int cpu)
3304{
3305 set_migration_target_nodes();
3306 return 0;
3307}
3308
Dave Hansen884a6e52021-09-02 14:59:09 -07003309static int __init migrate_on_reclaim_init(void)
3310{
3311 int ret;
3312
Huang Yinga6a02512021-10-18 15:15:35 -07003313 ret = cpuhp_setup_state_nocalls(CPUHP_MM_DEMOTION_DEAD, "mm/demotion:offline",
3314 NULL, migration_offline_cpu);
Dave Hansen884a6e52021-09-02 14:59:09 -07003315 /*
3316 * In the unlikely case that this fails, the automatic
3317 * migration targets may become suboptimal for nodes
3318 * where N_CPU changes. With such a small impact in a
3319 * rare case, do not bother trying to do anything special.
3320 */
3321 WARN_ON(ret < 0);
Huang Yinga6a02512021-10-18 15:15:35 -07003322 ret = cpuhp_setup_state(CPUHP_AP_MM_DEMOTION_ONLINE, "mm/demotion:online",
3323 migration_online_cpu, NULL);
3324 WARN_ON(ret < 0);
Dave Hansen884a6e52021-09-02 14:59:09 -07003325
3326 hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
3327 return 0;
3328}
3329late_initcall(migrate_on_reclaim_init);
Dave Hansen76af6a02021-10-18 15:15:32 -07003330#endif /* CONFIG_HOTPLUG_CPU */