blob: 756d6e5bb59f90bcad13954e9d32377c391647ec [file] [log] [blame]
Thomas Gleixner46aeb7e2019-05-28 10:10:27 -07001// SPDX-License-Identifier: GPL-2.0-only
Linus Torvalds1da177e2005-04-16 15:20:36 -07002/*
3 * Simple NUMA memory policy for the Linux kernel.
4 *
5 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
Christoph Lameter8bccd852005-10-29 18:16:59 -07006 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
Linus Torvalds1da177e2005-04-16 15:20:36 -07007 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
20 * is used.
Christoph Lameter8bccd852005-10-29 18:16:59 -070021 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070022 * bind Only allocate memory on a specific set of nodes,
23 * no fallback.
Christoph Lameter8bccd852005-10-29 18:16:59 -070024 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
27 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070028 * preferred Try a specific node first before normal fallback.
David Rientjes00ef2d22013-02-22 16:35:36 -080029 * As a special case NUMA_NO_NODE here means do the allocation
Linus Torvalds1da177e2005-04-16 15:20:36 -070030 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default
32 * process policy.
Christoph Lameter8bccd852005-10-29 18:16:59 -070033 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070034 * default Allocate on the local node first, or when on a VMA
35 * use the process policy. This is what Linux always did
36 * in a NUMA aware kernel and still does by, ahem, default.
37 *
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
42 *
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
46 *
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
51 *
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
54 */
55
56/* Notebook:
57 fix mmap readahead to honour policy and enable policy for any page cache
58 object
59 statistics for bigpages
60 global policy for page cache? currently it uses process policy. Requires
61 first item above.
62 handle mremap for shared memory (currently ignored for the policy)
63 grows down?
64 make bind policy root only? It can trigger oom much faster and the
65 kernel is not always grateful with that.
Linus Torvalds1da177e2005-04-16 15:20:36 -070066*/
67
Mitchel Humpherysb1de0d12014-06-06 14:38:30 -070068#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
69
Linus Torvalds1da177e2005-04-16 15:20:36 -070070#include <linux/mempolicy.h>
Christoph Hellwiga5201102019-08-28 16:19:53 +020071#include <linux/pagewalk.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070072#include <linux/highmem.h>
73#include <linux/hugetlb.h>
74#include <linux/kernel.h>
75#include <linux/sched.h>
Ingo Molnar6e84f312017-02-08 18:51:29 +010076#include <linux/sched/mm.h>
Ingo Molnar6a3827d2017-02-08 18:51:31 +010077#include <linux/sched/numa_balancing.h>
Ingo Molnarf719ff9b2017-02-06 10:57:33 +010078#include <linux/sched/task.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070079#include <linux/nodemask.h>
80#include <linux/cpuset.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070081#include <linux/slab.h>
82#include <linux/string.h>
Paul Gortmakerb95f1b312011-10-16 02:01:52 -040083#include <linux/export.h>
Pavel Emelyanovb4888932007-10-18 23:40:14 -070084#include <linux/nsproxy.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070085#include <linux/interrupt.h>
86#include <linux/init.h>
87#include <linux/compat.h>
Otto Ebeling31367462017-11-15 17:38:14 -080088#include <linux/ptrace.h>
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -080089#include <linux/swap.h>
Christoph Lameter1a75a6c2006-01-08 01:01:02 -080090#include <linux/seq_file.h>
91#include <linux/proc_fs.h>
Christoph Lameterb20a3502006-03-22 00:09:12 -080092#include <linux/migrate.h>
Hugh Dickins62b61f62009-12-14 17:59:33 -080093#include <linux/ksm.h>
Christoph Lameter95a402c2006-06-23 02:03:53 -070094#include <linux/rmap.h>
David Quigley86c3a762006-06-23 02:04:02 -070095#include <linux/security.h>
Adrian Bunkdbcb0f12007-10-16 01:26:26 -070096#include <linux/syscalls.h>
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -070097#include <linux/ctype.h>
KOSAKI Motohiro6d9c2852009-12-14 17:58:11 -080098#include <linux/mm_inline.h>
Lee Schermerhornb24f53a2012-10-25 14:16:32 +020099#include <linux/mmu_notifier.h>
Mitchel Humpherysb1de0d12014-06-06 14:38:30 -0700100#include <linux/printk.h>
Naoya Horiguchic8633792017-09-08 16:11:08 -0700101#include <linux/swapops.h>
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800102
Linus Torvalds1da177e2005-04-16 15:20:36 -0700103#include <asm/tlbflush.h>
Linus Torvalds7c0f6ba2016-12-24 11:46:01 -0800104#include <linux/uaccess.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700105
Nick Piggin62695a82008-10-18 20:26:09 -0700106#include "internal.h"
107
Christoph Lameter38e35862006-01-08 01:01:01 -0800108/* Internal flags */
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800109#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
Christoph Lameter38e35862006-01-08 01:01:01 -0800110#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800111
Pekka Enbergfcc234f2006-03-22 00:08:13 -0800112static struct kmem_cache *policy_cache;
113static struct kmem_cache *sn_cache;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700114
Linus Torvalds1da177e2005-04-16 15:20:36 -0700115/* Highest zone. An specific allocation for a zone below that is not
116 policied. */
Christoph Lameter62672762007-02-10 01:43:07 -0800117enum zone_type policy_zone = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700118
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700119/*
120 * run-time system-wide default policy => local allocation
121 */
H Hartley Sweetene754d792011-10-31 17:09:23 -0700122static struct mempolicy default_policy = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700123 .refcnt = ATOMIC_INIT(1), /* never free it */
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700124 .mode = MPOL_PREFERRED,
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -0700125 .flags = MPOL_F_LOCAL,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700126};
127
Mel Gorman5606e382012-11-02 18:19:13 +0000128static struct mempolicy preferred_node_policy[MAX_NUMNODES];
129
Dan Williamsb2ca9162020-02-16 12:00:48 -0800130/**
131 * numa_map_to_online_node - Find closest online node
132 * @nid: Node id to start the search
133 *
134 * Lookup the next closest node by distance if @nid is not online.
135 */
136int numa_map_to_online_node(int node)
137{
138 int min_node;
139
140 if (node == NUMA_NO_NODE)
141 node = 0;
142
143 min_node = node;
144 if (!node_online(node)) {
145 int min_dist = INT_MAX, dist, n;
146
147 for_each_online_node(n) {
148 dist = node_distance(node, n);
149 if (dist < min_dist) {
150 min_dist = dist;
151 min_node = n;
152 }
153 }
154 }
155
156 return min_node;
157}
158EXPORT_SYMBOL_GPL(numa_map_to_online_node);
159
Oleg Nesterov74d2c3a2014-10-09 15:27:50 -0700160struct mempolicy *get_task_policy(struct task_struct *p)
Mel Gorman5606e382012-11-02 18:19:13 +0000161{
162 struct mempolicy *pol = p->mempolicy;
Oleg Nesterovf15ca782014-10-09 15:27:43 -0700163 int node;
Mel Gorman5606e382012-11-02 18:19:13 +0000164
Oleg Nesterovf15ca782014-10-09 15:27:43 -0700165 if (pol)
166 return pol;
Mel Gorman5606e382012-11-02 18:19:13 +0000167
Oleg Nesterovf15ca782014-10-09 15:27:43 -0700168 node = numa_node_id();
169 if (node != NUMA_NO_NODE) {
170 pol = &preferred_node_policy[node];
171 /* preferred_node_policy is not initialised early in boot */
172 if (pol->mode)
173 return pol;
Mel Gorman5606e382012-11-02 18:19:13 +0000174 }
175
Oleg Nesterovf15ca782014-10-09 15:27:43 -0700176 return &default_policy;
Mel Gorman5606e382012-11-02 18:19:13 +0000177}
178
David Rientjes37012942008-04-28 02:12:33 -0700179static const struct mempolicy_operations {
180 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
Vlastimil Babka213980c2017-07-06 15:40:06 -0700181 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
David Rientjes37012942008-04-28 02:12:33 -0700182} mpol_ops[MPOL_MAX];
183
David Rientjesf5b087b2008-04-28 02:12:27 -0700184static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
185{
Bob Liu6d556292010-05-24 14:31:59 -0700186 return pol->flags & MPOL_MODE_FLAGS;
David Rientjes4c50bc02008-04-28 02:12:30 -0700187}
188
189static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
190 const nodemask_t *rel)
191{
192 nodemask_t tmp;
193 nodes_fold(tmp, *orig, nodes_weight(*rel));
194 nodes_onto(*ret, tmp, *rel);
David Rientjesf5b087b2008-04-28 02:12:27 -0700195}
196
David Rientjes37012942008-04-28 02:12:33 -0700197static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
198{
199 if (nodes_empty(*nodes))
200 return -EINVAL;
201 pol->v.nodes = *nodes;
202 return 0;
203}
204
205static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
206{
207 if (!nodes)
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -0700208 pol->flags |= MPOL_F_LOCAL; /* local allocation */
David Rientjes37012942008-04-28 02:12:33 -0700209 else if (nodes_empty(*nodes))
210 return -EINVAL; /* no allowed nodes */
211 else
212 pol->v.preferred_node = first_node(*nodes);
213 return 0;
214}
215
216static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
217{
Zhihui Zhang859f7ef2014-12-18 16:17:09 -0800218 if (nodes_empty(*nodes))
David Rientjes37012942008-04-28 02:12:33 -0700219 return -EINVAL;
220 pol->v.nodes = *nodes;
221 return 0;
222}
223
Miao Xie58568d22009-06-16 15:31:49 -0700224/*
225 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
226 * any, for the new policy. mpol_new() has already validated the nodes
227 * parameter with respect to the policy mode and flags. But, we need to
228 * handle an empty nodemask with MPOL_PREFERRED here.
229 *
230 * Must be called holding task's alloc_lock to protect task's mems_allowed
231 * and mempolicy. May also be called holding the mmap_semaphore for write.
232 */
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -0700233static int mpol_set_nodemask(struct mempolicy *pol,
234 const nodemask_t *nodes, struct nodemask_scratch *nsc)
Miao Xie58568d22009-06-16 15:31:49 -0700235{
Miao Xie58568d22009-06-16 15:31:49 -0700236 int ret;
237
238 /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
239 if (pol == NULL)
240 return 0;
Lai Jiangshan01f13bd2012-12-12 13:51:33 -0800241 /* Check N_MEMORY */
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -0700242 nodes_and(nsc->mask1,
Lai Jiangshan01f13bd2012-12-12 13:51:33 -0800243 cpuset_current_mems_allowed, node_states[N_MEMORY]);
Miao Xie58568d22009-06-16 15:31:49 -0700244
245 VM_BUG_ON(!nodes);
246 if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
247 nodes = NULL; /* explicit local allocation */
248 else {
249 if (pol->flags & MPOL_F_RELATIVE_NODES)
Zhihui Zhang859f7ef2014-12-18 16:17:09 -0800250 mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
Miao Xie58568d22009-06-16 15:31:49 -0700251 else
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -0700252 nodes_and(nsc->mask2, *nodes, nsc->mask1);
253
Miao Xie58568d22009-06-16 15:31:49 -0700254 if (mpol_store_user_nodemask(pol))
255 pol->w.user_nodemask = *nodes;
256 else
257 pol->w.cpuset_mems_allowed =
258 cpuset_current_mems_allowed;
259 }
260
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -0700261 if (nodes)
262 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
263 else
264 ret = mpol_ops[pol->mode].create(pol, NULL);
Miao Xie58568d22009-06-16 15:31:49 -0700265 return ret;
266}
267
268/*
269 * This function just creates a new policy, does some check and simple
270 * initialization. You must invoke mpol_set_nodemask() to set nodes.
271 */
David Rientjes028fec42008-04-28 02:12:25 -0700272static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
273 nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700274{
275 struct mempolicy *policy;
276
David Rientjes028fec42008-04-28 02:12:25 -0700277 pr_debug("setting mode %d flags %d nodes[0] %lx\n",
David Rientjes00ef2d22013-02-22 16:35:36 -0800278 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
Paul Mundt140d5a42007-07-15 23:38:16 -0700279
David Rientjes3e1f06452008-04-28 02:12:34 -0700280 if (mode == MPOL_DEFAULT) {
281 if (nodes && !nodes_empty(*nodes))
David Rientjes37012942008-04-28 02:12:33 -0700282 return ERR_PTR(-EINVAL);
Lee Schermerhornd3a71032012-10-25 14:16:29 +0200283 return NULL;
David Rientjes37012942008-04-28 02:12:33 -0700284 }
David Rientjes3e1f06452008-04-28 02:12:34 -0700285 VM_BUG_ON(!nodes);
286
287 /*
288 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
289 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
290 * All other modes require a valid pointer to a non-empty nodemask.
291 */
292 if (mode == MPOL_PREFERRED) {
293 if (nodes_empty(*nodes)) {
294 if (((flags & MPOL_F_STATIC_NODES) ||
295 (flags & MPOL_F_RELATIVE_NODES)))
296 return ERR_PTR(-EINVAL);
David Rientjes3e1f06452008-04-28 02:12:34 -0700297 }
Peter Zijlstra479e2802012-10-25 14:16:28 +0200298 } else if (mode == MPOL_LOCAL) {
Piotr Kwapulinski8d303e42016-12-12 16:42:49 -0800299 if (!nodes_empty(*nodes) ||
300 (flags & MPOL_F_STATIC_NODES) ||
301 (flags & MPOL_F_RELATIVE_NODES))
Peter Zijlstra479e2802012-10-25 14:16:28 +0200302 return ERR_PTR(-EINVAL);
303 mode = MPOL_PREFERRED;
David Rientjes3e1f06452008-04-28 02:12:34 -0700304 } else if (nodes_empty(*nodes))
305 return ERR_PTR(-EINVAL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700306 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
307 if (!policy)
308 return ERR_PTR(-ENOMEM);
309 atomic_set(&policy->refcnt, 1);
Lee Schermerhorn45c47452008-04-28 02:13:12 -0700310 policy->mode = mode;
David Rientjes3e1f06452008-04-28 02:12:34 -0700311 policy->flags = flags;
David Rientjesf5b087b2008-04-28 02:12:27 -0700312
David Rientjes37012942008-04-28 02:12:33 -0700313 return policy;
314}
315
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -0700316/* Slow path of a mpol destructor. */
317void __mpol_put(struct mempolicy *p)
318{
319 if (!atomic_dec_and_test(&p->refcnt))
320 return;
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -0700321 kmem_cache_free(policy_cache, p);
322}
323
Vlastimil Babka213980c2017-07-06 15:40:06 -0700324static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
David Rientjes37012942008-04-28 02:12:33 -0700325{
326}
327
Vlastimil Babka213980c2017-07-06 15:40:06 -0700328static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
David Rientjes37012942008-04-28 02:12:33 -0700329{
330 nodemask_t tmp;
331
332 if (pol->flags & MPOL_F_STATIC_NODES)
333 nodes_and(tmp, pol->w.user_nodemask, *nodes);
334 else if (pol->flags & MPOL_F_RELATIVE_NODES)
335 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
336 else {
Vlastimil Babka213980c2017-07-06 15:40:06 -0700337 nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
338 *nodes);
zhong jiang29b190f2019-06-28 12:06:43 -0700339 pol->w.cpuset_mems_allowed = *nodes;
David Rientjes37012942008-04-28 02:12:33 -0700340 }
341
Miao Xie708c1bb2010-05-24 14:32:07 -0700342 if (nodes_empty(tmp))
343 tmp = *nodes;
344
Vlastimil Babka213980c2017-07-06 15:40:06 -0700345 pol->v.nodes = tmp;
David Rientjes37012942008-04-28 02:12:33 -0700346}
347
348static void mpol_rebind_preferred(struct mempolicy *pol,
Vlastimil Babka213980c2017-07-06 15:40:06 -0700349 const nodemask_t *nodes)
David Rientjes37012942008-04-28 02:12:33 -0700350{
351 nodemask_t tmp;
352
David Rientjes37012942008-04-28 02:12:33 -0700353 if (pol->flags & MPOL_F_STATIC_NODES) {
354 int node = first_node(pol->w.user_nodemask);
355
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -0700356 if (node_isset(node, *nodes)) {
David Rientjes37012942008-04-28 02:12:33 -0700357 pol->v.preferred_node = node;
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -0700358 pol->flags &= ~MPOL_F_LOCAL;
359 } else
360 pol->flags |= MPOL_F_LOCAL;
David Rientjes37012942008-04-28 02:12:33 -0700361 } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
362 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
363 pol->v.preferred_node = first_node(tmp);
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -0700364 } else if (!(pol->flags & MPOL_F_LOCAL)) {
David Rientjes37012942008-04-28 02:12:33 -0700365 pol->v.preferred_node = node_remap(pol->v.preferred_node,
366 pol->w.cpuset_mems_allowed,
367 *nodes);
368 pol->w.cpuset_mems_allowed = *nodes;
369 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700370}
371
Miao Xie708c1bb2010-05-24 14:32:07 -0700372/*
373 * mpol_rebind_policy - Migrate a policy to a different set of nodes
374 *
Vlastimil Babka213980c2017-07-06 15:40:06 -0700375 * Per-vma policies are protected by mmap_sem. Allocations using per-task
376 * policies are protected by task->mems_allowed_seq to prevent a premature
377 * OOM/allocation failure due to parallel nodemask modification.
Miao Xie708c1bb2010-05-24 14:32:07 -0700378 */
Vlastimil Babka213980c2017-07-06 15:40:06 -0700379static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
David Rientjes1d0d2682008-04-28 02:12:32 -0700380{
David Rientjes1d0d2682008-04-28 02:12:32 -0700381 if (!pol)
382 return;
Vlastimil Babka2e256442019-03-05 15:46:50 -0800383 if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
David Rientjes1d0d2682008-04-28 02:12:32 -0700384 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
385 return;
Miao Xie708c1bb2010-05-24 14:32:07 -0700386
Vlastimil Babka213980c2017-07-06 15:40:06 -0700387 mpol_ops[pol->mode].rebind(pol, newmask);
David Rientjes1d0d2682008-04-28 02:12:32 -0700388}
389
390/*
391 * Wrapper for mpol_rebind_policy() that just requires task
392 * pointer, and updates task mempolicy.
Miao Xie58568d22009-06-16 15:31:49 -0700393 *
394 * Called with task's alloc_lock held.
David Rientjes1d0d2682008-04-28 02:12:32 -0700395 */
396
Vlastimil Babka213980c2017-07-06 15:40:06 -0700397void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
David Rientjes1d0d2682008-04-28 02:12:32 -0700398{
Vlastimil Babka213980c2017-07-06 15:40:06 -0700399 mpol_rebind_policy(tsk->mempolicy, new);
David Rientjes1d0d2682008-04-28 02:12:32 -0700400}
401
402/*
403 * Rebind each vma in mm to new nodemask.
404 *
405 * Call holding a reference to mm. Takes mm->mmap_sem during call.
406 */
407
408void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
409{
410 struct vm_area_struct *vma;
411
412 down_write(&mm->mmap_sem);
413 for (vma = mm->mmap; vma; vma = vma->vm_next)
Vlastimil Babka213980c2017-07-06 15:40:06 -0700414 mpol_rebind_policy(vma->vm_policy, new);
David Rientjes1d0d2682008-04-28 02:12:32 -0700415 up_write(&mm->mmap_sem);
416}
417
David Rientjes37012942008-04-28 02:12:33 -0700418static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
419 [MPOL_DEFAULT] = {
420 .rebind = mpol_rebind_default,
421 },
422 [MPOL_INTERLEAVE] = {
423 .create = mpol_new_interleave,
424 .rebind = mpol_rebind_nodemask,
425 },
426 [MPOL_PREFERRED] = {
427 .create = mpol_new_preferred,
428 .rebind = mpol_rebind_preferred,
429 },
430 [MPOL_BIND] = {
431 .create = mpol_new_bind,
432 .rebind = mpol_rebind_nodemask,
433 },
434};
435
Yang Shia53190a2019-08-13 15:37:18 -0700436static int migrate_page_add(struct page *page, struct list_head *pagelist,
Christoph Lameterfc301282006-01-18 17:42:29 -0800437 unsigned long flags);
Christoph Lameter1a75a6c2006-01-08 01:01:02 -0800438
Naoya Horiguchi6f4576e2015-02-11 15:28:03 -0800439struct queue_pages {
440 struct list_head *pagelist;
441 unsigned long flags;
442 nodemask_t *nmask;
Li Xinhaif18da662019-11-30 17:56:18 -0800443 unsigned long start;
444 unsigned long end;
445 struct vm_area_struct *first;
Naoya Horiguchi6f4576e2015-02-11 15:28:03 -0800446};
447
Naoya Horiguchi98094942013-09-11 14:22:14 -0700448/*
Naoya Horiguchi88aaa2a2017-09-08 16:10:42 -0700449 * Check if the page's nid is in qp->nmask.
450 *
451 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
452 * in the invert of qp->nmask.
453 */
454static inline bool queue_pages_required(struct page *page,
455 struct queue_pages *qp)
456{
457 int nid = page_to_nid(page);
458 unsigned long flags = qp->flags;
459
460 return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
461}
462
Yang Shia7f40cf2019-03-28 20:43:55 -0700463/*
Yang Shid8835442019-08-13 15:37:15 -0700464 * queue_pages_pmd() has four possible return values:
465 * 0 - pages are placed on the right node or queued successfully.
466 * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
467 * specified.
468 * 2 - THP was split.
469 * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
470 * existing page was already on a node that does not follow the
471 * policy.
Yang Shia7f40cf2019-03-28 20:43:55 -0700472 */
Naoya Horiguchic8633792017-09-08 16:11:08 -0700473static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
474 unsigned long end, struct mm_walk *walk)
475{
476 int ret = 0;
477 struct page *page;
478 struct queue_pages *qp = walk->private;
479 unsigned long flags;
480
481 if (unlikely(is_pmd_migration_entry(*pmd))) {
Yang Shia7f40cf2019-03-28 20:43:55 -0700482 ret = -EIO;
Naoya Horiguchic8633792017-09-08 16:11:08 -0700483 goto unlock;
484 }
485 page = pmd_page(*pmd);
486 if (is_huge_zero_page(page)) {
487 spin_unlock(ptl);
488 __split_huge_pmd(walk->vma, pmd, addr, false, NULL);
Yang Shid8835442019-08-13 15:37:15 -0700489 ret = 2;
Naoya Horiguchic8633792017-09-08 16:11:08 -0700490 goto out;
491 }
Yang Shid8835442019-08-13 15:37:15 -0700492 if (!queue_pages_required(page, qp))
Naoya Horiguchic8633792017-09-08 16:11:08 -0700493 goto unlock;
Naoya Horiguchic8633792017-09-08 16:11:08 -0700494
Naoya Horiguchic8633792017-09-08 16:11:08 -0700495 flags = qp->flags;
496 /* go to thp migration */
Yang Shia7f40cf2019-03-28 20:43:55 -0700497 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
Yang Shia53190a2019-08-13 15:37:18 -0700498 if (!vma_migratable(walk->vma) ||
499 migrate_page_add(page, qp->pagelist, flags)) {
Yang Shid8835442019-08-13 15:37:15 -0700500 ret = 1;
Yang Shia7f40cf2019-03-28 20:43:55 -0700501 goto unlock;
502 }
Yang Shia7f40cf2019-03-28 20:43:55 -0700503 } else
504 ret = -EIO;
Naoya Horiguchic8633792017-09-08 16:11:08 -0700505unlock:
506 spin_unlock(ptl);
507out:
508 return ret;
509}
510
Naoya Horiguchi88aaa2a2017-09-08 16:10:42 -0700511/*
Naoya Horiguchi98094942013-09-11 14:22:14 -0700512 * Scan through pages checking if pages follow certain conditions,
513 * and move them to the pagelist if they do.
Yang Shid8835442019-08-13 15:37:15 -0700514 *
515 * queue_pages_pte_range() has three possible return values:
516 * 0 - pages are placed on the right node or queued successfully.
517 * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
518 * specified.
519 * -EIO - only MPOL_MF_STRICT was specified and an existing page was already
520 * on a node that does not follow the policy.
Naoya Horiguchi98094942013-09-11 14:22:14 -0700521 */
Naoya Horiguchi6f4576e2015-02-11 15:28:03 -0800522static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
523 unsigned long end, struct mm_walk *walk)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700524{
Naoya Horiguchi6f4576e2015-02-11 15:28:03 -0800525 struct vm_area_struct *vma = walk->vma;
526 struct page *page;
527 struct queue_pages *qp = walk->private;
528 unsigned long flags = qp->flags;
Naoya Horiguchic8633792017-09-08 16:11:08 -0700529 int ret;
Yang Shid8835442019-08-13 15:37:15 -0700530 bool has_unmovable = false;
Hugh Dickins91612e02005-06-21 17:15:07 -0700531 pte_t *pte;
Hugh Dickins705e87c2005-10-29 18:16:27 -0700532 spinlock_t *ptl;
Hugh Dickins941150a2005-06-21 17:15:06 -0700533
Naoya Horiguchic8633792017-09-08 16:11:08 -0700534 ptl = pmd_trans_huge_lock(pmd, vma);
535 if (ptl) {
536 ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
Yang Shid8835442019-08-13 15:37:15 -0700537 if (ret != 2)
Yang Shia7f40cf2019-03-28 20:43:55 -0700538 return ret;
Kirill A. Shutemov248db922016-01-15 16:54:14 -0800539 }
Yang Shid8835442019-08-13 15:37:15 -0700540 /* THP was split, fall through to pte walk */
Hugh Dickins91612e02005-06-21 17:15:07 -0700541
Naoya Horiguchi337d9ab2016-07-26 15:24:03 -0700542 if (pmd_trans_unstable(pmd))
543 return 0;
Michal Hocko94723aa2018-04-10 16:30:07 -0700544
Naoya Horiguchi6f4576e2015-02-11 15:28:03 -0800545 pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
546 for (; addr != end; pte++, addr += PAGE_SIZE) {
Hugh Dickins91612e02005-06-21 17:15:07 -0700547 if (!pte_present(*pte))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700548 continue;
Linus Torvalds6aab3412005-11-28 14:34:23 -0800549 page = vm_normal_page(vma, addr, *pte);
550 if (!page)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700551 continue;
Nick Piggin053837f2006-01-18 17:42:27 -0800552 /*
Hugh Dickins62b61f62009-12-14 17:59:33 -0800553 * vm_normal_page() filters out zero pages, but there might
554 * still be PageReserved pages to skip, perhaps in a VDSO.
Nick Piggin053837f2006-01-18 17:42:27 -0800555 */
Hugh Dickinsb79bc0a2013-02-22 16:35:13 -0800556 if (PageReserved(page))
Christoph Lameterf4598c82006-01-12 01:05:20 -0800557 continue;
Naoya Horiguchi88aaa2a2017-09-08 16:10:42 -0700558 if (!queue_pages_required(page, qp))
Christoph Lameter38e35862006-01-08 01:01:01 -0800559 continue;
Yang Shia7f40cf2019-03-28 20:43:55 -0700560 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
Yang Shid8835442019-08-13 15:37:15 -0700561 /* MPOL_MF_STRICT must be specified if we get here */
562 if (!vma_migratable(vma)) {
563 has_unmovable = true;
Yang Shia7f40cf2019-03-28 20:43:55 -0700564 break;
Yang Shid8835442019-08-13 15:37:15 -0700565 }
Yang Shia53190a2019-08-13 15:37:18 -0700566
567 /*
568 * Do not abort immediately since there may be
569 * temporary off LRU pages in the range. Still
570 * need migrate other LRU pages.
571 */
572 if (migrate_page_add(page, qp->pagelist, flags))
573 has_unmovable = true;
Yang Shia7f40cf2019-03-28 20:43:55 -0700574 } else
575 break;
Naoya Horiguchi6f4576e2015-02-11 15:28:03 -0800576 }
577 pte_unmap_unlock(pte - 1, ptl);
578 cond_resched();
Yang Shid8835442019-08-13 15:37:15 -0700579
580 if (has_unmovable)
581 return 1;
582
Yang Shia7f40cf2019-03-28 20:43:55 -0700583 return addr != end ? -EIO : 0;
Hugh Dickins91612e02005-06-21 17:15:07 -0700584}
585
Naoya Horiguchi6f4576e2015-02-11 15:28:03 -0800586static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
587 unsigned long addr, unsigned long end,
588 struct mm_walk *walk)
Naoya Horiguchie2d8cf42013-09-11 14:22:03 -0700589{
590#ifdef CONFIG_HUGETLB_PAGE
Naoya Horiguchi6f4576e2015-02-11 15:28:03 -0800591 struct queue_pages *qp = walk->private;
592 unsigned long flags = qp->flags;
Naoya Horiguchie2d8cf42013-09-11 14:22:03 -0700593 struct page *page;
Kirill A. Shutemovcb900f42013-11-14 14:31:02 -0800594 spinlock_t *ptl;
Naoya Horiguchid4c54912014-06-06 10:00:01 -0400595 pte_t entry;
Naoya Horiguchie2d8cf42013-09-11 14:22:03 -0700596
Naoya Horiguchi6f4576e2015-02-11 15:28:03 -0800597 ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
598 entry = huge_ptep_get(pte);
Naoya Horiguchid4c54912014-06-06 10:00:01 -0400599 if (!pte_present(entry))
600 goto unlock;
601 page = pte_page(entry);
Naoya Horiguchi88aaa2a2017-09-08 16:10:42 -0700602 if (!queue_pages_required(page, qp))
Naoya Horiguchie2d8cf42013-09-11 14:22:03 -0700603 goto unlock;
604 /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
605 if (flags & (MPOL_MF_MOVE_ALL) ||
606 (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
Naoya Horiguchi6f4576e2015-02-11 15:28:03 -0800607 isolate_huge_page(page, qp->pagelist);
Naoya Horiguchie2d8cf42013-09-11 14:22:03 -0700608unlock:
Kirill A. Shutemovcb900f42013-11-14 14:31:02 -0800609 spin_unlock(ptl);
Naoya Horiguchie2d8cf42013-09-11 14:22:03 -0700610#else
611 BUG();
612#endif
Hugh Dickins91612e02005-06-21 17:15:07 -0700613 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700614}
615
Aneesh Kumar K.V58772312013-12-06 00:08:22 +0530616#ifdef CONFIG_NUMA_BALANCING
Lee Schermerhornb24f53a2012-10-25 14:16:32 +0200617/*
Mel Gorman4b10e7d2012-10-25 14:16:32 +0200618 * This is used to mark a range of virtual addresses to be inaccessible.
619 * These are later cleared by a NUMA hinting fault. Depending on these
620 * faults, pages may be migrated for better NUMA placement.
621 *
622 * This is assuming that NUMA faults are handled using PROT_NONE. If
623 * an architecture makes a different choice, it will need further
624 * changes to the core.
Lee Schermerhornb24f53a2012-10-25 14:16:32 +0200625 */
Mel Gorman4b10e7d2012-10-25 14:16:32 +0200626unsigned long change_prot_numa(struct vm_area_struct *vma,
627 unsigned long addr, unsigned long end)
Lee Schermerhornb24f53a2012-10-25 14:16:32 +0200628{
Mel Gorman4b10e7d2012-10-25 14:16:32 +0200629 int nr_updated;
Lee Schermerhornb24f53a2012-10-25 14:16:32 +0200630
Mel Gorman4d942462015-02-12 14:58:28 -0800631 nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
Mel Gorman03c5a6e2012-11-02 14:52:48 +0000632 if (nr_updated)
633 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
Lee Schermerhornb24f53a2012-10-25 14:16:32 +0200634
Mel Gorman4b10e7d2012-10-25 14:16:32 +0200635 return nr_updated;
Lee Schermerhornb24f53a2012-10-25 14:16:32 +0200636}
637#else
638static unsigned long change_prot_numa(struct vm_area_struct *vma,
639 unsigned long addr, unsigned long end)
640{
641 return 0;
642}
Aneesh Kumar K.V58772312013-12-06 00:08:22 +0530643#endif /* CONFIG_NUMA_BALANCING */
Lee Schermerhornb24f53a2012-10-25 14:16:32 +0200644
Naoya Horiguchi6f4576e2015-02-11 15:28:03 -0800645static int queue_pages_test_walk(unsigned long start, unsigned long end,
646 struct mm_walk *walk)
647{
648 struct vm_area_struct *vma = walk->vma;
649 struct queue_pages *qp = walk->private;
650 unsigned long endvma = vma->vm_end;
651 unsigned long flags = qp->flags;
652
Li Xinhaia18b3ac22019-11-30 17:56:15 -0800653 /* range check first */
Li Xinhaif18da662019-11-30 17:56:18 -0800654 VM_BUG_ON((vma->vm_start > start) || (vma->vm_end < end));
655
656 if (!qp->first) {
657 qp->first = vma;
658 if (!(flags & MPOL_MF_DISCONTIG_OK) &&
659 (qp->start < vma->vm_start))
660 /* hole at head side of range */
Li Xinhaia18b3ac22019-11-30 17:56:15 -0800661 return -EFAULT;
662 }
Li Xinhaif18da662019-11-30 17:56:18 -0800663 if (!(flags & MPOL_MF_DISCONTIG_OK) &&
664 ((vma->vm_end < qp->end) &&
665 (!vma->vm_next || vma->vm_end < vma->vm_next->vm_start)))
666 /* hole at middle or tail of range */
667 return -EFAULT;
Li Xinhaia18b3ac22019-11-30 17:56:15 -0800668
Yang Shia7f40cf2019-03-28 20:43:55 -0700669 /*
670 * Need check MPOL_MF_STRICT to return -EIO if possible
671 * regardless of vma_migratable
672 */
673 if (!vma_migratable(vma) &&
674 !(flags & MPOL_MF_STRICT))
Naoya Horiguchi48684a62015-02-11 15:28:06 -0800675 return 1;
676
Naoya Horiguchi6f4576e2015-02-11 15:28:03 -0800677 if (endvma > end)
678 endvma = end;
Naoya Horiguchi6f4576e2015-02-11 15:28:03 -0800679
Naoya Horiguchi6f4576e2015-02-11 15:28:03 -0800680 if (flags & MPOL_MF_LAZY) {
681 /* Similar to task_numa_work, skip inaccessible VMAs */
Liang Chen4355c012016-03-15 14:56:42 -0700682 if (!is_vm_hugetlb_page(vma) &&
683 (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
684 !(vma->vm_flags & VM_MIXEDMAP))
Naoya Horiguchi6f4576e2015-02-11 15:28:03 -0800685 change_prot_numa(vma, start, endvma);
686 return 1;
687 }
688
Kirill A. Shutemov77bf45e2016-02-05 15:36:33 -0800689 /* queue pages from current vma */
Yang Shia7f40cf2019-03-28 20:43:55 -0700690 if (flags & MPOL_MF_VALID)
Naoya Horiguchi6f4576e2015-02-11 15:28:03 -0800691 return 0;
692 return 1;
693}
694
Christoph Hellwig7b86ac32019-08-28 16:19:54 +0200695static const struct mm_walk_ops queue_pages_walk_ops = {
696 .hugetlb_entry = queue_pages_hugetlb,
697 .pmd_entry = queue_pages_pte_range,
698 .test_walk = queue_pages_test_walk,
699};
700
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800701/*
Naoya Horiguchi98094942013-09-11 14:22:14 -0700702 * Walk through page tables and collect pages to be migrated.
703 *
704 * If pages found in a given range are on a set of nodes (determined by
705 * @nodes and @flags,) it's isolated and queued to the pagelist which is
Yang Shid8835442019-08-13 15:37:15 -0700706 * passed via @private.
707 *
708 * queue_pages_range() has three possible return values:
709 * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
710 * specified.
711 * 0 - queue pages successfully or no misplaced page.
Yang Shia85dfc32019-11-15 17:34:33 -0800712 * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
713 * memory range specified by nodemask and maxnode points outside
714 * your accessible address space (-EFAULT)
Christoph Lameterdc9aa5b2006-01-08 01:00:50 -0800715 */
Hugh Dickinsd05f0cdcb2014-06-23 13:22:07 -0700716static int
Naoya Horiguchi98094942013-09-11 14:22:14 -0700717queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
Naoya Horiguchi6f4576e2015-02-11 15:28:03 -0800718 nodemask_t *nodes, unsigned long flags,
719 struct list_head *pagelist)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700720{
Li Xinhaif18da662019-11-30 17:56:18 -0800721 int err;
Naoya Horiguchi6f4576e2015-02-11 15:28:03 -0800722 struct queue_pages qp = {
723 .pagelist = pagelist,
724 .flags = flags,
725 .nmask = nodes,
Li Xinhaif18da662019-11-30 17:56:18 -0800726 .start = start,
727 .end = end,
728 .first = NULL,
Naoya Horiguchi6f4576e2015-02-11 15:28:03 -0800729 };
Linus Torvalds1da177e2005-04-16 15:20:36 -0700730
Li Xinhaif18da662019-11-30 17:56:18 -0800731 err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
732
733 if (!qp.first)
734 /* whole range in hole */
735 err = -EFAULT;
736
737 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700738}
739
KOSAKI Motohiro869833f2012-10-08 16:29:16 -0700740/*
741 * Apply policy to a single VMA
742 * This must be called with the mmap_sem held for writing.
743 */
744static int vma_replace_policy(struct vm_area_struct *vma,
745 struct mempolicy *pol)
KOSAKI Motohiro8d34694c12012-10-08 16:29:14 -0700746{
KOSAKI Motohiro869833f2012-10-08 16:29:16 -0700747 int err;
748 struct mempolicy *old;
749 struct mempolicy *new;
KOSAKI Motohiro8d34694c12012-10-08 16:29:14 -0700750
751 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
752 vma->vm_start, vma->vm_end, vma->vm_pgoff,
753 vma->vm_ops, vma->vm_file,
754 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
755
KOSAKI Motohiro869833f2012-10-08 16:29:16 -0700756 new = mpol_dup(pol);
757 if (IS_ERR(new))
758 return PTR_ERR(new);
759
760 if (vma->vm_ops && vma->vm_ops->set_policy) {
KOSAKI Motohiro8d34694c12012-10-08 16:29:14 -0700761 err = vma->vm_ops->set_policy(vma, new);
KOSAKI Motohiro869833f2012-10-08 16:29:16 -0700762 if (err)
763 goto err_out;
KOSAKI Motohiro8d34694c12012-10-08 16:29:14 -0700764 }
KOSAKI Motohiro869833f2012-10-08 16:29:16 -0700765
766 old = vma->vm_policy;
767 vma->vm_policy = new; /* protected by mmap_sem */
768 mpol_put(old);
769
770 return 0;
771 err_out:
772 mpol_put(new);
KOSAKI Motohiro8d34694c12012-10-08 16:29:14 -0700773 return err;
774}
775
Linus Torvalds1da177e2005-04-16 15:20:36 -0700776/* Step 2: apply policy to a range and do splits. */
KOSAKI Motohiro9d8cebd2010-03-05 13:41:57 -0800777static int mbind_range(struct mm_struct *mm, unsigned long start,
778 unsigned long end, struct mempolicy *new_pol)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700779{
780 struct vm_area_struct *next;
KOSAKI Motohiro9d8cebd2010-03-05 13:41:57 -0800781 struct vm_area_struct *prev;
782 struct vm_area_struct *vma;
783 int err = 0;
KOSAKI Motohiroe26a5112011-12-28 15:57:11 -0800784 pgoff_t pgoff;
KOSAKI Motohiro9d8cebd2010-03-05 13:41:57 -0800785 unsigned long vmstart;
786 unsigned long vmend;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700787
Linus Torvalds097d5912012-03-06 18:23:36 -0800788 vma = find_vma(mm, start);
Li Xinhaif18da662019-11-30 17:56:18 -0800789 VM_BUG_ON(!vma);
KOSAKI Motohiro9d8cebd2010-03-05 13:41:57 -0800790
Linus Torvalds097d5912012-03-06 18:23:36 -0800791 prev = vma->vm_prev;
KOSAKI Motohiroe26a5112011-12-28 15:57:11 -0800792 if (start > vma->vm_start)
793 prev = vma;
794
KOSAKI Motohiro9d8cebd2010-03-05 13:41:57 -0800795 for (; vma && vma->vm_start < end; prev = vma, vma = next) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700796 next = vma->vm_next;
KOSAKI Motohiro9d8cebd2010-03-05 13:41:57 -0800797 vmstart = max(start, vma->vm_start);
798 vmend = min(end, vma->vm_end);
799
KOSAKI Motohiroe26a5112011-12-28 15:57:11 -0800800 if (mpol_equal(vma_policy(vma), new_pol))
801 continue;
802
803 pgoff = vma->vm_pgoff +
804 ((vmstart - vma->vm_start) >> PAGE_SHIFT);
KOSAKI Motohiro9d8cebd2010-03-05 13:41:57 -0800805 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
Andrea Arcangeli19a809a2015-09-04 15:46:24 -0700806 vma->anon_vma, vma->vm_file, pgoff,
807 new_pol, vma->vm_userfaultfd_ctx);
KOSAKI Motohiro9d8cebd2010-03-05 13:41:57 -0800808 if (prev) {
809 vma = prev;
810 next = vma->vm_next;
Oleg Nesterov3964acd2013-07-31 13:53:28 -0700811 if (mpol_equal(vma_policy(vma), new_pol))
812 continue;
813 /* vma_merge() joined vma && vma->next, case 8 */
814 goto replace;
KOSAKI Motohiro9d8cebd2010-03-05 13:41:57 -0800815 }
816 if (vma->vm_start != vmstart) {
817 err = split_vma(vma->vm_mm, vma, vmstart, 1);
818 if (err)
819 goto out;
820 }
821 if (vma->vm_end != vmend) {
822 err = split_vma(vma->vm_mm, vma, vmend, 0);
823 if (err)
824 goto out;
825 }
Oleg Nesterov3964acd2013-07-31 13:53:28 -0700826 replace:
KOSAKI Motohiro869833f2012-10-08 16:29:16 -0700827 err = vma_replace_policy(vma, new_pol);
KOSAKI Motohiro8d34694c12012-10-08 16:29:14 -0700828 if (err)
829 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700830 }
KOSAKI Motohiro9d8cebd2010-03-05 13:41:57 -0800831
832 out:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700833 return err;
834}
835
Linus Torvalds1da177e2005-04-16 15:20:36 -0700836/* Set the process memory policy */
David Rientjes028fec42008-04-28 02:12:25 -0700837static long do_set_mempolicy(unsigned short mode, unsigned short flags,
838 nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700839{
Miao Xie58568d22009-06-16 15:31:49 -0700840 struct mempolicy *new, *old;
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -0700841 NODEMASK_SCRATCH(scratch);
Miao Xie58568d22009-06-16 15:31:49 -0700842 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700843
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -0700844 if (!scratch)
845 return -ENOMEM;
Lee Schermerhornf4e53d92008-04-28 02:13:10 -0700846
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -0700847 new = mpol_new(mode, flags, nodes);
848 if (IS_ERR(new)) {
849 ret = PTR_ERR(new);
850 goto out;
851 }
Oleg Nesterov2c7c3a72014-10-09 15:27:55 -0700852
Miao Xie58568d22009-06-16 15:31:49 -0700853 task_lock(current);
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -0700854 ret = mpol_set_nodemask(new, nodes, scratch);
Miao Xie58568d22009-06-16 15:31:49 -0700855 if (ret) {
856 task_unlock(current);
Miao Xie58568d22009-06-16 15:31:49 -0700857 mpol_put(new);
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -0700858 goto out;
Miao Xie58568d22009-06-16 15:31:49 -0700859 }
860 old = current->mempolicy;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700861 current->mempolicy = new;
Vlastimil Babka45816682017-07-06 15:39:59 -0700862 if (new && new->mode == MPOL_INTERLEAVE)
863 current->il_prev = MAX_NUMNODES-1;
Miao Xie58568d22009-06-16 15:31:49 -0700864 task_unlock(current);
Miao Xie58568d22009-06-16 15:31:49 -0700865 mpol_put(old);
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -0700866 ret = 0;
867out:
868 NODEMASK_SCRATCH_FREE(scratch);
869 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700870}
871
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700872/*
873 * Return nodemask for policy for get_mempolicy() query
Miao Xie58568d22009-06-16 15:31:49 -0700874 *
875 * Called with task's alloc_lock held
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700876 */
877static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700878{
Andi Kleendfcd3c02005-10-29 18:15:48 -0700879 nodes_clear(*nodes);
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700880 if (p == &default_policy)
881 return;
882
Lee Schermerhorn45c47452008-04-28 02:13:12 -0700883 switch (p->mode) {
Mel Gorman19770b32008-04-28 02:12:18 -0700884 case MPOL_BIND:
885 /* Fall through */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700886 case MPOL_INTERLEAVE:
Andi Kleendfcd3c02005-10-29 18:15:48 -0700887 *nodes = p->v.nodes;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700888 break;
889 case MPOL_PREFERRED:
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -0700890 if (!(p->flags & MPOL_F_LOCAL))
Andi Kleendfcd3c02005-10-29 18:15:48 -0700891 node_set(p->v.preferred_node, *nodes);
Lee Schermerhorn53f25562008-04-28 02:13:20 -0700892 /* else return empty node mask for local allocation */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700893 break;
894 default:
895 BUG();
896 }
897}
898
Andrea Arcangeli3b9aadf2018-10-26 15:05:16 -0700899static int lookup_node(struct mm_struct *mm, unsigned long addr)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700900{
901 struct page *p;
902 int err;
903
Andrea Arcangeli3b9aadf2018-10-26 15:05:16 -0700904 int locked = 1;
905 err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700906 if (err >= 0) {
907 err = page_to_nid(p);
908 put_page(p);
909 }
Andrea Arcangeli3b9aadf2018-10-26 15:05:16 -0700910 if (locked)
911 up_read(&mm->mmap_sem);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700912 return err;
913}
914
Linus Torvalds1da177e2005-04-16 15:20:36 -0700915/* Retrieve NUMA policy */
Adrian Bunkdbcb0f12007-10-16 01:26:26 -0700916static long do_get_mempolicy(int *policy, nodemask_t *nmask,
917 unsigned long addr, unsigned long flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700918{
Christoph Lameter8bccd852005-10-29 18:16:59 -0700919 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700920 struct mm_struct *mm = current->mm;
921 struct vm_area_struct *vma = NULL;
Andrea Arcangeli3b9aadf2018-10-26 15:05:16 -0700922 struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700923
Lee Schermerhorn754af6f2007-10-16 01:24:51 -0700924 if (flags &
925 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700926 return -EINVAL;
Lee Schermerhorn754af6f2007-10-16 01:24:51 -0700927
928 if (flags & MPOL_F_MEMS_ALLOWED) {
929 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
930 return -EINVAL;
931 *policy = 0; /* just so it's initialized */
Miao Xie58568d22009-06-16 15:31:49 -0700932 task_lock(current);
Lee Schermerhorn754af6f2007-10-16 01:24:51 -0700933 *nmask = cpuset_current_mems_allowed;
Miao Xie58568d22009-06-16 15:31:49 -0700934 task_unlock(current);
Lee Schermerhorn754af6f2007-10-16 01:24:51 -0700935 return 0;
936 }
937
Linus Torvalds1da177e2005-04-16 15:20:36 -0700938 if (flags & MPOL_F_ADDR) {
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700939 /*
940 * Do NOT fall back to task policy if the
941 * vma/shared policy at addr is NULL. We
942 * want to return MPOL_DEFAULT in this case.
943 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700944 down_read(&mm->mmap_sem);
945 vma = find_vma_intersection(mm, addr, addr+1);
946 if (!vma) {
947 up_read(&mm->mmap_sem);
948 return -EFAULT;
949 }
950 if (vma->vm_ops && vma->vm_ops->get_policy)
951 pol = vma->vm_ops->get_policy(vma, addr);
952 else
953 pol = vma->vm_policy;
954 } else if (addr)
955 return -EINVAL;
956
957 if (!pol)
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700958 pol = &default_policy; /* indicates default behavior */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700959
960 if (flags & MPOL_F_NODE) {
961 if (flags & MPOL_F_ADDR) {
Andrea Arcangeli3b9aadf2018-10-26 15:05:16 -0700962 /*
963 * Take a refcount on the mpol, lookup_node()
964 * wil drop the mmap_sem, so after calling
965 * lookup_node() only "pol" remains valid, "vma"
966 * is stale.
967 */
968 pol_refcount = pol;
969 vma = NULL;
970 mpol_get(pol);
971 err = lookup_node(mm, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700972 if (err < 0)
973 goto out;
Christoph Lameter8bccd852005-10-29 18:16:59 -0700974 *policy = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700975 } else if (pol == current->mempolicy &&
Lee Schermerhorn45c47452008-04-28 02:13:12 -0700976 pol->mode == MPOL_INTERLEAVE) {
Vlastimil Babka45816682017-07-06 15:39:59 -0700977 *policy = next_node_in(current->il_prev, pol->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700978 } else {
979 err = -EINVAL;
980 goto out;
981 }
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700982 } else {
983 *policy = pol == &default_policy ? MPOL_DEFAULT :
984 pol->mode;
David Rientjesd79df632008-07-04 12:24:13 -0700985 /*
986 * Internal mempolicy flags must be masked off before exposing
987 * the policy to userspace.
988 */
989 *policy |= (pol->flags & MPOL_MODE_FLAGS);
Lee Schermerhornbea904d2008-04-28 02:13:18 -0700990 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700991
Linus Torvalds1da177e2005-04-16 15:20:36 -0700992 err = 0;
Miao Xie58568d22009-06-16 15:31:49 -0700993 if (nmask) {
Lee Schermerhornc6b6ef82010-03-23 13:35:41 -0700994 if (mpol_store_user_nodemask(pol)) {
995 *nmask = pol->w.user_nodemask;
996 } else {
997 task_lock(current);
998 get_policy_nodemask(pol, nmask);
999 task_unlock(current);
1000 }
Miao Xie58568d22009-06-16 15:31:49 -07001001 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001002
1003 out:
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001004 mpol_cond_put(pol);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001005 if (vma)
Andrea Arcangeli3b9aadf2018-10-26 15:05:16 -07001006 up_read(&mm->mmap_sem);
1007 if (pol_refcount)
1008 mpol_put(pol_refcount);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001009 return err;
1010}
1011
Christoph Lameterb20a3502006-03-22 00:09:12 -08001012#ifdef CONFIG_MIGRATION
Christoph Lameter8bccd852005-10-29 18:16:59 -07001013/*
Naoya Horiguchic8633792017-09-08 16:11:08 -07001014 * page migration, thp tail pages can be passed.
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -08001015 */
Yang Shia53190a2019-08-13 15:37:18 -07001016static int migrate_page_add(struct page *page, struct list_head *pagelist,
Christoph Lameterfc301282006-01-18 17:42:29 -08001017 unsigned long flags)
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -08001018{
Naoya Horiguchic8633792017-09-08 16:11:08 -07001019 struct page *head = compound_head(page);
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -08001020 /*
Christoph Lameterfc301282006-01-18 17:42:29 -08001021 * Avoid migrating a page that is shared with others.
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -08001022 */
Naoya Horiguchic8633792017-09-08 16:11:08 -07001023 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
1024 if (!isolate_lru_page(head)) {
1025 list_add_tail(&head->lru, pagelist);
1026 mod_node_page_state(page_pgdat(head),
1027 NR_ISOLATED_ANON + page_is_file_cache(head),
1028 hpage_nr_pages(head));
Yang Shia53190a2019-08-13 15:37:18 -07001029 } else if (flags & MPOL_MF_STRICT) {
1030 /*
1031 * Non-movable page may reach here. And, there may be
1032 * temporary off LRU pages or non-LRU movable pages.
1033 * Treat them as unmovable pages since they can't be
1034 * isolated, so they can't be moved at the moment. It
1035 * should return -EIO for this case too.
1036 */
1037 return -EIO;
Nick Piggin62695a82008-10-18 20:26:09 -07001038 }
1039 }
Yang Shia53190a2019-08-13 15:37:18 -07001040
1041 return 0;
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -08001042}
1043
Michal Hockoa49bd4d2018-04-10 16:29:59 -07001044/* page allocation callback for NUMA node migration */
Michal Hocko666feb22018-04-10 16:30:03 -07001045struct page *alloc_new_node_page(struct page *page, unsigned long node)
Christoph Lameter95a402c2006-06-23 02:03:53 -07001046{
Naoya Horiguchie2d8cf42013-09-11 14:22:03 -07001047 if (PageHuge(page))
1048 return alloc_huge_page_node(page_hstate(compound_head(page)),
1049 node);
Michal Hocko94723aa2018-04-10 16:30:07 -07001050 else if (PageTransHuge(page)) {
Naoya Horiguchic8633792017-09-08 16:11:08 -07001051 struct page *thp;
1052
1053 thp = alloc_pages_node(node,
1054 (GFP_TRANSHUGE | __GFP_THISNODE),
1055 HPAGE_PMD_ORDER);
1056 if (!thp)
1057 return NULL;
1058 prep_transhuge_page(thp);
1059 return thp;
1060 } else
Vlastimil Babka96db8002015-09-08 15:03:50 -07001061 return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
David Rientjesb360edb2015-04-14 15:46:52 -07001062 __GFP_THISNODE, 0);
Christoph Lameter95a402c2006-06-23 02:03:53 -07001063}
1064
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -08001065/*
Christoph Lameter7e2ab152006-02-01 03:05:40 -08001066 * Migrate pages from one node to a target node.
1067 * Returns error or the number of pages not migrated.
1068 */
Adrian Bunkdbcb0f12007-10-16 01:26:26 -07001069static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1070 int flags)
Christoph Lameter7e2ab152006-02-01 03:05:40 -08001071{
1072 nodemask_t nmask;
1073 LIST_HEAD(pagelist);
1074 int err = 0;
1075
1076 nodes_clear(nmask);
1077 node_set(source, nmask);
1078
Minchan Kim08270802012-10-08 16:33:38 -07001079 /*
1080 * This does not "check" the range but isolates all pages that
1081 * need migration. Between passing in the full user address
1082 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1083 */
1084 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
Naoya Horiguchi98094942013-09-11 14:22:14 -07001085 queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
Christoph Lameter7e2ab152006-02-01 03:05:40 -08001086 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1087
Minchan Kimcf608ac2010-10-26 14:21:29 -07001088 if (!list_empty(&pagelist)) {
Michal Hockoa49bd4d2018-04-10 16:29:59 -07001089 err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest,
Hugh Dickins9c620e22013-02-22 16:35:14 -08001090 MIGRATE_SYNC, MR_SYSCALL);
Minchan Kimcf608ac2010-10-26 14:21:29 -07001091 if (err)
Naoya Horiguchie2d8cf42013-09-11 14:22:03 -07001092 putback_movable_pages(&pagelist);
Minchan Kimcf608ac2010-10-26 14:21:29 -07001093 }
Christoph Lameter95a402c2006-06-23 02:03:53 -07001094
Christoph Lameter7e2ab152006-02-01 03:05:40 -08001095 return err;
1096}
1097
1098/*
1099 * Move pages between the two nodesets so as to preserve the physical
1100 * layout as much as possible.
Christoph Lameter39743882006-01-08 01:00:51 -08001101 *
1102 * Returns the number of page that could not be moved.
1103 */
Andrew Morton0ce72d42012-05-29 15:06:24 -07001104int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1105 const nodemask_t *to, int flags)
Christoph Lameter39743882006-01-08 01:00:51 -08001106{
Christoph Lameter7e2ab152006-02-01 03:05:40 -08001107 int busy = 0;
Christoph Lameter0aedadf2008-11-06 12:53:30 -08001108 int err;
Christoph Lameter7e2ab152006-02-01 03:05:40 -08001109 nodemask_t tmp;
Christoph Lameter39743882006-01-08 01:00:51 -08001110
Christoph Lameter0aedadf2008-11-06 12:53:30 -08001111 err = migrate_prep();
1112 if (err)
1113 return err;
1114
Lee Schermerhorn53f25562008-04-28 02:13:20 -07001115 down_read(&mm->mmap_sem);
Christoph Lameter39743882006-01-08 01:00:51 -08001116
KOSAKI Motohiroda0aa132010-03-05 13:41:59 -08001117 /*
1118 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1119 * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
1120 * bit in 'tmp', and return that <source, dest> pair for migration.
1121 * The pair of nodemasks 'to' and 'from' define the map.
1122 *
1123 * If no pair of bits is found that way, fallback to picking some
1124 * pair of 'source' and 'dest' bits that are not the same. If the
1125 * 'source' and 'dest' bits are the same, this represents a node
1126 * that will be migrating to itself, so no pages need move.
1127 *
1128 * If no bits are left in 'tmp', or if all remaining bits left
1129 * in 'tmp' correspond to the same bit in 'to', return false
1130 * (nothing left to migrate).
1131 *
1132 * This lets us pick a pair of nodes to migrate between, such that
1133 * if possible the dest node is not already occupied by some other
1134 * source node, minimizing the risk of overloading the memory on a
1135 * node that would happen if we migrated incoming memory to a node
1136 * before migrating outgoing memory source that same node.
1137 *
1138 * A single scan of tmp is sufficient. As we go, we remember the
1139 * most recent <s, d> pair that moved (s != d). If we find a pair
1140 * that not only moved, but what's better, moved to an empty slot
1141 * (d is not set in tmp), then we break out then, with that pair.
Justin P. Mattockae0e47f2011-03-01 15:06:02 +01001142 * Otherwise when we finish scanning from_tmp, we at least have the
KOSAKI Motohiroda0aa132010-03-05 13:41:59 -08001143 * most recent <s, d> pair that moved. If we get all the way through
1144 * the scan of tmp without finding any node that moved, much less
1145 * moved to an empty node, then there is nothing left worth migrating.
1146 */
Christoph Lameterd4984712006-01-08 01:00:55 -08001147
Andrew Morton0ce72d42012-05-29 15:06:24 -07001148 tmp = *from;
Christoph Lameter7e2ab152006-02-01 03:05:40 -08001149 while (!nodes_empty(tmp)) {
1150 int s,d;
Jianguo Wub76ac7e2013-11-12 15:07:39 -08001151 int source = NUMA_NO_NODE;
Christoph Lameter7e2ab152006-02-01 03:05:40 -08001152 int dest = 0;
1153
1154 for_each_node_mask(s, tmp) {
Larry Woodman4a5b18c2012-05-29 15:06:24 -07001155
1156 /*
1157 * do_migrate_pages() tries to maintain the relative
1158 * node relationship of the pages established between
1159 * threads and memory areas.
1160 *
1161 * However if the number of source nodes is not equal to
1162 * the number of destination nodes we can not preserve
1163 * this node relative relationship. In that case, skip
1164 * copying memory from a node that is in the destination
1165 * mask.
1166 *
1167 * Example: [2,3,4] -> [3,4,5] moves everything.
1168 * [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1169 */
1170
Andrew Morton0ce72d42012-05-29 15:06:24 -07001171 if ((nodes_weight(*from) != nodes_weight(*to)) &&
1172 (node_isset(s, *to)))
Larry Woodman4a5b18c2012-05-29 15:06:24 -07001173 continue;
1174
Andrew Morton0ce72d42012-05-29 15:06:24 -07001175 d = node_remap(s, *from, *to);
Christoph Lameter7e2ab152006-02-01 03:05:40 -08001176 if (s == d)
1177 continue;
1178
1179 source = s; /* Node moved. Memorize */
1180 dest = d;
1181
1182 /* dest not in remaining from nodes? */
1183 if (!node_isset(dest, tmp))
1184 break;
1185 }
Jianguo Wub76ac7e2013-11-12 15:07:39 -08001186 if (source == NUMA_NO_NODE)
Christoph Lameter7e2ab152006-02-01 03:05:40 -08001187 break;
1188
1189 node_clear(source, tmp);
1190 err = migrate_to_node(mm, source, dest, flags);
1191 if (err > 0)
1192 busy += err;
1193 if (err < 0)
1194 break;
Christoph Lameter39743882006-01-08 01:00:51 -08001195 }
1196 up_read(&mm->mmap_sem);
Christoph Lameter7e2ab152006-02-01 03:05:40 -08001197 if (err < 0)
1198 return err;
1199 return busy;
Christoph Lameterb20a3502006-03-22 00:09:12 -08001200
Christoph Lameter39743882006-01-08 01:00:51 -08001201}
1202
Lee Schermerhorn3ad33b242007-11-14 16:59:10 -08001203/*
1204 * Allocate a new page for page migration based on vma policy.
Hugh Dickinsd05f0cdcb2014-06-23 13:22:07 -07001205 * Start by assuming the page is mapped by the same vma as contains @start.
Lee Schermerhorn3ad33b242007-11-14 16:59:10 -08001206 * Search forward from there, if not. N.B., this assumes that the
1207 * list of pages handed to migrate_pages()--which is how we get here--
1208 * is in virtual address order.
1209 */
Michal Hocko666feb22018-04-10 16:30:03 -07001210static struct page *new_page(struct page *page, unsigned long start)
Christoph Lameter95a402c2006-06-23 02:03:53 -07001211{
Hugh Dickinsd05f0cdcb2014-06-23 13:22:07 -07001212 struct vm_area_struct *vma;
Lee Schermerhorn3ad33b242007-11-14 16:59:10 -08001213 unsigned long uninitialized_var(address);
Christoph Lameter95a402c2006-06-23 02:03:53 -07001214
Hugh Dickinsd05f0cdcb2014-06-23 13:22:07 -07001215 vma = find_vma(current->mm, start);
Lee Schermerhorn3ad33b242007-11-14 16:59:10 -08001216 while (vma) {
1217 address = page_address_in_vma(page, vma);
1218 if (address != -EFAULT)
1219 break;
1220 vma = vma->vm_next;
1221 }
1222
Wanpeng Li11c731e2013-12-18 17:08:56 -08001223 if (PageHuge(page)) {
Michal Hocko389c8172018-01-31 16:21:03 -08001224 return alloc_huge_page_vma(page_hstate(compound_head(page)),
1225 vma, address);
Michal Hocko94723aa2018-04-10 16:30:07 -07001226 } else if (PageTransHuge(page)) {
Naoya Horiguchic8633792017-09-08 16:11:08 -07001227 struct page *thp;
1228
David Rientjes19deb762019-09-04 12:54:20 -07001229 thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
1230 HPAGE_PMD_ORDER);
Naoya Horiguchic8633792017-09-08 16:11:08 -07001231 if (!thp)
1232 return NULL;
1233 prep_transhuge_page(thp);
1234 return thp;
Wanpeng Li11c731e2013-12-18 17:08:56 -08001235 }
1236 /*
1237 * if !vma, alloc_page_vma() will use task or system default policy
1238 */
Michal Hocko0f556852017-07-12 14:36:58 -07001239 return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
1240 vma, address);
Christoph Lameter95a402c2006-06-23 02:03:53 -07001241}
Christoph Lameterb20a3502006-03-22 00:09:12 -08001242#else
1243
Yang Shia53190a2019-08-13 15:37:18 -07001244static int migrate_page_add(struct page *page, struct list_head *pagelist,
Christoph Lameterb20a3502006-03-22 00:09:12 -08001245 unsigned long flags)
1246{
Yang Shia53190a2019-08-13 15:37:18 -07001247 return -EIO;
Christoph Lameterb20a3502006-03-22 00:09:12 -08001248}
1249
Andrew Morton0ce72d42012-05-29 15:06:24 -07001250int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1251 const nodemask_t *to, int flags)
Christoph Lameterb20a3502006-03-22 00:09:12 -08001252{
1253 return -ENOSYS;
1254}
Christoph Lameter95a402c2006-06-23 02:03:53 -07001255
Michal Hocko666feb22018-04-10 16:30:03 -07001256static struct page *new_page(struct page *page, unsigned long start)
Christoph Lameter95a402c2006-06-23 02:03:53 -07001257{
1258 return NULL;
1259}
Christoph Lameterb20a3502006-03-22 00:09:12 -08001260#endif
1261
Adrian Bunkdbcb0f12007-10-16 01:26:26 -07001262static long do_mbind(unsigned long start, unsigned long len,
David Rientjes028fec42008-04-28 02:12:25 -07001263 unsigned short mode, unsigned short mode_flags,
1264 nodemask_t *nmask, unsigned long flags)
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -08001265{
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -08001266 struct mm_struct *mm = current->mm;
1267 struct mempolicy *new;
1268 unsigned long end;
1269 int err;
Yang Shid8835442019-08-13 15:37:15 -07001270 int ret;
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -08001271 LIST_HEAD(pagelist);
1272
Lee Schermerhornb24f53a2012-10-25 14:16:32 +02001273 if (flags & ~(unsigned long)MPOL_MF_VALID)
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -08001274 return -EINVAL;
Christoph Lameter74c00242006-03-14 19:50:21 -08001275 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -08001276 return -EPERM;
1277
1278 if (start & ~PAGE_MASK)
1279 return -EINVAL;
1280
1281 if (mode == MPOL_DEFAULT)
1282 flags &= ~MPOL_MF_STRICT;
1283
1284 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1285 end = start + len;
1286
1287 if (end < start)
1288 return -EINVAL;
1289 if (end == start)
1290 return 0;
1291
David Rientjes028fec42008-04-28 02:12:25 -07001292 new = mpol_new(mode, mode_flags, nmask);
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -08001293 if (IS_ERR(new))
1294 return PTR_ERR(new);
1295
Lee Schermerhornb24f53a2012-10-25 14:16:32 +02001296 if (flags & MPOL_MF_LAZY)
1297 new->flags |= MPOL_F_MOF;
1298
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -08001299 /*
1300 * If we are using the default policy then operation
1301 * on discontinuous address spaces is okay after all
1302 */
1303 if (!new)
1304 flags |= MPOL_MF_DISCONTIG_OK;
1305
David Rientjes028fec42008-04-28 02:12:25 -07001306 pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1307 start, start + len, mode, mode_flags,
David Rientjes00ef2d22013-02-22 16:35:36 -08001308 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -08001309
Christoph Lameter0aedadf2008-11-06 12:53:30 -08001310 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1311
1312 err = migrate_prep();
1313 if (err)
KOSAKI Motohirob05ca732009-10-26 16:49:59 -07001314 goto mpol_out;
Christoph Lameter0aedadf2008-11-06 12:53:30 -08001315 }
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -07001316 {
1317 NODEMASK_SCRATCH(scratch);
1318 if (scratch) {
1319 down_write(&mm->mmap_sem);
1320 task_lock(current);
1321 err = mpol_set_nodemask(new, nmask, scratch);
1322 task_unlock(current);
1323 if (err)
1324 up_write(&mm->mmap_sem);
1325 } else
1326 err = -ENOMEM;
1327 NODEMASK_SCRATCH_FREE(scratch);
1328 }
KOSAKI Motohirob05ca732009-10-26 16:49:59 -07001329 if (err)
1330 goto mpol_out;
1331
Yang Shid8835442019-08-13 15:37:15 -07001332 ret = queue_pages_range(mm, start, end, nmask,
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -08001333 flags | MPOL_MF_INVERT, &pagelist);
Yang Shid8835442019-08-13 15:37:15 -07001334
1335 if (ret < 0) {
Yang Shia85dfc32019-11-15 17:34:33 -08001336 err = ret;
Yang Shid8835442019-08-13 15:37:15 -07001337 goto up_out;
1338 }
1339
1340 err = mbind_range(mm, start, end, new);
Christoph Lameter7e2ab152006-02-01 03:05:40 -08001341
Lee Schermerhornb24f53a2012-10-25 14:16:32 +02001342 if (!err) {
1343 int nr_failed = 0;
1344
Minchan Kimcf608ac2010-10-26 14:21:29 -07001345 if (!list_empty(&pagelist)) {
Lee Schermerhornb24f53a2012-10-25 14:16:32 +02001346 WARN_ON_ONCE(flags & MPOL_MF_LAZY);
Hugh Dickinsd05f0cdcb2014-06-23 13:22:07 -07001347 nr_failed = migrate_pages(&pagelist, new_page, NULL,
1348 start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
Minchan Kimcf608ac2010-10-26 14:21:29 -07001349 if (nr_failed)
Naoya Horiguchi74060e42013-09-11 14:22:06 -07001350 putback_movable_pages(&pagelist);
Minchan Kimcf608ac2010-10-26 14:21:29 -07001351 }
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -08001352
Yang Shid8835442019-08-13 15:37:15 -07001353 if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -08001354 err = -EIO;
Yang Shia85dfc32019-11-15 17:34:33 -08001355 } else {
Yang Shid8835442019-08-13 15:37:15 -07001356up_out:
Yang Shia85dfc32019-11-15 17:34:33 -08001357 if (!list_empty(&pagelist))
1358 putback_movable_pages(&pagelist);
1359 }
1360
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -08001361 up_write(&mm->mmap_sem);
Yang Shid8835442019-08-13 15:37:15 -07001362mpol_out:
Lee Schermerhornf0be3d32008-04-28 02:13:08 -07001363 mpol_put(new);
Christoph Lameter6ce3c4c2006-01-08 01:01:04 -08001364 return err;
1365}
1366
Christoph Lameter39743882006-01-08 01:00:51 -08001367/*
Christoph Lameter8bccd852005-10-29 18:16:59 -07001368 * User space interface with variable sized bitmaps for nodelists.
1369 */
1370
1371/* Copy a node mask from user space. */
Christoph Lameter39743882006-01-08 01:00:51 -08001372static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
Christoph Lameter8bccd852005-10-29 18:16:59 -07001373 unsigned long maxnode)
1374{
1375 unsigned long k;
Yisheng Xie56521e72018-01-31 16:16:11 -08001376 unsigned long t;
Christoph Lameter8bccd852005-10-29 18:16:59 -07001377 unsigned long nlongs;
1378 unsigned long endmask;
1379
1380 --maxnode;
1381 nodes_clear(*nodes);
1382 if (maxnode == 0 || !nmask)
1383 return 0;
Andi Kleena9c930b2006-02-20 18:27:59 -08001384 if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
Chris Wright636f13c2006-02-17 13:59:36 -08001385 return -EINVAL;
Christoph Lameter8bccd852005-10-29 18:16:59 -07001386
1387 nlongs = BITS_TO_LONGS(maxnode);
1388 if ((maxnode % BITS_PER_LONG) == 0)
1389 endmask = ~0UL;
1390 else
1391 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1392
Yisheng Xie56521e72018-01-31 16:16:11 -08001393 /*
1394 * When the user specified more nodes than supported just check
1395 * if the non supported part is all zero.
1396 *
1397 * If maxnode have more longs than MAX_NUMNODES, check
1398 * the bits in that area first. And then go through to
1399 * check the rest bits which equal or bigger than MAX_NUMNODES.
1400 * Otherwise, just check bits [MAX_NUMNODES, maxnode).
1401 */
Christoph Lameter8bccd852005-10-29 18:16:59 -07001402 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
Christoph Lameter8bccd852005-10-29 18:16:59 -07001403 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
Christoph Lameter8bccd852005-10-29 18:16:59 -07001404 if (get_user(t, nmask + k))
1405 return -EFAULT;
1406 if (k == nlongs - 1) {
1407 if (t & endmask)
1408 return -EINVAL;
1409 } else if (t)
1410 return -EINVAL;
1411 }
1412 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1413 endmask = ~0UL;
1414 }
1415
Yisheng Xie56521e72018-01-31 16:16:11 -08001416 if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
1417 unsigned long valid_mask = endmask;
1418
1419 valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1420 if (get_user(t, nmask + nlongs - 1))
1421 return -EFAULT;
1422 if (t & valid_mask)
1423 return -EINVAL;
1424 }
1425
Christoph Lameter8bccd852005-10-29 18:16:59 -07001426 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1427 return -EFAULT;
1428 nodes_addr(*nodes)[nlongs-1] &= endmask;
1429 return 0;
1430}
1431
1432/* Copy a kernel node mask to user space */
1433static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1434 nodemask_t *nodes)
1435{
1436 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
Ralph Campbell050c17f2019-02-20 22:18:58 -08001437 unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
Christoph Lameter8bccd852005-10-29 18:16:59 -07001438
1439 if (copy > nbytes) {
1440 if (copy > PAGE_SIZE)
1441 return -EINVAL;
1442 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1443 return -EFAULT;
1444 copy = nbytes;
1445 }
1446 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1447}
1448
Dominik Brodowskie7dc9ad62018-03-17 16:12:22 +01001449static long kernel_mbind(unsigned long start, unsigned long len,
1450 unsigned long mode, const unsigned long __user *nmask,
1451 unsigned long maxnode, unsigned int flags)
Christoph Lameter8bccd852005-10-29 18:16:59 -07001452{
1453 nodemask_t nodes;
1454 int err;
David Rientjes028fec42008-04-28 02:12:25 -07001455 unsigned short mode_flags;
Christoph Lameter8bccd852005-10-29 18:16:59 -07001456
Andrey Konovalov057d33892019-09-25 16:48:30 -07001457 start = untagged_addr(start);
David Rientjes028fec42008-04-28 02:12:25 -07001458 mode_flags = mode & MPOL_MODE_FLAGS;
1459 mode &= ~MPOL_MODE_FLAGS;
David Rientjesa3b51e02008-04-28 02:12:23 -07001460 if (mode >= MPOL_MAX)
1461 return -EINVAL;
David Rientjes4c50bc02008-04-28 02:12:30 -07001462 if ((mode_flags & MPOL_F_STATIC_NODES) &&
1463 (mode_flags & MPOL_F_RELATIVE_NODES))
1464 return -EINVAL;
Christoph Lameter8bccd852005-10-29 18:16:59 -07001465 err = get_nodes(&nodes, nmask, maxnode);
1466 if (err)
1467 return err;
David Rientjes028fec42008-04-28 02:12:25 -07001468 return do_mbind(start, len, mode, mode_flags, &nodes, flags);
Christoph Lameter8bccd852005-10-29 18:16:59 -07001469}
1470
Dominik Brodowskie7dc9ad62018-03-17 16:12:22 +01001471SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1472 unsigned long, mode, const unsigned long __user *, nmask,
1473 unsigned long, maxnode, unsigned int, flags)
1474{
1475 return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1476}
1477
Christoph Lameter8bccd852005-10-29 18:16:59 -07001478/* Set the process memory policy */
Dominik Brodowskiaf03c4a2018-03-17 16:20:01 +01001479static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1480 unsigned long maxnode)
Christoph Lameter8bccd852005-10-29 18:16:59 -07001481{
1482 int err;
1483 nodemask_t nodes;
David Rientjes028fec42008-04-28 02:12:25 -07001484 unsigned short flags;
Christoph Lameter8bccd852005-10-29 18:16:59 -07001485
David Rientjes028fec42008-04-28 02:12:25 -07001486 flags = mode & MPOL_MODE_FLAGS;
1487 mode &= ~MPOL_MODE_FLAGS;
1488 if ((unsigned int)mode >= MPOL_MAX)
Christoph Lameter8bccd852005-10-29 18:16:59 -07001489 return -EINVAL;
David Rientjes4c50bc02008-04-28 02:12:30 -07001490 if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1491 return -EINVAL;
Christoph Lameter8bccd852005-10-29 18:16:59 -07001492 err = get_nodes(&nodes, nmask, maxnode);
1493 if (err)
1494 return err;
David Rientjes028fec42008-04-28 02:12:25 -07001495 return do_set_mempolicy(mode, flags, &nodes);
Christoph Lameter8bccd852005-10-29 18:16:59 -07001496}
1497
Dominik Brodowskiaf03c4a2018-03-17 16:20:01 +01001498SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1499 unsigned long, maxnode)
1500{
1501 return kernel_set_mempolicy(mode, nmask, maxnode);
1502}
1503
Dominik Brodowskib6e9b0b2018-03-17 16:00:25 +01001504static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1505 const unsigned long __user *old_nodes,
1506 const unsigned long __user *new_nodes)
Christoph Lameter39743882006-01-08 01:00:51 -08001507{
KOSAKI Motohiro596d7cf2010-08-09 17:19:01 -07001508 struct mm_struct *mm = NULL;
Christoph Lameter39743882006-01-08 01:00:51 -08001509 struct task_struct *task;
Christoph Lameter39743882006-01-08 01:00:51 -08001510 nodemask_t task_nodes;
1511 int err;
KOSAKI Motohiro596d7cf2010-08-09 17:19:01 -07001512 nodemask_t *old;
1513 nodemask_t *new;
1514 NODEMASK_SCRATCH(scratch);
Christoph Lameter39743882006-01-08 01:00:51 -08001515
KOSAKI Motohiro596d7cf2010-08-09 17:19:01 -07001516 if (!scratch)
1517 return -ENOMEM;
Christoph Lameter39743882006-01-08 01:00:51 -08001518
KOSAKI Motohiro596d7cf2010-08-09 17:19:01 -07001519 old = &scratch->mask1;
1520 new = &scratch->mask2;
1521
1522 err = get_nodes(old, old_nodes, maxnode);
Christoph Lameter39743882006-01-08 01:00:51 -08001523 if (err)
KOSAKI Motohiro596d7cf2010-08-09 17:19:01 -07001524 goto out;
1525
1526 err = get_nodes(new, new_nodes, maxnode);
1527 if (err)
1528 goto out;
Christoph Lameter39743882006-01-08 01:00:51 -08001529
1530 /* Find the mm_struct */
Zeng Zhaoming55cfaa32010-12-02 14:31:13 -08001531 rcu_read_lock();
Pavel Emelyanov228ebcb2007-10-18 23:40:16 -07001532 task = pid ? find_task_by_vpid(pid) : current;
Christoph Lameter39743882006-01-08 01:00:51 -08001533 if (!task) {
Zeng Zhaoming55cfaa32010-12-02 14:31:13 -08001534 rcu_read_unlock();
KOSAKI Motohiro596d7cf2010-08-09 17:19:01 -07001535 err = -ESRCH;
1536 goto out;
Christoph Lameter39743882006-01-08 01:00:51 -08001537 }
Christoph Lameter3268c632012-03-21 16:34:06 -07001538 get_task_struct(task);
Christoph Lameter39743882006-01-08 01:00:51 -08001539
KOSAKI Motohiro596d7cf2010-08-09 17:19:01 -07001540 err = -EINVAL;
Christoph Lameter39743882006-01-08 01:00:51 -08001541
1542 /*
Otto Ebeling31367462017-11-15 17:38:14 -08001543 * Check if this process has the right to modify the specified process.
1544 * Use the regular "ptrace_may_access()" checks.
Christoph Lameter39743882006-01-08 01:00:51 -08001545 */
Otto Ebeling31367462017-11-15 17:38:14 -08001546 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
David Howellsc69e8d92008-11-14 10:39:19 +11001547 rcu_read_unlock();
Christoph Lameter39743882006-01-08 01:00:51 -08001548 err = -EPERM;
Christoph Lameter3268c632012-03-21 16:34:06 -07001549 goto out_put;
Christoph Lameter39743882006-01-08 01:00:51 -08001550 }
David Howellsc69e8d92008-11-14 10:39:19 +11001551 rcu_read_unlock();
Christoph Lameter39743882006-01-08 01:00:51 -08001552
1553 task_nodes = cpuset_mems_allowed(task);
1554 /* Is the user allowed to access the target nodes? */
KOSAKI Motohiro596d7cf2010-08-09 17:19:01 -07001555 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
Christoph Lameter39743882006-01-08 01:00:51 -08001556 err = -EPERM;
Christoph Lameter3268c632012-03-21 16:34:06 -07001557 goto out_put;
Christoph Lameter39743882006-01-08 01:00:51 -08001558 }
1559
Yisheng Xie0486a382018-01-31 16:16:15 -08001560 task_nodes = cpuset_mems_allowed(current);
1561 nodes_and(*new, *new, task_nodes);
1562 if (nodes_empty(*new))
Christoph Lameter3268c632012-03-21 16:34:06 -07001563 goto out_put;
Yisheng Xie0486a382018-01-31 16:16:15 -08001564
David Quigley86c3a762006-06-23 02:04:02 -07001565 err = security_task_movememory(task);
1566 if (err)
Christoph Lameter3268c632012-03-21 16:34:06 -07001567 goto out_put;
David Quigley86c3a762006-06-23 02:04:02 -07001568
Christoph Lameter3268c632012-03-21 16:34:06 -07001569 mm = get_task_mm(task);
1570 put_task_struct(task);
Sasha Levinf2a9ef82012-04-25 16:01:52 -07001571
1572 if (!mm) {
Christoph Lameter3268c632012-03-21 16:34:06 -07001573 err = -EINVAL;
Sasha Levinf2a9ef82012-04-25 16:01:52 -07001574 goto out;
1575 }
1576
1577 err = do_migrate_pages(mm, old, new,
1578 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
Christoph Lameter3268c632012-03-21 16:34:06 -07001579
1580 mmput(mm);
1581out:
KOSAKI Motohiro596d7cf2010-08-09 17:19:01 -07001582 NODEMASK_SCRATCH_FREE(scratch);
1583
Christoph Lameter39743882006-01-08 01:00:51 -08001584 return err;
Christoph Lameter3268c632012-03-21 16:34:06 -07001585
1586out_put:
1587 put_task_struct(task);
1588 goto out;
1589
Christoph Lameter39743882006-01-08 01:00:51 -08001590}
1591
Dominik Brodowskib6e9b0b2018-03-17 16:00:25 +01001592SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1593 const unsigned long __user *, old_nodes,
1594 const unsigned long __user *, new_nodes)
1595{
1596 return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1597}
1598
Christoph Lameter39743882006-01-08 01:00:51 -08001599
Christoph Lameter8bccd852005-10-29 18:16:59 -07001600/* Retrieve NUMA policy */
Dominik Brodowskiaf03c4a2018-03-17 16:20:01 +01001601static int kernel_get_mempolicy(int __user *policy,
1602 unsigned long __user *nmask,
1603 unsigned long maxnode,
1604 unsigned long addr,
1605 unsigned long flags)
Christoph Lameter8bccd852005-10-29 18:16:59 -07001606{
Adrian Bunkdbcb0f12007-10-16 01:26:26 -07001607 int err;
1608 int uninitialized_var(pval);
Christoph Lameter8bccd852005-10-29 18:16:59 -07001609 nodemask_t nodes;
1610
Andrey Konovalov057d33892019-09-25 16:48:30 -07001611 addr = untagged_addr(addr);
1612
Ralph Campbell050c17f2019-02-20 22:18:58 -08001613 if (nmask != NULL && maxnode < nr_node_ids)
Christoph Lameter8bccd852005-10-29 18:16:59 -07001614 return -EINVAL;
1615
1616 err = do_get_mempolicy(&pval, &nodes, addr, flags);
1617
1618 if (err)
1619 return err;
1620
1621 if (policy && put_user(pval, policy))
1622 return -EFAULT;
1623
1624 if (nmask)
1625 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1626
1627 return err;
1628}
1629
Dominik Brodowskiaf03c4a2018-03-17 16:20:01 +01001630SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1631 unsigned long __user *, nmask, unsigned long, maxnode,
1632 unsigned long, addr, unsigned long, flags)
1633{
1634 return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1635}
1636
Linus Torvalds1da177e2005-04-16 15:20:36 -07001637#ifdef CONFIG_COMPAT
1638
Heiko Carstensc93e0f62014-03-03 16:32:26 +01001639COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1640 compat_ulong_t __user *, nmask,
1641 compat_ulong_t, maxnode,
1642 compat_ulong_t, addr, compat_ulong_t, flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001643{
1644 long err;
1645 unsigned long __user *nm = NULL;
1646 unsigned long nr_bits, alloc_size;
1647 DECLARE_BITMAP(bm, MAX_NUMNODES);
1648
Ralph Campbell050c17f2019-02-20 22:18:58 -08001649 nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001650 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1651
1652 if (nmask)
1653 nm = compat_alloc_user_space(alloc_size);
1654
Dominik Brodowskiaf03c4a2018-03-17 16:20:01 +01001655 err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001656
1657 if (!err && nmask) {
KAMEZAWA Hiroyuki2bbff6c2011-09-14 16:21:02 -07001658 unsigned long copy_size;
1659 copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1660 err = copy_from_user(bm, nm, copy_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001661 /* ensure entire bitmap is zeroed */
1662 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1663 err |= compat_put_bitmap(nmask, bm, nr_bits);
1664 }
1665
1666 return err;
1667}
1668
Heiko Carstensc93e0f62014-03-03 16:32:26 +01001669COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1670 compat_ulong_t, maxnode)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001671{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001672 unsigned long __user *nm = NULL;
1673 unsigned long nr_bits, alloc_size;
1674 DECLARE_BITMAP(bm, MAX_NUMNODES);
1675
1676 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1677 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1678
1679 if (nmask) {
Chris Sallscf01fb92017-04-07 23:48:11 -07001680 if (compat_get_bitmap(bm, nmask, nr_bits))
1681 return -EFAULT;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001682 nm = compat_alloc_user_space(alloc_size);
Chris Sallscf01fb92017-04-07 23:48:11 -07001683 if (copy_to_user(nm, bm, alloc_size))
1684 return -EFAULT;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001685 }
1686
Dominik Brodowskiaf03c4a2018-03-17 16:20:01 +01001687 return kernel_set_mempolicy(mode, nm, nr_bits+1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001688}
1689
Heiko Carstensc93e0f62014-03-03 16:32:26 +01001690COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1691 compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1692 compat_ulong_t, maxnode, compat_ulong_t, flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001693{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001694 unsigned long __user *nm = NULL;
1695 unsigned long nr_bits, alloc_size;
Andi Kleendfcd3c02005-10-29 18:15:48 -07001696 nodemask_t bm;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001697
1698 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1699 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1700
1701 if (nmask) {
Chris Sallscf01fb92017-04-07 23:48:11 -07001702 if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
1703 return -EFAULT;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001704 nm = compat_alloc_user_space(alloc_size);
Chris Sallscf01fb92017-04-07 23:48:11 -07001705 if (copy_to_user(nm, nodes_addr(bm), alloc_size))
1706 return -EFAULT;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001707 }
1708
Dominik Brodowskie7dc9ad62018-03-17 16:12:22 +01001709 return kernel_mbind(start, len, mode, nm, nr_bits+1, flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001710}
1711
Dominik Brodowskib6e9b0b2018-03-17 16:00:25 +01001712COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
1713 compat_ulong_t, maxnode,
1714 const compat_ulong_t __user *, old_nodes,
1715 const compat_ulong_t __user *, new_nodes)
1716{
1717 unsigned long __user *old = NULL;
1718 unsigned long __user *new = NULL;
1719 nodemask_t tmp_mask;
1720 unsigned long nr_bits;
1721 unsigned long size;
1722
1723 nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
1724 size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1725 if (old_nodes) {
1726 if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
1727 return -EFAULT;
1728 old = compat_alloc_user_space(new_nodes ? size * 2 : size);
1729 if (new_nodes)
1730 new = old + size / sizeof(unsigned long);
1731 if (copy_to_user(old, nodes_addr(tmp_mask), size))
1732 return -EFAULT;
1733 }
1734 if (new_nodes) {
1735 if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
1736 return -EFAULT;
1737 if (new == NULL)
1738 new = compat_alloc_user_space(size);
1739 if (copy_to_user(new, nodes_addr(tmp_mask), size))
1740 return -EFAULT;
1741 }
1742 return kernel_migrate_pages(pid, nr_bits + 1, old, new);
1743}
1744
1745#endif /* CONFIG_COMPAT */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001746
Oleg Nesterov74d2c3a2014-10-09 15:27:50 -07001747struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1748 unsigned long addr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001749{
Oleg Nesterov8d902742014-10-09 15:27:45 -07001750 struct mempolicy *pol = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001751
1752 if (vma) {
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001753 if (vma->vm_ops && vma->vm_ops->get_policy) {
Oleg Nesterov8d902742014-10-09 15:27:45 -07001754 pol = vma->vm_ops->get_policy(vma, addr);
Mel Gorman00442ad2012-10-08 16:29:20 -07001755 } else if (vma->vm_policy) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001756 pol = vma->vm_policy;
Mel Gorman00442ad2012-10-08 16:29:20 -07001757
1758 /*
1759 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1760 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1761 * count on these policies which will be dropped by
1762 * mpol_cond_put() later
1763 */
1764 if (mpol_needs_cond_ref(pol))
1765 mpol_get(pol);
1766 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001767 }
Oleg Nesterovf15ca782014-10-09 15:27:43 -07001768
Oleg Nesterov74d2c3a2014-10-09 15:27:50 -07001769 return pol;
1770}
1771
1772/*
Oleg Nesterovdd6eecb2014-10-09 15:27:57 -07001773 * get_vma_policy(@vma, @addr)
Oleg Nesterov74d2c3a2014-10-09 15:27:50 -07001774 * @vma: virtual memory area whose policy is sought
1775 * @addr: address in @vma for shared policy lookup
1776 *
1777 * Returns effective policy for a VMA at specified address.
Oleg Nesterovdd6eecb2014-10-09 15:27:57 -07001778 * Falls back to current->mempolicy or system default policy, as necessary.
Oleg Nesterov74d2c3a2014-10-09 15:27:50 -07001779 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1780 * count--added by the get_policy() vm_op, as appropriate--to protect against
1781 * freeing by another task. It is the caller's responsibility to free the
1782 * extra reference for shared policies.
1783 */
David Rientjesac79f782019-09-04 12:54:18 -07001784static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
Oleg Nesterovdd6eecb2014-10-09 15:27:57 -07001785 unsigned long addr)
Oleg Nesterov74d2c3a2014-10-09 15:27:50 -07001786{
1787 struct mempolicy *pol = __get_vma_policy(vma, addr);
1788
Oleg Nesterov8d902742014-10-09 15:27:45 -07001789 if (!pol)
Oleg Nesterovdd6eecb2014-10-09 15:27:57 -07001790 pol = get_task_policy(current);
Oleg Nesterov8d902742014-10-09 15:27:45 -07001791
Linus Torvalds1da177e2005-04-16 15:20:36 -07001792 return pol;
1793}
1794
Oleg Nesterov6b6482b2014-10-09 15:27:48 -07001795bool vma_policy_mof(struct vm_area_struct *vma)
Mel Gormanfc3147242013-10-07 11:29:09 +01001796{
Oleg Nesterov6b6482b2014-10-09 15:27:48 -07001797 struct mempolicy *pol;
Oleg Nesterovf15ca782014-10-09 15:27:43 -07001798
Oleg Nesterov6b6482b2014-10-09 15:27:48 -07001799 if (vma->vm_ops && vma->vm_ops->get_policy) {
1800 bool ret = false;
Mel Gormanfc3147242013-10-07 11:29:09 +01001801
Oleg Nesterov6b6482b2014-10-09 15:27:48 -07001802 pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1803 if (pol && (pol->flags & MPOL_F_MOF))
1804 ret = true;
1805 mpol_cond_put(pol);
Mel Gormanfc3147242013-10-07 11:29:09 +01001806
Oleg Nesterov6b6482b2014-10-09 15:27:48 -07001807 return ret;
Mel Gormanfc3147242013-10-07 11:29:09 +01001808 }
1809
Oleg Nesterov6b6482b2014-10-09 15:27:48 -07001810 pol = vma->vm_policy;
Oleg Nesterov8d902742014-10-09 15:27:45 -07001811 if (!pol)
Oleg Nesterov6b6482b2014-10-09 15:27:48 -07001812 pol = get_task_policy(current);
Oleg Nesterov8d902742014-10-09 15:27:45 -07001813
Mel Gormanfc3147242013-10-07 11:29:09 +01001814 return pol->flags & MPOL_F_MOF;
1815}
1816
Lai Jiangshand3eb1572013-02-22 16:33:22 -08001817static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1818{
1819 enum zone_type dynamic_policy_zone = policy_zone;
1820
1821 BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1822
1823 /*
1824 * if policy->v.nodes has movable memory only,
1825 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1826 *
1827 * policy->v.nodes is intersect with node_states[N_MEMORY].
1828 * so if the following test faile, it implies
1829 * policy->v.nodes has movable memory only.
1830 */
1831 if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1832 dynamic_policy_zone = ZONE_MOVABLE;
1833
1834 return zone >= dynamic_policy_zone;
1835}
1836
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001837/*
1838 * Return a nodemask representing a mempolicy for filtering nodes for
1839 * page allocation
1840 */
1841static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
Mel Gorman19770b32008-04-28 02:12:18 -07001842{
1843 /* Lower zones don't get a nodemask applied for MPOL_BIND */
Lee Schermerhorn45c47452008-04-28 02:13:12 -07001844 if (unlikely(policy->mode == MPOL_BIND) &&
Lai Jiangshand3eb1572013-02-22 16:33:22 -08001845 apply_policy_zone(policy, gfp_zone(gfp)) &&
Mel Gorman19770b32008-04-28 02:12:18 -07001846 cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1847 return &policy->v.nodes;
1848
1849 return NULL;
1850}
1851
Vlastimil Babka04ec6262017-07-06 15:40:03 -07001852/* Return the node id preferred by the given mempolicy, or the given id */
1853static int policy_node(gfp_t gfp, struct mempolicy *policy,
1854 int nd)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001855{
Michal Hocko6d840952016-12-12 16:42:23 -08001856 if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
1857 nd = policy->v.preferred_node;
1858 else {
Mel Gorman19770b32008-04-28 02:12:18 -07001859 /*
Michal Hocko6d840952016-12-12 16:42:23 -08001860 * __GFP_THISNODE shouldn't even be used with the bind policy
1861 * because we might easily break the expectation to stay on the
1862 * requested node and not break the policy.
Mel Gorman19770b32008-04-28 02:12:18 -07001863 */
Michal Hocko6d840952016-12-12 16:42:23 -08001864 WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001865 }
Michal Hocko6d840952016-12-12 16:42:23 -08001866
Vlastimil Babka04ec6262017-07-06 15:40:03 -07001867 return nd;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001868}
1869
1870/* Do dynamic interleaving for a process */
1871static unsigned interleave_nodes(struct mempolicy *policy)
1872{
Vlastimil Babka45816682017-07-06 15:39:59 -07001873 unsigned next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001874 struct task_struct *me = current;
1875
Vlastimil Babka45816682017-07-06 15:39:59 -07001876 next = next_node_in(me->il_prev, policy->v.nodes);
David Rientjesf5b087b2008-04-28 02:12:27 -07001877 if (next < MAX_NUMNODES)
Vlastimil Babka45816682017-07-06 15:39:59 -07001878 me->il_prev = next;
1879 return next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001880}
1881
Christoph Lameterdc85da12006-01-18 17:42:36 -08001882/*
1883 * Depending on the memory policy provide a node from which to allocate the
1884 * next slab entry.
1885 */
David Rientjes2a389612014-04-07 15:37:29 -07001886unsigned int mempolicy_slab_node(void)
Christoph Lameterdc85da12006-01-18 17:42:36 -08001887{
Andi Kleene7b691b2012-06-09 02:40:03 -07001888 struct mempolicy *policy;
David Rientjes2a389612014-04-07 15:37:29 -07001889 int node = numa_mem_id();
Andi Kleene7b691b2012-06-09 02:40:03 -07001890
1891 if (in_interrupt())
David Rientjes2a389612014-04-07 15:37:29 -07001892 return node;
Andi Kleene7b691b2012-06-09 02:40:03 -07001893
1894 policy = current->mempolicy;
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -07001895 if (!policy || policy->flags & MPOL_F_LOCAL)
David Rientjes2a389612014-04-07 15:37:29 -07001896 return node;
Christoph Lameter765c4502006-09-27 01:50:08 -07001897
Lee Schermerhornbea904d2008-04-28 02:13:18 -07001898 switch (policy->mode) {
1899 case MPOL_PREFERRED:
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -07001900 /*
1901 * handled MPOL_F_LOCAL above
1902 */
1903 return policy->v.preferred_node;
Lee Schermerhornbea904d2008-04-28 02:13:18 -07001904
Christoph Lameterdc85da12006-01-18 17:42:36 -08001905 case MPOL_INTERLEAVE:
1906 return interleave_nodes(policy);
1907
Mel Gormandd1a2392008-04-28 02:12:17 -07001908 case MPOL_BIND: {
Mel Gormanc33d6c02016-05-19 17:14:10 -07001909 struct zoneref *z;
1910
Christoph Lameterdc85da12006-01-18 17:42:36 -08001911 /*
1912 * Follow bind policy behavior and start allocation at the
1913 * first node.
1914 */
Mel Gorman19770b32008-04-28 02:12:18 -07001915 struct zonelist *zonelist;
Mel Gorman19770b32008-04-28 02:12:18 -07001916 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
Aneesh Kumar K.Vc9634cf2016-10-07 16:59:12 -07001917 zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
Mel Gormanc33d6c02016-05-19 17:14:10 -07001918 z = first_zones_zonelist(zonelist, highest_zoneidx,
1919 &policy->v.nodes);
Pavel Tatashinc1093b72018-08-21 21:53:32 -07001920 return z->zone ? zone_to_nid(z->zone) : node;
Mel Gormandd1a2392008-04-28 02:12:17 -07001921 }
Christoph Lameterdc85da12006-01-18 17:42:36 -08001922
Christoph Lameterdc85da12006-01-18 17:42:36 -08001923 default:
Lee Schermerhornbea904d2008-04-28 02:13:18 -07001924 BUG();
Christoph Lameterdc85da12006-01-18 17:42:36 -08001925 }
1926}
1927
Andrew Mortonfee83b32016-05-19 17:11:43 -07001928/*
1929 * Do static interleaving for a VMA with known offset @n. Returns the n'th
1930 * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
1931 * number of present nodes.
1932 */
Laurent Dufour98c70ba2017-09-08 16:12:39 -07001933static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001934{
Andi Kleendfcd3c02005-10-29 18:15:48 -07001935 unsigned nnodes = nodes_weight(pol->v.nodes);
David Rientjesf5b087b2008-04-28 02:12:27 -07001936 unsigned target;
Andrew Mortonfee83b32016-05-19 17:11:43 -07001937 int i;
1938 int nid;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001939
David Rientjesf5b087b2008-04-28 02:12:27 -07001940 if (!nnodes)
1941 return numa_node_id();
Andrew Mortonfee83b32016-05-19 17:11:43 -07001942 target = (unsigned int)n % nnodes;
1943 nid = first_node(pol->v.nodes);
1944 for (i = 0; i < target; i++)
Andi Kleendfcd3c02005-10-29 18:15:48 -07001945 nid = next_node(nid, pol->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001946 return nid;
1947}
1948
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001949/* Determine a node number for interleave */
1950static inline unsigned interleave_nid(struct mempolicy *pol,
1951 struct vm_area_struct *vma, unsigned long addr, int shift)
1952{
1953 if (vma) {
1954 unsigned long off;
1955
Nishanth Aravamudan3b98b082006-08-31 21:27:53 -07001956 /*
1957 * for small pages, there is no difference between
1958 * shift and PAGE_SHIFT, so the bit-shift is safe.
1959 * for huge pages, since vm_pgoff is in units of small
1960 * pages, we need to shift off the always 0 bits to get
1961 * a useful offset.
1962 */
1963 BUG_ON(shift < PAGE_SHIFT);
1964 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001965 off += (addr - vma->vm_start) >> shift;
Laurent Dufour98c70ba2017-09-08 16:12:39 -07001966 return offset_il_node(pol, off);
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001967 } else
1968 return interleave_nodes(pol);
1969}
1970
Chen, Kenneth W00ac59a2006-02-03 21:51:14 +01001971#ifdef CONFIG_HUGETLBFS
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001972/*
Vlastimil Babka04ec6262017-07-06 15:40:03 -07001973 * huge_node(@vma, @addr, @gfp_flags, @mpol)
Fabian Frederickb46e14a2014-06-04 16:08:18 -07001974 * @vma: virtual memory area whose policy is sought
1975 * @addr: address in @vma for shared policy lookup and interleave policy
1976 * @gfp_flags: for requested zone
1977 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1978 * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001979 *
Vlastimil Babka04ec6262017-07-06 15:40:03 -07001980 * Returns a nid suitable for a huge page allocation and a pointer
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001981 * to the struct mempolicy for conditional unref after allocation.
1982 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1983 * @nodemask for filtering the zonelist.
Miao Xiec0ff7452010-05-24 14:32:08 -07001984 *
Mel Gormand26914d2014-04-03 14:47:24 -07001985 * Must be protected by read_mems_allowed_begin()
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07001986 */
Vlastimil Babka04ec6262017-07-06 15:40:03 -07001987int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
1988 struct mempolicy **mpol, nodemask_t **nodemask)
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001989{
Vlastimil Babka04ec6262017-07-06 15:40:03 -07001990 int nid;
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001991
Oleg Nesterovdd6eecb2014-10-09 15:27:57 -07001992 *mpol = get_vma_policy(vma, addr);
Mel Gorman19770b32008-04-28 02:12:18 -07001993 *nodemask = NULL; /* assume !MPOL_BIND */
Christoph Lameter5da7ca82006-01-06 00:10:46 -08001994
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001995 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
Vlastimil Babka04ec6262017-07-06 15:40:03 -07001996 nid = interleave_nid(*mpol, vma, addr,
1997 huge_page_shift(hstate_vma(vma)));
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07001998 } else {
Vlastimil Babka04ec6262017-07-06 15:40:03 -07001999 nid = policy_node(gfp_flags, *mpol, numa_node_id());
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07002000 if ((*mpol)->mode == MPOL_BIND)
2001 *nodemask = &(*mpol)->v.nodes;
Lee Schermerhorn480eccf2007-09-18 22:46:47 -07002002 }
Vlastimil Babka04ec6262017-07-06 15:40:03 -07002003 return nid;
Christoph Lameter5da7ca82006-01-06 00:10:46 -08002004}
Lee Schermerhorn06808b02009-12-14 17:58:21 -08002005
2006/*
2007 * init_nodemask_of_mempolicy
2008 *
2009 * If the current task's mempolicy is "default" [NULL], return 'false'
2010 * to indicate default policy. Otherwise, extract the policy nodemask
2011 * for 'bind' or 'interleave' policy into the argument nodemask, or
2012 * initialize the argument nodemask to contain the single node for
2013 * 'preferred' or 'local' policy and return 'true' to indicate presence
2014 * of non-default mempolicy.
2015 *
2016 * We don't bother with reference counting the mempolicy [mpol_get/put]
2017 * because the current task is examining it's own mempolicy and a task's
2018 * mempolicy is only ever changed by the task itself.
2019 *
2020 * N.B., it is the caller's responsibility to free a returned nodemask.
2021 */
2022bool init_nodemask_of_mempolicy(nodemask_t *mask)
2023{
2024 struct mempolicy *mempolicy;
2025 int nid;
2026
2027 if (!(mask && current->mempolicy))
2028 return false;
2029
Miao Xiec0ff7452010-05-24 14:32:08 -07002030 task_lock(current);
Lee Schermerhorn06808b02009-12-14 17:58:21 -08002031 mempolicy = current->mempolicy;
2032 switch (mempolicy->mode) {
2033 case MPOL_PREFERRED:
2034 if (mempolicy->flags & MPOL_F_LOCAL)
2035 nid = numa_node_id();
2036 else
2037 nid = mempolicy->v.preferred_node;
2038 init_nodemask_of_node(mask, nid);
2039 break;
2040
2041 case MPOL_BIND:
2042 /* Fall through */
2043 case MPOL_INTERLEAVE:
2044 *mask = mempolicy->v.nodes;
2045 break;
2046
2047 default:
2048 BUG();
2049 }
Miao Xiec0ff7452010-05-24 14:32:08 -07002050 task_unlock(current);
Lee Schermerhorn06808b02009-12-14 17:58:21 -08002051
2052 return true;
2053}
Chen, Kenneth W00ac59a2006-02-03 21:51:14 +01002054#endif
Christoph Lameter5da7ca82006-01-06 00:10:46 -08002055
David Rientjes6f48d0eb2010-08-09 17:18:52 -07002056/*
2057 * mempolicy_nodemask_intersects
2058 *
2059 * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
2060 * policy. Otherwise, check for intersection between mask and the policy
2061 * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local'
2062 * policy, always return true since it may allocate elsewhere on fallback.
2063 *
2064 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2065 */
2066bool mempolicy_nodemask_intersects(struct task_struct *tsk,
2067 const nodemask_t *mask)
2068{
2069 struct mempolicy *mempolicy;
2070 bool ret = true;
2071
2072 if (!mask)
2073 return ret;
2074 task_lock(tsk);
2075 mempolicy = tsk->mempolicy;
2076 if (!mempolicy)
2077 goto out;
2078
2079 switch (mempolicy->mode) {
2080 case MPOL_PREFERRED:
2081 /*
2082 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
2083 * allocate from, they may fallback to other nodes when oom.
2084 * Thus, it's possible for tsk to have allocated memory from
2085 * nodes in mask.
2086 */
2087 break;
2088 case MPOL_BIND:
2089 case MPOL_INTERLEAVE:
2090 ret = nodes_intersects(mempolicy->v.nodes, *mask);
2091 break;
2092 default:
2093 BUG();
2094 }
2095out:
2096 task_unlock(tsk);
2097 return ret;
2098}
2099
Linus Torvalds1da177e2005-04-16 15:20:36 -07002100/* Allocate a page in interleaved policy.
2101 Own path because it needs to do special accounting. */
Andi Kleen662f3a02005-10-29 18:15:49 -07002102static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2103 unsigned nid)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002104{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002105 struct page *page;
2106
Vlastimil Babka04ec6262017-07-06 15:40:03 -07002107 page = __alloc_pages(gfp, order, nid);
Kemi Wang45180852017-11-15 17:38:22 -08002108 /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
2109 if (!static_branch_likely(&vm_numa_stat_key))
2110 return page;
Andrey Ryabininde55c8b2017-10-13 15:57:43 -07002111 if (page && page_to_nid(page) == nid) {
2112 preempt_disable();
2113 __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
2114 preempt_enable();
2115 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002116 return page;
2117}
2118
2119/**
Andrea Arcangeli0bbbc0b2011-01-13 15:47:05 -08002120 * alloc_pages_vma - Allocate a page for a VMA.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002121 *
2122 * @gfp:
2123 * %GFP_USER user allocation.
2124 * %GFP_KERNEL kernel allocations,
2125 * %GFP_HIGHMEM highmem/user allocations,
2126 * %GFP_FS allocation should not call back into a file system.
2127 * %GFP_ATOMIC don't sleep.
2128 *
Andrea Arcangeli0bbbc0b2011-01-13 15:47:05 -08002129 * @order:Order of the GFP allocation.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002130 * @vma: Pointer to VMA or NULL if not available.
2131 * @addr: Virtual Address of the allocation. Must be inside the VMA.
Vlastimil Babkabe97a412015-02-11 15:27:15 -08002132 * @node: Which node to prefer for allocation (modulo policy).
David Rientjes19deb762019-09-04 12:54:20 -07002133 * @hugepage: for hugepages try only the preferred node if possible
Linus Torvalds1da177e2005-04-16 15:20:36 -07002134 *
2135 * This function allocates a page from the kernel page pool and applies
2136 * a NUMA policy associated with the VMA or the current process.
2137 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
2138 * mm_struct of the VMA to prevent it from going away. Should be used for
Vlastimil Babkabe97a412015-02-11 15:27:15 -08002139 * all allocations for pages that will be mapped into user space. Returns
2140 * NULL when no page can be allocated.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002141 */
2142struct page *
Andrea Arcangeli0bbbc0b2011-01-13 15:47:05 -08002143alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
David Rientjes19deb762019-09-04 12:54:20 -07002144 unsigned long addr, int node, bool hugepage)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002145{
Mel Gormancc9a6c82012-03-21 16:34:11 -07002146 struct mempolicy *pol;
Miao Xiec0ff7452010-05-24 14:32:08 -07002147 struct page *page;
Vlastimil Babka04ec6262017-07-06 15:40:03 -07002148 int preferred_nid;
Vlastimil Babkabe97a412015-02-11 15:27:15 -08002149 nodemask_t *nmask;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002150
Oleg Nesterovdd6eecb2014-10-09 15:27:57 -07002151 pol = get_vma_policy(vma, addr);
Mel Gormancc9a6c82012-03-21 16:34:11 -07002152
Vlastimil Babkabe97a412015-02-11 15:27:15 -08002153 if (pol->mode == MPOL_INTERLEAVE) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002154 unsigned nid;
Christoph Lameter5da7ca82006-01-06 00:10:46 -08002155
Andi Kleen8eac5632011-02-25 14:44:28 -08002156 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07002157 mpol_cond_put(pol);
Andrea Arcangeli0bbbc0b2011-01-13 15:47:05 -08002158 page = alloc_page_interleave(gfp, order, nid);
Vlastimil Babkabe97a412015-02-11 15:27:15 -08002159 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002160 }
Vlastimil Babkabe97a412015-02-11 15:27:15 -08002161
David Rientjes19deb762019-09-04 12:54:20 -07002162 if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2163 int hpage_node = node;
2164
2165 /*
2166 * For hugepage allocation and non-interleave policy which
2167 * allows the current node (or other explicitly preferred
2168 * node) we only try to allocate from the current/preferred
2169 * node and don't fall back to other nodes, as the cost of
2170 * remote accesses would likely offset THP benefits.
2171 *
2172 * If the policy is interleave, or does not allow the current
2173 * node in its nodemask, we allocate the standard way.
2174 */
2175 if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
2176 hpage_node = pol->v.preferred_node;
2177
2178 nmask = policy_nodemask(gfp, pol);
2179 if (!nmask || node_isset(hpage_node, *nmask)) {
2180 mpol_cond_put(pol);
Vlastimil Babkacc638f32020-01-13 16:29:04 -08002181 /*
2182 * First, try to allocate THP only on local node, but
2183 * don't reclaim unnecessarily, just compact.
2184 */
David Rientjes19deb762019-09-04 12:54:20 -07002185 page = __alloc_pages_node(hpage_node,
Vlastimil Babkacc638f32020-01-13 16:29:04 -08002186 gfp | __GFP_THISNODE | __GFP_NORETRY, order);
David Rientjes76e654c2019-09-04 12:54:25 -07002187
2188 /*
2189 * If hugepage allocations are configured to always
2190 * synchronous compact or the vma has been madvised
2191 * to prefer hugepage backing, retry allowing remote
Vlastimil Babkacc638f32020-01-13 16:29:04 -08002192 * memory with both reclaim and compact as well.
David Rientjes76e654c2019-09-04 12:54:25 -07002193 */
2194 if (!page && (gfp & __GFP_DIRECT_RECLAIM))
2195 page = __alloc_pages_node(hpage_node,
Vlastimil Babkacc638f32020-01-13 16:29:04 -08002196 gfp, order);
David Rientjes76e654c2019-09-04 12:54:25 -07002197
David Rientjes19deb762019-09-04 12:54:20 -07002198 goto out;
2199 }
2200 }
2201
Vlastimil Babkabe97a412015-02-11 15:27:15 -08002202 nmask = policy_nodemask(gfp, pol);
Vlastimil Babka04ec6262017-07-06 15:40:03 -07002203 preferred_nid = policy_node(gfp, pol, node);
2204 page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
Vlastimil Babkad51e9892017-01-24 15:18:18 -08002205 mpol_cond_put(pol);
Vlastimil Babkabe97a412015-02-11 15:27:15 -08002206out:
Miao Xiec0ff7452010-05-24 14:32:08 -07002207 return page;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002208}
Christoph Hellwig69262212019-06-26 14:27:05 +02002209EXPORT_SYMBOL(alloc_pages_vma);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002210
2211/**
2212 * alloc_pages_current - Allocate pages.
2213 *
2214 * @gfp:
2215 * %GFP_USER user allocation,
2216 * %GFP_KERNEL kernel allocation,
2217 * %GFP_HIGHMEM highmem allocation,
2218 * %GFP_FS don't call back into a file system.
2219 * %GFP_ATOMIC don't sleep.
2220 * @order: Power of two of allocation size in pages. 0 is a single page.
2221 *
2222 * Allocate a page from the kernel page pool. When not in
2223 * interrupt context and apply the current process NUMA policy.
2224 * Returns NULL when no page can be allocated.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002225 */
Al Virodd0fc662005-10-07 07:46:04 +01002226struct page *alloc_pages_current(gfp_t gfp, unsigned order)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002227{
Oleg Nesterov8d902742014-10-09 15:27:45 -07002228 struct mempolicy *pol = &default_policy;
Miao Xiec0ff7452010-05-24 14:32:08 -07002229 struct page *page;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002230
Oleg Nesterov8d902742014-10-09 15:27:45 -07002231 if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2232 pol = get_task_policy(current);
Lee Schermerhorn52cd3b02008-04-28 02:13:16 -07002233
2234 /*
2235 * No reference counting needed for current->mempolicy
2236 * nor system default_policy
2237 */
Lee Schermerhorn45c47452008-04-28 02:13:12 -07002238 if (pol->mode == MPOL_INTERLEAVE)
Miao Xiec0ff7452010-05-24 14:32:08 -07002239 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2240 else
2241 page = __alloc_pages_nodemask(gfp, order,
Vlastimil Babka04ec6262017-07-06 15:40:03 -07002242 policy_node(gfp, pol, numa_node_id()),
Andi Kleen5c4b4be2011-03-04 17:36:32 -08002243 policy_nodemask(gfp, pol));
Mel Gormancc9a6c82012-03-21 16:34:11 -07002244
Miao Xiec0ff7452010-05-24 14:32:08 -07002245 return page;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002246}
2247EXPORT_SYMBOL(alloc_pages_current);
2248
Oleg Nesterovef0855d2013-09-11 14:20:14 -07002249int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2250{
2251 struct mempolicy *pol = mpol_dup(vma_policy(src));
2252
2253 if (IS_ERR(pol))
2254 return PTR_ERR(pol);
2255 dst->vm_policy = pol;
2256 return 0;
2257}
2258
Paul Jackson42253992006-01-08 01:01:59 -08002259/*
Lee Schermerhorn846a16b2008-04-28 02:13:09 -07002260 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
Paul Jackson42253992006-01-08 01:01:59 -08002261 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2262 * with the mems_allowed returned by cpuset_mems_allowed(). This
2263 * keeps mempolicies cpuset relative after its cpuset moves. See
2264 * further kernel/cpuset.c update_nodemask().
Miao Xie708c1bb2010-05-24 14:32:07 -07002265 *
2266 * current's mempolicy may be rebinded by the other task(the task that changes
2267 * cpuset's mems), so we needn't do rebind work for current task.
Paul Jackson42253992006-01-08 01:01:59 -08002268 */
Paul Jackson42253992006-01-08 01:01:59 -08002269
Lee Schermerhorn846a16b2008-04-28 02:13:09 -07002270/* Slow path of a mempolicy duplicate */
2271struct mempolicy *__mpol_dup(struct mempolicy *old)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002272{
2273 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2274
2275 if (!new)
2276 return ERR_PTR(-ENOMEM);
Miao Xie708c1bb2010-05-24 14:32:07 -07002277
2278 /* task's mempolicy is protected by alloc_lock */
2279 if (old == current->mempolicy) {
2280 task_lock(current);
2281 *new = *old;
2282 task_unlock(current);
2283 } else
2284 *new = *old;
2285
Paul Jackson42253992006-01-08 01:01:59 -08002286 if (current_cpuset_is_being_rebound()) {
2287 nodemask_t mems = cpuset_mems_allowed(current);
Vlastimil Babka213980c2017-07-06 15:40:06 -07002288 mpol_rebind_policy(new, &mems);
Paul Jackson42253992006-01-08 01:01:59 -08002289 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002290 atomic_set(&new->refcnt, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002291 return new;
2292}
2293
2294/* Slow path of a mempolicy comparison */
KOSAKI Motohirofcfb4dc2012-01-10 15:08:21 -08002295bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002296{
2297 if (!a || !b)
KOSAKI Motohirofcfb4dc2012-01-10 15:08:21 -08002298 return false;
Lee Schermerhorn45c47452008-04-28 02:13:12 -07002299 if (a->mode != b->mode)
KOSAKI Motohirofcfb4dc2012-01-10 15:08:21 -08002300 return false;
Bob Liu19800502010-05-24 14:32:01 -07002301 if (a->flags != b->flags)
KOSAKI Motohirofcfb4dc2012-01-10 15:08:21 -08002302 return false;
Bob Liu19800502010-05-24 14:32:01 -07002303 if (mpol_store_user_nodemask(a))
2304 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
KOSAKI Motohirofcfb4dc2012-01-10 15:08:21 -08002305 return false;
Bob Liu19800502010-05-24 14:32:01 -07002306
Lee Schermerhorn45c47452008-04-28 02:13:12 -07002307 switch (a->mode) {
Mel Gorman19770b32008-04-28 02:12:18 -07002308 case MPOL_BIND:
2309 /* Fall through */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002310 case MPOL_INTERLEAVE:
KOSAKI Motohirofcfb4dc2012-01-10 15:08:21 -08002311 return !!nodes_equal(a->v.nodes, b->v.nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002312 case MPOL_PREFERRED:
Yisheng Xie8970a632018-03-22 16:17:02 -07002313 /* a's ->flags is the same as b's */
2314 if (a->flags & MPOL_F_LOCAL)
2315 return true;
Namhyung Kim75719662011-03-22 16:33:02 -07002316 return a->v.preferred_node == b->v.preferred_node;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002317 default:
2318 BUG();
KOSAKI Motohirofcfb4dc2012-01-10 15:08:21 -08002319 return false;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002320 }
2321}
2322
Linus Torvalds1da177e2005-04-16 15:20:36 -07002323/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07002324 * Shared memory backing store policy support.
2325 *
2326 * Remember policies even when nobody has shared memory mapped.
2327 * The policies are kept in Red-Black tree linked from the inode.
Nathan Zimmer4a8c7bb2016-01-14 15:18:36 -08002328 * They are protected by the sp->lock rwlock, which should be held
Linus Torvalds1da177e2005-04-16 15:20:36 -07002329 * for any accesses to the tree.
2330 */
2331
Nathan Zimmer4a8c7bb2016-01-14 15:18:36 -08002332/*
2333 * lookup first element intersecting start-end. Caller holds sp->lock for
2334 * reading or for writing
2335 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002336static struct sp_node *
2337sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2338{
2339 struct rb_node *n = sp->root.rb_node;
2340
2341 while (n) {
2342 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2343
2344 if (start >= p->end)
2345 n = n->rb_right;
2346 else if (end <= p->start)
2347 n = n->rb_left;
2348 else
2349 break;
2350 }
2351 if (!n)
2352 return NULL;
2353 for (;;) {
2354 struct sp_node *w = NULL;
2355 struct rb_node *prev = rb_prev(n);
2356 if (!prev)
2357 break;
2358 w = rb_entry(prev, struct sp_node, nd);
2359 if (w->end <= start)
2360 break;
2361 n = prev;
2362 }
2363 return rb_entry(n, struct sp_node, nd);
2364}
2365
Nathan Zimmer4a8c7bb2016-01-14 15:18:36 -08002366/*
2367 * Insert a new shared policy into the list. Caller holds sp->lock for
2368 * writing.
2369 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002370static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2371{
2372 struct rb_node **p = &sp->root.rb_node;
2373 struct rb_node *parent = NULL;
2374 struct sp_node *nd;
2375
2376 while (*p) {
2377 parent = *p;
2378 nd = rb_entry(parent, struct sp_node, nd);
2379 if (new->start < nd->start)
2380 p = &(*p)->rb_left;
2381 else if (new->end > nd->end)
2382 p = &(*p)->rb_right;
2383 else
2384 BUG();
2385 }
2386 rb_link_node(&new->nd, parent, p);
2387 rb_insert_color(&new->nd, &sp->root);
Paul Mundt140d5a42007-07-15 23:38:16 -07002388 pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
Lee Schermerhorn45c47452008-04-28 02:13:12 -07002389 new->policy ? new->policy->mode : 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002390}
2391
2392/* Find shared policy intersecting idx */
2393struct mempolicy *
2394mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2395{
2396 struct mempolicy *pol = NULL;
2397 struct sp_node *sn;
2398
2399 if (!sp->root.rb_node)
2400 return NULL;
Nathan Zimmer4a8c7bb2016-01-14 15:18:36 -08002401 read_lock(&sp->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002402 sn = sp_lookup(sp, idx, idx+1);
2403 if (sn) {
2404 mpol_get(sn->policy);
2405 pol = sn->policy;
2406 }
Nathan Zimmer4a8c7bb2016-01-14 15:18:36 -08002407 read_unlock(&sp->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002408 return pol;
2409}
2410
KOSAKI Motohiro63f74ca2012-10-08 16:29:19 -07002411static void sp_free(struct sp_node *n)
2412{
2413 mpol_put(n->policy);
2414 kmem_cache_free(sn_cache, n);
2415}
2416
Lee Schermerhorn771fb4d2012-10-25 14:16:30 +02002417/**
2418 * mpol_misplaced - check whether current page node is valid in policy
2419 *
Fabian Frederickb46e14a2014-06-04 16:08:18 -07002420 * @page: page to be checked
2421 * @vma: vm area where page mapped
2422 * @addr: virtual address where page mapped
Lee Schermerhorn771fb4d2012-10-25 14:16:30 +02002423 *
2424 * Lookup current policy node id for vma,addr and "compare to" page's
2425 * node id.
2426 *
2427 * Returns:
2428 * -1 - not misplaced, page is in the right node
2429 * node - node id where the page should be
2430 *
2431 * Policy determination "mimics" alloc_page_vma().
2432 * Called from fault path where we know the vma and faulting address.
2433 */
2434int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2435{
2436 struct mempolicy *pol;
Mel Gormanc33d6c02016-05-19 17:14:10 -07002437 struct zoneref *z;
Lee Schermerhorn771fb4d2012-10-25 14:16:30 +02002438 int curnid = page_to_nid(page);
2439 unsigned long pgoff;
Peter Zijlstra90572892013-10-07 11:29:20 +01002440 int thiscpu = raw_smp_processor_id();
2441 int thisnid = cpu_to_node(thiscpu);
Anshuman Khandual98fa15f2019-03-05 15:42:58 -08002442 int polnid = NUMA_NO_NODE;
Lee Schermerhorn771fb4d2012-10-25 14:16:30 +02002443 int ret = -1;
2444
Oleg Nesterovdd6eecb2014-10-09 15:27:57 -07002445 pol = get_vma_policy(vma, addr);
Lee Schermerhorn771fb4d2012-10-25 14:16:30 +02002446 if (!(pol->flags & MPOL_F_MOF))
2447 goto out;
2448
2449 switch (pol->mode) {
2450 case MPOL_INTERLEAVE:
Lee Schermerhorn771fb4d2012-10-25 14:16:30 +02002451 pgoff = vma->vm_pgoff;
2452 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
Laurent Dufour98c70ba2017-09-08 16:12:39 -07002453 polnid = offset_il_node(pol, pgoff);
Lee Schermerhorn771fb4d2012-10-25 14:16:30 +02002454 break;
2455
2456 case MPOL_PREFERRED:
2457 if (pol->flags & MPOL_F_LOCAL)
2458 polnid = numa_node_id();
2459 else
2460 polnid = pol->v.preferred_node;
2461 break;
2462
2463 case MPOL_BIND:
Mel Gormanc33d6c02016-05-19 17:14:10 -07002464
Lee Schermerhorn771fb4d2012-10-25 14:16:30 +02002465 /*
2466 * allows binding to multiple nodes.
2467 * use current page if in policy nodemask,
2468 * else select nearest allowed node, if any.
2469 * If no allowed nodes, use current [!misplaced].
2470 */
2471 if (node_isset(curnid, pol->v.nodes))
2472 goto out;
Mel Gormanc33d6c02016-05-19 17:14:10 -07002473 z = first_zones_zonelist(
Lee Schermerhorn771fb4d2012-10-25 14:16:30 +02002474 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2475 gfp_zone(GFP_HIGHUSER),
Mel Gormanc33d6c02016-05-19 17:14:10 -07002476 &pol->v.nodes);
Pavel Tatashinc1093b72018-08-21 21:53:32 -07002477 polnid = zone_to_nid(z->zone);
Lee Schermerhorn771fb4d2012-10-25 14:16:30 +02002478 break;
2479
2480 default:
2481 BUG();
2482 }
Mel Gorman5606e382012-11-02 18:19:13 +00002483
2484 /* Migrate the page towards the node whose CPU is referencing it */
Mel Gormane42c8ff2012-11-12 09:17:07 +00002485 if (pol->flags & MPOL_F_MORON) {
Peter Zijlstra90572892013-10-07 11:29:20 +01002486 polnid = thisnid;
Mel Gorman5606e382012-11-02 18:19:13 +00002487
Rik van Riel10f39042014-01-27 17:03:44 -05002488 if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
Rik van Rielde1c9ce2013-10-07 11:29:39 +01002489 goto out;
Mel Gormane42c8ff2012-11-12 09:17:07 +00002490 }
2491
Lee Schermerhorn771fb4d2012-10-25 14:16:30 +02002492 if (curnid != polnid)
2493 ret = polnid;
2494out:
2495 mpol_cond_put(pol);
2496
2497 return ret;
2498}
2499
David Rientjesc11600e2016-09-01 16:15:07 -07002500/*
2501 * Drop the (possibly final) reference to task->mempolicy. It needs to be
2502 * dropped after task->mempolicy is set to NULL so that any allocation done as
2503 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2504 * policy.
2505 */
2506void mpol_put_task_policy(struct task_struct *task)
2507{
2508 struct mempolicy *pol;
2509
2510 task_lock(task);
2511 pol = task->mempolicy;
2512 task->mempolicy = NULL;
2513 task_unlock(task);
2514 mpol_put(pol);
2515}
2516
Linus Torvalds1da177e2005-04-16 15:20:36 -07002517static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2518{
Paul Mundt140d5a42007-07-15 23:38:16 -07002519 pr_debug("deleting %lx-l%lx\n", n->start, n->end);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002520 rb_erase(&n->nd, &sp->root);
KOSAKI Motohiro63f74ca2012-10-08 16:29:19 -07002521 sp_free(n);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002522}
2523
Mel Gorman42288fe2012-12-21 23:10:25 +00002524static void sp_node_init(struct sp_node *node, unsigned long start,
2525 unsigned long end, struct mempolicy *pol)
2526{
2527 node->start = start;
2528 node->end = end;
2529 node->policy = pol;
2530}
2531
Adrian Bunkdbcb0f12007-10-16 01:26:26 -07002532static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2533 struct mempolicy *pol)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002534{
KOSAKI Motohiro869833f2012-10-08 16:29:16 -07002535 struct sp_node *n;
2536 struct mempolicy *newpol;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002537
KOSAKI Motohiro869833f2012-10-08 16:29:16 -07002538 n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002539 if (!n)
2540 return NULL;
KOSAKI Motohiro869833f2012-10-08 16:29:16 -07002541
2542 newpol = mpol_dup(pol);
2543 if (IS_ERR(newpol)) {
2544 kmem_cache_free(sn_cache, n);
2545 return NULL;
2546 }
2547 newpol->flags |= MPOL_F_SHARED;
Mel Gorman42288fe2012-12-21 23:10:25 +00002548 sp_node_init(n, start, end, newpol);
KOSAKI Motohiro869833f2012-10-08 16:29:16 -07002549
Linus Torvalds1da177e2005-04-16 15:20:36 -07002550 return n;
2551}
2552
2553/* Replace a policy range. */
2554static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2555 unsigned long end, struct sp_node *new)
2556{
Mel Gormanb22d1272012-10-08 16:29:17 -07002557 struct sp_node *n;
Mel Gorman42288fe2012-12-21 23:10:25 +00002558 struct sp_node *n_new = NULL;
2559 struct mempolicy *mpol_new = NULL;
Mel Gormanb22d1272012-10-08 16:29:17 -07002560 int ret = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002561
Mel Gorman42288fe2012-12-21 23:10:25 +00002562restart:
Nathan Zimmer4a8c7bb2016-01-14 15:18:36 -08002563 write_lock(&sp->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002564 n = sp_lookup(sp, start, end);
2565 /* Take care of old policies in the same range. */
2566 while (n && n->start < end) {
2567 struct rb_node *next = rb_next(&n->nd);
2568 if (n->start >= start) {
2569 if (n->end <= end)
2570 sp_delete(sp, n);
2571 else
2572 n->start = end;
2573 } else {
2574 /* Old policy spanning whole new range. */
2575 if (n->end > end) {
Mel Gorman42288fe2012-12-21 23:10:25 +00002576 if (!n_new)
2577 goto alloc_new;
2578
2579 *mpol_new = *n->policy;
2580 atomic_set(&mpol_new->refcnt, 1);
KOSAKI Motohiro78806392013-03-08 12:43:29 -08002581 sp_node_init(n_new, end, n->end, mpol_new);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002582 n->end = start;
Hillf Danton5ca39572013-03-08 12:43:28 -08002583 sp_insert(sp, n_new);
Mel Gorman42288fe2012-12-21 23:10:25 +00002584 n_new = NULL;
2585 mpol_new = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002586 break;
2587 } else
2588 n->end = start;
2589 }
2590 if (!next)
2591 break;
2592 n = rb_entry(next, struct sp_node, nd);
2593 }
2594 if (new)
2595 sp_insert(sp, new);
Nathan Zimmer4a8c7bb2016-01-14 15:18:36 -08002596 write_unlock(&sp->lock);
Mel Gorman42288fe2012-12-21 23:10:25 +00002597 ret = 0;
2598
2599err_out:
2600 if (mpol_new)
2601 mpol_put(mpol_new);
2602 if (n_new)
2603 kmem_cache_free(sn_cache, n_new);
2604
Mel Gormanb22d1272012-10-08 16:29:17 -07002605 return ret;
Mel Gorman42288fe2012-12-21 23:10:25 +00002606
2607alloc_new:
Nathan Zimmer4a8c7bb2016-01-14 15:18:36 -08002608 write_unlock(&sp->lock);
Mel Gorman42288fe2012-12-21 23:10:25 +00002609 ret = -ENOMEM;
2610 n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2611 if (!n_new)
2612 goto err_out;
2613 mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2614 if (!mpol_new)
2615 goto err_out;
2616 goto restart;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002617}
2618
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002619/**
2620 * mpol_shared_policy_init - initialize shared policy for inode
2621 * @sp: pointer to inode shared policy
2622 * @mpol: struct mempolicy to install
2623 *
2624 * Install non-NULL @mpol in inode's shared policy rb-tree.
2625 * On entry, the current task has a reference on a non-NULL @mpol.
2626 * This must be released on exit.
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -07002627 * This is called at get_inode() calls and we can use GFP_KERNEL.
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002628 */
2629void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
Robin Holt7339ff82006-01-14 13:20:48 -08002630{
Miao Xie58568d22009-06-16 15:31:49 -07002631 int ret;
2632
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002633 sp->root = RB_ROOT; /* empty tree == default mempolicy */
Nathan Zimmer4a8c7bb2016-01-14 15:18:36 -08002634 rwlock_init(&sp->lock);
Robin Holt7339ff82006-01-14 13:20:48 -08002635
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002636 if (mpol) {
2637 struct vm_area_struct pvma;
2638 struct mempolicy *new;
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -07002639 NODEMASK_SCRATCH(scratch);
Robin Holt7339ff82006-01-14 13:20:48 -08002640
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -07002641 if (!scratch)
Lee Schermerhorn5c0c1652010-06-29 15:05:30 -07002642 goto put_mpol;
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002643 /* contextualize the tmpfs mount point mempolicy */
2644 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
Lee Schermerhorn15d77832010-05-24 14:32:04 -07002645 if (IS_ERR(new))
Dan Carpenter0cae3452010-05-25 23:42:58 -07002646 goto free_scratch; /* no valid nodemask intersection */
Miao Xie58568d22009-06-16 15:31:49 -07002647
2648 task_lock(current);
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -07002649 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
Miao Xie58568d22009-06-16 15:31:49 -07002650 task_unlock(current);
Lee Schermerhorn15d77832010-05-24 14:32:04 -07002651 if (ret)
Lee Schermerhorn5c0c1652010-06-29 15:05:30 -07002652 goto put_new;
Robin Holt7339ff82006-01-14 13:20:48 -08002653
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002654 /* Create pseudo-vma that contains just the policy */
Kirill A. Shutemov2c4541e2018-07-26 16:37:30 -07002655 vma_init(&pvma, NULL);
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002656 pvma.vm_end = TASK_SIZE; /* policy covers entire file */
2657 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
Lee Schermerhorn15d77832010-05-24 14:32:04 -07002658
Lee Schermerhorn5c0c1652010-06-29 15:05:30 -07002659put_new:
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002660 mpol_put(new); /* drop initial ref */
Dan Carpenter0cae3452010-05-25 23:42:58 -07002661free_scratch:
KAMEZAWA Hiroyuki4bfc4492009-08-06 15:07:33 -07002662 NODEMASK_SCRATCH_FREE(scratch);
Lee Schermerhorn5c0c1652010-06-29 15:05:30 -07002663put_mpol:
2664 mpol_put(mpol); /* drop our incoming ref on sb mpol */
Robin Holt7339ff82006-01-14 13:20:48 -08002665 }
2666}
2667
Linus Torvalds1da177e2005-04-16 15:20:36 -07002668int mpol_set_shared_policy(struct shared_policy *info,
2669 struct vm_area_struct *vma, struct mempolicy *npol)
2670{
2671 int err;
2672 struct sp_node *new = NULL;
2673 unsigned long sz = vma_pages(vma);
2674
David Rientjes028fec42008-04-28 02:12:25 -07002675 pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -07002676 vma->vm_pgoff,
Lee Schermerhorn45c47452008-04-28 02:13:12 -07002677 sz, npol ? npol->mode : -1,
David Rientjes028fec42008-04-28 02:12:25 -07002678 npol ? npol->flags : -1,
David Rientjes00ef2d22013-02-22 16:35:36 -08002679 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002680
2681 if (npol) {
2682 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2683 if (!new)
2684 return -ENOMEM;
2685 }
2686 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2687 if (err && new)
KOSAKI Motohiro63f74ca2012-10-08 16:29:19 -07002688 sp_free(new);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002689 return err;
2690}
2691
2692/* Free a backing policy store on inode delete. */
2693void mpol_free_shared_policy(struct shared_policy *p)
2694{
2695 struct sp_node *n;
2696 struct rb_node *next;
2697
2698 if (!p->root.rb_node)
2699 return;
Nathan Zimmer4a8c7bb2016-01-14 15:18:36 -08002700 write_lock(&p->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002701 next = rb_first(&p->root);
2702 while (next) {
2703 n = rb_entry(next, struct sp_node, nd);
2704 next = rb_next(&n->nd);
KOSAKI Motohiro63f74ca2012-10-08 16:29:19 -07002705 sp_delete(p, n);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002706 }
Nathan Zimmer4a8c7bb2016-01-14 15:18:36 -08002707 write_unlock(&p->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002708}
2709
Mel Gorman1a687c22012-11-22 11:16:36 +00002710#ifdef CONFIG_NUMA_BALANCING
Mel Gormanc2976632014-01-29 14:05:42 -08002711static int __initdata numabalancing_override;
Mel Gorman1a687c22012-11-22 11:16:36 +00002712
2713static void __init check_numabalancing_enable(void)
2714{
2715 bool numabalancing_default = false;
2716
2717 if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2718 numabalancing_default = true;
2719
Mel Gormanc2976632014-01-29 14:05:42 -08002720 /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2721 if (numabalancing_override)
2722 set_numabalancing_state(numabalancing_override == 1);
2723
Mel Gormanb0dc2b92015-05-14 15:17:09 -07002724 if (num_online_nodes() > 1 && !numabalancing_override) {
Joe Perches756a0252016-03-17 14:19:47 -07002725 pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
Mel Gormanc2976632014-01-29 14:05:42 -08002726 numabalancing_default ? "Enabling" : "Disabling");
Mel Gorman1a687c22012-11-22 11:16:36 +00002727 set_numabalancing_state(numabalancing_default);
2728 }
2729}
2730
2731static int __init setup_numabalancing(char *str)
2732{
2733 int ret = 0;
2734 if (!str)
2735 goto out;
Mel Gorman1a687c22012-11-22 11:16:36 +00002736
2737 if (!strcmp(str, "enable")) {
Mel Gormanc2976632014-01-29 14:05:42 -08002738 numabalancing_override = 1;
Mel Gorman1a687c22012-11-22 11:16:36 +00002739 ret = 1;
2740 } else if (!strcmp(str, "disable")) {
Mel Gormanc2976632014-01-29 14:05:42 -08002741 numabalancing_override = -1;
Mel Gorman1a687c22012-11-22 11:16:36 +00002742 ret = 1;
2743 }
2744out:
2745 if (!ret)
Andrew Morton4a404be2014-01-29 14:05:43 -08002746 pr_warn("Unable to parse numa_balancing=\n");
Mel Gorman1a687c22012-11-22 11:16:36 +00002747
2748 return ret;
2749}
2750__setup("numa_balancing=", setup_numabalancing);
2751#else
2752static inline void __init check_numabalancing_enable(void)
2753{
2754}
2755#endif /* CONFIG_NUMA_BALANCING */
2756
Linus Torvalds1da177e2005-04-16 15:20:36 -07002757/* assumes fs == KERNEL_DS */
2758void __init numa_policy_init(void)
2759{
Paul Mundtb71636e2007-07-15 23:38:15 -07002760 nodemask_t interleave_nodes;
2761 unsigned long largest = 0;
2762 int nid, prefer = 0;
2763
Linus Torvalds1da177e2005-04-16 15:20:36 -07002764 policy_cache = kmem_cache_create("numa_policy",
2765 sizeof(struct mempolicy),
Paul Mundt20c2df82007-07-20 10:11:58 +09002766 0, SLAB_PANIC, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002767
2768 sn_cache = kmem_cache_create("shared_policy_node",
2769 sizeof(struct sp_node),
Paul Mundt20c2df82007-07-20 10:11:58 +09002770 0, SLAB_PANIC, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002771
Mel Gorman5606e382012-11-02 18:19:13 +00002772 for_each_node(nid) {
2773 preferred_node_policy[nid] = (struct mempolicy) {
2774 .refcnt = ATOMIC_INIT(1),
2775 .mode = MPOL_PREFERRED,
2776 .flags = MPOL_F_MOF | MPOL_F_MORON,
2777 .v = { .preferred_node = nid, },
2778 };
2779 }
2780
Paul Mundtb71636e2007-07-15 23:38:15 -07002781 /*
2782 * Set interleaving policy for system init. Interleaving is only
2783 * enabled across suitably sized nodes (default is >= 16MB), or
2784 * fall back to the largest node if they're all smaller.
2785 */
2786 nodes_clear(interleave_nodes);
Lai Jiangshan01f13bd2012-12-12 13:51:33 -08002787 for_each_node_state(nid, N_MEMORY) {
Paul Mundtb71636e2007-07-15 23:38:15 -07002788 unsigned long total_pages = node_present_pages(nid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002789
Paul Mundtb71636e2007-07-15 23:38:15 -07002790 /* Preserve the largest node */
2791 if (largest < total_pages) {
2792 largest = total_pages;
2793 prefer = nid;
2794 }
2795
2796 /* Interleave this node? */
2797 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2798 node_set(nid, interleave_nodes);
2799 }
2800
2801 /* All too small, use the largest */
2802 if (unlikely(nodes_empty(interleave_nodes)))
2803 node_set(prefer, interleave_nodes);
2804
David Rientjes028fec42008-04-28 02:12:25 -07002805 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
Mitchel Humpherysb1de0d12014-06-06 14:38:30 -07002806 pr_err("%s: interleaving failed\n", __func__);
Mel Gorman1a687c22012-11-22 11:16:36 +00002807
2808 check_numabalancing_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002809}
2810
Christoph Lameter8bccd852005-10-29 18:16:59 -07002811/* Reset policy of current process to default */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002812void numa_default_policy(void)
2813{
David Rientjes028fec42008-04-28 02:12:25 -07002814 do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002815}
Paul Jackson68860ec2005-10-30 15:02:36 -08002816
Paul Jackson42253992006-01-08 01:01:59 -08002817/*
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002818 * Parse and format mempolicy from/to strings
2819 */
2820
2821/*
Hugh Dickinsf2a07f42013-01-02 02:01:33 -08002822 * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002823 */
Lee Schermerhorn345ace92010-05-24 14:32:04 -07002824static const char * const policy_modes[] =
2825{
2826 [MPOL_DEFAULT] = "default",
2827 [MPOL_PREFERRED] = "prefer",
2828 [MPOL_BIND] = "bind",
2829 [MPOL_INTERLEAVE] = "interleave",
Lee Schermerhornd3a71032012-10-25 14:16:29 +02002830 [MPOL_LOCAL] = "local",
Lee Schermerhorn345ace92010-05-24 14:32:04 -07002831};
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002832
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002833
2834#ifdef CONFIG_TMPFS
2835/**
Hugh Dickinsf2a07f42013-01-02 02:01:33 -08002836 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002837 * @str: string containing mempolicy to parse
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002838 * @mpol: pointer to struct mempolicy pointer, returned on success.
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002839 *
2840 * Format of input:
2841 * <mode>[=<flags>][:<nodelist>]
2842 *
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002843 * On success, returns 0, else 1
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002844 */
Hugh Dickinsa7a88b22013-01-02 02:04:23 -08002845int mpol_parse_str(char *str, struct mempolicy **mpol)
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002846{
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002847 struct mempolicy *new = NULL;
Hugh Dickinsf2a07f42013-01-02 02:01:33 -08002848 unsigned short mode_flags;
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002849 nodemask_t nodes;
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002850 char *nodelist = strchr(str, ':');
2851 char *flags = strchr(str, '=');
zhong jiangdedf2c72018-10-26 15:06:57 -07002852 int err = 1, mode;
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002853
Dan Carpenterc7a91bc2020-01-30 22:11:07 -08002854 if (flags)
2855 *flags++ = '\0'; /* terminate mode string */
2856
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002857 if (nodelist) {
2858 /* NUL-terminate mode or flags string */
2859 *nodelist++ = '\0';
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002860 if (nodelist_parse(nodelist, nodes))
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002861 goto out;
Lai Jiangshan01f13bd2012-12-12 13:51:33 -08002862 if (!nodes_subset(nodes, node_states[N_MEMORY]))
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002863 goto out;
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002864 } else
2865 nodes_clear(nodes);
2866
zhong jiangdedf2c72018-10-26 15:06:57 -07002867 mode = match_string(policy_modes, MPOL_MAX, str);
2868 if (mode < 0)
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002869 goto out;
2870
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002871 switch (mode) {
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002872 case MPOL_PREFERRED:
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002873 /*
2874 * Insist on a nodelist of one node only
2875 */
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002876 if (nodelist) {
2877 char *rest = nodelist;
2878 while (isdigit(*rest))
2879 rest++;
KOSAKI Motohiro926f2ae2010-03-23 13:35:32 -07002880 if (*rest)
2881 goto out;
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002882 }
2883 break;
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002884 case MPOL_INTERLEAVE:
2885 /*
2886 * Default to online nodes with memory if no nodelist
2887 */
2888 if (!nodelist)
Lai Jiangshan01f13bd2012-12-12 13:51:33 -08002889 nodes = node_states[N_MEMORY];
Lee Schermerhorn3f226aa2008-04-28 02:13:24 -07002890 break;
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002891 case MPOL_LOCAL:
Lee Schermerhorn3f226aa2008-04-28 02:13:24 -07002892 /*
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002893 * Don't allow a nodelist; mpol_new() checks flags
Lee Schermerhorn3f226aa2008-04-28 02:13:24 -07002894 */
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002895 if (nodelist)
Lee Schermerhorn3f226aa2008-04-28 02:13:24 -07002896 goto out;
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002897 mode = MPOL_PREFERRED;
Lee Schermerhorn3f226aa2008-04-28 02:13:24 -07002898 break;
Ravikiran G Thirumalai413b43d2010-03-23 13:35:28 -07002899 case MPOL_DEFAULT:
2900 /*
2901 * Insist on a empty nodelist
2902 */
2903 if (!nodelist)
2904 err = 0;
2905 goto out;
KOSAKI Motohirod69b2e62010-03-23 13:35:30 -07002906 case MPOL_BIND:
2907 /*
2908 * Insist on a nodelist
2909 */
2910 if (!nodelist)
2911 goto out;
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002912 }
2913
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002914 mode_flags = 0;
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002915 if (flags) {
2916 /*
2917 * Currently, we only support two mutually exclusive
2918 * mode flags.
2919 */
2920 if (!strcmp(flags, "static"))
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002921 mode_flags |= MPOL_F_STATIC_NODES;
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002922 else if (!strcmp(flags, "relative"))
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002923 mode_flags |= MPOL_F_RELATIVE_NODES;
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002924 else
KOSAKI Motohiro926f2ae2010-03-23 13:35:32 -07002925 goto out;
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002926 }
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002927
2928 new = mpol_new(mode, mode_flags, &nodes);
2929 if (IS_ERR(new))
KOSAKI Motohiro926f2ae2010-03-23 13:35:32 -07002930 goto out;
2931
Hugh Dickinsf2a07f42013-01-02 02:01:33 -08002932 /*
2933 * Save nodes for mpol_to_str() to show the tmpfs mount options
2934 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2935 */
2936 if (mode != MPOL_PREFERRED)
2937 new->v.nodes = nodes;
2938 else if (nodelist)
2939 new->v.preferred_node = first_node(nodes);
2940 else
2941 new->flags |= MPOL_F_LOCAL;
2942
2943 /*
2944 * Save nodes for contextualization: this will be used to "clone"
2945 * the mempolicy in a specific context [cpuset] at a later time.
2946 */
2947 new->w.user_nodemask = nodes;
2948
KOSAKI Motohiro926f2ae2010-03-23 13:35:32 -07002949 err = 0;
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002950
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002951out:
2952 /* Restore string for error message */
2953 if (nodelist)
2954 *--nodelist = ':';
2955 if (flags)
2956 *--flags = '=';
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002957 if (!err)
2958 *mpol = new;
Lee Schermerhorn095f1fc2008-04-28 02:13:23 -07002959 return err;
2960}
2961#endif /* CONFIG_TMPFS */
2962
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002963/**
2964 * mpol_to_str - format a mempolicy structure for printing
2965 * @buffer: to contain formatted mempolicy string
2966 * @maxlen: length of @buffer
2967 * @pol: pointer to mempolicy to be formatted
Lee Schermerhorn71fe8042008-04-28 02:13:26 -07002968 *
David Rientjes948927e2013-11-12 15:07:28 -08002969 * Convert @pol into a string. If @buffer is too short, truncate the string.
2970 * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2971 * longest flag, "relative", and to display at least a few node ids.
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002972 */
David Rientjes948927e2013-11-12 15:07:28 -08002973void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002974{
2975 char *p = buffer;
David Rientjes948927e2013-11-12 15:07:28 -08002976 nodemask_t nodes = NODE_MASK_NONE;
2977 unsigned short mode = MPOL_DEFAULT;
2978 unsigned short flags = 0;
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002979
David Rientjes8790c71a2014-01-30 15:46:08 -08002980 if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
Lee Schermerhornbea904d2008-04-28 02:13:18 -07002981 mode = pol->mode;
David Rientjes948927e2013-11-12 15:07:28 -08002982 flags = pol->flags;
2983 }
Lee Schermerhornbea904d2008-04-28 02:13:18 -07002984
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002985 switch (mode) {
2986 case MPOL_DEFAULT:
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002987 break;
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002988 case MPOL_PREFERRED:
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -07002989 if (flags & MPOL_F_LOCAL)
Hugh Dickinsf2a07f42013-01-02 02:01:33 -08002990 mode = MPOL_LOCAL;
Lee Schermerhorn53f25562008-04-28 02:13:20 -07002991 else
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -07002992 node_set(pol->v.preferred_node, nodes);
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002993 break;
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002994 case MPOL_BIND:
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002995 case MPOL_INTERLEAVE:
Hugh Dickinsf2a07f42013-01-02 02:01:33 -08002996 nodes = pol->v.nodes;
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002997 break;
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08002998 default:
David Rientjes948927e2013-11-12 15:07:28 -08002999 WARN_ON_ONCE(1);
3000 snprintf(p, maxlen, "unknown");
3001 return;
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08003002 }
3003
David Rientjesb7a9f422013-11-21 14:32:06 -08003004 p += snprintf(p, maxlen, "%s", policy_modes[mode]);
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08003005
Lee Schermerhornfc36b8d2008-04-28 02:13:21 -07003006 if (flags & MPOL_MODE_FLAGS) {
David Rientjes948927e2013-11-12 15:07:28 -08003007 p += snprintf(p, buffer + maxlen - p, "=");
David Rientjesf5b087b2008-04-28 02:12:27 -07003008
Lee Schermerhorn22919902008-04-28 02:13:22 -07003009 /*
3010 * Currently, the only defined flags are mutually exclusive
3011 */
David Rientjesf5b087b2008-04-28 02:12:27 -07003012 if (flags & MPOL_F_STATIC_NODES)
Lee Schermerhorn22919902008-04-28 02:13:22 -07003013 p += snprintf(p, buffer + maxlen - p, "static");
3014 else if (flags & MPOL_F_RELATIVE_NODES)
3015 p += snprintf(p, buffer + maxlen - p, "relative");
David Rientjesf5b087b2008-04-28 02:12:27 -07003016 }
3017
Tejun Heo9e763e02015-02-13 14:38:02 -08003018 if (!nodes_empty(nodes))
3019 p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
3020 nodemask_pr_args(&nodes));
Christoph Lameter1a75a6c2006-01-08 01:01:02 -08003021}