blob: 7af45b2e8870647e0ba98f6a62fc802667ec8881 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * linux/mm/page_alloc.c
3 *
4 * Manages the free list, the system allocates free pages here.
5 * Note that kmalloc() lives in slab.c
6 *
7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
8 * Swap reorganised 29.12.95, Stephen Tweedie
9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000
13 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton)
15 */
16
Linus Torvalds1da177e2005-04-16 15:20:36 -070017#include <linux/stddef.h>
18#include <linux/mm.h>
19#include <linux/swap.h>
20#include <linux/interrupt.h>
21#include <linux/pagemap.h>
KOSAKI Motohiro10ed2732008-03-04 14:28:32 -080022#include <linux/jiffies.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070023#include <linux/bootmem.h>
Yinghai Luedbe7d22010-08-25 13:39:16 -070024#include <linux/memblock.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070025#include <linux/compiler.h>
Randy Dunlap9f158332005-09-13 01:25:16 -070026#include <linux/kernel.h>
Vegard Nossumb1eeab62008-11-25 16:55:53 +010027#include <linux/kmemcheck.h>
Andrey Ryabininb8c73fc2015-02-13 14:39:28 -080028#include <linux/kasan.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070029#include <linux/module.h>
30#include <linux/suspend.h>
31#include <linux/pagevec.h>
32#include <linux/blkdev.h>
33#include <linux/slab.h>
Dave Hansena238ab52011-05-24 17:12:16 -070034#include <linux/ratelimit.h>
David Rientjes5a3135c22007-10-16 23:25:53 -070035#include <linux/oom.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070036#include <linux/notifier.h>
37#include <linux/topology.h>
38#include <linux/sysctl.h>
39#include <linux/cpu.h>
40#include <linux/cpuset.h>
Dave Hansenbdc8cb92005-10-29 18:16:53 -070041#include <linux/memory_hotplug.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070042#include <linux/nodemask.h>
43#include <linux/vmalloc.h>
KOSAKI Motohiroa6cccdc2011-05-24 17:11:33 -070044#include <linux/vmstat.h>
Christoph Lameter4be38e32006-01-06 00:11:17 -080045#include <linux/mempolicy.h>
Yasunori Goto68113782006-06-23 02:03:11 -070046#include <linux/stop_machine.h>
Mel Gormanc7132162006-09-27 01:49:43 -070047#include <linux/sort.h>
48#include <linux/pfn.h>
Andrew Morton3fcfab12006-10-19 23:28:16 -070049#include <linux/backing-dev.h>
Akinobu Mita933e3122006-12-08 02:39:45 -080050#include <linux/fault-inject.h>
KAMEZAWA Hiroyukia5d76b542007-10-16 01:26:11 -070051#include <linux/page-isolation.h>
Joonsoo Kimeefa864b2014-12-12 16:55:46 -080052#include <linux/page_ext.h>
Thomas Gleixner3ac7fe52008-04-30 00:55:01 -070053#include <linux/debugobjects.h>
Catalin Marinasdbb1f812009-06-11 13:23:19 +010054#include <linux/kmemleak.h>
Mel Gorman56de7262010-05-24 14:32:30 -070055#include <linux/compaction.h>
Mel Gorman0d3d0622009-09-21 17:02:44 -070056#include <trace/events/kmem.h>
Linus Torvalds268bb0c2011-05-20 12:50:29 -070057#include <linux/prefetch.h>
Lisa Du6e543d52013-09-11 14:22:36 -070058#include <linux/mm_inline.h>
Michal Nazarewicz041d3a82011-12-29 13:09:50 +010059#include <linux/migrate.h>
Joonsoo Kime30825f2014-12-12 16:55:49 -080060#include <linux/page_ext.h>
David Rientjes949f7ec2013-04-29 15:07:48 -070061#include <linux/hugetlb.h>
Clark Williams8bd75c72013-02-07 09:47:07 -060062#include <linux/sched/rt.h>
Joonsoo Kim48c96a32014-12-12 16:56:01 -080063#include <linux/page_owner.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070064
Jiang Liu7ee3d4e2013-07-03 15:03:41 -070065#include <asm/sections.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070066#include <asm/tlbflush.h>
Andrew Mortonac924c62006-05-15 09:43:59 -070067#include <asm/div64.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070068#include "internal.h"
69
Cody P Schaferc8e251f2013-07-03 15:01:29 -070070/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
71static DEFINE_MUTEX(pcp_batch_high_lock);
David Rientjes7cd2b0a2014-06-23 13:22:04 -070072#define MIN_PERCPU_PAGELIST_FRACTION (8)
Cody P Schaferc8e251f2013-07-03 15:01:29 -070073
Lee Schermerhorn72812012010-05-26 14:44:56 -070074#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
75DEFINE_PER_CPU(int, numa_node);
76EXPORT_PER_CPU_SYMBOL(numa_node);
77#endif
78
Lee Schermerhorn7aac7892010-05-26 14:45:00 -070079#ifdef CONFIG_HAVE_MEMORYLESS_NODES
80/*
81 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
82 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
83 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
84 * defined in <linux/topology.h>.
85 */
86DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
87EXPORT_PER_CPU_SYMBOL(_numa_mem_);
Joonsoo Kimad2c8142014-10-09 15:26:13 -070088int _node_numa_mem_[MAX_NUMNODES];
Lee Schermerhorn7aac7892010-05-26 14:45:00 -070089#endif
90
Linus Torvalds1da177e2005-04-16 15:20:36 -070091/*
Christoph Lameter13808912007-10-16 01:25:27 -070092 * Array of node states.
Linus Torvalds1da177e2005-04-16 15:20:36 -070093 */
Christoph Lameter13808912007-10-16 01:25:27 -070094nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
95 [N_POSSIBLE] = NODE_MASK_ALL,
96 [N_ONLINE] = { { [0] = 1UL } },
97#ifndef CONFIG_NUMA
98 [N_NORMAL_MEMORY] = { { [0] = 1UL } },
99#ifdef CONFIG_HIGHMEM
100 [N_HIGH_MEMORY] = { { [0] = 1UL } },
101#endif
Lai Jiangshan20b2f522012-12-12 13:52:00 -0800102#ifdef CONFIG_MOVABLE_NODE
103 [N_MEMORY] = { { [0] = 1UL } },
104#endif
Christoph Lameter13808912007-10-16 01:25:27 -0700105 [N_CPU] = { { [0] = 1UL } },
106#endif /* NUMA */
107};
108EXPORT_SYMBOL(node_states);
109
Jiang Liuc3d5f5f2013-07-03 15:03:14 -0700110/* Protect totalram_pages and zone->managed_pages */
111static DEFINE_SPINLOCK(managed_page_count_lock);
112
Ravikiran G Thirumalai6c231b72005-09-06 15:17:45 -0700113unsigned long totalram_pages __read_mostly;
Hideo AOKIcb45b0e2006-04-10 22:52:59 -0700114unsigned long totalreserve_pages __read_mostly;
Pintu Kumare48322a2014-12-18 16:17:15 -0800115unsigned long totalcma_pages __read_mostly;
Johannes Weinerab8fabd2012-01-10 15:07:42 -0800116/*
117 * When calculating the number of globally allowed dirty pages, there
118 * is a certain number of per-zone reserves that should not be
119 * considered dirtyable memory. This is the sum of those reserves
120 * over all existing zones that contribute dirtyable memory.
121 */
122unsigned long dirty_balance_reserve __read_mostly;
123
Hugh Dickins1b76b022012-05-11 01:00:07 -0700124int percpu_pagelist_fraction;
Benjamin Herrenschmidtdcce2842009-06-18 13:24:12 +1000125gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700126
Rafael J. Wysocki452aa692010-03-05 13:42:13 -0800127#ifdef CONFIG_PM_SLEEP
128/*
129 * The following functions are used by the suspend/hibernate code to temporarily
130 * change gfp_allowed_mask in order to avoid using I/O during memory allocations
131 * while devices are suspended. To avoid races with the suspend/hibernate code,
132 * they should always be called with pm_mutex held (gfp_allowed_mask also should
133 * only be modified with pm_mutex held, unless the suspend/hibernate code is
134 * guaranteed not to run in parallel with that modification).
135 */
Rafael J. Wysockic9e664f2010-12-03 22:57:45 +0100136
137static gfp_t saved_gfp_mask;
138
139void pm_restore_gfp_mask(void)
Rafael J. Wysocki452aa692010-03-05 13:42:13 -0800140{
141 WARN_ON(!mutex_is_locked(&pm_mutex));
Rafael J. Wysockic9e664f2010-12-03 22:57:45 +0100142 if (saved_gfp_mask) {
143 gfp_allowed_mask = saved_gfp_mask;
144 saved_gfp_mask = 0;
145 }
Rafael J. Wysocki452aa692010-03-05 13:42:13 -0800146}
147
Rafael J. Wysockic9e664f2010-12-03 22:57:45 +0100148void pm_restrict_gfp_mask(void)
Rafael J. Wysocki452aa692010-03-05 13:42:13 -0800149{
Rafael J. Wysocki452aa692010-03-05 13:42:13 -0800150 WARN_ON(!mutex_is_locked(&pm_mutex));
Rafael J. Wysockic9e664f2010-12-03 22:57:45 +0100151 WARN_ON(saved_gfp_mask);
152 saved_gfp_mask = gfp_allowed_mask;
153 gfp_allowed_mask &= ~GFP_IOFS;
Rafael J. Wysocki452aa692010-03-05 13:42:13 -0800154}
Mel Gormanf90ac392012-01-10 15:07:15 -0800155
156bool pm_suspended_storage(void)
157{
158 if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
159 return false;
160 return true;
161}
Rafael J. Wysocki452aa692010-03-05 13:42:13 -0800162#endif /* CONFIG_PM_SLEEP */
163
Mel Gormand9c23402007-10-16 01:26:01 -0700164#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
165int pageblock_order __read_mostly;
166#endif
167
Hugh Dickinsd98c7a02006-02-14 13:52:59 -0800168static void __free_pages_ok(struct page *page, unsigned int order);
David Howellsa226f6c2006-01-06 00:11:08 -0800169
Linus Torvalds1da177e2005-04-16 15:20:36 -0700170/*
171 * results with 256, 32 in the lowmem_reserve sysctl:
172 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
173 * 1G machine -> (16M dma, 784M normal, 224M high)
174 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
175 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
Yaowei Bai84109e12015-02-12 15:00:22 -0800176 * HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA
Andi Kleena2f1b422005-11-05 17:25:53 +0100177 *
178 * TBD: should special case ZONE_DMA32 machines here - in those we normally
179 * don't need any ZONE_NORMAL reservation
Linus Torvalds1da177e2005-04-16 15:20:36 -0700180 */
Christoph Lameter2f1b6242006-09-25 23:31:13 -0700181int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
Christoph Lameter4b51d662007-02-10 01:43:10 -0800182#ifdef CONFIG_ZONE_DMA
Christoph Lameter2f1b6242006-09-25 23:31:13 -0700183 256,
Christoph Lameter4b51d662007-02-10 01:43:10 -0800184#endif
Christoph Lameterfb0e7942006-09-25 23:31:13 -0700185#ifdef CONFIG_ZONE_DMA32
Christoph Lameter2f1b6242006-09-25 23:31:13 -0700186 256,
Christoph Lameterfb0e7942006-09-25 23:31:13 -0700187#endif
Christoph Lametere53ef382006-09-25 23:31:14 -0700188#ifdef CONFIG_HIGHMEM
Mel Gorman2a1e2742007-07-17 04:03:12 -0700189 32,
Christoph Lametere53ef382006-09-25 23:31:14 -0700190#endif
Mel Gorman2a1e2742007-07-17 04:03:12 -0700191 32,
Christoph Lameter2f1b6242006-09-25 23:31:13 -0700192};
Linus Torvalds1da177e2005-04-16 15:20:36 -0700193
194EXPORT_SYMBOL(totalram_pages);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700195
Helge Deller15ad7cd2006-12-06 20:40:36 -0800196static char * const zone_names[MAX_NR_ZONES] = {
Christoph Lameter4b51d662007-02-10 01:43:10 -0800197#ifdef CONFIG_ZONE_DMA
Christoph Lameter2f1b6242006-09-25 23:31:13 -0700198 "DMA",
Christoph Lameter4b51d662007-02-10 01:43:10 -0800199#endif
Christoph Lameterfb0e7942006-09-25 23:31:13 -0700200#ifdef CONFIG_ZONE_DMA32
Christoph Lameter2f1b6242006-09-25 23:31:13 -0700201 "DMA32",
Christoph Lameterfb0e7942006-09-25 23:31:13 -0700202#endif
Christoph Lameter2f1b6242006-09-25 23:31:13 -0700203 "Normal",
Christoph Lametere53ef382006-09-25 23:31:14 -0700204#ifdef CONFIG_HIGHMEM
Mel Gorman2a1e2742007-07-17 04:03:12 -0700205 "HighMem",
Christoph Lametere53ef382006-09-25 23:31:14 -0700206#endif
Mel Gorman2a1e2742007-07-17 04:03:12 -0700207 "Movable",
Christoph Lameter2f1b6242006-09-25 23:31:13 -0700208};
209
Linus Torvalds1da177e2005-04-16 15:20:36 -0700210int min_free_kbytes = 1024;
Han Pingtian42aa83c2014-01-23 15:53:28 -0800211int user_min_free_kbytes = -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700212
Jan Beulich2c85f512009-09-21 17:03:07 -0700213static unsigned long __meminitdata nr_kernel_pages;
214static unsigned long __meminitdata nr_all_pages;
Yasunori Gotoa3142c82007-05-08 00:23:07 -0700215static unsigned long __meminitdata dma_reserve;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700216
Tejun Heo0ee332c2011-12-08 10:22:09 -0800217#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
218static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
219static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
220static unsigned long __initdata required_kernelcore;
221static unsigned long __initdata required_movablecore;
222static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
Mel Gormanc7132162006-09-27 01:49:43 -0700223
Tejun Heo0ee332c2011-12-08 10:22:09 -0800224/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
225int movable_zone;
226EXPORT_SYMBOL(movable_zone);
227#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
Mel Gormanc7132162006-09-27 01:49:43 -0700228
Miklos Szeredi418508c2007-05-23 13:57:55 -0700229#if MAX_NUMNODES > 1
230int nr_node_ids __read_mostly = MAX_NUMNODES;
Christoph Lameter62bc62a2009-06-16 15:32:15 -0700231int nr_online_nodes __read_mostly = 1;
Miklos Szeredi418508c2007-05-23 13:57:55 -0700232EXPORT_SYMBOL(nr_node_ids);
Christoph Lameter62bc62a2009-06-16 15:32:15 -0700233EXPORT_SYMBOL(nr_online_nodes);
Miklos Szeredi418508c2007-05-23 13:57:55 -0700234#endif
235
Mel Gorman9ef9acb2007-10-16 01:25:54 -0700236int page_group_by_mobility_disabled __read_mostly;
237
Mel Gorman3a80a7f2015-06-30 14:57:02 -0700238#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
239static inline void reset_deferred_meminit(pg_data_t *pgdat)
240{
241 pgdat->first_deferred_pfn = ULONG_MAX;
242}
243
244/* Returns true if the struct page for the pfn is uninitialised */
245static inline bool __defermem_init early_page_uninitialised(unsigned long pfn)
246{
247 int nid = early_pfn_to_nid(pfn);
248
249 if (pfn >= NODE_DATA(nid)->first_deferred_pfn)
250 return true;
251
252 return false;
253}
254
255/*
256 * Returns false when the remaining initialisation should be deferred until
257 * later in the boot cycle when it can be parallelised.
258 */
259static inline bool update_defer_init(pg_data_t *pgdat,
260 unsigned long pfn, unsigned long zone_end,
261 unsigned long *nr_initialised)
262{
263 /* Always populate low zones for address-contrained allocations */
264 if (zone_end < pgdat_end_pfn(pgdat))
265 return true;
266
267 /* Initialise at least 2G of the highest zone */
268 (*nr_initialised)++;
269 if (*nr_initialised > (2UL << (30 - PAGE_SHIFT)) &&
270 (pfn & (PAGES_PER_SECTION - 1)) == 0) {
271 pgdat->first_deferred_pfn = pfn;
272 return false;
273 }
274
275 return true;
276}
277#else
278static inline void reset_deferred_meminit(pg_data_t *pgdat)
279{
280}
281
282static inline bool early_page_uninitialised(unsigned long pfn)
283{
284 return false;
285}
286
287static inline bool update_defer_init(pg_data_t *pgdat,
288 unsigned long pfn, unsigned long zone_end,
289 unsigned long *nr_initialised)
290{
291 return true;
292}
293#endif
294
295
Minchan Kimee6f5092012-07-31 16:43:50 -0700296void set_pageblock_migratetype(struct page *page, int migratetype)
Mel Gormanb2a0ac82007-10-16 01:25:48 -0700297{
KOSAKI Motohiro5d0f3f72013-11-12 15:08:18 -0800298 if (unlikely(page_group_by_mobility_disabled &&
299 migratetype < MIGRATE_PCPTYPES))
Mel Gorman49255c62009-06-16 15:31:58 -0700300 migratetype = MIGRATE_UNMOVABLE;
301
Mel Gormanb2a0ac82007-10-16 01:25:48 -0700302 set_pageblock_flags_group(page, (unsigned long)migratetype,
303 PB_migrate, PB_migrate_end);
304}
305
Nick Piggin13e74442006-01-06 00:10:58 -0800306#ifdef CONFIG_DEBUG_VM
Dave Hansenc6a57e12005-10-29 18:16:52 -0700307static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700308{
Dave Hansenbdc8cb92005-10-29 18:16:53 -0700309 int ret = 0;
310 unsigned seq;
311 unsigned long pfn = page_to_pfn(page);
Cody P Schaferb5e6a5a2013-02-22 16:35:28 -0800312 unsigned long sp, start_pfn;
Dave Hansenc6a57e12005-10-29 18:16:52 -0700313
Dave Hansenbdc8cb92005-10-29 18:16:53 -0700314 do {
315 seq = zone_span_seqbegin(zone);
Cody P Schaferb5e6a5a2013-02-22 16:35:28 -0800316 start_pfn = zone->zone_start_pfn;
317 sp = zone->spanned_pages;
Cody P Schafer108bcc92013-02-22 16:35:23 -0800318 if (!zone_spans_pfn(zone, pfn))
Dave Hansenbdc8cb92005-10-29 18:16:53 -0700319 ret = 1;
320 } while (zone_span_seqretry(zone, seq));
321
Cody P Schaferb5e6a5a2013-02-22 16:35:28 -0800322 if (ret)
Dave Hansen613813e2014-06-04 16:07:27 -0700323 pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n",
324 pfn, zone_to_nid(zone), zone->name,
325 start_pfn, start_pfn + sp);
Cody P Schaferb5e6a5a2013-02-22 16:35:28 -0800326
Dave Hansenbdc8cb92005-10-29 18:16:53 -0700327 return ret;
Dave Hansenc6a57e12005-10-29 18:16:52 -0700328}
329
330static int page_is_consistent(struct zone *zone, struct page *page)
331{
Andy Whitcroft14e07292007-05-06 14:49:14 -0700332 if (!pfn_valid_within(page_to_pfn(page)))
Dave Hansenc6a57e12005-10-29 18:16:52 -0700333 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700334 if (zone != page_zone(page))
Dave Hansenc6a57e12005-10-29 18:16:52 -0700335 return 0;
336
337 return 1;
338}
339/*
340 * Temporary debugging check for pages not lying within a given zone.
341 */
342static int bad_range(struct zone *zone, struct page *page)
343{
344 if (page_outside_zone_boundaries(zone, page))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700345 return 1;
Dave Hansenc6a57e12005-10-29 18:16:52 -0700346 if (!page_is_consistent(zone, page))
347 return 1;
348
Linus Torvalds1da177e2005-04-16 15:20:36 -0700349 return 0;
350}
Nick Piggin13e74442006-01-06 00:10:58 -0800351#else
352static inline int bad_range(struct zone *zone, struct page *page)
353{
354 return 0;
355}
356#endif
357
Kirill A. Shutemovd230dec2014-04-07 15:37:38 -0700358static void bad_page(struct page *page, const char *reason,
359 unsigned long bad_flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700360{
Hugh Dickinsd936cf92009-01-06 14:40:12 -0800361 static unsigned long resume;
362 static unsigned long nr_shown;
363 static unsigned long nr_unshown;
364
Wu Fengguang2a7684a2009-09-16 11:50:12 +0200365 /* Don't complain about poisoned pages */
366 if (PageHWPoison(page)) {
Mel Gorman22b751c2013-02-22 16:34:59 -0800367 page_mapcount_reset(page); /* remove PageBuddy */
Wu Fengguang2a7684a2009-09-16 11:50:12 +0200368 return;
369 }
370
Hugh Dickinsd936cf92009-01-06 14:40:12 -0800371 /*
372 * Allow a burst of 60 reports, then keep quiet for that minute;
373 * or allow a steady drip of one report per second.
374 */
375 if (nr_shown == 60) {
376 if (time_before(jiffies, resume)) {
377 nr_unshown++;
378 goto out;
379 }
380 if (nr_unshown) {
Hugh Dickins1e9e6362009-01-06 14:40:13 -0800381 printk(KERN_ALERT
382 "BUG: Bad page state: %lu messages suppressed\n",
Hugh Dickinsd936cf92009-01-06 14:40:12 -0800383 nr_unshown);
384 nr_unshown = 0;
385 }
386 nr_shown = 0;
387 }
388 if (nr_shown++ == 0)
389 resume = jiffies + 60 * HZ;
390
Hugh Dickins1e9e6362009-01-06 14:40:13 -0800391 printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",
Hugh Dickins3dc14742009-01-06 14:40:08 -0800392 current->comm, page_to_pfn(page));
Dave Hansenf0b791a2014-01-23 15:52:49 -0800393 dump_page_badflags(page, reason, bad_flags);
KAMEZAWA Hiroyuki52d4b9a2008-10-18 20:28:16 -0700394
Dave Jones4f318882011-10-31 17:07:24 -0700395 print_modules();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700396 dump_stack();
Hugh Dickinsd936cf92009-01-06 14:40:12 -0800397out:
Hugh Dickins8cc3b392009-01-06 14:40:06 -0800398 /* Leave bad fields for debug, except PageBuddy could make trouble */
Mel Gorman22b751c2013-02-22 16:34:59 -0800399 page_mapcount_reset(page); /* remove PageBuddy */
Rusty Russell373d4d02013-01-21 17:17:39 +1030400 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700401}
402
Linus Torvalds1da177e2005-04-16 15:20:36 -0700403/*
404 * Higher-order pages are called "compound pages". They are structured thusly:
405 *
406 * The first PAGE_SIZE page is called the "head page".
407 *
408 * The remaining PAGE_SIZE pages are called "tail pages".
409 *
Wang Sheng-Hui6416b9fa2011-11-17 10:53:50 +0100410 * All pages have PG_compound set. All tail pages have their ->first_page
411 * pointing at the head page.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700412 *
Hugh Dickins41d78ba2006-02-14 13:52:58 -0800413 * The first tail page's ->lru.next holds the address of the compound page's
414 * put_page() function. Its ->lru.prev holds the order of allocation.
415 * This usage means that zero-order pages may not be compound.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700416 */
Hugh Dickinsd98c7a02006-02-14 13:52:59 -0800417
418static void free_compound_page(struct page *page)
419{
Christoph Lameterd85f3382007-05-06 14:49:39 -0700420 __free_pages_ok(page, compound_order(page));
Hugh Dickinsd98c7a02006-02-14 13:52:59 -0800421}
422
Andi Kleen01ad1c02008-07-23 21:27:46 -0700423void prep_compound_page(struct page *page, unsigned long order)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700424{
425 int i;
426 int nr_pages = 1 << order;
427
Andy Whitcroft33f2ef82006-12-06 20:33:32 -0800428 set_compound_page_dtor(page, free_compound_page);
Christoph Lameterd85f3382007-05-06 14:49:39 -0700429 set_compound_order(page, order);
Christoph Lameter6d777952007-05-06 14:49:40 -0700430 __SetPageHead(page);
Andy Whitcroft18229df2008-11-06 12:53:27 -0800431 for (i = 1; i < nr_pages; i++) {
432 struct page *p = page + i;
Youquan Song58a84aa2011-12-08 14:34:18 -0800433 set_page_count(p, 0);
Christoph Lameterd85f3382007-05-06 14:49:39 -0700434 p->first_page = page;
David Rientjes668f9abb2014-03-03 15:38:18 -0800435 /* Make sure p->first_page is always valid for PageTail() */
436 smp_wmb();
437 __SetPageTail(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700438 }
439}
440
Stanislaw Gruszkac0a32fc2012-01-10 15:07:28 -0800441#ifdef CONFIG_DEBUG_PAGEALLOC
442unsigned int _debug_guardpage_minorder;
Joonsoo Kim031bc572014-12-12 16:55:52 -0800443bool _debug_pagealloc_enabled __read_mostly;
Joonsoo Kime30825f2014-12-12 16:55:49 -0800444bool _debug_guardpage_enabled __read_mostly;
445
Joonsoo Kim031bc572014-12-12 16:55:52 -0800446static int __init early_debug_pagealloc(char *buf)
447{
448 if (!buf)
449 return -EINVAL;
450
451 if (strcmp(buf, "on") == 0)
452 _debug_pagealloc_enabled = true;
453
454 return 0;
455}
456early_param("debug_pagealloc", early_debug_pagealloc);
457
Joonsoo Kime30825f2014-12-12 16:55:49 -0800458static bool need_debug_guardpage(void)
459{
Joonsoo Kim031bc572014-12-12 16:55:52 -0800460 /* If we don't use debug_pagealloc, we don't need guard page */
461 if (!debug_pagealloc_enabled())
462 return false;
463
Joonsoo Kime30825f2014-12-12 16:55:49 -0800464 return true;
465}
466
467static void init_debug_guardpage(void)
468{
Joonsoo Kim031bc572014-12-12 16:55:52 -0800469 if (!debug_pagealloc_enabled())
470 return;
471
Joonsoo Kime30825f2014-12-12 16:55:49 -0800472 _debug_guardpage_enabled = true;
473}
474
475struct page_ext_operations debug_guardpage_ops = {
476 .need = need_debug_guardpage,
477 .init = init_debug_guardpage,
478};
Stanislaw Gruszkac0a32fc2012-01-10 15:07:28 -0800479
480static int __init debug_guardpage_minorder_setup(char *buf)
481{
482 unsigned long res;
483
484 if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {
485 printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
486 return 0;
487 }
488 _debug_guardpage_minorder = res;
489 printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
490 return 0;
491}
492__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
493
Joonsoo Kim2847cf92014-12-12 16:55:01 -0800494static inline void set_page_guard(struct zone *zone, struct page *page,
495 unsigned int order, int migratetype)
Stanislaw Gruszkac0a32fc2012-01-10 15:07:28 -0800496{
Joonsoo Kime30825f2014-12-12 16:55:49 -0800497 struct page_ext *page_ext;
498
499 if (!debug_guardpage_enabled())
500 return;
501
502 page_ext = lookup_page_ext(page);
503 __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
504
Joonsoo Kim2847cf92014-12-12 16:55:01 -0800505 INIT_LIST_HEAD(&page->lru);
506 set_page_private(page, order);
507 /* Guard pages are not available for any usage */
508 __mod_zone_freepage_state(zone, -(1 << order), migratetype);
Stanislaw Gruszkac0a32fc2012-01-10 15:07:28 -0800509}
510
Joonsoo Kim2847cf92014-12-12 16:55:01 -0800511static inline void clear_page_guard(struct zone *zone, struct page *page,
512 unsigned int order, int migratetype)
Stanislaw Gruszkac0a32fc2012-01-10 15:07:28 -0800513{
Joonsoo Kime30825f2014-12-12 16:55:49 -0800514 struct page_ext *page_ext;
515
516 if (!debug_guardpage_enabled())
517 return;
518
519 page_ext = lookup_page_ext(page);
520 __clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
521
Joonsoo Kim2847cf92014-12-12 16:55:01 -0800522 set_page_private(page, 0);
523 if (!is_migrate_isolate(migratetype))
524 __mod_zone_freepage_state(zone, (1 << order), migratetype);
Stanislaw Gruszkac0a32fc2012-01-10 15:07:28 -0800525}
526#else
Joonsoo Kime30825f2014-12-12 16:55:49 -0800527struct page_ext_operations debug_guardpage_ops = { NULL, };
Joonsoo Kim2847cf92014-12-12 16:55:01 -0800528static inline void set_page_guard(struct zone *zone, struct page *page,
529 unsigned int order, int migratetype) {}
530static inline void clear_page_guard(struct zone *zone, struct page *page,
531 unsigned int order, int migratetype) {}
Stanislaw Gruszkac0a32fc2012-01-10 15:07:28 -0800532#endif
533
Mel Gorman7aeb09f2014-06-04 16:10:21 -0700534static inline void set_page_order(struct page *page, unsigned int order)
Andrew Morton6aa3001b22006-04-18 22:20:52 -0700535{
Hugh Dickins4c21e2f2005-10-29 18:16:40 -0700536 set_page_private(page, order);
Nick Piggin676165a2006-04-10 11:21:48 +1000537 __SetPageBuddy(page);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700538}
539
540static inline void rmv_page_order(struct page *page)
541{
Nick Piggin676165a2006-04-10 11:21:48 +1000542 __ClearPageBuddy(page);
Hugh Dickins4c21e2f2005-10-29 18:16:40 -0700543 set_page_private(page, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700544}
545
546/*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700547 * This function checks whether a page is free && is the buddy
548 * we can do coalesce a page and its buddy if
Nick Piggin13e74442006-01-06 00:10:58 -0800549 * (a) the buddy is not in a hole &&
Nick Piggin676165a2006-04-10 11:21:48 +1000550 * (b) the buddy is in the buddy system &&
Andy Whitcroftcb2b95e2006-06-23 02:03:01 -0700551 * (c) a page and its buddy have the same order &&
552 * (d) a page and its buddy are in the same zone.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700553 *
Wang Sheng-Huicf6fe942013-09-11 14:22:48 -0700554 * For recording whether a page is in the buddy system, we set ->_mapcount
555 * PAGE_BUDDY_MAPCOUNT_VALUE.
556 * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is
557 * serialized by zone->lock.
Nick Piggin676165a2006-04-10 11:21:48 +1000558 *
559 * For recording page's order, we use page_private(page).
Linus Torvalds1da177e2005-04-16 15:20:36 -0700560 */
Andy Whitcroftcb2b95e2006-06-23 02:03:01 -0700561static inline int page_is_buddy(struct page *page, struct page *buddy,
Mel Gorman7aeb09f2014-06-04 16:10:21 -0700562 unsigned int order)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700563{
Andy Whitcroft14e07292007-05-06 14:49:14 -0700564 if (!pfn_valid_within(page_to_pfn(buddy)))
Nick Piggin13e74442006-01-06 00:10:58 -0800565 return 0;
Nick Piggin13e74442006-01-06 00:10:58 -0800566
Stanislaw Gruszkac0a32fc2012-01-10 15:07:28 -0800567 if (page_is_guard(buddy) && page_order(buddy) == order) {
Mel Gormand34c5fa2014-06-04 16:10:10 -0700568 if (page_zone_id(page) != page_zone_id(buddy))
569 return 0;
570
Weijie Yang4c5018c2015-02-10 14:11:39 -0800571 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
572
Stanislaw Gruszkac0a32fc2012-01-10 15:07:28 -0800573 return 1;
574 }
575
Andy Whitcroftcb2b95e2006-06-23 02:03:01 -0700576 if (PageBuddy(buddy) && page_order(buddy) == order) {
Mel Gormand34c5fa2014-06-04 16:10:10 -0700577 /*
578 * zone check is done late to avoid uselessly
579 * calculating zone/node ids for pages that could
580 * never merge.
581 */
582 if (page_zone_id(page) != page_zone_id(buddy))
583 return 0;
584
Weijie Yang4c5018c2015-02-10 14:11:39 -0800585 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
586
Andrew Morton6aa3001b22006-04-18 22:20:52 -0700587 return 1;
Nick Piggin676165a2006-04-10 11:21:48 +1000588 }
Andrew Morton6aa3001b22006-04-18 22:20:52 -0700589 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700590}
591
592/*
593 * Freeing function for a buddy system allocator.
594 *
595 * The concept of a buddy system is to maintain direct-mapped table
596 * (containing bit values) for memory blocks of various "orders".
597 * The bottom level table contains the map for the smallest allocatable
598 * units of memory (here, pages), and each level above it describes
599 * pairs of units from the levels below, hence, "buddies".
600 * At a high level, all that happens here is marking the table entry
601 * at the bottom level available, and propagating the changes upward
602 * as necessary, plus some accounting needed to play nicely with other
603 * parts of the VM system.
604 * At each level, we keep a list of pages, which are heads of continuous
Wang Sheng-Huicf6fe942013-09-11 14:22:48 -0700605 * free pages of length of (1 << order) and marked with _mapcount
606 * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)
607 * field.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700608 * So when we are allocating or freeing one, we can derive the state of the
Michal Nazarewicz5f63b722012-01-11 15:16:11 +0100609 * other. That is, if we allocate a small block, and both were
610 * free, the remainder of the region must be split into blocks.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700611 * If a block is freed, and its buddy is also free, then this
Michal Nazarewicz5f63b722012-01-11 15:16:11 +0100612 * triggers coalescing into a block of larger size.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700613 *
Nadia Yvette Chambers6d49e352012-12-06 10:39:54 +0100614 * -- nyc
Linus Torvalds1da177e2005-04-16 15:20:36 -0700615 */
616
Nick Piggin48db57f2006-01-08 01:00:42 -0800617static inline void __free_one_page(struct page *page,
Mel Gormandc4b0ca2014-06-04 16:10:17 -0700618 unsigned long pfn,
Mel Gormaned0ae212009-06-16 15:32:07 -0700619 struct zone *zone, unsigned int order,
620 int migratetype)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700621{
622 unsigned long page_idx;
Corrado Zoccolo6dda9d52010-05-24 14:31:54 -0700623 unsigned long combined_idx;
KyongHo Cho43506fa2011-01-13 15:47:24 -0800624 unsigned long uninitialized_var(buddy_idx);
Corrado Zoccolo6dda9d52010-05-24 14:31:54 -0700625 struct page *buddy;
Joonsoo Kim3c605092014-11-13 15:19:21 -0800626 int max_order = MAX_ORDER;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700627
Cody P Schaferd29bb972013-02-22 16:35:25 -0800628 VM_BUG_ON(!zone_is_initialized(zone));
Kirill A. Shutemov6e9f0d52015-02-11 15:25:50 -0800629 VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700630
Mel Gormaned0ae212009-06-16 15:32:07 -0700631 VM_BUG_ON(migratetype == -1);
Joonsoo Kim3c605092014-11-13 15:19:21 -0800632 if (is_migrate_isolate(migratetype)) {
633 /*
634 * We restrict max order of merging to prevent merge
635 * between freepages on isolate pageblock and normal
636 * pageblock. Without this, pageblock isolation
637 * could cause incorrect freepage accounting.
638 */
639 max_order = min(MAX_ORDER, pageblock_order + 1);
640 } else {
Joonsoo Kim8f82b552014-11-13 15:19:18 -0800641 __mod_zone_freepage_state(zone, 1 << order, migratetype);
Joonsoo Kim3c605092014-11-13 15:19:21 -0800642 }
Mel Gormaned0ae212009-06-16 15:32:07 -0700643
Joonsoo Kim3c605092014-11-13 15:19:21 -0800644 page_idx = pfn & ((1 << max_order) - 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700645
Sasha Levin309381fea2014-01-23 15:52:54 -0800646 VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page);
647 VM_BUG_ON_PAGE(bad_range(zone, page), page);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700648
Joonsoo Kim3c605092014-11-13 15:19:21 -0800649 while (order < max_order - 1) {
KyongHo Cho43506fa2011-01-13 15:47:24 -0800650 buddy_idx = __find_buddy_index(page_idx, order);
651 buddy = page + (buddy_idx - page_idx);
Andy Whitcroftcb2b95e2006-06-23 02:03:01 -0700652 if (!page_is_buddy(page, buddy, order))
Andy Whitcroft3c82d0c2008-07-23 21:27:11 -0700653 break;
Stanislaw Gruszkac0a32fc2012-01-10 15:07:28 -0800654 /*
655 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
656 * merge with it and move up one order.
657 */
658 if (page_is_guard(buddy)) {
Joonsoo Kim2847cf92014-12-12 16:55:01 -0800659 clear_page_guard(zone, buddy, order, migratetype);
Stanislaw Gruszkac0a32fc2012-01-10 15:07:28 -0800660 } else {
661 list_del(&buddy->lru);
662 zone->free_area[order].nr_free--;
663 rmv_page_order(buddy);
664 }
KyongHo Cho43506fa2011-01-13 15:47:24 -0800665 combined_idx = buddy_idx & page_idx;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700666 page = page + (combined_idx - page_idx);
667 page_idx = combined_idx;
668 order++;
669 }
670 set_page_order(page, order);
Corrado Zoccolo6dda9d52010-05-24 14:31:54 -0700671
672 /*
673 * If this is not the largest possible page, check if the buddy
674 * of the next-highest order is free. If it is, it's possible
675 * that pages are being freed that will coalesce soon. In case,
676 * that is happening, add the free page to the tail of the list
677 * so it's less likely to be used soon and more likely to be merged
678 * as a higher order page
679 */
Mel Gormanb7f50cf2010-10-26 14:21:11 -0700680 if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
Corrado Zoccolo6dda9d52010-05-24 14:31:54 -0700681 struct page *higher_page, *higher_buddy;
KyongHo Cho43506fa2011-01-13 15:47:24 -0800682 combined_idx = buddy_idx & page_idx;
683 higher_page = page + (combined_idx - page_idx);
684 buddy_idx = __find_buddy_index(combined_idx, order + 1);
Li Haifeng0ba8f2d2012-09-17 14:09:21 -0700685 higher_buddy = higher_page + (buddy_idx - combined_idx);
Corrado Zoccolo6dda9d52010-05-24 14:31:54 -0700686 if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
687 list_add_tail(&page->lru,
688 &zone->free_area[order].free_list[migratetype]);
689 goto out;
690 }
691 }
692
693 list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
694out:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700695 zone->free_area[order].nr_free++;
696}
697
Nick Piggin224abf92006-01-06 00:11:11 -0800698static inline int free_pages_check(struct page *page)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700699{
Kirill A. Shutemovd230dec2014-04-07 15:37:38 -0700700 const char *bad_reason = NULL;
Dave Hansenf0b791a2014-01-23 15:52:49 -0800701 unsigned long bad_flags = 0;
702
703 if (unlikely(page_mapcount(page)))
704 bad_reason = "nonzero mapcount";
705 if (unlikely(page->mapping != NULL))
706 bad_reason = "non-NULL mapping";
707 if (unlikely(atomic_read(&page->_count) != 0))
708 bad_reason = "nonzero _count";
709 if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
710 bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
711 bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
712 }
Johannes Weiner9edad6e2014-12-10 15:44:58 -0800713#ifdef CONFIG_MEMCG
714 if (unlikely(page->mem_cgroup))
715 bad_reason = "page still charged to cgroup";
716#endif
Dave Hansenf0b791a2014-01-23 15:52:49 -0800717 if (unlikely(bad_reason)) {
718 bad_page(page, bad_reason, bad_flags);
Hugh Dickins79f4b7b2009-01-06 14:40:05 -0800719 return 1;
Hugh Dickins8cc3b392009-01-06 14:40:06 -0800720 }
Peter Zijlstra90572892013-10-07 11:29:20 +0100721 page_cpupid_reset_last(page);
Hugh Dickins79f4b7b2009-01-06 14:40:05 -0800722 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
723 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
724 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700725}
726
727/*
Mel Gorman5f8dcc22009-09-21 17:03:19 -0700728 * Frees a number of pages from the PCP lists
Linus Torvalds1da177e2005-04-16 15:20:36 -0700729 * Assumes all pages on list are in same zone, and of same order.
Renaud Lienhart207f36e2005-09-10 00:26:59 -0700730 * count is the number of pages to free.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700731 *
732 * If the zone was previously in an "all pages pinned" state then look to
733 * see if this freeing clears that state.
734 *
735 * And clear the zone's pages_scanned counter, to hold off the "all pages are
736 * pinned" detection logic.
737 */
Mel Gorman5f8dcc22009-09-21 17:03:19 -0700738static void free_pcppages_bulk(struct zone *zone, int count,
739 struct per_cpu_pages *pcp)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700740{
Mel Gorman5f8dcc22009-09-21 17:03:19 -0700741 int migratetype = 0;
Mel Gormana6f9edd62009-09-21 17:03:20 -0700742 int batch_free = 0;
Mel Gorman72853e22010-09-09 16:38:16 -0700743 int to_free = count;
Mel Gorman0d5d8232014-08-06 16:07:16 -0700744 unsigned long nr_scanned;
Mel Gorman5f8dcc22009-09-21 17:03:19 -0700745
Nick Pigginc54ad302006-01-06 00:10:56 -0800746 spin_lock(&zone->lock);
Mel Gorman0d5d8232014-08-06 16:07:16 -0700747 nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
748 if (nr_scanned)
749 __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
Mel Gormanf2260e62009-06-16 15:32:13 -0700750
Mel Gorman72853e22010-09-09 16:38:16 -0700751 while (to_free) {
Nick Piggin48db57f2006-01-08 01:00:42 -0800752 struct page *page;
Mel Gorman5f8dcc22009-09-21 17:03:19 -0700753 struct list_head *list;
Nick Piggin48db57f2006-01-08 01:00:42 -0800754
Mel Gorman5f8dcc22009-09-21 17:03:19 -0700755 /*
Mel Gormana6f9edd62009-09-21 17:03:20 -0700756 * Remove pages from lists in a round-robin fashion. A
757 * batch_free count is maintained that is incremented when an
758 * empty list is encountered. This is so more pages are freed
759 * off fuller lists instead of spinning excessively around empty
760 * lists
Mel Gorman5f8dcc22009-09-21 17:03:19 -0700761 */
762 do {
Mel Gormana6f9edd62009-09-21 17:03:20 -0700763 batch_free++;
Mel Gorman5f8dcc22009-09-21 17:03:19 -0700764 if (++migratetype == MIGRATE_PCPTYPES)
765 migratetype = 0;
766 list = &pcp->lists[migratetype];
767 } while (list_empty(list));
768
Namhyung Kim1d168712011-03-22 16:32:45 -0700769 /* This is the only non-empty list. Free them all. */
770 if (batch_free == MIGRATE_PCPTYPES)
771 batch_free = to_free;
772
Mel Gormana6f9edd62009-09-21 17:03:20 -0700773 do {
Bartlomiej Zolnierkiewicz770c8aa2012-10-08 16:31:57 -0700774 int mt; /* migratetype of the to-be-freed page */
775
Mel Gormana6f9edd62009-09-21 17:03:20 -0700776 page = list_entry(list->prev, struct page, lru);
777 /* must delete as __free_one_page list manipulates */
778 list_del(&page->lru);
Minchan Kimb12c4ad2012-10-08 16:32:08 -0700779 mt = get_freepage_migratetype(page);
Joonsoo Kim8f82b552014-11-13 15:19:18 -0800780 if (unlikely(has_isolate_pageblock(zone)))
Joonsoo Kim51bb1a42014-11-13 15:19:14 -0800781 mt = get_pageblock_migratetype(page);
Joonsoo Kim51bb1a42014-11-13 15:19:14 -0800782
Hugh Dickinsa7016232010-01-29 17:46:34 +0000783 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
Mel Gormandc4b0ca2014-06-04 16:10:17 -0700784 __free_one_page(page, page_to_pfn(page), zone, 0, mt);
Bartlomiej Zolnierkiewicz770c8aa2012-10-08 16:31:57 -0700785 trace_mm_page_pcpu_drain(page, 0, mt);
Mel Gorman72853e22010-09-09 16:38:16 -0700786 } while (--to_free && --batch_free && !list_empty(list));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700787 }
Nick Pigginc54ad302006-01-06 00:10:56 -0800788 spin_unlock(&zone->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700789}
790
Mel Gormandc4b0ca2014-06-04 16:10:17 -0700791static void free_one_page(struct zone *zone,
792 struct page *page, unsigned long pfn,
Mel Gorman7aeb09f2014-06-04 16:10:21 -0700793 unsigned int order,
Mel Gormaned0ae212009-06-16 15:32:07 -0700794 int migratetype)
Nick Piggin48db57f2006-01-08 01:00:42 -0800795{
Mel Gorman0d5d8232014-08-06 16:07:16 -0700796 unsigned long nr_scanned;
Christoph Lameter006d22d2006-09-25 23:31:48 -0700797 spin_lock(&zone->lock);
Mel Gorman0d5d8232014-08-06 16:07:16 -0700798 nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
799 if (nr_scanned)
800 __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
Mel Gormanf2260e62009-06-16 15:32:13 -0700801
Joonsoo Kimad53f922014-11-13 15:19:11 -0800802 if (unlikely(has_isolate_pageblock(zone) ||
803 is_migrate_isolate(migratetype))) {
804 migratetype = get_pfnblock_migratetype(page, pfn);
Joonsoo Kimad53f922014-11-13 15:19:11 -0800805 }
Mel Gormandc4b0ca2014-06-04 16:10:17 -0700806 __free_one_page(page, pfn, zone, order, migratetype);
Christoph Lameter006d22d2006-09-25 23:31:48 -0700807 spin_unlock(&zone->lock);
Nick Piggin48db57f2006-01-08 01:00:42 -0800808}
809
Kirill A. Shutemov81422f22015-02-11 15:25:52 -0800810static int free_tail_pages_check(struct page *head_page, struct page *page)
811{
812 if (!IS_ENABLED(CONFIG_DEBUG_VM))
813 return 0;
814 if (unlikely(!PageTail(page))) {
815 bad_page(page, "PageTail not set", 0);
816 return 1;
817 }
818 if (unlikely(page->first_page != head_page)) {
819 bad_page(page, "first_page not consistent", 0);
820 return 1;
821 }
822 return 0;
823}
824
Robin Holt1e8ce832015-06-30 14:56:45 -0700825static void __meminit __init_single_page(struct page *page, unsigned long pfn,
826 unsigned long zone, int nid)
827{
828 struct zone *z = &NODE_DATA(nid)->node_zones[zone];
829
830 set_page_links(page, zone, nid, pfn);
831 mminit_verify_page_links(page, zone, nid, pfn);
832 init_page_count(page);
833 page_mapcount_reset(page);
834 page_cpupid_reset_last(page);
Robin Holt1e8ce832015-06-30 14:56:45 -0700835
836 /*
837 * Mark the block movable so that blocks are reserved for
838 * movable at startup. This will force kernel allocations
839 * to reserve their blocks rather than leaking throughout
840 * the address space during boot when many long-lived
841 * kernel allocations are made. Later some blocks near
842 * the start are marked MIGRATE_RESERVE by
843 * setup_zone_migrate_reserve()
844 *
845 * bitmap is created for zone's valid pfn range. but memmap
846 * can be created for invalid pages (for alignment)
847 * check here not to call set_pageblock_migratetype() against
848 * pfn out of zone.
849 */
850 if ((z->zone_start_pfn <= pfn)
851 && (pfn < zone_end_pfn(z))
852 && !(pfn & (pageblock_nr_pages - 1)))
853 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
854
855 INIT_LIST_HEAD(&page->lru);
856#ifdef WANT_PAGE_VIRTUAL
857 /* The shift won't overflow because ZONE_NORMAL is below 4G. */
858 if (!is_highmem_idx(zone))
859 set_page_address(page, __va(pfn << PAGE_SHIFT));
860#endif
861}
862
863static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone,
864 int nid)
865{
866 return __init_single_page(pfn_to_page(pfn), pfn, zone, nid);
867}
868
Nathan Zimmer92923ca32015-06-30 14:56:48 -0700869/*
870 * Initialised pages do not have PageReserved set. This function is
871 * called for each range allocated by the bootmem allocator and
872 * marks the pages PageReserved. The remaining valid pages are later
873 * sent to the buddy page allocator.
874 */
875void reserve_bootmem_region(unsigned long start, unsigned long end)
876{
877 unsigned long start_pfn = PFN_DOWN(start);
878 unsigned long end_pfn = PFN_UP(end);
879
880 for (; start_pfn < end_pfn; start_pfn++)
881 if (pfn_valid(start_pfn))
882 SetPageReserved(pfn_to_page(start_pfn));
883}
884
KOSAKI Motohiroec95f532010-05-24 14:32:38 -0700885static bool free_pages_prepare(struct page *page, unsigned int order)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700886{
Kirill A. Shutemov81422f22015-02-11 15:25:52 -0800887 bool compound = PageCompound(page);
888 int i, bad = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700889
Yu Zhaoab1f3062014-12-10 15:43:17 -0800890 VM_BUG_ON_PAGE(PageTail(page), page);
Kirill A. Shutemov81422f22015-02-11 15:25:52 -0800891 VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
Yu Zhaoab1f3062014-12-10 15:43:17 -0800892
Konstantin Khlebnikovb413d482012-01-10 15:07:09 -0800893 trace_mm_page_free(page, order);
Vegard Nossumb1eeab62008-11-25 16:55:53 +0100894 kmemcheck_free_shadow(page, order);
Andrey Ryabininb8c73fc2015-02-13 14:39:28 -0800895 kasan_free_pages(page, order);
Vegard Nossumb1eeab62008-11-25 16:55:53 +0100896
Andrea Arcangeli8dd60a32011-01-13 15:46:34 -0800897 if (PageAnon(page))
898 page->mapping = NULL;
Kirill A. Shutemov81422f22015-02-11 15:25:52 -0800899 bad += free_pages_check(page);
900 for (i = 1; i < (1 << order); i++) {
901 if (compound)
902 bad += free_tail_pages_check(page, page + i);
Andrea Arcangeli8dd60a32011-01-13 15:46:34 -0800903 bad += free_pages_check(page + i);
Kirill A. Shutemov81422f22015-02-11 15:25:52 -0800904 }
Hugh Dickins8cc3b392009-01-06 14:40:06 -0800905 if (bad)
KOSAKI Motohiroec95f532010-05-24 14:32:38 -0700906 return false;
Hugh Dickins689bceb2005-11-21 21:32:20 -0800907
Joonsoo Kim48c96a32014-12-12 16:56:01 -0800908 reset_page_owner(page, order);
909
Thomas Gleixner3ac7fe52008-04-30 00:55:01 -0700910 if (!PageHighMem(page)) {
Pintu Kumarb8af2942013-09-11 14:20:34 -0700911 debug_check_no_locks_freed(page_address(page),
912 PAGE_SIZE << order);
Thomas Gleixner3ac7fe52008-04-30 00:55:01 -0700913 debug_check_no_obj_freed(page_address(page),
914 PAGE_SIZE << order);
915 }
Nick Piggindafb1362006-10-11 01:21:30 -0700916 arch_free_page(page, order);
Nick Piggin48db57f2006-01-08 01:00:42 -0800917 kernel_map_pages(page, 1 << order, 0);
Nick Piggindafb1362006-10-11 01:21:30 -0700918
KOSAKI Motohiroec95f532010-05-24 14:32:38 -0700919 return true;
920}
921
922static void __free_pages_ok(struct page *page, unsigned int order)
923{
924 unsigned long flags;
Minchan Kim95e34412012-10-08 16:32:11 -0700925 int migratetype;
Mel Gormandc4b0ca2014-06-04 16:10:17 -0700926 unsigned long pfn = page_to_pfn(page);
KOSAKI Motohiroec95f532010-05-24 14:32:38 -0700927
928 if (!free_pages_prepare(page, order))
929 return;
930
Mel Gormancfc47a22014-06-04 16:10:19 -0700931 migratetype = get_pfnblock_migratetype(page, pfn);
Nick Pigginc54ad302006-01-06 00:10:56 -0800932 local_irq_save(flags);
Christoph Lameterf8891e52006-06-30 01:55:45 -0700933 __count_vm_events(PGFREE, 1 << order);
Minchan Kim95e34412012-10-08 16:32:11 -0700934 set_freepage_migratetype(page, migratetype);
Mel Gormandc4b0ca2014-06-04 16:10:17 -0700935 free_one_page(page_zone(page), page, pfn, order, migratetype);
Nick Pigginc54ad302006-01-06 00:10:56 -0800936 local_irq_restore(flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700937}
938
Mel Gorman3a80a7f2015-06-30 14:57:02 -0700939static void __defer_init __free_pages_boot_core(struct page *page,
940 unsigned long pfn, unsigned int order)
David Howellsa226f6c2006-01-06 00:11:08 -0800941{
Johannes Weinerc3993072012-01-10 15:08:10 -0800942 unsigned int nr_pages = 1 << order;
Yinghai Lue2d0bd22013-09-11 14:20:37 -0700943 struct page *p = page;
Johannes Weinerc3993072012-01-10 15:08:10 -0800944 unsigned int loop;
David Howellsa226f6c2006-01-06 00:11:08 -0800945
Yinghai Lue2d0bd22013-09-11 14:20:37 -0700946 prefetchw(p);
947 for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
948 prefetchw(p + 1);
Johannes Weinerc3993072012-01-10 15:08:10 -0800949 __ClearPageReserved(p);
950 set_page_count(p, 0);
David Howellsa226f6c2006-01-06 00:11:08 -0800951 }
Yinghai Lue2d0bd22013-09-11 14:20:37 -0700952 __ClearPageReserved(p);
953 set_page_count(p, 0);
Johannes Weinerc3993072012-01-10 15:08:10 -0800954
Yinghai Lue2d0bd22013-09-11 14:20:37 -0700955 page_zone(page)->managed_pages += nr_pages;
Johannes Weinerc3993072012-01-10 15:08:10 -0800956 set_page_refcounted(page);
957 __free_pages(page, order);
David Howellsa226f6c2006-01-06 00:11:08 -0800958}
959
Mel Gorman75a592a2015-06-30 14:56:59 -0700960#if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \
961 defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
962/* Only safe to use early in boot when initialisation is single-threaded */
963static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
964
965int __meminit early_pfn_to_nid(unsigned long pfn)
966{
967 int nid;
968
969 /* The system will behave unpredictably otherwise */
970 BUG_ON(system_state != SYSTEM_BOOTING);
971
972 nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
973 if (nid >= 0)
974 return nid;
975 /* just returns 0 */
976 return 0;
977}
978#endif
979
980#ifdef CONFIG_NODES_SPAN_OTHER_NODES
981static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node,
982 struct mminit_pfnnid_cache *state)
983{
984 int nid;
985
986 nid = __early_pfn_to_nid(pfn, state);
987 if (nid >= 0 && nid != node)
988 return false;
989 return true;
990}
991
992/* Only safe to use early in boot when initialisation is single-threaded */
993static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
994{
995 return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache);
996}
997
998#else
999
1000static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
1001{
1002 return true;
1003}
1004static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node,
1005 struct mminit_pfnnid_cache *state)
1006{
1007 return true;
1008}
1009#endif
1010
1011
Mel Gorman3a80a7f2015-06-30 14:57:02 -07001012void __defer_init __free_pages_bootmem(struct page *page, unsigned long pfn,
1013 unsigned int order)
1014{
1015 if (early_page_uninitialised(pfn))
1016 return;
1017 return __free_pages_boot_core(page, pfn, order);
1018}
1019
Michal Nazarewicz47118af2011-12-29 13:09:50 +01001020#ifdef CONFIG_CMA
Li Zhong9cf510a2013-08-23 13:52:52 +08001021/* Free whole pageblock and set its migration type to MIGRATE_CMA. */
Michal Nazarewicz47118af2011-12-29 13:09:50 +01001022void __init init_cma_reserved_pageblock(struct page *page)
1023{
1024 unsigned i = pageblock_nr_pages;
1025 struct page *p = page;
1026
1027 do {
1028 __ClearPageReserved(p);
1029 set_page_count(p, 0);
1030 } while (++p, --i);
1031
Michal Nazarewicz47118af2011-12-29 13:09:50 +01001032 set_pageblock_migratetype(page, MIGRATE_CMA);
Michal Nazarewiczdc783272014-07-02 15:22:35 -07001033
1034 if (pageblock_order >= MAX_ORDER) {
1035 i = pageblock_nr_pages;
1036 p = page;
1037 do {
1038 set_page_refcounted(p);
1039 __free_pages(p, MAX_ORDER - 1);
1040 p += MAX_ORDER_NR_PAGES;
1041 } while (i -= MAX_ORDER_NR_PAGES);
1042 } else {
1043 set_page_refcounted(page);
1044 __free_pages(page, pageblock_order);
1045 }
1046
Jiang Liu3dcc0572013-07-03 15:03:21 -07001047 adjust_managed_page_count(page, pageblock_nr_pages);
Michal Nazarewicz47118af2011-12-29 13:09:50 +01001048}
1049#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001050
1051/*
1052 * The order of subdivision here is critical for the IO subsystem.
1053 * Please do not alter this order without good reasons and regression
1054 * testing. Specifically, as large blocks of memory are subdivided,
1055 * the order in which smaller blocks are delivered depends on the order
1056 * they're subdivided in this function. This is the primary factor
1057 * influencing the order in which pages are delivered to the IO
1058 * subsystem according to empirical testing, and this is also justified
1059 * by considering the behavior of a buddy system containing a single
1060 * large block of memory acted on by a series of small allocations.
1061 * This behavior is a critical factor in sglist merging's success.
1062 *
Nadia Yvette Chambers6d49e352012-12-06 10:39:54 +01001063 * -- nyc
Linus Torvalds1da177e2005-04-16 15:20:36 -07001064 */
Nick Piggin085cc7d2006-01-06 00:11:01 -08001065static inline void expand(struct zone *zone, struct page *page,
Mel Gormanb2a0ac82007-10-16 01:25:48 -07001066 int low, int high, struct free_area *area,
1067 int migratetype)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001068{
1069 unsigned long size = 1 << high;
1070
1071 while (high > low) {
1072 area--;
1073 high--;
1074 size >>= 1;
Sasha Levin309381fea2014-01-23 15:52:54 -08001075 VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
Stanislaw Gruszkac0a32fc2012-01-10 15:07:28 -08001076
Joonsoo Kim2847cf92014-12-12 16:55:01 -08001077 if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) &&
Joonsoo Kime30825f2014-12-12 16:55:49 -08001078 debug_guardpage_enabled() &&
Joonsoo Kim2847cf92014-12-12 16:55:01 -08001079 high < debug_guardpage_minorder()) {
Stanislaw Gruszkac0a32fc2012-01-10 15:07:28 -08001080 /*
1081 * Mark as guard pages (or page), that will allow to
1082 * merge back to allocator when buddy will be freed.
1083 * Corresponding page table entries will not be touched,
1084 * pages will stay not present in virtual address space
1085 */
Joonsoo Kim2847cf92014-12-12 16:55:01 -08001086 set_page_guard(zone, &page[size], high, migratetype);
Stanislaw Gruszkac0a32fc2012-01-10 15:07:28 -08001087 continue;
1088 }
Mel Gormanb2a0ac82007-10-16 01:25:48 -07001089 list_add(&page[size].lru, &area->free_list[migratetype]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001090 area->nr_free++;
1091 set_page_order(&page[size], high);
1092 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001093}
1094
Linus Torvalds1da177e2005-04-16 15:20:36 -07001095/*
1096 * This page is about to be returned from the page allocator
1097 */
Wu Fengguang2a7684a2009-09-16 11:50:12 +02001098static inline int check_new_page(struct page *page)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001099{
Kirill A. Shutemovd230dec2014-04-07 15:37:38 -07001100 const char *bad_reason = NULL;
Dave Hansenf0b791a2014-01-23 15:52:49 -08001101 unsigned long bad_flags = 0;
1102
1103 if (unlikely(page_mapcount(page)))
1104 bad_reason = "nonzero mapcount";
1105 if (unlikely(page->mapping != NULL))
1106 bad_reason = "non-NULL mapping";
1107 if (unlikely(atomic_read(&page->_count) != 0))
1108 bad_reason = "nonzero _count";
1109 if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) {
1110 bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
1111 bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
1112 }
Johannes Weiner9edad6e2014-12-10 15:44:58 -08001113#ifdef CONFIG_MEMCG
1114 if (unlikely(page->mem_cgroup))
1115 bad_reason = "page still charged to cgroup";
1116#endif
Dave Hansenf0b791a2014-01-23 15:52:49 -08001117 if (unlikely(bad_reason)) {
1118 bad_page(page, bad_reason, bad_flags);
Hugh Dickins689bceb2005-11-21 21:32:20 -08001119 return 1;
Hugh Dickins8cc3b392009-01-06 14:40:06 -08001120 }
Wu Fengguang2a7684a2009-09-16 11:50:12 +02001121 return 0;
1122}
1123
Vlastimil Babka75379192015-02-11 15:25:38 -08001124static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
1125 int alloc_flags)
Wu Fengguang2a7684a2009-09-16 11:50:12 +02001126{
1127 int i;
1128
1129 for (i = 0; i < (1 << order); i++) {
1130 struct page *p = page + i;
1131 if (unlikely(check_new_page(p)))
1132 return 1;
1133 }
Hugh Dickins689bceb2005-11-21 21:32:20 -08001134
Hugh Dickins4c21e2f2005-10-29 18:16:40 -07001135 set_page_private(page, 0);
Nick Piggin7835e982006-03-22 00:08:40 -08001136 set_page_refcounted(page);
Nick Piggincc1025092006-12-06 20:32:00 -08001137
1138 arch_alloc_page(page, order);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001139 kernel_map_pages(page, 1 << order, 1);
Andrey Ryabininb8c73fc2015-02-13 14:39:28 -08001140 kasan_alloc_pages(page, order);
Nick Piggin17cf4402006-03-22 00:08:41 -08001141
1142 if (gfp_flags & __GFP_ZERO)
Anisse Astierf4d28972015-06-24 16:56:36 -07001143 for (i = 0; i < (1 << order); i++)
1144 clear_highpage(page + i);
Nick Piggin17cf4402006-03-22 00:08:41 -08001145
1146 if (order && (gfp_flags & __GFP_COMP))
1147 prep_compound_page(page, order);
1148
Joonsoo Kim48c96a32014-12-12 16:56:01 -08001149 set_page_owner(page, order, gfp_flags);
1150
Vlastimil Babka75379192015-02-11 15:25:38 -08001151 /*
1152 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was necessary to
1153 * allocate the page. The expectation is that the caller is taking
1154 * steps that will free more memory. The caller should avoid the page
1155 * being used for !PFMEMALLOC purposes.
1156 */
1157 page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
1158
Hugh Dickins689bceb2005-11-21 21:32:20 -08001159 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001160}
1161
Mel Gorman56fd56b2007-10-16 01:25:58 -07001162/*
1163 * Go through the free lists for the given migratetype and remove
1164 * the smallest available page from the freelists
1165 */
Mel Gorman728ec982009-06-16 15:32:04 -07001166static inline
1167struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
Mel Gorman56fd56b2007-10-16 01:25:58 -07001168 int migratetype)
1169{
1170 unsigned int current_order;
Pintu Kumarb8af2942013-09-11 14:20:34 -07001171 struct free_area *area;
Mel Gorman56fd56b2007-10-16 01:25:58 -07001172 struct page *page;
1173
1174 /* Find a page of the appropriate size in the preferred list */
1175 for (current_order = order; current_order < MAX_ORDER; ++current_order) {
1176 area = &(zone->free_area[current_order]);
1177 if (list_empty(&area->free_list[migratetype]))
1178 continue;
1179
1180 page = list_entry(area->free_list[migratetype].next,
1181 struct page, lru);
1182 list_del(&page->lru);
1183 rmv_page_order(page);
1184 area->nr_free--;
Mel Gorman56fd56b2007-10-16 01:25:58 -07001185 expand(zone, page, order, current_order, area, migratetype);
Vlastimil Babka5bcc9f82014-06-04 16:07:22 -07001186 set_freepage_migratetype(page, migratetype);
Mel Gorman56fd56b2007-10-16 01:25:58 -07001187 return page;
1188 }
1189
1190 return NULL;
1191}
1192
1193
Mel Gormanb2a0ac82007-10-16 01:25:48 -07001194/*
1195 * This array describes the order lists are fallen back to when
1196 * the free lists for the desirable migrate type are depleted
1197 */
Michal Nazarewicz47118af2011-12-29 13:09:50 +01001198static int fallbacks[MIGRATE_TYPES][4] = {
1199 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
1200 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
Michal Nazarewicz47118af2011-12-29 13:09:50 +01001201 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
Joonsoo Kimdc676472015-04-14 15:45:15 -07001202#ifdef CONFIG_CMA
1203 [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */
Michal Nazarewicz47118af2011-12-29 13:09:50 +01001204#endif
Michal Nazarewicz6d4a4912012-01-11 15:31:33 +01001205 [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */
Minchan Kim194159f2013-02-22 16:33:58 -08001206#ifdef CONFIG_MEMORY_ISOLATION
Michal Nazarewicz6d4a4912012-01-11 15:31:33 +01001207 [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */
Minchan Kim194159f2013-02-22 16:33:58 -08001208#endif
Mel Gormanb2a0ac82007-10-16 01:25:48 -07001209};
1210
Joonsoo Kimdc676472015-04-14 15:45:15 -07001211#ifdef CONFIG_CMA
1212static struct page *__rmqueue_cma_fallback(struct zone *zone,
1213 unsigned int order)
1214{
1215 return __rmqueue_smallest(zone, order, MIGRATE_CMA);
1216}
1217#else
1218static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
1219 unsigned int order) { return NULL; }
1220#endif
1221
Mel Gormanc361be52007-10-16 01:25:51 -07001222/*
1223 * Move the free pages in a range to the free lists of the requested type.
Mel Gormand9c23402007-10-16 01:26:01 -07001224 * Note that start_page and end_pages are not aligned on a pageblock
Mel Gormanc361be52007-10-16 01:25:51 -07001225 * boundary. If alignment is required, use move_freepages_block()
1226 */
Minchan Kim435b4052012-10-08 16:32:16 -07001227int move_freepages(struct zone *zone,
Adrian Bunkb69a7282008-07-23 21:28:12 -07001228 struct page *start_page, struct page *end_page,
1229 int migratetype)
Mel Gormanc361be52007-10-16 01:25:51 -07001230{
1231 struct page *page;
1232 unsigned long order;
Mel Gormand1003132007-10-16 01:26:00 -07001233 int pages_moved = 0;
Mel Gormanc361be52007-10-16 01:25:51 -07001234
1235#ifndef CONFIG_HOLES_IN_ZONE
1236 /*
1237 * page_zone is not safe to call in this context when
1238 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
1239 * anyway as we check zone boundaries in move_freepages_block().
1240 * Remove at a later date when no bug reports exist related to
Mel Gormanac0e5b72007-10-16 01:25:58 -07001241 * grouping pages by mobility
Mel Gormanc361be52007-10-16 01:25:51 -07001242 */
Mel Gorman97ee4ba2014-10-09 15:28:28 -07001243 VM_BUG_ON(page_zone(start_page) != page_zone(end_page));
Mel Gormanc361be52007-10-16 01:25:51 -07001244#endif
1245
1246 for (page = start_page; page <= end_page;) {
Adam Litke344c7902008-09-02 14:35:38 -07001247 /* Make sure we are not inadvertently changing nodes */
Sasha Levin309381fea2014-01-23 15:52:54 -08001248 VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
Adam Litke344c7902008-09-02 14:35:38 -07001249
Mel Gormanc361be52007-10-16 01:25:51 -07001250 if (!pfn_valid_within(page_to_pfn(page))) {
1251 page++;
1252 continue;
1253 }
1254
1255 if (!PageBuddy(page)) {
1256 page++;
1257 continue;
1258 }
1259
1260 order = page_order(page);
Kirill A. Shutemov84be48d2011-03-22 16:33:41 -07001261 list_move(&page->lru,
1262 &zone->free_area[order].free_list[migratetype]);
Minchan Kim95e34412012-10-08 16:32:11 -07001263 set_freepage_migratetype(page, migratetype);
Mel Gormanc361be52007-10-16 01:25:51 -07001264 page += 1 << order;
Mel Gormand1003132007-10-16 01:26:00 -07001265 pages_moved += 1 << order;
Mel Gormanc361be52007-10-16 01:25:51 -07001266 }
1267
Mel Gormand1003132007-10-16 01:26:00 -07001268 return pages_moved;
Mel Gormanc361be52007-10-16 01:25:51 -07001269}
1270
Minchan Kimee6f5092012-07-31 16:43:50 -07001271int move_freepages_block(struct zone *zone, struct page *page,
Linus Torvalds68e3e922012-06-03 20:05:57 -07001272 int migratetype)
Mel Gormanc361be52007-10-16 01:25:51 -07001273{
1274 unsigned long start_pfn, end_pfn;
1275 struct page *start_page, *end_page;
1276
1277 start_pfn = page_to_pfn(page);
Mel Gormand9c23402007-10-16 01:26:01 -07001278 start_pfn = start_pfn & ~(pageblock_nr_pages-1);
Mel Gormanc361be52007-10-16 01:25:51 -07001279 start_page = pfn_to_page(start_pfn);
Mel Gormand9c23402007-10-16 01:26:01 -07001280 end_page = start_page + pageblock_nr_pages - 1;
1281 end_pfn = start_pfn + pageblock_nr_pages - 1;
Mel Gormanc361be52007-10-16 01:25:51 -07001282
1283 /* Do not cross zone boundaries */
Cody P Schafer108bcc92013-02-22 16:35:23 -08001284 if (!zone_spans_pfn(zone, start_pfn))
Mel Gormanc361be52007-10-16 01:25:51 -07001285 start_page = page;
Cody P Schafer108bcc92013-02-22 16:35:23 -08001286 if (!zone_spans_pfn(zone, end_pfn))
Mel Gormanc361be52007-10-16 01:25:51 -07001287 return 0;
1288
1289 return move_freepages(zone, start_page, end_page, migratetype);
1290}
1291
Mel Gorman2f66a682009-09-21 17:02:31 -07001292static void change_pageblock_range(struct page *pageblock_page,
1293 int start_order, int migratetype)
1294{
1295 int nr_pageblocks = 1 << (start_order - pageblock_order);
1296
1297 while (nr_pageblocks--) {
1298 set_pageblock_migratetype(pageblock_page, migratetype);
1299 pageblock_page += pageblock_nr_pages;
1300 }
1301}
1302
Srivatsa S. Bhatfef903e2013-09-11 14:20:35 -07001303/*
Vlastimil Babka9c0415e2015-02-11 15:28:21 -08001304 * When we are falling back to another migratetype during allocation, try to
1305 * steal extra free pages from the same pageblocks to satisfy further
1306 * allocations, instead of polluting multiple pageblocks.
1307 *
1308 * If we are stealing a relatively large buddy page, it is likely there will
1309 * be more free pages in the pageblock, so try to steal them all. For
1310 * reclaimable and unmovable allocations, we steal regardless of page size,
1311 * as fragmentation caused by those allocations polluting movable pageblocks
1312 * is worse than movable allocations stealing from unmovable and reclaimable
1313 * pageblocks.
Srivatsa S. Bhatfef903e2013-09-11 14:20:35 -07001314 */
Joonsoo Kim4eb7dce2015-04-14 15:45:18 -07001315static bool can_steal_fallback(unsigned int order, int start_mt)
1316{
1317 /*
1318 * Leaving this order check is intended, although there is
1319 * relaxed order check in next check. The reason is that
1320 * we can actually steal whole pageblock if this condition met,
1321 * but, below check doesn't guarantee it and that is just heuristic
1322 * so could be changed anytime.
1323 */
1324 if (order >= pageblock_order)
1325 return true;
1326
1327 if (order >= pageblock_order / 2 ||
1328 start_mt == MIGRATE_RECLAIMABLE ||
1329 start_mt == MIGRATE_UNMOVABLE ||
1330 page_group_by_mobility_disabled)
1331 return true;
1332
1333 return false;
1334}
1335
1336/*
1337 * This function implements actual steal behaviour. If order is large enough,
1338 * we can steal whole pageblock. If not, we first move freepages in this
1339 * pageblock and check whether half of pages are moved or not. If half of
1340 * pages are moved, we can change migratetype of pageblock and permanently
1341 * use it's pages as requested migratetype in the future.
1342 */
1343static void steal_suitable_fallback(struct zone *zone, struct page *page,
1344 int start_type)
Srivatsa S. Bhatfef903e2013-09-11 14:20:35 -07001345{
1346 int current_order = page_order(page);
Joonsoo Kim4eb7dce2015-04-14 15:45:18 -07001347 int pages;
Srivatsa S. Bhatfef903e2013-09-11 14:20:35 -07001348
Srivatsa S. Bhatfef903e2013-09-11 14:20:35 -07001349 /* Take ownership for orders >= pageblock_order */
1350 if (current_order >= pageblock_order) {
1351 change_pageblock_range(page, current_order, start_type);
Vlastimil Babka3a1086f2015-02-11 15:28:18 -08001352 return;
Srivatsa S. Bhatfef903e2013-09-11 14:20:35 -07001353 }
1354
Joonsoo Kim4eb7dce2015-04-14 15:45:18 -07001355 pages = move_freepages_block(zone, page, start_type);
Srivatsa S. Bhatfef903e2013-09-11 14:20:35 -07001356
Joonsoo Kim4eb7dce2015-04-14 15:45:18 -07001357 /* Claim the whole block if over half of it is free */
1358 if (pages >= (1 << (pageblock_order-1)) ||
1359 page_group_by_mobility_disabled)
1360 set_pageblock_migratetype(page, start_type);
1361}
Srivatsa S. Bhatfef903e2013-09-11 14:20:35 -07001362
Joonsoo Kim2149cda2015-04-14 15:45:21 -07001363/*
1364 * Check whether there is a suitable fallback freepage with requested order.
1365 * If only_stealable is true, this function returns fallback_mt only if
1366 * we can steal other freepages all together. This would help to reduce
1367 * fragmentation due to mixed migratetype pages in one pageblock.
1368 */
1369int find_suitable_fallback(struct free_area *area, unsigned int order,
1370 int migratetype, bool only_stealable, bool *can_steal)
Joonsoo Kim4eb7dce2015-04-14 15:45:18 -07001371{
1372 int i;
1373 int fallback_mt;
1374
1375 if (area->nr_free == 0)
1376 return -1;
1377
1378 *can_steal = false;
1379 for (i = 0;; i++) {
1380 fallback_mt = fallbacks[migratetype][i];
1381 if (fallback_mt == MIGRATE_RESERVE)
1382 break;
1383
1384 if (list_empty(&area->free_list[fallback_mt]))
1385 continue;
1386
1387 if (can_steal_fallback(order, migratetype))
1388 *can_steal = true;
1389
Joonsoo Kim2149cda2015-04-14 15:45:21 -07001390 if (!only_stealable)
1391 return fallback_mt;
1392
1393 if (*can_steal)
1394 return fallback_mt;
Srivatsa S. Bhatfef903e2013-09-11 14:20:35 -07001395 }
Joonsoo Kim4eb7dce2015-04-14 15:45:18 -07001396
1397 return -1;
Srivatsa S. Bhatfef903e2013-09-11 14:20:35 -07001398}
1399
Mel Gormanb2a0ac82007-10-16 01:25:48 -07001400/* Remove an element from the buddy allocator from the fallback list */
Mel Gorman0ac3a402009-06-16 15:32:06 -07001401static inline struct page *
Mel Gorman7aeb09f2014-06-04 16:10:21 -07001402__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
Mel Gormanb2a0ac82007-10-16 01:25:48 -07001403{
Pintu Kumarb8af2942013-09-11 14:20:34 -07001404 struct free_area *area;
Mel Gorman7aeb09f2014-06-04 16:10:21 -07001405 unsigned int current_order;
Mel Gormanb2a0ac82007-10-16 01:25:48 -07001406 struct page *page;
Joonsoo Kim4eb7dce2015-04-14 15:45:18 -07001407 int fallback_mt;
1408 bool can_steal;
Mel Gormanb2a0ac82007-10-16 01:25:48 -07001409
1410 /* Find the largest possible block of pages in the other list */
Mel Gorman7aeb09f2014-06-04 16:10:21 -07001411 for (current_order = MAX_ORDER-1;
1412 current_order >= order && current_order <= MAX_ORDER-1;
1413 --current_order) {
Joonsoo Kim4eb7dce2015-04-14 15:45:18 -07001414 area = &(zone->free_area[current_order]);
1415 fallback_mt = find_suitable_fallback(area, current_order,
Joonsoo Kim2149cda2015-04-14 15:45:21 -07001416 start_migratetype, false, &can_steal);
Joonsoo Kim4eb7dce2015-04-14 15:45:18 -07001417 if (fallback_mt == -1)
1418 continue;
Mel Gormanb2a0ac82007-10-16 01:25:48 -07001419
Joonsoo Kim4eb7dce2015-04-14 15:45:18 -07001420 page = list_entry(area->free_list[fallback_mt].next,
1421 struct page, lru);
1422 if (can_steal)
1423 steal_suitable_fallback(zone, page, start_migratetype);
Mel Gormane0104872007-10-16 01:25:53 -07001424
Joonsoo Kim4eb7dce2015-04-14 15:45:18 -07001425 /* Remove the page from the freelists */
1426 area->nr_free--;
1427 list_del(&page->lru);
1428 rmv_page_order(page);
Mel Gormanb2a0ac82007-10-16 01:25:48 -07001429
Joonsoo Kim4eb7dce2015-04-14 15:45:18 -07001430 expand(zone, page, order, current_order, area,
1431 start_migratetype);
1432 /*
1433 * The freepage_migratetype may differ from pageblock's
1434 * migratetype depending on the decisions in
1435 * try_to_steal_freepages(). This is OK as long as it
1436 * does not differ for MIGRATE_CMA pageblocks. For CMA
1437 * we need to make sure unallocated pages flushed from
1438 * pcp lists are returned to the correct freelist.
1439 */
1440 set_freepage_migratetype(page, start_migratetype);
Mel Gormanb2a0ac82007-10-16 01:25:48 -07001441
Joonsoo Kim4eb7dce2015-04-14 15:45:18 -07001442 trace_mm_page_alloc_extfrag(page, order, current_order,
1443 start_migratetype, fallback_mt);
Mel Gormanb2a0ac82007-10-16 01:25:48 -07001444
Joonsoo Kim4eb7dce2015-04-14 15:45:18 -07001445 return page;
Mel Gormanb2a0ac82007-10-16 01:25:48 -07001446 }
1447
Mel Gorman728ec982009-06-16 15:32:04 -07001448 return NULL;
Mel Gormanb2a0ac82007-10-16 01:25:48 -07001449}
1450
Mel Gorman56fd56b2007-10-16 01:25:58 -07001451/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001452 * Do the hard work of removing an element from the buddy allocator.
1453 * Call me with the zone->lock already held.
1454 */
Mel Gormanb2a0ac82007-10-16 01:25:48 -07001455static struct page *__rmqueue(struct zone *zone, unsigned int order,
1456 int migratetype)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001457{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001458 struct page *page;
1459
Mel Gorman728ec982009-06-16 15:32:04 -07001460retry_reserve:
Mel Gorman56fd56b2007-10-16 01:25:58 -07001461 page = __rmqueue_smallest(zone, order, migratetype);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001462
Mel Gorman728ec982009-06-16 15:32:04 -07001463 if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
Joonsoo Kimdc676472015-04-14 15:45:15 -07001464 if (migratetype == MIGRATE_MOVABLE)
1465 page = __rmqueue_cma_fallback(zone, order);
1466
1467 if (!page)
1468 page = __rmqueue_fallback(zone, order, migratetype);
Mel Gormanb2a0ac82007-10-16 01:25:48 -07001469
Mel Gorman728ec982009-06-16 15:32:04 -07001470 /*
1471 * Use MIGRATE_RESERVE rather than fail an allocation. goto
1472 * is used because __rmqueue_smallest is an inline function
1473 * and we want just one call site
1474 */
1475 if (!page) {
1476 migratetype = MIGRATE_RESERVE;
1477 goto retry_reserve;
1478 }
1479 }
1480
Mel Gorman0d3d0622009-09-21 17:02:44 -07001481 trace_mm_page_alloc_zone_locked(page, order, migratetype);
Mel Gormanb2a0ac82007-10-16 01:25:48 -07001482 return page;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001483}
1484
Michal Nazarewicz5f63b722012-01-11 15:16:11 +01001485/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001486 * Obtain a specified number of elements from the buddy allocator, all under
1487 * a single hold of the lock, for efficiency. Add them to the supplied list.
1488 * Returns the number of new pages which were placed at *list.
1489 */
Michal Nazarewicz5f63b722012-01-11 15:16:11 +01001490static int rmqueue_bulk(struct zone *zone, unsigned int order,
Mel Gormanb2a0ac82007-10-16 01:25:48 -07001491 unsigned long count, struct list_head *list,
Mel Gormanb745bc82014-06-04 16:10:22 -07001492 int migratetype, bool cold)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001493{
Vlastimil Babka5bcc9f82014-06-04 16:07:22 -07001494 int i;
Michal Nazarewicz5f63b722012-01-11 15:16:11 +01001495
Nick Pigginc54ad302006-01-06 00:10:56 -08001496 spin_lock(&zone->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001497 for (i = 0; i < count; ++i) {
Mel Gormanb2a0ac82007-10-16 01:25:48 -07001498 struct page *page = __rmqueue(zone, order, migratetype);
Nick Piggin085cc7d2006-01-06 00:11:01 -08001499 if (unlikely(page == NULL))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001500 break;
Mel Gorman81eabcb2007-12-17 16:20:05 -08001501
1502 /*
1503 * Split buddy pages returned by expand() are received here
1504 * in physical page order. The page is added to the callers and
1505 * list and the list head then moves forward. From the callers
1506 * perspective, the linked list is ordered by page number in
1507 * some conditions. This is useful for IO devices that can
1508 * merge IO requests if the physical pages are ordered
1509 * properly.
1510 */
Mel Gormanb745bc82014-06-04 16:10:22 -07001511 if (likely(!cold))
Mel Gormane084b2d2009-07-29 15:02:04 -07001512 list_add(&page->lru, list);
1513 else
1514 list_add_tail(&page->lru, list);
Mel Gorman81eabcb2007-12-17 16:20:05 -08001515 list = &page->lru;
Vlastimil Babka5bcc9f82014-06-04 16:07:22 -07001516 if (is_migrate_cma(get_freepage_migratetype(page)))
Bartlomiej Zolnierkiewiczd1ce7492012-10-08 16:32:02 -07001517 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
1518 -(1 << order));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001519 }
Mel Gormanf2260e62009-06-16 15:32:13 -07001520 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
Nick Pigginc54ad302006-01-06 00:10:56 -08001521 spin_unlock(&zone->lock);
Nick Piggin085cc7d2006-01-06 00:11:01 -08001522 return i;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001523}
1524
Christoph Lameter4ae7c032005-06-21 17:14:57 -07001525#ifdef CONFIG_NUMA
Christoph Lameter8fce4d82006-03-09 17:33:54 -08001526/*
Christoph Lameter4037d452007-05-09 02:35:14 -07001527 * Called from the vmstat counter updater to drain pagesets of this
1528 * currently executing processor on remote nodes after they have
1529 * expired.
1530 *
Christoph Lameter879336c2006-03-22 00:09:08 -08001531 * Note that this function must be called with the thread pinned to
1532 * a single processor.
Christoph Lameter8fce4d82006-03-09 17:33:54 -08001533 */
Christoph Lameter4037d452007-05-09 02:35:14 -07001534void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
Christoph Lameter4ae7c032005-06-21 17:14:57 -07001535{
Christoph Lameter4ae7c032005-06-21 17:14:57 -07001536 unsigned long flags;
Michal Nazarewicz7be12fc2014-08-06 16:05:15 -07001537 int to_drain, batch;
Christoph Lameter4ae7c032005-06-21 17:14:57 -07001538
Christoph Lameter4037d452007-05-09 02:35:14 -07001539 local_irq_save(flags);
Jason Low4db0c3c2015-04-15 16:14:08 -07001540 batch = READ_ONCE(pcp->batch);
Michal Nazarewicz7be12fc2014-08-06 16:05:15 -07001541 to_drain = min(pcp->count, batch);
KOSAKI Motohiro2a135152012-07-31 16:42:53 -07001542 if (to_drain > 0) {
1543 free_pcppages_bulk(zone, to_drain, pcp);
1544 pcp->count -= to_drain;
1545 }
Christoph Lameter4037d452007-05-09 02:35:14 -07001546 local_irq_restore(flags);
Christoph Lameter4ae7c032005-06-21 17:14:57 -07001547}
1548#endif
1549
Christoph Lameter9f8f2172008-02-04 22:29:11 -08001550/*
Vlastimil Babka93481ff2014-12-10 15:43:01 -08001551 * Drain pcplists of the indicated processor and zone.
1552 *
1553 * The processor must either be the current processor and the
1554 * thread pinned to the current processor or a processor that
1555 * is not online.
1556 */
1557static void drain_pages_zone(unsigned int cpu, struct zone *zone)
1558{
1559 unsigned long flags;
1560 struct per_cpu_pageset *pset;
1561 struct per_cpu_pages *pcp;
1562
1563 local_irq_save(flags);
1564 pset = per_cpu_ptr(zone->pageset, cpu);
1565
1566 pcp = &pset->pcp;
1567 if (pcp->count) {
1568 free_pcppages_bulk(zone, pcp->count, pcp);
1569 pcp->count = 0;
1570 }
1571 local_irq_restore(flags);
1572}
1573
1574/*
1575 * Drain pcplists of all zones on the indicated processor.
Christoph Lameter9f8f2172008-02-04 22:29:11 -08001576 *
1577 * The processor must either be the current processor and the
1578 * thread pinned to the current processor or a processor that
1579 * is not online.
1580 */
1581static void drain_pages(unsigned int cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001582{
1583 struct zone *zone;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001584
KOSAKI Motohiroee99c712009-03-31 15:19:31 -07001585 for_each_populated_zone(zone) {
Vlastimil Babka93481ff2014-12-10 15:43:01 -08001586 drain_pages_zone(cpu, zone);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001587 }
1588}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001589
Christoph Lameter9f8f2172008-02-04 22:29:11 -08001590/*
1591 * Spill all of this CPU's per-cpu pages back into the buddy allocator.
Vlastimil Babka93481ff2014-12-10 15:43:01 -08001592 *
1593 * The CPU has to be pinned. When zone parameter is non-NULL, spill just
1594 * the single zone's pages.
Christoph Lameter9f8f2172008-02-04 22:29:11 -08001595 */
Vlastimil Babka93481ff2014-12-10 15:43:01 -08001596void drain_local_pages(struct zone *zone)
Christoph Lameter9f8f2172008-02-04 22:29:11 -08001597{
Vlastimil Babka93481ff2014-12-10 15:43:01 -08001598 int cpu = smp_processor_id();
1599
1600 if (zone)
1601 drain_pages_zone(cpu, zone);
1602 else
1603 drain_pages(cpu);
Christoph Lameter9f8f2172008-02-04 22:29:11 -08001604}
1605
1606/*
Gilad Ben-Yossef74046492012-03-28 14:42:45 -07001607 * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
1608 *
Vlastimil Babka93481ff2014-12-10 15:43:01 -08001609 * When zone parameter is non-NULL, spill just the single zone's pages.
1610 *
Gilad Ben-Yossef74046492012-03-28 14:42:45 -07001611 * Note that this code is protected against sending an IPI to an offline
1612 * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
1613 * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
1614 * nothing keeps CPUs from showing up after we populated the cpumask and
1615 * before the call to on_each_cpu_mask().
Christoph Lameter9f8f2172008-02-04 22:29:11 -08001616 */
Vlastimil Babka93481ff2014-12-10 15:43:01 -08001617void drain_all_pages(struct zone *zone)
Christoph Lameter9f8f2172008-02-04 22:29:11 -08001618{
Gilad Ben-Yossef74046492012-03-28 14:42:45 -07001619 int cpu;
Gilad Ben-Yossef74046492012-03-28 14:42:45 -07001620
1621 /*
1622 * Allocate in the BSS so we wont require allocation in
1623 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
1624 */
1625 static cpumask_t cpus_with_pcps;
1626
1627 /*
1628 * We don't care about racing with CPU hotplug event
1629 * as offline notification will cause the notified
1630 * cpu to drain that CPU pcps and on_each_cpu_mask
1631 * disables preemption as part of its processing
1632 */
1633 for_each_online_cpu(cpu) {
Vlastimil Babka93481ff2014-12-10 15:43:01 -08001634 struct per_cpu_pageset *pcp;
1635 struct zone *z;
Gilad Ben-Yossef74046492012-03-28 14:42:45 -07001636 bool has_pcps = false;
Vlastimil Babka93481ff2014-12-10 15:43:01 -08001637
1638 if (zone) {
Gilad Ben-Yossef74046492012-03-28 14:42:45 -07001639 pcp = per_cpu_ptr(zone->pageset, cpu);
Vlastimil Babka93481ff2014-12-10 15:43:01 -08001640 if (pcp->pcp.count)
Gilad Ben-Yossef74046492012-03-28 14:42:45 -07001641 has_pcps = true;
Vlastimil Babka93481ff2014-12-10 15:43:01 -08001642 } else {
1643 for_each_populated_zone(z) {
1644 pcp = per_cpu_ptr(z->pageset, cpu);
1645 if (pcp->pcp.count) {
1646 has_pcps = true;
1647 break;
1648 }
Gilad Ben-Yossef74046492012-03-28 14:42:45 -07001649 }
1650 }
Vlastimil Babka93481ff2014-12-10 15:43:01 -08001651
Gilad Ben-Yossef74046492012-03-28 14:42:45 -07001652 if (has_pcps)
1653 cpumask_set_cpu(cpu, &cpus_with_pcps);
1654 else
1655 cpumask_clear_cpu(cpu, &cpus_with_pcps);
1656 }
Vlastimil Babka93481ff2014-12-10 15:43:01 -08001657 on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages,
1658 zone, 1);
Christoph Lameter9f8f2172008-02-04 22:29:11 -08001659}
1660
Rafael J. Wysocki296699d2007-07-29 23:27:18 +02001661#ifdef CONFIG_HIBERNATION
Linus Torvalds1da177e2005-04-16 15:20:36 -07001662
1663void mark_free_pages(struct zone *zone)
1664{
Rafael J. Wysockif623f0d2006-09-25 23:32:49 -07001665 unsigned long pfn, max_zone_pfn;
1666 unsigned long flags;
Mel Gorman7aeb09f2014-06-04 16:10:21 -07001667 unsigned int order, t;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001668 struct list_head *curr;
1669
Xishi Qiu8080fc02013-09-11 14:21:45 -07001670 if (zone_is_empty(zone))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001671 return;
1672
1673 spin_lock_irqsave(&zone->lock, flags);
Rafael J. Wysockif623f0d2006-09-25 23:32:49 -07001674
Cody P Schafer108bcc92013-02-22 16:35:23 -08001675 max_zone_pfn = zone_end_pfn(zone);
Rafael J. Wysockif623f0d2006-09-25 23:32:49 -07001676 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1677 if (pfn_valid(pfn)) {
1678 struct page *page = pfn_to_page(pfn);
1679
Rafael J. Wysocki7be98232007-05-06 14:50:42 -07001680 if (!swsusp_page_is_forbidden(page))
1681 swsusp_unset_page_free(page);
Rafael J. Wysockif623f0d2006-09-25 23:32:49 -07001682 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001683
Mel Gormanb2a0ac82007-10-16 01:25:48 -07001684 for_each_migratetype_order(order, t) {
1685 list_for_each(curr, &zone->free_area[order].free_list[t]) {
Rafael J. Wysockif623f0d2006-09-25 23:32:49 -07001686 unsigned long i;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001687
Rafael J. Wysockif623f0d2006-09-25 23:32:49 -07001688 pfn = page_to_pfn(list_entry(curr, struct page, lru));
1689 for (i = 0; i < (1UL << order); i++)
Rafael J. Wysocki7be98232007-05-06 14:50:42 -07001690 swsusp_set_page_free(pfn_to_page(pfn + i));
Rafael J. Wysockif623f0d2006-09-25 23:32:49 -07001691 }
Mel Gormanb2a0ac82007-10-16 01:25:48 -07001692 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001693 spin_unlock_irqrestore(&zone->lock, flags);
1694}
Mel Gormane2c55dc2007-10-16 01:25:50 -07001695#endif /* CONFIG_PM */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001696
1697/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001698 * Free a 0-order page
Mel Gormanb745bc82014-06-04 16:10:22 -07001699 * cold == true ? free a cold page : free a hot page
Linus Torvalds1da177e2005-04-16 15:20:36 -07001700 */
Mel Gormanb745bc82014-06-04 16:10:22 -07001701void free_hot_cold_page(struct page *page, bool cold)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001702{
1703 struct zone *zone = page_zone(page);
1704 struct per_cpu_pages *pcp;
1705 unsigned long flags;
Mel Gormandc4b0ca2014-06-04 16:10:17 -07001706 unsigned long pfn = page_to_pfn(page);
Mel Gorman5f8dcc22009-09-21 17:03:19 -07001707 int migratetype;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001708
KOSAKI Motohiroec95f532010-05-24 14:32:38 -07001709 if (!free_pages_prepare(page, 0))
Hugh Dickins689bceb2005-11-21 21:32:20 -08001710 return;
1711
Mel Gormandc4b0ca2014-06-04 16:10:17 -07001712 migratetype = get_pfnblock_migratetype(page, pfn);
Minchan Kimb12c4ad2012-10-08 16:32:08 -07001713 set_freepage_migratetype(page, migratetype);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001714 local_irq_save(flags);
Christoph Lameterf8891e52006-06-30 01:55:45 -07001715 __count_vm_event(PGFREE);
Mel Gormanda456f12009-06-16 15:32:08 -07001716
Mel Gorman5f8dcc22009-09-21 17:03:19 -07001717 /*
1718 * We only track unmovable, reclaimable and movable on pcp lists.
1719 * Free ISOLATE pages back to the allocator because they are being
1720 * offlined but treat RESERVE as movable pages so we can get those
1721 * areas back if necessary. Otherwise, we may have to free
1722 * excessively into the page allocator
1723 */
1724 if (migratetype >= MIGRATE_PCPTYPES) {
Minchan Kim194159f2013-02-22 16:33:58 -08001725 if (unlikely(is_migrate_isolate(migratetype))) {
Mel Gormandc4b0ca2014-06-04 16:10:17 -07001726 free_one_page(zone, page, pfn, 0, migratetype);
Mel Gorman5f8dcc22009-09-21 17:03:19 -07001727 goto out;
1728 }
1729 migratetype = MIGRATE_MOVABLE;
1730 }
1731
Christoph Lameter99dcc3e2010-01-05 15:34:51 +09001732 pcp = &this_cpu_ptr(zone->pageset)->pcp;
Mel Gormanb745bc82014-06-04 16:10:22 -07001733 if (!cold)
Mel Gorman5f8dcc22009-09-21 17:03:19 -07001734 list_add(&page->lru, &pcp->lists[migratetype]);
Mel Gormanb745bc82014-06-04 16:10:22 -07001735 else
1736 list_add_tail(&page->lru, &pcp->lists[migratetype]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001737 pcp->count++;
Nick Piggin48db57f2006-01-08 01:00:42 -08001738 if (pcp->count >= pcp->high) {
Jason Low4db0c3c2015-04-15 16:14:08 -07001739 unsigned long batch = READ_ONCE(pcp->batch);
Cody P Schafer998d39cb2013-07-03 15:01:32 -07001740 free_pcppages_bulk(zone, batch, pcp);
1741 pcp->count -= batch;
Nick Piggin48db57f2006-01-08 01:00:42 -08001742 }
Mel Gorman5f8dcc22009-09-21 17:03:19 -07001743
1744out:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001745 local_irq_restore(flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001746}
1747
Nick Piggin8dfcc9b2006-03-22 00:08:05 -08001748/*
Konstantin Khlebnikovcc598502012-01-10 15:07:04 -08001749 * Free a list of 0-order pages
1750 */
Mel Gormanb745bc82014-06-04 16:10:22 -07001751void free_hot_cold_page_list(struct list_head *list, bool cold)
Konstantin Khlebnikovcc598502012-01-10 15:07:04 -08001752{
1753 struct page *page, *next;
1754
1755 list_for_each_entry_safe(page, next, list, lru) {
Konstantin Khlebnikovb413d482012-01-10 15:07:09 -08001756 trace_mm_page_free_batched(page, cold);
Konstantin Khlebnikovcc598502012-01-10 15:07:04 -08001757 free_hot_cold_page(page, cold);
1758 }
1759}
1760
1761/*
Nick Piggin8dfcc9b2006-03-22 00:08:05 -08001762 * split_page takes a non-compound higher-order page, and splits it into
1763 * n (1<<order) sub-pages: page[0..n]
1764 * Each sub-page must be freed individually.
1765 *
1766 * Note: this is probably too low level an operation for use in drivers.
1767 * Please consult with lkml before using this in your driver.
1768 */
1769void split_page(struct page *page, unsigned int order)
1770{
1771 int i;
1772
Sasha Levin309381fea2014-01-23 15:52:54 -08001773 VM_BUG_ON_PAGE(PageCompound(page), page);
1774 VM_BUG_ON_PAGE(!page_count(page), page);
Vegard Nossumb1eeab62008-11-25 16:55:53 +01001775
1776#ifdef CONFIG_KMEMCHECK
1777 /*
1778 * Split shadow pages too, because free(page[0]) would
1779 * otherwise free the whole shadow.
1780 */
1781 if (kmemcheck_page_is_tracked(page))
1782 split_page(virt_to_page(page[0].shadow), order);
1783#endif
1784
Joonsoo Kim48c96a32014-12-12 16:56:01 -08001785 set_page_owner(page, 0, 0);
1786 for (i = 1; i < (1 << order); i++) {
Nick Piggin7835e982006-03-22 00:08:40 -08001787 set_page_refcounted(page + i);
Joonsoo Kim48c96a32014-12-12 16:56:01 -08001788 set_page_owner(page + i, 0, 0);
1789 }
Nick Piggin8dfcc9b2006-03-22 00:08:05 -08001790}
K. Y. Srinivasan5853ff22013-03-25 15:47:38 -07001791EXPORT_SYMBOL_GPL(split_page);
Nick Piggin8dfcc9b2006-03-22 00:08:05 -08001792
Joonsoo Kim3c605092014-11-13 15:19:21 -08001793int __isolate_free_page(struct page *page, unsigned int order)
Mel Gorman748446b2010-05-24 14:32:27 -07001794{
Mel Gorman748446b2010-05-24 14:32:27 -07001795 unsigned long watermark;
1796 struct zone *zone;
Bartlomiej Zolnierkiewicz2139cbe2012-10-08 16:32:00 -07001797 int mt;
Mel Gorman748446b2010-05-24 14:32:27 -07001798
1799 BUG_ON(!PageBuddy(page));
1800
1801 zone = page_zone(page);
Marek Szyprowski2e30abd2012-12-11 16:02:57 -08001802 mt = get_pageblock_migratetype(page);
Mel Gorman748446b2010-05-24 14:32:27 -07001803
Minchan Kim194159f2013-02-22 16:33:58 -08001804 if (!is_migrate_isolate(mt)) {
Marek Szyprowski2e30abd2012-12-11 16:02:57 -08001805 /* Obey watermarks as if the page was being allocated */
1806 watermark = low_wmark_pages(zone) + (1 << order);
1807 if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
1808 return 0;
1809
Mel Gorman8fb74b92013-01-11 14:32:16 -08001810 __mod_zone_freepage_state(zone, -(1UL << order), mt);
Marek Szyprowski2e30abd2012-12-11 16:02:57 -08001811 }
Mel Gorman748446b2010-05-24 14:32:27 -07001812
1813 /* Remove page from free list */
1814 list_del(&page->lru);
1815 zone->free_area[order].nr_free--;
1816 rmv_page_order(page);
Bartlomiej Zolnierkiewicz2139cbe2012-10-08 16:32:00 -07001817
Mel Gorman8fb74b92013-01-11 14:32:16 -08001818 /* Set the pageblock if the isolated page is at least a pageblock */
Mel Gorman748446b2010-05-24 14:32:27 -07001819 if (order >= pageblock_order - 1) {
1820 struct page *endpage = page + (1 << order) - 1;
Michal Nazarewicz47118af2011-12-29 13:09:50 +01001821 for (; page < endpage; page += pageblock_nr_pages) {
1822 int mt = get_pageblock_migratetype(page);
Minchan Kim194159f2013-02-22 16:33:58 -08001823 if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))
Michal Nazarewicz47118af2011-12-29 13:09:50 +01001824 set_pageblock_migratetype(page,
1825 MIGRATE_MOVABLE);
1826 }
Mel Gorman748446b2010-05-24 14:32:27 -07001827 }
1828
Joonsoo Kim48c96a32014-12-12 16:56:01 -08001829 set_page_owner(page, order, 0);
Mel Gorman8fb74b92013-01-11 14:32:16 -08001830 return 1UL << order;
Mel Gorman1fb3f8c2012-10-08 16:29:12 -07001831}
1832
1833/*
1834 * Similar to split_page except the page is already free. As this is only
1835 * being used for migration, the migratetype of the block also changes.
1836 * As this is called with interrupts disabled, the caller is responsible
1837 * for calling arch_alloc_page() and kernel_map_page() after interrupts
1838 * are enabled.
1839 *
1840 * Note: this is probably too low level an operation for use in drivers.
1841 * Please consult with lkml before using this in your driver.
1842 */
1843int split_free_page(struct page *page)
1844{
1845 unsigned int order;
1846 int nr_pages;
1847
Mel Gorman1fb3f8c2012-10-08 16:29:12 -07001848 order = page_order(page);
1849
Mel Gorman8fb74b92013-01-11 14:32:16 -08001850 nr_pages = __isolate_free_page(page, order);
Mel Gorman1fb3f8c2012-10-08 16:29:12 -07001851 if (!nr_pages)
1852 return 0;
1853
1854 /* Split into individual pages */
1855 set_page_refcounted(page);
1856 split_page(page, order);
1857 return nr_pages;
Mel Gorman748446b2010-05-24 14:32:27 -07001858}
1859
1860/*
Vlastimil Babka75379192015-02-11 15:25:38 -08001861 * Allocate a page from the given zone. Use pcplists for order-0 allocations.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001862 */
Mel Gorman0a15c3e2009-06-16 15:32:05 -07001863static inline
1864struct page *buffered_rmqueue(struct zone *preferred_zone,
Mel Gorman7aeb09f2014-06-04 16:10:21 -07001865 struct zone *zone, unsigned int order,
1866 gfp_t gfp_flags, int migratetype)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001867{
1868 unsigned long flags;
Hugh Dickins689bceb2005-11-21 21:32:20 -08001869 struct page *page;
Mel Gormanb745bc82014-06-04 16:10:22 -07001870 bool cold = ((gfp_flags & __GFP_COLD) != 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001871
Nick Piggin48db57f2006-01-08 01:00:42 -08001872 if (likely(order == 0)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001873 struct per_cpu_pages *pcp;
Mel Gorman5f8dcc22009-09-21 17:03:19 -07001874 struct list_head *list;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001875
Linus Torvalds1da177e2005-04-16 15:20:36 -07001876 local_irq_save(flags);
Christoph Lameter99dcc3e2010-01-05 15:34:51 +09001877 pcp = &this_cpu_ptr(zone->pageset)->pcp;
1878 list = &pcp->lists[migratetype];
Mel Gorman5f8dcc22009-09-21 17:03:19 -07001879 if (list_empty(list)) {
Mel Gorman535131e62007-10-16 01:25:49 -07001880 pcp->count += rmqueue_bulk(zone, 0,
Mel Gorman5f8dcc22009-09-21 17:03:19 -07001881 pcp->batch, list,
Mel Gormane084b2d2009-07-29 15:02:04 -07001882 migratetype, cold);
Mel Gorman5f8dcc22009-09-21 17:03:19 -07001883 if (unlikely(list_empty(list)))
Shaohua Li6fb332f2009-09-21 17:01:17 -07001884 goto failed;
Mel Gorman535131e62007-10-16 01:25:49 -07001885 }
Mel Gormanb92a6ed2007-10-16 01:25:50 -07001886
Mel Gorman5f8dcc22009-09-21 17:03:19 -07001887 if (cold)
1888 page = list_entry(list->prev, struct page, lru);
1889 else
1890 page = list_entry(list->next, struct page, lru);
1891
Mel Gormanb92a6ed2007-10-16 01:25:50 -07001892 list_del(&page->lru);
1893 pcp->count--;
Rohit Seth7fb1d9f2005-11-13 16:06:43 -08001894 } else {
Andrew Mortondab48da2009-06-16 15:32:37 -07001895 if (unlikely(gfp_flags & __GFP_NOFAIL)) {
1896 /*
1897 * __GFP_NOFAIL is not to be used in new code.
1898 *
1899 * All __GFP_NOFAIL callers should be fixed so that they
1900 * properly detect and handle allocation failures.
1901 *
1902 * We most definitely don't want callers attempting to
Linus Torvalds4923abf2009-06-24 12:16:49 -07001903 * allocate greater than order-1 page units with
Andrew Mortondab48da2009-06-16 15:32:37 -07001904 * __GFP_NOFAIL.
1905 */
Linus Torvalds4923abf2009-06-24 12:16:49 -07001906 WARN_ON_ONCE(order > 1);
Andrew Mortondab48da2009-06-16 15:32:37 -07001907 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001908 spin_lock_irqsave(&zone->lock, flags);
Mel Gormanb2a0ac82007-10-16 01:25:48 -07001909 page = __rmqueue(zone, order, migratetype);
Nick Piggina74609f2006-01-06 00:11:20 -08001910 spin_unlock(&zone->lock);
1911 if (!page)
1912 goto failed;
Bartlomiej Zolnierkiewiczd1ce7492012-10-08 16:32:02 -07001913 __mod_zone_freepage_state(zone, -(1 << order),
Vlastimil Babka5bcc9f82014-06-04 16:07:22 -07001914 get_freepage_migratetype(page));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001915 }
1916
Johannes Weiner3a025762014-04-07 15:37:48 -07001917 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
Johannes Weinerabe5f972014-10-02 16:21:10 -07001918 if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 &&
Johannes Weiner57054652014-10-09 15:28:17 -07001919 !test_bit(ZONE_FAIR_DEPLETED, &zone->flags))
1920 set_bit(ZONE_FAIR_DEPLETED, &zone->flags);
Johannes Weiner27329362014-03-03 15:38:41 -08001921
Christoph Lameterf8891e52006-06-30 01:55:45 -07001922 __count_zone_vm_events(PGALLOC, zone, 1 << order);
Andi Kleen78afd562011-03-22 16:33:12 -07001923 zone_statistics(preferred_zone, zone, gfp_flags);
Nick Piggina74609f2006-01-06 00:11:20 -08001924 local_irq_restore(flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001925
Sasha Levin309381fea2014-01-23 15:52:54 -08001926 VM_BUG_ON_PAGE(bad_range(zone, page), page);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001927 return page;
Nick Piggina74609f2006-01-06 00:11:20 -08001928
1929failed:
1930 local_irq_restore(flags);
Nick Piggina74609f2006-01-06 00:11:20 -08001931 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001932}
1933
Akinobu Mita933e3122006-12-08 02:39:45 -08001934#ifdef CONFIG_FAIL_PAGE_ALLOC
1935
Akinobu Mitab2588c42011-07-26 16:09:03 -07001936static struct {
Akinobu Mita933e3122006-12-08 02:39:45 -08001937 struct fault_attr attr;
1938
1939 u32 ignore_gfp_highmem;
1940 u32 ignore_gfp_wait;
Akinobu Mita54114992007-07-15 23:40:23 -07001941 u32 min_order;
Akinobu Mita933e3122006-12-08 02:39:45 -08001942} fail_page_alloc = {
1943 .attr = FAULT_ATTR_INITIALIZER,
Don Mullis6b1b60f2006-12-08 02:39:53 -08001944 .ignore_gfp_wait = 1,
1945 .ignore_gfp_highmem = 1,
Akinobu Mita54114992007-07-15 23:40:23 -07001946 .min_order = 1,
Akinobu Mita933e3122006-12-08 02:39:45 -08001947};
1948
1949static int __init setup_fail_page_alloc(char *str)
1950{
1951 return setup_fault_attr(&fail_page_alloc.attr, str);
1952}
1953__setup("fail_page_alloc=", setup_fail_page_alloc);
1954
Gavin Shandeaf3862012-07-31 16:41:51 -07001955static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
Akinobu Mita933e3122006-12-08 02:39:45 -08001956{
Akinobu Mita54114992007-07-15 23:40:23 -07001957 if (order < fail_page_alloc.min_order)
Gavin Shandeaf3862012-07-31 16:41:51 -07001958 return false;
Akinobu Mita933e3122006-12-08 02:39:45 -08001959 if (gfp_mask & __GFP_NOFAIL)
Gavin Shandeaf3862012-07-31 16:41:51 -07001960 return false;
Akinobu Mita933e3122006-12-08 02:39:45 -08001961 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
Gavin Shandeaf3862012-07-31 16:41:51 -07001962 return false;
Akinobu Mita933e3122006-12-08 02:39:45 -08001963 if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
Gavin Shandeaf3862012-07-31 16:41:51 -07001964 return false;
Akinobu Mita933e3122006-12-08 02:39:45 -08001965
1966 return should_fail(&fail_page_alloc.attr, 1 << order);
1967}
1968
1969#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
1970
1971static int __init fail_page_alloc_debugfs(void)
1972{
Al Virof4ae40a62011-07-24 04:33:43 -04001973 umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
Akinobu Mita933e3122006-12-08 02:39:45 -08001974 struct dentry *dir;
Akinobu Mita933e3122006-12-08 02:39:45 -08001975
Akinobu Mitadd48c082011-08-03 16:21:01 -07001976 dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
1977 &fail_page_alloc.attr);
1978 if (IS_ERR(dir))
1979 return PTR_ERR(dir);
Akinobu Mita933e3122006-12-08 02:39:45 -08001980
Akinobu Mitab2588c42011-07-26 16:09:03 -07001981 if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
1982 &fail_page_alloc.ignore_gfp_wait))
1983 goto fail;
1984 if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
1985 &fail_page_alloc.ignore_gfp_highmem))
1986 goto fail;
1987 if (!debugfs_create_u32("min-order", mode, dir,
1988 &fail_page_alloc.min_order))
1989 goto fail;
Akinobu Mita933e3122006-12-08 02:39:45 -08001990
Akinobu Mitab2588c42011-07-26 16:09:03 -07001991 return 0;
1992fail:
Akinobu Mitadd48c082011-08-03 16:21:01 -07001993 debugfs_remove_recursive(dir);
Akinobu Mita933e3122006-12-08 02:39:45 -08001994
Akinobu Mitab2588c42011-07-26 16:09:03 -07001995 return -ENOMEM;
Akinobu Mita933e3122006-12-08 02:39:45 -08001996}
1997
1998late_initcall(fail_page_alloc_debugfs);
1999
2000#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
2001
2002#else /* CONFIG_FAIL_PAGE_ALLOC */
2003
Gavin Shandeaf3862012-07-31 16:41:51 -07002004static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
Akinobu Mita933e3122006-12-08 02:39:45 -08002005{
Gavin Shandeaf3862012-07-31 16:41:51 -07002006 return false;
Akinobu Mita933e3122006-12-08 02:39:45 -08002007}
2008
2009#endif /* CONFIG_FAIL_PAGE_ALLOC */
2010
Linus Torvalds1da177e2005-04-16 15:20:36 -07002011/*
Mel Gorman88f5acf2011-01-13 15:45:41 -08002012 * Return true if free pages are above 'mark'. This takes into account the order
Linus Torvalds1da177e2005-04-16 15:20:36 -07002013 * of the allocation.
2014 */
Mel Gorman7aeb09f2014-06-04 16:10:21 -07002015static bool __zone_watermark_ok(struct zone *z, unsigned int order,
2016 unsigned long mark, int classzone_idx, int alloc_flags,
2017 long free_pages)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002018{
Wei Yuan26086de2014-12-10 15:44:44 -08002019 /* free_pages may go negative - that's OK */
Christoph Lameterd23ad422007-02-10 01:43:02 -08002020 long min = mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002021 int o;
Tomasz Stanislawski026b0812013-06-12 14:05:02 -07002022 long free_cma = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002023
Michal Hockodf0a6da2012-01-10 15:08:02 -08002024 free_pages -= (1 << order) - 1;
Rohit Seth7fb1d9f2005-11-13 16:06:43 -08002025 if (alloc_flags & ALLOC_HIGH)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002026 min -= min / 2;
Rohit Seth7fb1d9f2005-11-13 16:06:43 -08002027 if (alloc_flags & ALLOC_HARDER)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002028 min -= min / 4;
Bartlomiej Zolnierkiewiczd95ea5d2012-10-08 16:32:05 -07002029#ifdef CONFIG_CMA
2030 /* If allocation can't use CMA areas don't use free CMA pages */
2031 if (!(alloc_flags & ALLOC_CMA))
Tomasz Stanislawski026b0812013-06-12 14:05:02 -07002032 free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
Bartlomiej Zolnierkiewiczd95ea5d2012-10-08 16:32:05 -07002033#endif
Tomasz Stanislawski026b0812013-06-12 14:05:02 -07002034
Mel Gorman3484b2d2014-08-06 16:07:14 -07002035 if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx])
Mel Gorman88f5acf2011-01-13 15:45:41 -08002036 return false;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002037 for (o = 0; o < order; o++) {
2038 /* At the next order, this order's pages become unavailable */
2039 free_pages -= z->free_area[o].nr_free << o;
2040
2041 /* Require fewer higher order pages to be free */
2042 min >>= 1;
2043
2044 if (free_pages <= min)
Mel Gorman88f5acf2011-01-13 15:45:41 -08002045 return false;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002046 }
Mel Gorman88f5acf2011-01-13 15:45:41 -08002047 return true;
2048}
2049
Mel Gorman7aeb09f2014-06-04 16:10:21 -07002050bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
Mel Gorman88f5acf2011-01-13 15:45:41 -08002051 int classzone_idx, int alloc_flags)
2052{
2053 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
2054 zone_page_state(z, NR_FREE_PAGES));
2055}
2056
Mel Gorman7aeb09f2014-06-04 16:10:21 -07002057bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
2058 unsigned long mark, int classzone_idx, int alloc_flags)
Mel Gorman88f5acf2011-01-13 15:45:41 -08002059{
2060 long free_pages = zone_page_state(z, NR_FREE_PAGES);
2061
2062 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
2063 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
2064
2065 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
2066 free_pages);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002067}
2068
Paul Jackson9276b1bc2006-12-06 20:31:48 -08002069#ifdef CONFIG_NUMA
2070/*
2071 * zlc_setup - Setup for "zonelist cache". Uses cached zone data to
2072 * skip over zones that are not allowed by the cpuset, or that have
2073 * been recently (in last second) found to be nearly full. See further
2074 * comments in mmzone.h. Reduces cache footprint of zonelist scans
Simon Arlott183ff222007-10-20 01:27:18 +02002075 * that have to skip over a lot of full or unallowed zones.
Paul Jackson9276b1bc2006-12-06 20:31:48 -08002076 *
Zhi Yong Wua1aeb652013-11-12 15:08:29 -08002077 * If the zonelist cache is present in the passed zonelist, then
Paul Jackson9276b1bc2006-12-06 20:31:48 -08002078 * returns a pointer to the allowed node mask (either the current
Lai Jiangshan4b0ef1fe2012-12-12 13:51:46 -08002079 * tasks mems_allowed, or node_states[N_MEMORY].)
Paul Jackson9276b1bc2006-12-06 20:31:48 -08002080 *
2081 * If the zonelist cache is not available for this zonelist, does
2082 * nothing and returns NULL.
2083 *
2084 * If the fullzones BITMAP in the zonelist cache is stale (more than
2085 * a second since last zap'd) then we zap it out (clear its bits.)
2086 *
2087 * We hold off even calling zlc_setup, until after we've checked the
2088 * first zone in the zonelist, on the theory that most allocations will
2089 * be satisfied from that first zone, so best to examine that zone as
2090 * quickly as we can.
2091 */
2092static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
2093{
2094 struct zonelist_cache *zlc; /* cached zonelist speedup info */
2095 nodemask_t *allowednodes; /* zonelist_cache approximation */
2096
2097 zlc = zonelist->zlcache_ptr;
2098 if (!zlc)
2099 return NULL;
2100
S.Caglar Onurf05111f2008-04-28 02:12:38 -07002101 if (time_after(jiffies, zlc->last_full_zap + HZ)) {
Paul Jackson9276b1bc2006-12-06 20:31:48 -08002102 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
2103 zlc->last_full_zap = jiffies;
2104 }
2105
2106 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
2107 &cpuset_current_mems_allowed :
Lai Jiangshan4b0ef1fe2012-12-12 13:51:46 -08002108 &node_states[N_MEMORY];
Paul Jackson9276b1bc2006-12-06 20:31:48 -08002109 return allowednodes;
2110}
2111
2112/*
2113 * Given 'z' scanning a zonelist, run a couple of quick checks to see
2114 * if it is worth looking at further for free memory:
2115 * 1) Check that the zone isn't thought to be full (doesn't have its
2116 * bit set in the zonelist_cache fullzones BITMAP).
2117 * 2) Check that the zones node (obtained from the zonelist_cache
2118 * z_to_n[] mapping) is allowed in the passed in allowednodes mask.
2119 * Return true (non-zero) if zone is worth looking at further, or
2120 * else return false (zero) if it is not.
2121 *
2122 * This check -ignores- the distinction between various watermarks,
2123 * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is
2124 * found to be full for any variation of these watermarks, it will
2125 * be considered full for up to one second by all requests, unless
2126 * we are so low on memory on all allowed nodes that we are forced
2127 * into the second scan of the zonelist.
2128 *
2129 * In the second scan we ignore this zonelist cache and exactly
2130 * apply the watermarks to all zones, even it is slower to do so.
2131 * We are low on memory in the second scan, and should leave no stone
2132 * unturned looking for a free page.
2133 */
Mel Gormandd1a2392008-04-28 02:12:17 -07002134static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
Paul Jackson9276b1bc2006-12-06 20:31:48 -08002135 nodemask_t *allowednodes)
2136{
2137 struct zonelist_cache *zlc; /* cached zonelist speedup info */
2138 int i; /* index of *z in zonelist zones */
2139 int n; /* node that zone *z is on */
2140
2141 zlc = zonelist->zlcache_ptr;
2142 if (!zlc)
2143 return 1;
2144
Mel Gormandd1a2392008-04-28 02:12:17 -07002145 i = z - zonelist->_zonerefs;
Paul Jackson9276b1bc2006-12-06 20:31:48 -08002146 n = zlc->z_to_n[i];
2147
2148 /* This zone is worth trying if it is allowed but not full */
2149 return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
2150}
2151
2152/*
2153 * Given 'z' scanning a zonelist, set the corresponding bit in
2154 * zlc->fullzones, so that subsequent attempts to allocate a page
2155 * from that zone don't waste time re-examining it.
2156 */
Mel Gormandd1a2392008-04-28 02:12:17 -07002157static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
Paul Jackson9276b1bc2006-12-06 20:31:48 -08002158{
2159 struct zonelist_cache *zlc; /* cached zonelist speedup info */
2160 int i; /* index of *z in zonelist zones */
2161
2162 zlc = zonelist->zlcache_ptr;
2163 if (!zlc)
2164 return;
2165
Mel Gormandd1a2392008-04-28 02:12:17 -07002166 i = z - zonelist->_zonerefs;
Paul Jackson9276b1bc2006-12-06 20:31:48 -08002167
2168 set_bit(i, zlc->fullzones);
2169}
2170
Mel Gorman76d3fbf2011-07-25 17:12:30 -07002171/*
2172 * clear all zones full, called after direct reclaim makes progress so that
2173 * a zone that was recently full is not skipped over for up to a second
2174 */
2175static void zlc_clear_zones_full(struct zonelist *zonelist)
2176{
2177 struct zonelist_cache *zlc; /* cached zonelist speedup info */
2178
2179 zlc = zonelist->zlcache_ptr;
2180 if (!zlc)
2181 return;
2182
2183 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
2184}
2185
Johannes Weiner81c0a2b2013-09-11 14:20:47 -07002186static bool zone_local(struct zone *local_zone, struct zone *zone)
2187{
Johannes Weinerfff4068c2013-12-20 14:54:12 +00002188 return local_zone->node == zone->node;
Johannes Weiner81c0a2b2013-09-11 14:20:47 -07002189}
2190
David Rientjes957f8222012-10-08 16:33:24 -07002191static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
2192{
Mel Gorman5f7a75a2014-06-04 16:07:15 -07002193 return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <
2194 RECLAIM_DISTANCE;
David Rientjes957f8222012-10-08 16:33:24 -07002195}
2196
Paul Jackson9276b1bc2006-12-06 20:31:48 -08002197#else /* CONFIG_NUMA */
2198
2199static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
2200{
2201 return NULL;
2202}
2203
Mel Gormandd1a2392008-04-28 02:12:17 -07002204static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
Paul Jackson9276b1bc2006-12-06 20:31:48 -08002205 nodemask_t *allowednodes)
2206{
2207 return 1;
2208}
2209
Mel Gormandd1a2392008-04-28 02:12:17 -07002210static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
Paul Jackson9276b1bc2006-12-06 20:31:48 -08002211{
2212}
Mel Gorman76d3fbf2011-07-25 17:12:30 -07002213
2214static void zlc_clear_zones_full(struct zonelist *zonelist)
2215{
2216}
David Rientjes957f8222012-10-08 16:33:24 -07002217
Johannes Weiner81c0a2b2013-09-11 14:20:47 -07002218static bool zone_local(struct zone *local_zone, struct zone *zone)
2219{
2220 return true;
2221}
2222
David Rientjes957f8222012-10-08 16:33:24 -07002223static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
2224{
2225 return true;
2226}
2227
Paul Jackson9276b1bc2006-12-06 20:31:48 -08002228#endif /* CONFIG_NUMA */
2229
Mel Gorman4ffeaf32014-08-06 16:07:22 -07002230static void reset_alloc_batches(struct zone *preferred_zone)
2231{
2232 struct zone *zone = preferred_zone->zone_pgdat->node_zones;
2233
2234 do {
2235 mod_zone_page_state(zone, NR_ALLOC_BATCH,
2236 high_wmark_pages(zone) - low_wmark_pages(zone) -
2237 atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
Johannes Weiner57054652014-10-09 15:28:17 -07002238 clear_bit(ZONE_FAIR_DEPLETED, &zone->flags);
Mel Gorman4ffeaf32014-08-06 16:07:22 -07002239 } while (zone++ != preferred_zone);
2240}
2241
Rohit Seth7fb1d9f2005-11-13 16:06:43 -08002242/*
Paul Jackson0798e512006-12-06 20:31:38 -08002243 * get_page_from_freelist goes through the zonelist trying to allocate
Rohit Seth7fb1d9f2005-11-13 16:06:43 -08002244 * a page.
2245 */
2246static struct page *
Vlastimil Babkaa9263752015-02-11 15:25:41 -08002247get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
2248 const struct alloc_context *ac)
Martin Hicks753ee722005-06-21 17:14:41 -07002249{
Vlastimil Babkaa9263752015-02-11 15:25:41 -08002250 struct zonelist *zonelist = ac->zonelist;
Mel Gormandd1a2392008-04-28 02:12:17 -07002251 struct zoneref *z;
Rohit Seth7fb1d9f2005-11-13 16:06:43 -08002252 struct page *page = NULL;
Mel Gorman5117f452009-06-16 15:31:59 -07002253 struct zone *zone;
Paul Jackson9276b1bc2006-12-06 20:31:48 -08002254 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
2255 int zlc_active = 0; /* set if using zonelist_cache */
2256 int did_zlc_setup = 0; /* just call zlc_setup() one time */
Mel Gormana6e21b142014-06-04 16:10:12 -07002257 bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&
2258 (gfp_mask & __GFP_WRITE);
Mel Gorman4ffeaf32014-08-06 16:07:22 -07002259 int nr_fair_skipped = 0;
2260 bool zonelist_rescan;
Mel Gorman54a6eb52008-04-28 02:12:16 -07002261
Paul Jackson9276b1bc2006-12-06 20:31:48 -08002262zonelist_scan:
Mel Gorman4ffeaf32014-08-06 16:07:22 -07002263 zonelist_rescan = false;
2264
Rohit Seth7fb1d9f2005-11-13 16:06:43 -08002265 /*
Paul Jackson9276b1bc2006-12-06 20:31:48 -08002266 * Scan zonelist, looking for a zone with enough free.
Vladimir Davydov344736f2014-10-20 15:50:30 +04002267 * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
Rohit Seth7fb1d9f2005-11-13 16:06:43 -08002268 */
Vlastimil Babkaa9263752015-02-11 15:25:41 -08002269 for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
2270 ac->nodemask) {
Johannes Weinere085dbc2013-09-11 14:20:46 -07002271 unsigned long mark;
2272
Kirill A. Shutemove5adfff2012-12-11 16:00:29 -08002273 if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
Paul Jackson9276b1bc2006-12-06 20:31:48 -08002274 !zlc_zone_worth_trying(zonelist, z, allowednodes))
2275 continue;
Mel Gorman664eedd2014-06-04 16:10:08 -07002276 if (cpusets_enabled() &&
2277 (alloc_flags & ALLOC_CPUSET) &&
Vladimir Davydov344736f2014-10-20 15:50:30 +04002278 !cpuset_zone_allowed(zone, gfp_mask))
Mel Gormancd38b112011-07-25 17:12:29 -07002279 continue;
Johannes Weinera756cf52012-01-10 15:07:49 -08002280 /*
Johannes Weiner81c0a2b2013-09-11 14:20:47 -07002281 * Distribute pages in proportion to the individual
2282 * zone size to ensure fair page aging. The zone a
2283 * page was allocated in should have no effect on the
2284 * time the page has in memory before being reclaimed.
Johannes Weiner81c0a2b2013-09-11 14:20:47 -07002285 */
Johannes Weiner3a025762014-04-07 15:37:48 -07002286 if (alloc_flags & ALLOC_FAIR) {
Vlastimil Babkaa9263752015-02-11 15:25:41 -08002287 if (!zone_local(ac->preferred_zone, zone))
Mel Gormanf7b5d642014-08-06 16:07:20 -07002288 break;
Johannes Weiner57054652014-10-09 15:28:17 -07002289 if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) {
Mel Gorman4ffeaf32014-08-06 16:07:22 -07002290 nr_fair_skipped++;
Johannes Weiner3a025762014-04-07 15:37:48 -07002291 continue;
Mel Gorman4ffeaf32014-08-06 16:07:22 -07002292 }
Johannes Weiner81c0a2b2013-09-11 14:20:47 -07002293 }
2294 /*
Johannes Weinera756cf52012-01-10 15:07:49 -08002295 * When allocating a page cache page for writing, we
2296 * want to get it from a zone that is within its dirty
2297 * limit, such that no single zone holds more than its
2298 * proportional share of globally allowed dirty pages.
2299 * The dirty limits take into account the zone's
2300 * lowmem reserves and high watermark so that kswapd
2301 * should be able to balance it without having to
2302 * write pages from its LRU list.
2303 *
2304 * This may look like it could increase pressure on
2305 * lower zones by failing allocations in higher zones
2306 * before they are full. But the pages that do spill
2307 * over are limited as the lower zones are protected
2308 * by this very same mechanism. It should not become
2309 * a practical burden to them.
2310 *
2311 * XXX: For now, allow allocations to potentially
2312 * exceed the per-zone dirty limit in the slowpath
2313 * (ALLOC_WMARK_LOW unset) before going into reclaim,
2314 * which is important when on a NUMA setup the allowed
2315 * zones are together not big enough to reach the
2316 * global limit. The proper fix for these situations
2317 * will require awareness of zones in the
2318 * dirty-throttling and the flusher threads.
2319 */
Mel Gormana6e21b142014-06-04 16:10:12 -07002320 if (consider_zone_dirty && !zone_dirty_ok(zone))
Mel Gorman800a1e72014-06-04 16:10:06 -07002321 continue;
Rohit Seth7fb1d9f2005-11-13 16:06:43 -08002322
Johannes Weinere085dbc2013-09-11 14:20:46 -07002323 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
2324 if (!zone_watermark_ok(zone, order, mark,
Vlastimil Babkaa9263752015-02-11 15:25:41 -08002325 ac->classzone_idx, alloc_flags)) {
Mel Gormanfa5e0842009-06-16 15:33:22 -07002326 int ret;
2327
Mel Gorman5dab2912014-06-04 16:10:14 -07002328 /* Checked here to keep the fast path fast */
2329 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
2330 if (alloc_flags & ALLOC_NO_WATERMARKS)
2331 goto try_this_zone;
2332
Kirill A. Shutemove5adfff2012-12-11 16:00:29 -08002333 if (IS_ENABLED(CONFIG_NUMA) &&
2334 !did_zlc_setup && nr_online_nodes > 1) {
Mel Gormancd38b112011-07-25 17:12:29 -07002335 /*
2336 * we do zlc_setup if there are multiple nodes
2337 * and before considering the first zone allowed
2338 * by the cpuset.
2339 */
2340 allowednodes = zlc_setup(zonelist, alloc_flags);
2341 zlc_active = 1;
2342 did_zlc_setup = 1;
2343 }
2344
David Rientjes957f8222012-10-08 16:33:24 -07002345 if (zone_reclaim_mode == 0 ||
Vlastimil Babkaa9263752015-02-11 15:25:41 -08002346 !zone_allows_reclaim(ac->preferred_zone, zone))
Mel Gormanfa5e0842009-06-16 15:33:22 -07002347 goto this_zone_full;
2348
Mel Gormancd38b112011-07-25 17:12:29 -07002349 /*
2350 * As we may have just activated ZLC, check if the first
2351 * eligible zone has failed zone_reclaim recently.
2352 */
Kirill A. Shutemove5adfff2012-12-11 16:00:29 -08002353 if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
Mel Gormancd38b112011-07-25 17:12:29 -07002354 !zlc_zone_worth_trying(zonelist, z, allowednodes))
2355 continue;
2356
Mel Gormanfa5e0842009-06-16 15:33:22 -07002357 ret = zone_reclaim(zone, gfp_mask, order);
2358 switch (ret) {
2359 case ZONE_RECLAIM_NOSCAN:
2360 /* did not scan */
Mel Gormancd38b112011-07-25 17:12:29 -07002361 continue;
Mel Gormanfa5e0842009-06-16 15:33:22 -07002362 case ZONE_RECLAIM_FULL:
2363 /* scanned but unreclaimable */
Mel Gormancd38b112011-07-25 17:12:29 -07002364 continue;
Mel Gormanfa5e0842009-06-16 15:33:22 -07002365 default:
2366 /* did we reclaim enough */
Mel Gormanfed27192013-04-29 15:07:57 -07002367 if (zone_watermark_ok(zone, order, mark,
Vlastimil Babkaa9263752015-02-11 15:25:41 -08002368 ac->classzone_idx, alloc_flags))
Mel Gormanfed27192013-04-29 15:07:57 -07002369 goto try_this_zone;
2370
2371 /*
2372 * Failed to reclaim enough to meet watermark.
2373 * Only mark the zone full if checking the min
2374 * watermark or if we failed to reclaim just
2375 * 1<<order pages or else the page allocator
2376 * fastpath will prematurely mark zones full
2377 * when the watermark is between the low and
2378 * min watermarks.
2379 */
2380 if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||
2381 ret == ZONE_RECLAIM_SOME)
Paul Jackson9276b1bc2006-12-06 20:31:48 -08002382 goto this_zone_full;
Mel Gormanfed27192013-04-29 15:07:57 -07002383
2384 continue;
Paul Jackson0798e512006-12-06 20:31:38 -08002385 }
Rohit Seth7fb1d9f2005-11-13 16:06:43 -08002386 }
2387
Mel Gormanfa5e0842009-06-16 15:33:22 -07002388try_this_zone:
Vlastimil Babkaa9263752015-02-11 15:25:41 -08002389 page = buffered_rmqueue(ac->preferred_zone, zone, order,
2390 gfp_mask, ac->migratetype);
Vlastimil Babka75379192015-02-11 15:25:38 -08002391 if (page) {
2392 if (prep_new_page(page, order, gfp_mask, alloc_flags))
2393 goto try_this_zone;
2394 return page;
2395 }
Paul Jackson9276b1bc2006-12-06 20:31:48 -08002396this_zone_full:
Mel Gorman65bb3712014-06-04 16:10:05 -07002397 if (IS_ENABLED(CONFIG_NUMA) && zlc_active)
Paul Jackson9276b1bc2006-12-06 20:31:48 -08002398 zlc_mark_zone_full(zonelist, z);
Mel Gorman54a6eb52008-04-28 02:12:16 -07002399 }
Paul Jackson9276b1bc2006-12-06 20:31:48 -08002400
Mel Gorman4ffeaf32014-08-06 16:07:22 -07002401 /*
2402 * The first pass makes sure allocations are spread fairly within the
2403 * local node. However, the local node might have free pages left
2404 * after the fairness batches are exhausted, and remote zones haven't
2405 * even been considered yet. Try once more without fairness, and
2406 * include remote zones now, before entering the slowpath and waking
2407 * kswapd: prefer spilling to a remote zone over swapping locally.
2408 */
2409 if (alloc_flags & ALLOC_FAIR) {
2410 alloc_flags &= ~ALLOC_FAIR;
2411 if (nr_fair_skipped) {
2412 zonelist_rescan = true;
Vlastimil Babkaa9263752015-02-11 15:25:41 -08002413 reset_alloc_batches(ac->preferred_zone);
Mel Gorman4ffeaf32014-08-06 16:07:22 -07002414 }
2415 if (nr_online_nodes > 1)
2416 zonelist_rescan = true;
2417 }
2418
2419 if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) {
2420 /* Disable zlc cache for second zonelist scan */
2421 zlc_active = 0;
2422 zonelist_rescan = true;
2423 }
2424
2425 if (zonelist_rescan)
2426 goto zonelist_scan;
2427
2428 return NULL;
Martin Hicks753ee722005-06-21 17:14:41 -07002429}
2430
David Rientjes29423e772011-03-22 16:30:47 -07002431/*
2432 * Large machines with many possible nodes should not always dump per-node
2433 * meminfo in irq context.
2434 */
2435static inline bool should_suppress_show_mem(void)
2436{
2437 bool ret = false;
2438
2439#if NODES_SHIFT > 8
2440 ret = in_interrupt();
2441#endif
2442 return ret;
2443}
2444
Dave Hansena238ab52011-05-24 17:12:16 -07002445static DEFINE_RATELIMIT_STATE(nopage_rs,
2446 DEFAULT_RATELIMIT_INTERVAL,
2447 DEFAULT_RATELIMIT_BURST);
2448
2449void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
2450{
Dave Hansena238ab52011-05-24 17:12:16 -07002451 unsigned int filter = SHOW_MEM_FILTER_NODES;
2452
Stanislaw Gruszkac0a32fc2012-01-10 15:07:28 -08002453 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
2454 debug_guardpage_minorder() > 0)
Dave Hansena238ab52011-05-24 17:12:16 -07002455 return;
2456
2457 /*
2458 * This documents exceptions given to allocations in certain
2459 * contexts that are allowed to allocate outside current's set
2460 * of allowed nodes.
2461 */
2462 if (!(gfp_mask & __GFP_NOMEMALLOC))
2463 if (test_thread_flag(TIF_MEMDIE) ||
2464 (current->flags & (PF_MEMALLOC | PF_EXITING)))
2465 filter &= ~SHOW_MEM_FILTER_NODES;
2466 if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
2467 filter &= ~SHOW_MEM_FILTER_NODES;
2468
2469 if (fmt) {
Joe Perches3ee9a4f2011-10-31 17:08:35 -07002470 struct va_format vaf;
2471 va_list args;
2472
Dave Hansena238ab52011-05-24 17:12:16 -07002473 va_start(args, fmt);
Joe Perches3ee9a4f2011-10-31 17:08:35 -07002474
2475 vaf.fmt = fmt;
2476 vaf.va = &args;
2477
2478 pr_warn("%pV", &vaf);
2479
Dave Hansena238ab52011-05-24 17:12:16 -07002480 va_end(args);
2481 }
2482
Joe Perches3ee9a4f2011-10-31 17:08:35 -07002483 pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",
2484 current->comm, order, gfp_mask);
Dave Hansena238ab52011-05-24 17:12:16 -07002485
2486 dump_stack();
2487 if (!should_suppress_show_mem())
2488 show_mem(filter);
2489}
2490
Mel Gorman11e33f62009-06-16 15:31:57 -07002491static inline struct page *
2492__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
Vlastimil Babkaa9263752015-02-11 15:25:41 -08002493 const struct alloc_context *ac, unsigned long *did_some_progress)
Mel Gorman11e33f62009-06-16 15:31:57 -07002494{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002495 struct page *page;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002496
Johannes Weiner9879de72015-01-26 12:58:32 -08002497 *did_some_progress = 0;
2498
Johannes Weiner9879de72015-01-26 12:58:32 -08002499 /*
Johannes Weinerdc564012015-06-24 16:57:19 -07002500 * Acquire the oom lock. If that fails, somebody else is
2501 * making progress for us.
Johannes Weiner9879de72015-01-26 12:58:32 -08002502 */
Johannes Weinerdc564012015-06-24 16:57:19 -07002503 if (!mutex_trylock(&oom_lock)) {
Johannes Weiner9879de72015-01-26 12:58:32 -08002504 *did_some_progress = 1;
Mel Gorman11e33f62009-06-16 15:31:57 -07002505 schedule_timeout_uninterruptible(1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002506 return NULL;
2507 }
Jens Axboe6b1de912005-11-17 21:35:02 +01002508
Mel Gorman11e33f62009-06-16 15:31:57 -07002509 /*
2510 * Go through the zonelist yet one more time, keep very high watermark
2511 * here, this is only to catch a parallel oom killing, we must fail if
2512 * we're still under heavy pressure.
2513 */
Vlastimil Babkaa9263752015-02-11 15:25:41 -08002514 page = get_page_from_freelist(gfp_mask | __GFP_HARDWALL, order,
2515 ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
Rohit Seth7fb1d9f2005-11-13 16:06:43 -08002516 if (page)
Mel Gorman11e33f62009-06-16 15:31:57 -07002517 goto out;
2518
KAMEZAWA Hiroyuki4365a562009-12-15 16:45:33 -08002519 if (!(gfp_mask & __GFP_NOFAIL)) {
Johannes Weiner9879de72015-01-26 12:58:32 -08002520 /* Coredumps can quickly deplete all memory reserves */
2521 if (current->flags & PF_DUMPCORE)
2522 goto out;
KAMEZAWA Hiroyuki4365a562009-12-15 16:45:33 -08002523 /* The OOM killer will not help higher order allocs */
2524 if (order > PAGE_ALLOC_COSTLY_ORDER)
2525 goto out;
David Rientjes03668b32010-08-09 17:18:54 -07002526 /* The OOM killer does not needlessly kill tasks for lowmem */
Vlastimil Babkaa9263752015-02-11 15:25:41 -08002527 if (ac->high_zoneidx < ZONE_NORMAL)
David Rientjes03668b32010-08-09 17:18:54 -07002528 goto out;
Johannes Weiner90839052015-06-24 16:57:21 -07002529 /* The OOM killer does not compensate for IO-less reclaim */
Johannes Weinercc873172015-02-27 15:52:09 -08002530 if (!(gfp_mask & __GFP_FS)) {
2531 /*
2532 * XXX: Page reclaim didn't yield anything,
2533 * and the OOM killer can't be invoked, but
Johannes Weiner90839052015-06-24 16:57:21 -07002534 * keep looping as per tradition.
Johannes Weinercc873172015-02-27 15:52:09 -08002535 */
2536 *did_some_progress = 1;
Johannes Weiner9879de72015-01-26 12:58:32 -08002537 goto out;
Johannes Weinercc873172015-02-27 15:52:09 -08002538 }
Johannes Weiner90839052015-06-24 16:57:21 -07002539 if (pm_suspended_storage())
2540 goto out;
David Rientjes4167e9b2015-04-14 15:46:55 -07002541 /* The OOM killer may not free memory on a specific node */
KAMEZAWA Hiroyuki4365a562009-12-15 16:45:33 -08002542 if (gfp_mask & __GFP_THISNODE)
2543 goto out;
2544 }
Mel Gorman11e33f62009-06-16 15:31:57 -07002545 /* Exhausted what can be done so it's blamo time */
Michal Hockoe009d5d2015-03-12 16:25:52 -07002546 if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false)
2547 || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
Michal Hockoc32b3cb2015-02-11 15:26:24 -08002548 *did_some_progress = 1;
Mel Gorman11e33f62009-06-16 15:31:57 -07002549out:
Johannes Weinerdc564012015-06-24 16:57:19 -07002550 mutex_unlock(&oom_lock);
Mel Gorman11e33f62009-06-16 15:31:57 -07002551 return page;
2552}
2553
Mel Gorman56de7262010-05-24 14:32:30 -07002554#ifdef CONFIG_COMPACTION
2555/* Try memory compaction for high-order allocations before reclaim */
2556static struct page *
2557__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
Vlastimil Babkaa9263752015-02-11 15:25:41 -08002558 int alloc_flags, const struct alloc_context *ac,
2559 enum migrate_mode mode, int *contended_compaction,
2560 bool *deferred_compaction)
Mel Gorman56de7262010-05-24 14:32:30 -07002561{
Vlastimil Babka53853e22014-10-09 15:27:02 -07002562 unsigned long compact_result;
Vlastimil Babka98dd3b42014-10-09 15:27:04 -07002563 struct page *page;
Vlastimil Babka53853e22014-10-09 15:27:02 -07002564
Mel Gorman66199712012-01-12 17:19:41 -08002565 if (!order)
Mel Gorman56de7262010-05-24 14:32:30 -07002566 return NULL;
2567
Andrew Mortonc06b1fc2011-01-13 15:47:32 -08002568 current->flags |= PF_MEMALLOC;
Vlastimil Babka1a6d53a2015-02-11 15:25:44 -08002569 compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
2570 mode, contended_compaction);
Andrew Mortonc06b1fc2011-01-13 15:47:32 -08002571 current->flags &= ~PF_MEMALLOC;
Mel Gorman56de7262010-05-24 14:32:30 -07002572
Vlastimil Babka98dd3b42014-10-09 15:27:04 -07002573 switch (compact_result) {
2574 case COMPACT_DEFERRED:
Vlastimil Babka53853e22014-10-09 15:27:02 -07002575 *deferred_compaction = true;
Vlastimil Babka98dd3b42014-10-09 15:27:04 -07002576 /* fall-through */
2577 case COMPACT_SKIPPED:
2578 return NULL;
2579 default:
2580 break;
Mel Gorman56de7262010-05-24 14:32:30 -07002581 }
2582
Vlastimil Babka98dd3b42014-10-09 15:27:04 -07002583 /*
2584 * At least in one zone compaction wasn't deferred or skipped, so let's
2585 * count a compaction stall
2586 */
2587 count_vm_event(COMPACTSTALL);
2588
Vlastimil Babkaa9263752015-02-11 15:25:41 -08002589 page = get_page_from_freelist(gfp_mask, order,
2590 alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
Vlastimil Babka98dd3b42014-10-09 15:27:04 -07002591
2592 if (page) {
2593 struct zone *zone = page_zone(page);
2594
2595 zone->compact_blockskip_flush = false;
2596 compaction_defer_reset(zone, order, true);
2597 count_vm_event(COMPACTSUCCESS);
2598 return page;
2599 }
2600
2601 /*
Vlastimil Babka98dd3b42014-10-09 15:27:04 -07002602 * It's bad if compaction run occurs and fails. The most likely reason
2603 * is that pages exist, but not enough to satisfy watermarks.
2604 */
2605 count_vm_event(COMPACTFAIL);
2606
2607 cond_resched();
2608
Mel Gorman56de7262010-05-24 14:32:30 -07002609 return NULL;
2610}
2611#else
2612static inline struct page *
2613__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
Vlastimil Babkaa9263752015-02-11 15:25:41 -08002614 int alloc_flags, const struct alloc_context *ac,
2615 enum migrate_mode mode, int *contended_compaction,
2616 bool *deferred_compaction)
Mel Gorman56de7262010-05-24 14:32:30 -07002617{
2618 return NULL;
2619}
2620#endif /* CONFIG_COMPACTION */
2621
Marek Szyprowskibba90712012-01-25 12:09:52 +01002622/* Perform direct synchronous page reclaim */
2623static int
Vlastimil Babkaa9263752015-02-11 15:25:41 -08002624__perform_reclaim(gfp_t gfp_mask, unsigned int order,
2625 const struct alloc_context *ac)
Mel Gorman11e33f62009-06-16 15:31:57 -07002626{
Mel Gorman11e33f62009-06-16 15:31:57 -07002627 struct reclaim_state reclaim_state;
Marek Szyprowskibba90712012-01-25 12:09:52 +01002628 int progress;
Mel Gorman11e33f62009-06-16 15:31:57 -07002629
2630 cond_resched();
2631
2632 /* We now go into synchronous reclaim */
2633 cpuset_memory_pressure_bump();
Andrew Mortonc06b1fc2011-01-13 15:47:32 -08002634 current->flags |= PF_MEMALLOC;
Mel Gorman11e33f62009-06-16 15:31:57 -07002635 lockdep_set_current_reclaim_state(gfp_mask);
2636 reclaim_state.reclaimed_slab = 0;
Andrew Mortonc06b1fc2011-01-13 15:47:32 -08002637 current->reclaim_state = &reclaim_state;
Mel Gorman11e33f62009-06-16 15:31:57 -07002638
Vlastimil Babkaa9263752015-02-11 15:25:41 -08002639 progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
2640 ac->nodemask);
Mel Gorman11e33f62009-06-16 15:31:57 -07002641
Andrew Mortonc06b1fc2011-01-13 15:47:32 -08002642 current->reclaim_state = NULL;
Mel Gorman11e33f62009-06-16 15:31:57 -07002643 lockdep_clear_current_reclaim_state();
Andrew Mortonc06b1fc2011-01-13 15:47:32 -08002644 current->flags &= ~PF_MEMALLOC;
Mel Gorman11e33f62009-06-16 15:31:57 -07002645
2646 cond_resched();
2647
Marek Szyprowskibba90712012-01-25 12:09:52 +01002648 return progress;
2649}
2650
2651/* The really slow allocator path where we enter direct reclaim */
2652static inline struct page *
2653__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
Vlastimil Babkaa9263752015-02-11 15:25:41 -08002654 int alloc_flags, const struct alloc_context *ac,
2655 unsigned long *did_some_progress)
Marek Szyprowskibba90712012-01-25 12:09:52 +01002656{
2657 struct page *page = NULL;
2658 bool drained = false;
2659
Vlastimil Babkaa9263752015-02-11 15:25:41 -08002660 *did_some_progress = __perform_reclaim(gfp_mask, order, ac);
Mel Gorman9ee493c2010-09-09 16:38:18 -07002661 if (unlikely(!(*did_some_progress)))
2662 return NULL;
Mel Gorman11e33f62009-06-16 15:31:57 -07002663
Mel Gorman76d3fbf2011-07-25 17:12:30 -07002664 /* After successful reclaim, reconsider all zones for allocation */
Kirill A. Shutemove5adfff2012-12-11 16:00:29 -08002665 if (IS_ENABLED(CONFIG_NUMA))
Vlastimil Babkaa9263752015-02-11 15:25:41 -08002666 zlc_clear_zones_full(ac->zonelist);
Mel Gorman76d3fbf2011-07-25 17:12:30 -07002667
Mel Gorman9ee493c2010-09-09 16:38:18 -07002668retry:
Vlastimil Babkaa9263752015-02-11 15:25:41 -08002669 page = get_page_from_freelist(gfp_mask, order,
2670 alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
Mel Gorman9ee493c2010-09-09 16:38:18 -07002671
2672 /*
2673 * If an allocation failed after direct reclaim, it could be because
2674 * pages are pinned on the per-cpu lists. Drain them and try again
2675 */
2676 if (!page && !drained) {
Vlastimil Babka93481ff2014-12-10 15:43:01 -08002677 drain_all_pages(NULL);
Mel Gorman9ee493c2010-09-09 16:38:18 -07002678 drained = true;
2679 goto retry;
2680 }
2681
Mel Gorman11e33f62009-06-16 15:31:57 -07002682 return page;
2683}
2684
Mel Gorman11e33f62009-06-16 15:31:57 -07002685/*
2686 * This is called in the allocator slow-path if the allocation request is of
2687 * sufficient urgency to ignore watermarks and take other desperate measures
2688 */
2689static inline struct page *
2690__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
Vlastimil Babkaa9263752015-02-11 15:25:41 -08002691 const struct alloc_context *ac)
Mel Gorman11e33f62009-06-16 15:31:57 -07002692{
2693 struct page *page;
2694
2695 do {
Vlastimil Babkaa9263752015-02-11 15:25:41 -08002696 page = get_page_from_freelist(gfp_mask, order,
2697 ALLOC_NO_WATERMARKS, ac);
Mel Gorman11e33f62009-06-16 15:31:57 -07002698
2699 if (!page && gfp_mask & __GFP_NOFAIL)
Vlastimil Babkaa9263752015-02-11 15:25:41 -08002700 wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC,
2701 HZ/50);
Mel Gorman11e33f62009-06-16 15:31:57 -07002702 } while (!page && (gfp_mask & __GFP_NOFAIL));
2703
2704 return page;
2705}
2706
Vlastimil Babkaa9263752015-02-11 15:25:41 -08002707static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
Mel Gorman11e33f62009-06-16 15:31:57 -07002708{
2709 struct zoneref *z;
2710 struct zone *zone;
2711
Vlastimil Babkaa9263752015-02-11 15:25:41 -08002712 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
2713 ac->high_zoneidx, ac->nodemask)
2714 wakeup_kswapd(zone, order, zone_idx(ac->preferred_zone));
Mel Gorman11e33f62009-06-16 15:31:57 -07002715}
2716
Peter Zijlstra341ce062009-06-16 15:32:02 -07002717static inline int
2718gfp_to_alloc_flags(gfp_t gfp_mask)
2719{
Peter Zijlstra341ce062009-06-16 15:32:02 -07002720 int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
David Rientjesb104a352014-07-30 16:08:24 -07002721 const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));
Peter Zijlstra341ce062009-06-16 15:32:02 -07002722
Mel Gormana56f57f2009-06-16 15:32:02 -07002723 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
Namhyung Kime6223a32010-10-26 14:21:59 -07002724 BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
Mel Gormana56f57f2009-06-16 15:32:02 -07002725
Peter Zijlstra341ce062009-06-16 15:32:02 -07002726 /*
2727 * The caller may dip into page reserves a bit more if the caller
2728 * cannot run direct reclaim, or if the caller has realtime scheduling
2729 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
David Rientjesb104a352014-07-30 16:08:24 -07002730 * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH).
Peter Zijlstra341ce062009-06-16 15:32:02 -07002731 */
Namhyung Kime6223a32010-10-26 14:21:59 -07002732 alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
Peter Zijlstra341ce062009-06-16 15:32:02 -07002733
David Rientjesb104a352014-07-30 16:08:24 -07002734 if (atomic) {
Andrea Arcangeli5c3240d2011-01-13 15:46:49 -08002735 /*
David Rientjesb104a352014-07-30 16:08:24 -07002736 * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
2737 * if it can't schedule.
Andrea Arcangeli5c3240d2011-01-13 15:46:49 -08002738 */
David Rientjesb104a352014-07-30 16:08:24 -07002739 if (!(gfp_mask & __GFP_NOMEMALLOC))
Andrea Arcangeli5c3240d2011-01-13 15:46:49 -08002740 alloc_flags |= ALLOC_HARDER;
Peter Zijlstra341ce062009-06-16 15:32:02 -07002741 /*
David Rientjesb104a352014-07-30 16:08:24 -07002742 * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
Vladimir Davydov344736f2014-10-20 15:50:30 +04002743 * comment for __cpuset_node_allowed().
Peter Zijlstra341ce062009-06-16 15:32:02 -07002744 */
2745 alloc_flags &= ~ALLOC_CPUSET;
Andrew Mortonc06b1fc2011-01-13 15:47:32 -08002746 } else if (unlikely(rt_task(current)) && !in_interrupt())
Peter Zijlstra341ce062009-06-16 15:32:02 -07002747 alloc_flags |= ALLOC_HARDER;
2748
Mel Gormanb37f1dd2012-07-31 16:44:03 -07002749 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
2750 if (gfp_mask & __GFP_MEMALLOC)
2751 alloc_flags |= ALLOC_NO_WATERMARKS;
Mel Gorman907aed42012-07-31 16:44:07 -07002752 else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
2753 alloc_flags |= ALLOC_NO_WATERMARKS;
2754 else if (!in_interrupt() &&
2755 ((current->flags & PF_MEMALLOC) ||
2756 unlikely(test_thread_flag(TIF_MEMDIE))))
Peter Zijlstra341ce062009-06-16 15:32:02 -07002757 alloc_flags |= ALLOC_NO_WATERMARKS;
2758 }
Bartlomiej Zolnierkiewiczd95ea5d2012-10-08 16:32:05 -07002759#ifdef CONFIG_CMA
David Rientjes43e7a342014-10-09 15:27:25 -07002760 if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
Bartlomiej Zolnierkiewiczd95ea5d2012-10-08 16:32:05 -07002761 alloc_flags |= ALLOC_CMA;
2762#endif
Peter Zijlstra341ce062009-06-16 15:32:02 -07002763 return alloc_flags;
2764}
2765
Mel Gorman072bb0a2012-07-31 16:43:58 -07002766bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
2767{
Mel Gormanb37f1dd2012-07-31 16:44:03 -07002768 return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
Mel Gorman072bb0a2012-07-31 16:43:58 -07002769}
2770
Mel Gorman11e33f62009-06-16 15:31:57 -07002771static inline struct page *
2772__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
Vlastimil Babkaa9263752015-02-11 15:25:41 -08002773 struct alloc_context *ac)
Mel Gorman11e33f62009-06-16 15:31:57 -07002774{
2775 const gfp_t wait = gfp_mask & __GFP_WAIT;
2776 struct page *page = NULL;
2777 int alloc_flags;
2778 unsigned long pages_reclaimed = 0;
2779 unsigned long did_some_progress;
David Rientjese0b9dae2014-06-04 16:08:28 -07002780 enum migrate_mode migration_mode = MIGRATE_ASYNC;
Mel Gorman66199712012-01-12 17:19:41 -08002781 bool deferred_compaction = false;
Vlastimil Babka1f9efde2014-10-09 15:27:14 -07002782 int contended_compaction = COMPACT_CONTENDED_NONE;
Rohit Seth7fb1d9f2005-11-13 16:06:43 -08002783
Christoph Lameter952f3b52006-12-06 20:33:26 -08002784 /*
Mel Gorman72807a72009-06-16 15:32:18 -07002785 * In the slowpath, we sanity check order to avoid ever trying to
2786 * reclaim >= MAX_ORDER areas which will never succeed. Callers may
2787 * be using allocators in order of preference for an area that is
2788 * too large.
2789 */
Mel Gorman1fc28b72009-07-29 15:04:08 -07002790 if (order >= MAX_ORDER) {
2791 WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
Mel Gorman72807a72009-06-16 15:32:18 -07002792 return NULL;
Mel Gorman1fc28b72009-07-29 15:04:08 -07002793 }
Rohit Seth7fb1d9f2005-11-13 16:06:43 -08002794
Christoph Lameter952f3b52006-12-06 20:33:26 -08002795 /*
David Rientjes4167e9b2015-04-14 15:46:55 -07002796 * If this allocation cannot block and it is for a specific node, then
2797 * fail early. There's no need to wakeup kswapd or retry for a
2798 * speculative node-specific allocation.
Christoph Lameter952f3b52006-12-06 20:33:26 -08002799 */
David Rientjes4167e9b2015-04-14 15:46:55 -07002800 if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !wait)
Christoph Lameter952f3b52006-12-06 20:33:26 -08002801 goto nopage;
2802
Johannes Weiner9879de72015-01-26 12:58:32 -08002803retry:
Johannes Weiner3a025762014-04-07 15:37:48 -07002804 if (!(gfp_mask & __GFP_NO_KSWAPD))
Vlastimil Babkaa9263752015-02-11 15:25:41 -08002805 wake_all_kswapds(order, ac);
Rohit Seth7fb1d9f2005-11-13 16:06:43 -08002806
Paul Jackson9bf22292005-09-06 15:18:12 -07002807 /*
Rohit Seth7fb1d9f2005-11-13 16:06:43 -08002808 * OK, we're below the kswapd watermark and have kicked background
2809 * reclaim. Now things get more complex, so set up alloc_flags according
2810 * to how we want to proceed.
Paul Jackson9bf22292005-09-06 15:18:12 -07002811 */
Peter Zijlstra341ce062009-06-16 15:32:02 -07002812 alloc_flags = gfp_to_alloc_flags(gfp_mask);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002813
David Rientjesf33261d2011-01-25 15:07:20 -08002814 /*
2815 * Find the true preferred zone if the allocation is unconstrained by
2816 * cpusets.
2817 */
Vlastimil Babkaa9263752015-02-11 15:25:41 -08002818 if (!(alloc_flags & ALLOC_CPUSET) && !ac->nodemask) {
Mel Gormand8846372014-06-04 16:10:33 -07002819 struct zoneref *preferred_zoneref;
Vlastimil Babkaa9263752015-02-11 15:25:41 -08002820 preferred_zoneref = first_zones_zonelist(ac->zonelist,
2821 ac->high_zoneidx, NULL, &ac->preferred_zone);
2822 ac->classzone_idx = zonelist_zone_idx(preferred_zoneref);
Mel Gormand8846372014-06-04 16:10:33 -07002823 }
David Rientjesf33261d2011-01-25 15:07:20 -08002824
Peter Zijlstra341ce062009-06-16 15:32:02 -07002825 /* This is the last chance, in general, before the goto nopage. */
Vlastimil Babkaa9263752015-02-11 15:25:41 -08002826 page = get_page_from_freelist(gfp_mask, order,
2827 alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
Rohit Seth7fb1d9f2005-11-13 16:06:43 -08002828 if (page)
2829 goto got_pg;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002830
Mel Gorman11e33f62009-06-16 15:31:57 -07002831 /* Allocate without watermarks if the context allows */
Peter Zijlstra341ce062009-06-16 15:32:02 -07002832 if (alloc_flags & ALLOC_NO_WATERMARKS) {
Mel Gorman183f6372012-07-31 16:44:12 -07002833 /*
2834 * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
2835 * the allocation is high priority and these type of
2836 * allocations are system rather than user orientated
2837 */
Vlastimil Babkaa9263752015-02-11 15:25:41 -08002838 ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
Mel Gorman183f6372012-07-31 16:44:12 -07002839
Vlastimil Babkaa9263752015-02-11 15:25:41 -08002840 page = __alloc_pages_high_priority(gfp_mask, order, ac);
2841
Mel Gormancfd19c52012-07-31 16:44:10 -07002842 if (page) {
Peter Zijlstra341ce062009-06-16 15:32:02 -07002843 goto got_pg;
Mel Gormancfd19c52012-07-31 16:44:10 -07002844 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002845 }
2846
2847 /* Atomic allocations - we can't balance anything */
David Rientjesaed0a0e2014-01-21 15:51:12 -08002848 if (!wait) {
2849 /*
2850 * All existing users of the deprecated __GFP_NOFAIL are
2851 * blockable, so warn of any new users that actually allow this
2852 * type of allocation to fail.
2853 */
2854 WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002855 goto nopage;
David Rientjesaed0a0e2014-01-21 15:51:12 -08002856 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002857
Peter Zijlstra341ce062009-06-16 15:32:02 -07002858 /* Avoid recursion of direct reclaim */
Andrew Mortonc06b1fc2011-01-13 15:47:32 -08002859 if (current->flags & PF_MEMALLOC)
Peter Zijlstra341ce062009-06-16 15:32:02 -07002860 goto nopage;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002861
David Rientjes6583bb62009-07-29 15:02:06 -07002862 /* Avoid allocations with no watermarks from looping endlessly */
2863 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
2864 goto nopage;
2865
Mel Gorman77f1fe62011-01-13 15:45:57 -08002866 /*
2867 * Try direct compaction. The first pass is asynchronous. Subsequent
2868 * attempts after direct reclaim are synchronous
2869 */
Vlastimil Babkaa9263752015-02-11 15:25:41 -08002870 page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
2871 migration_mode,
2872 &contended_compaction,
Vlastimil Babka53853e22014-10-09 15:27:02 -07002873 &deferred_compaction);
Mel Gorman56de7262010-05-24 14:32:30 -07002874 if (page)
2875 goto got_pg;
David Rientjes75f30862014-06-04 16:08:30 -07002876
Vlastimil Babka1f9efde2014-10-09 15:27:14 -07002877 /* Checks for THP-specific high-order allocations */
2878 if ((gfp_mask & GFP_TRANSHUGE) == GFP_TRANSHUGE) {
2879 /*
2880 * If compaction is deferred for high-order allocations, it is
2881 * because sync compaction recently failed. If this is the case
2882 * and the caller requested a THP allocation, we do not want
2883 * to heavily disrupt the system, so we fail the allocation
2884 * instead of entering direct reclaim.
2885 */
2886 if (deferred_compaction)
2887 goto nopage;
2888
2889 /*
2890 * In all zones where compaction was attempted (and not
2891 * deferred or skipped), lock contention has been detected.
2892 * For THP allocation we do not want to disrupt the others
2893 * so we fallback to base pages instead.
2894 */
2895 if (contended_compaction == COMPACT_CONTENDED_LOCK)
2896 goto nopage;
2897
2898 /*
2899 * If compaction was aborted due to need_resched(), we do not
2900 * want to further increase allocation latency, unless it is
2901 * khugepaged trying to collapse.
2902 */
2903 if (contended_compaction == COMPACT_CONTENDED_SCHED
2904 && !(current->flags & PF_KTHREAD))
2905 goto nopage;
2906 }
Mel Gorman66199712012-01-12 17:19:41 -08002907
David Rientjes8fe78042014-08-06 16:07:54 -07002908 /*
2909 * It can become very expensive to allocate transparent hugepages at
2910 * fault, so use asynchronous memory compaction for THP unless it is
2911 * khugepaged trying to collapse.
2912 */
2913 if ((gfp_mask & GFP_TRANSHUGE) != GFP_TRANSHUGE ||
2914 (current->flags & PF_KTHREAD))
2915 migration_mode = MIGRATE_SYNC_LIGHT;
2916
Mel Gorman11e33f62009-06-16 15:31:57 -07002917 /* Try direct reclaim and then allocating */
Vlastimil Babkaa9263752015-02-11 15:25:41 -08002918 page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
2919 &did_some_progress);
Mel Gorman11e33f62009-06-16 15:31:57 -07002920 if (page)
2921 goto got_pg;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002922
Johannes Weiner90839052015-06-24 16:57:21 -07002923 /* Do not loop if specifically requested */
2924 if (gfp_mask & __GFP_NORETRY)
2925 goto noretry;
2926
2927 /* Keep reclaiming pages as long as there is reasonable progress */
Nishanth Aravamudana41f24e2008-04-29 00:58:25 -07002928 pages_reclaimed += did_some_progress;
Johannes Weiner90839052015-06-24 16:57:21 -07002929 if ((did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) ||
2930 ((gfp_mask & __GFP_REPEAT) && pages_reclaimed < (1 << order))) {
Mel Gorman11e33f62009-06-16 15:31:57 -07002931 /* Wait for some write requests to complete then retry */
Vlastimil Babkaa9263752015-02-11 15:25:41 -08002932 wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50);
Johannes Weiner9879de72015-01-26 12:58:32 -08002933 goto retry;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002934 }
2935
Johannes Weiner90839052015-06-24 16:57:21 -07002936 /* Reclaim has failed us, start killing things */
2937 page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
2938 if (page)
2939 goto got_pg;
2940
2941 /* Retry as long as the OOM killer is making progress */
2942 if (did_some_progress)
2943 goto retry;
2944
2945noretry:
2946 /*
2947 * High-order allocations do not necessarily loop after
2948 * direct reclaim and reclaim/compaction depends on compaction
2949 * being called after reclaim so call directly if necessary
2950 */
2951 page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags,
2952 ac, migration_mode,
2953 &contended_compaction,
2954 &deferred_compaction);
2955 if (page)
2956 goto got_pg;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002957nopage:
Dave Hansena238ab52011-05-24 17:12:16 -07002958 warn_alloc_failed(gfp_mask, order, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002959got_pg:
Mel Gorman072bb0a2012-07-31 16:43:58 -07002960 return page;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002961}
Mel Gorman11e33f62009-06-16 15:31:57 -07002962
2963/*
2964 * This is the 'heart' of the zoned buddy allocator.
2965 */
2966struct page *
2967__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2968 struct zonelist *zonelist, nodemask_t *nodemask)
2969{
Mel Gormand8846372014-06-04 16:10:33 -07002970 struct zoneref *preferred_zoneref;
Mel Gormancc9a6c82012-03-21 16:34:11 -07002971 struct page *page = NULL;
Mel Gormancc9a6c82012-03-21 16:34:11 -07002972 unsigned int cpuset_mems_cookie;
Johannes Weiner3a025762014-04-07 15:37:48 -07002973 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
Andrew Morton91fbdc02015-02-11 15:25:04 -08002974 gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
Vlastimil Babkaa9263752015-02-11 15:25:41 -08002975 struct alloc_context ac = {
2976 .high_zoneidx = gfp_zone(gfp_mask),
2977 .nodemask = nodemask,
2978 .migratetype = gfpflags_to_migratetype(gfp_mask),
2979 };
Mel Gorman11e33f62009-06-16 15:31:57 -07002980
Benjamin Herrenschmidtdcce2842009-06-18 13:24:12 +10002981 gfp_mask &= gfp_allowed_mask;
2982
Mel Gorman11e33f62009-06-16 15:31:57 -07002983 lockdep_trace_alloc(gfp_mask);
2984
2985 might_sleep_if(gfp_mask & __GFP_WAIT);
2986
2987 if (should_fail_alloc_page(gfp_mask, order))
2988 return NULL;
2989
2990 /*
2991 * Check the zones suitable for the gfp_mask contain at least one
2992 * valid zone. It's possible to have an empty zonelist as a result
David Rientjes4167e9b2015-04-14 15:46:55 -07002993 * of __GFP_THISNODE and a memoryless node
Mel Gorman11e33f62009-06-16 15:31:57 -07002994 */
2995 if (unlikely(!zonelist->_zonerefs->zone))
2996 return NULL;
2997
Vlastimil Babkaa9263752015-02-11 15:25:41 -08002998 if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE)
Vlastimil Babka21bb9bd2014-10-09 15:26:51 -07002999 alloc_flags |= ALLOC_CMA;
3000
Mel Gormancc9a6c82012-03-21 16:34:11 -07003001retry_cpuset:
Mel Gormand26914d2014-04-03 14:47:24 -07003002 cpuset_mems_cookie = read_mems_allowed_begin();
Mel Gormancc9a6c82012-03-21 16:34:11 -07003003
Vlastimil Babkaa9263752015-02-11 15:25:41 -08003004 /* We set it here, as __alloc_pages_slowpath might have changed it */
3005 ac.zonelist = zonelist;
Mel Gorman5117f452009-06-16 15:31:59 -07003006 /* The preferred zone is used for statistics later */
Vlastimil Babkaa9263752015-02-11 15:25:41 -08003007 preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx,
3008 ac.nodemask ? : &cpuset_current_mems_allowed,
3009 &ac.preferred_zone);
3010 if (!ac.preferred_zone)
Mel Gormancc9a6c82012-03-21 16:34:11 -07003011 goto out;
Vlastimil Babkaa9263752015-02-11 15:25:41 -08003012 ac.classzone_idx = zonelist_zone_idx(preferred_zoneref);
Mel Gorman5117f452009-06-16 15:31:59 -07003013
3014 /* First allocation attempt */
Andrew Morton91fbdc02015-02-11 15:25:04 -08003015 alloc_mask = gfp_mask|__GFP_HARDWALL;
Vlastimil Babkaa9263752015-02-11 15:25:41 -08003016 page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
Ming Lei21caf2f2013-02-22 16:34:08 -08003017 if (unlikely(!page)) {
3018 /*
3019 * Runtime PM, block IO and its error handling path
3020 * can deadlock because I/O on the device might not
3021 * complete.
3022 */
Andrew Morton91fbdc02015-02-11 15:25:04 -08003023 alloc_mask = memalloc_noio_flags(gfp_mask);
3024
Vlastimil Babkaa9263752015-02-11 15:25:41 -08003025 page = __alloc_pages_slowpath(alloc_mask, order, &ac);
Ming Lei21caf2f2013-02-22 16:34:08 -08003026 }
Mel Gorman11e33f62009-06-16 15:31:57 -07003027
Xishi Qiu23f086f2015-02-11 15:25:07 -08003028 if (kmemcheck_enabled && page)
3029 kmemcheck_pagealloc_alloc(page, order, gfp_mask);
3030
Vlastimil Babkaa9263752015-02-11 15:25:41 -08003031 trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
Mel Gormancc9a6c82012-03-21 16:34:11 -07003032
3033out:
3034 /*
3035 * When updating a task's mems_allowed, it is possible to race with
3036 * parallel threads in such a way that an allocation can fail while
3037 * the mask is being updated. If a page allocation is about to fail,
3038 * check if the cpuset changed during allocation and if so, retry.
3039 */
Mel Gormand26914d2014-04-03 14:47:24 -07003040 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
Mel Gormancc9a6c82012-03-21 16:34:11 -07003041 goto retry_cpuset;
3042
Mel Gorman11e33f62009-06-16 15:31:57 -07003043 return page;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003044}
Mel Gormand2391712009-06-16 15:31:52 -07003045EXPORT_SYMBOL(__alloc_pages_nodemask);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003046
3047/*
3048 * Common helper functions.
3049 */
Harvey Harrison920c7a52008-02-04 22:29:26 -08003050unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003051{
Akinobu Mita945a1112009-09-21 17:01:47 -07003052 struct page *page;
3053
3054 /*
3055 * __get_free_pages() returns a 32-bit address, which cannot represent
3056 * a highmem page
3057 */
3058 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
3059
Linus Torvalds1da177e2005-04-16 15:20:36 -07003060 page = alloc_pages(gfp_mask, order);
3061 if (!page)
3062 return 0;
3063 return (unsigned long) page_address(page);
3064}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003065EXPORT_SYMBOL(__get_free_pages);
3066
Harvey Harrison920c7a52008-02-04 22:29:26 -08003067unsigned long get_zeroed_page(gfp_t gfp_mask)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003068{
Akinobu Mita945a1112009-09-21 17:01:47 -07003069 return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003070}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003071EXPORT_SYMBOL(get_zeroed_page);
3072
Harvey Harrison920c7a52008-02-04 22:29:26 -08003073void __free_pages(struct page *page, unsigned int order)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003074{
Nick Pigginb5810032005-10-29 18:16:12 -07003075 if (put_page_testzero(page)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003076 if (order == 0)
Mel Gormanb745bc82014-06-04 16:10:22 -07003077 free_hot_cold_page(page, false);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003078 else
3079 __free_pages_ok(page, order);
3080 }
3081}
3082
3083EXPORT_SYMBOL(__free_pages);
3084
Harvey Harrison920c7a52008-02-04 22:29:26 -08003085void free_pages(unsigned long addr, unsigned int order)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003086{
3087 if (addr != 0) {
Nick Piggin725d7042006-09-25 23:30:55 -07003088 VM_BUG_ON(!virt_addr_valid((void *)addr));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003089 __free_pages(virt_to_page((void *)addr), order);
3090 }
3091}
3092
3093EXPORT_SYMBOL(free_pages);
3094
Glauber Costa6a1a0d32012-12-18 14:22:00 -08003095/*
Alexander Duyckb63ae8c2015-05-06 21:11:57 -07003096 * Page Fragment:
3097 * An arbitrary-length arbitrary-offset area of memory which resides
3098 * within a 0 or higher order page. Multiple fragments within that page
3099 * are individually refcounted, in the page's reference counter.
3100 *
3101 * The page_frag functions below provide a simple allocation framework for
3102 * page fragments. This is used by the network stack and network device
3103 * drivers to provide a backing region of memory for use as either an
3104 * sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
3105 */
3106static struct page *__page_frag_refill(struct page_frag_cache *nc,
3107 gfp_t gfp_mask)
3108{
3109 struct page *page = NULL;
3110 gfp_t gfp = gfp_mask;
3111
3112#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
3113 gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
3114 __GFP_NOMEMALLOC;
3115 page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
3116 PAGE_FRAG_CACHE_MAX_ORDER);
3117 nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
3118#endif
3119 if (unlikely(!page))
3120 page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
3121
3122 nc->va = page ? page_address(page) : NULL;
3123
3124 return page;
3125}
3126
3127void *__alloc_page_frag(struct page_frag_cache *nc,
3128 unsigned int fragsz, gfp_t gfp_mask)
3129{
3130 unsigned int size = PAGE_SIZE;
3131 struct page *page;
3132 int offset;
3133
3134 if (unlikely(!nc->va)) {
3135refill:
3136 page = __page_frag_refill(nc, gfp_mask);
3137 if (!page)
3138 return NULL;
3139
3140#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
3141 /* if size can vary use size else just use PAGE_SIZE */
3142 size = nc->size;
3143#endif
3144 /* Even if we own the page, we do not use atomic_set().
3145 * This would break get_page_unless_zero() users.
3146 */
3147 atomic_add(size - 1, &page->_count);
3148
3149 /* reset page count bias and offset to start of new frag */
3150 nc->pfmemalloc = page->pfmemalloc;
3151 nc->pagecnt_bias = size;
3152 nc->offset = size;
3153 }
3154
3155 offset = nc->offset - fragsz;
3156 if (unlikely(offset < 0)) {
3157 page = virt_to_page(nc->va);
3158
3159 if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count))
3160 goto refill;
3161
3162#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
3163 /* if size can vary use size else just use PAGE_SIZE */
3164 size = nc->size;
3165#endif
3166 /* OK, page count is 0, we can safely set it */
3167 atomic_set(&page->_count, size);
3168
3169 /* reset page count bias and offset to start of new frag */
3170 nc->pagecnt_bias = size;
3171 offset = size - fragsz;
3172 }
3173
3174 nc->pagecnt_bias--;
3175 nc->offset = offset;
3176
3177 return nc->va + offset;
3178}
3179EXPORT_SYMBOL(__alloc_page_frag);
3180
3181/*
3182 * Frees a page fragment allocated out of either a compound or order 0 page.
3183 */
3184void __free_page_frag(void *addr)
3185{
3186 struct page *page = virt_to_head_page(addr);
3187
3188 if (unlikely(put_page_testzero(page)))
3189 __free_pages_ok(page, compound_order(page));
3190}
3191EXPORT_SYMBOL(__free_page_frag);
3192
3193/*
Vladimir Davydov52383432014-06-04 16:06:39 -07003194 * alloc_kmem_pages charges newly allocated pages to the kmem resource counter
3195 * of the current memory cgroup.
Glauber Costa6a1a0d32012-12-18 14:22:00 -08003196 *
Vladimir Davydov52383432014-06-04 16:06:39 -07003197 * It should be used when the caller would like to use kmalloc, but since the
3198 * allocation is large, it has to fall back to the page allocator.
Glauber Costa6a1a0d32012-12-18 14:22:00 -08003199 */
Vladimir Davydov52383432014-06-04 16:06:39 -07003200struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order)
3201{
3202 struct page *page;
3203 struct mem_cgroup *memcg = NULL;
3204
3205 if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
3206 return NULL;
3207 page = alloc_pages(gfp_mask, order);
3208 memcg_kmem_commit_charge(page, memcg, order);
3209 return page;
3210}
3211
3212struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
3213{
3214 struct page *page;
3215 struct mem_cgroup *memcg = NULL;
3216
3217 if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
3218 return NULL;
3219 page = alloc_pages_node(nid, gfp_mask, order);
3220 memcg_kmem_commit_charge(page, memcg, order);
3221 return page;
3222}
3223
3224/*
3225 * __free_kmem_pages and free_kmem_pages will free pages allocated with
3226 * alloc_kmem_pages.
3227 */
3228void __free_kmem_pages(struct page *page, unsigned int order)
Glauber Costa6a1a0d32012-12-18 14:22:00 -08003229{
3230 memcg_kmem_uncharge_pages(page, order);
3231 __free_pages(page, order);
3232}
3233
Vladimir Davydov52383432014-06-04 16:06:39 -07003234void free_kmem_pages(unsigned long addr, unsigned int order)
Glauber Costa6a1a0d32012-12-18 14:22:00 -08003235{
3236 if (addr != 0) {
3237 VM_BUG_ON(!virt_addr_valid((void *)addr));
Vladimir Davydov52383432014-06-04 16:06:39 -07003238 __free_kmem_pages(virt_to_page((void *)addr), order);
Glauber Costa6a1a0d32012-12-18 14:22:00 -08003239 }
3240}
3241
Andi Kleenee85c2e2011-05-11 15:13:34 -07003242static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
3243{
3244 if (addr) {
3245 unsigned long alloc_end = addr + (PAGE_SIZE << order);
3246 unsigned long used = addr + PAGE_ALIGN(size);
3247
3248 split_page(virt_to_page((void *)addr), order);
3249 while (used < alloc_end) {
3250 free_page(used);
3251 used += PAGE_SIZE;
3252 }
3253 }
3254 return (void *)addr;
3255}
3256
Timur Tabi2be0ffe2008-07-23 21:28:11 -07003257/**
3258 * alloc_pages_exact - allocate an exact number physically-contiguous pages.
3259 * @size: the number of bytes to allocate
3260 * @gfp_mask: GFP flags for the allocation
3261 *
3262 * This function is similar to alloc_pages(), except that it allocates the
3263 * minimum number of pages to satisfy the request. alloc_pages() can only
3264 * allocate memory in power-of-two pages.
3265 *
3266 * This function is also limited by MAX_ORDER.
3267 *
3268 * Memory allocated by this function must be released by free_pages_exact().
3269 */
3270void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
3271{
3272 unsigned int order = get_order(size);
3273 unsigned long addr;
3274
3275 addr = __get_free_pages(gfp_mask, order);
Andi Kleenee85c2e2011-05-11 15:13:34 -07003276 return make_alloc_exact(addr, order, size);
Timur Tabi2be0ffe2008-07-23 21:28:11 -07003277}
3278EXPORT_SYMBOL(alloc_pages_exact);
3279
3280/**
Andi Kleenee85c2e2011-05-11 15:13:34 -07003281 * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
3282 * pages on a node.
Randy Dunlapb5e6ab52011-05-16 13:16:54 -07003283 * @nid: the preferred node ID where memory should be allocated
Andi Kleenee85c2e2011-05-11 15:13:34 -07003284 * @size: the number of bytes to allocate
3285 * @gfp_mask: GFP flags for the allocation
3286 *
3287 * Like alloc_pages_exact(), but try to allocate on node nid first before falling
3288 * back.
3289 * Note this is not alloc_pages_exact_node() which allocates on a specific node,
3290 * but is not exact.
3291 */
Fabian Fredericke1931812014-08-06 16:04:59 -07003292void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
Andi Kleenee85c2e2011-05-11 15:13:34 -07003293{
3294 unsigned order = get_order(size);
3295 struct page *p = alloc_pages_node(nid, gfp_mask, order);
3296 if (!p)
3297 return NULL;
3298 return make_alloc_exact((unsigned long)page_address(p), order, size);
3299}
Andi Kleenee85c2e2011-05-11 15:13:34 -07003300
3301/**
Timur Tabi2be0ffe2008-07-23 21:28:11 -07003302 * free_pages_exact - release memory allocated via alloc_pages_exact()
3303 * @virt: the value returned by alloc_pages_exact.
3304 * @size: size of allocation, same value as passed to alloc_pages_exact().
3305 *
3306 * Release the memory allocated by a previous call to alloc_pages_exact.
3307 */
3308void free_pages_exact(void *virt, size_t size)
3309{
3310 unsigned long addr = (unsigned long)virt;
3311 unsigned long end = addr + PAGE_ALIGN(size);
3312
3313 while (addr < end) {
3314 free_page(addr);
3315 addr += PAGE_SIZE;
3316 }
3317}
3318EXPORT_SYMBOL(free_pages_exact);
3319
Zhang Yanfeie0fb5812013-02-22 16:35:54 -08003320/**
3321 * nr_free_zone_pages - count number of pages beyond high watermark
3322 * @offset: The zone index of the highest zone
3323 *
3324 * nr_free_zone_pages() counts the number of counts pages which are beyond the
3325 * high watermark within all zones at or below a given zone index. For each
3326 * zone, the number of pages is calculated as:
Jiang Liu834405c2013-07-03 15:03:04 -07003327 * managed_pages - high_pages
Zhang Yanfeie0fb5812013-02-22 16:35:54 -08003328 */
Zhang Yanfeiebec3862013-02-22 16:35:43 -08003329static unsigned long nr_free_zone_pages(int offset)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003330{
Mel Gormandd1a2392008-04-28 02:12:17 -07003331 struct zoneref *z;
Mel Gorman54a6eb52008-04-28 02:12:16 -07003332 struct zone *zone;
3333
Martin J. Blighe310fd42005-07-29 22:59:18 -07003334 /* Just pick one node, since fallback list is circular */
Zhang Yanfeiebec3862013-02-22 16:35:43 -08003335 unsigned long sum = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003336
Mel Gorman0e884602008-04-28 02:12:14 -07003337 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003338
Mel Gorman54a6eb52008-04-28 02:12:16 -07003339 for_each_zone_zonelist(zone, z, zonelist, offset) {
Jiang Liub40da042013-02-22 16:33:52 -08003340 unsigned long size = zone->managed_pages;
Mel Gorman41858962009-06-16 15:32:12 -07003341 unsigned long high = high_wmark_pages(zone);
Martin J. Blighe310fd42005-07-29 22:59:18 -07003342 if (size > high)
3343 sum += size - high;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003344 }
3345
3346 return sum;
3347}
3348
Zhang Yanfeie0fb5812013-02-22 16:35:54 -08003349/**
3350 * nr_free_buffer_pages - count number of pages beyond high watermark
3351 *
3352 * nr_free_buffer_pages() counts the number of pages which are beyond the high
3353 * watermark within ZONE_DMA and ZONE_NORMAL.
Linus Torvalds1da177e2005-04-16 15:20:36 -07003354 */
Zhang Yanfeiebec3862013-02-22 16:35:43 -08003355unsigned long nr_free_buffer_pages(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003356{
Al Viroaf4ca452005-10-21 02:55:38 -04003357 return nr_free_zone_pages(gfp_zone(GFP_USER));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003358}
Meelap Shahc2f1a552007-07-17 04:04:39 -07003359EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003360
Zhang Yanfeie0fb5812013-02-22 16:35:54 -08003361/**
3362 * nr_free_pagecache_pages - count number of pages beyond high watermark
3363 *
3364 * nr_free_pagecache_pages() counts the number of pages which are beyond the
3365 * high watermark within all zones.
Linus Torvalds1da177e2005-04-16 15:20:36 -07003366 */
Zhang Yanfeiebec3862013-02-22 16:35:43 -08003367unsigned long nr_free_pagecache_pages(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003368{
Mel Gorman2a1e2742007-07-17 04:03:12 -07003369 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003370}
Christoph Lameter08e0f6a2006-09-27 01:50:06 -07003371
3372static inline void show_node(struct zone *zone)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003373{
Kirill A. Shutemove5adfff2012-12-11 16:00:29 -08003374 if (IS_ENABLED(CONFIG_NUMA))
Andy Whitcroft25ba77c2006-12-06 20:33:03 -08003375 printk("Node %d ", zone_to_nid(zone));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003376}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003377
Linus Torvalds1da177e2005-04-16 15:20:36 -07003378void si_meminfo(struct sysinfo *val)
3379{
3380 val->totalram = totalram_pages;
Rafael Aquinicc7452b2014-08-06 16:06:38 -07003381 val->sharedram = global_page_state(NR_SHMEM);
Christoph Lameterd23ad422007-02-10 01:43:02 -08003382 val->freeram = global_page_state(NR_FREE_PAGES);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003383 val->bufferram = nr_blockdev_pages();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003384 val->totalhigh = totalhigh_pages;
3385 val->freehigh = nr_free_highpages();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003386 val->mem_unit = PAGE_SIZE;
3387}
3388
3389EXPORT_SYMBOL(si_meminfo);
3390
3391#ifdef CONFIG_NUMA
3392void si_meminfo_node(struct sysinfo *val, int nid)
3393{
Jiang Liucdd91a72013-07-03 15:03:27 -07003394 int zone_type; /* needs to be signed */
3395 unsigned long managed_pages = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003396 pg_data_t *pgdat = NODE_DATA(nid);
3397
Jiang Liucdd91a72013-07-03 15:03:27 -07003398 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
3399 managed_pages += pgdat->node_zones[zone_type].managed_pages;
3400 val->totalram = managed_pages;
Rafael Aquinicc7452b2014-08-06 16:06:38 -07003401 val->sharedram = node_page_state(nid, NR_SHMEM);
Christoph Lameterd23ad422007-02-10 01:43:02 -08003402 val->freeram = node_page_state(nid, NR_FREE_PAGES);
Christoph Lameter98d2b0e2006-09-25 23:31:12 -07003403#ifdef CONFIG_HIGHMEM
Jiang Liub40da042013-02-22 16:33:52 -08003404 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
Christoph Lameterd23ad422007-02-10 01:43:02 -08003405 val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
3406 NR_FREE_PAGES);
Christoph Lameter98d2b0e2006-09-25 23:31:12 -07003407#else
3408 val->totalhigh = 0;
3409 val->freehigh = 0;
3410#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07003411 val->mem_unit = PAGE_SIZE;
3412}
3413#endif
3414
David Rientjesddd588b2011-03-22 16:30:46 -07003415/*
David Rientjes7bf02ea2011-05-24 17:11:16 -07003416 * Determine whether the node should be displayed or not, depending on whether
3417 * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
David Rientjesddd588b2011-03-22 16:30:46 -07003418 */
David Rientjes7bf02ea2011-05-24 17:11:16 -07003419bool skip_free_areas_node(unsigned int flags, int nid)
David Rientjesddd588b2011-03-22 16:30:46 -07003420{
3421 bool ret = false;
Mel Gormancc9a6c82012-03-21 16:34:11 -07003422 unsigned int cpuset_mems_cookie;
David Rientjesddd588b2011-03-22 16:30:46 -07003423
3424 if (!(flags & SHOW_MEM_FILTER_NODES))
3425 goto out;
3426
Mel Gormancc9a6c82012-03-21 16:34:11 -07003427 do {
Mel Gormand26914d2014-04-03 14:47:24 -07003428 cpuset_mems_cookie = read_mems_allowed_begin();
Mel Gormancc9a6c82012-03-21 16:34:11 -07003429 ret = !node_isset(nid, cpuset_current_mems_allowed);
Mel Gormand26914d2014-04-03 14:47:24 -07003430 } while (read_mems_allowed_retry(cpuset_mems_cookie));
David Rientjesddd588b2011-03-22 16:30:46 -07003431out:
3432 return ret;
3433}
3434
Linus Torvalds1da177e2005-04-16 15:20:36 -07003435#define K(x) ((x) << (PAGE_SHIFT-10))
3436
Rabin Vincent377e4f12012-12-11 16:00:24 -08003437static void show_migration_types(unsigned char type)
3438{
3439 static const char types[MIGRATE_TYPES] = {
3440 [MIGRATE_UNMOVABLE] = 'U',
3441 [MIGRATE_RECLAIMABLE] = 'E',
3442 [MIGRATE_MOVABLE] = 'M',
3443 [MIGRATE_RESERVE] = 'R',
3444#ifdef CONFIG_CMA
3445 [MIGRATE_CMA] = 'C',
3446#endif
Minchan Kim194159f2013-02-22 16:33:58 -08003447#ifdef CONFIG_MEMORY_ISOLATION
Rabin Vincent377e4f12012-12-11 16:00:24 -08003448 [MIGRATE_ISOLATE] = 'I',
Minchan Kim194159f2013-02-22 16:33:58 -08003449#endif
Rabin Vincent377e4f12012-12-11 16:00:24 -08003450 };
3451 char tmp[MIGRATE_TYPES + 1];
3452 char *p = tmp;
3453 int i;
3454
3455 for (i = 0; i < MIGRATE_TYPES; i++) {
3456 if (type & (1 << i))
3457 *p++ = types[i];
3458 }
3459
3460 *p = '\0';
3461 printk("(%s) ", tmp);
3462}
3463
Linus Torvalds1da177e2005-04-16 15:20:36 -07003464/*
3465 * Show free area list (used inside shift_scroll-lock stuff)
3466 * We also calculate the percentage fragmentation. We do this by counting the
3467 * memory on each free list with the exception of the first item on the list.
Konstantin Khlebnikovd1bfcdb2015-04-14 15:45:30 -07003468 *
3469 * Bits in @filter:
3470 * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
3471 * cpuset.
Linus Torvalds1da177e2005-04-16 15:20:36 -07003472 */
David Rientjes7bf02ea2011-05-24 17:11:16 -07003473void show_free_areas(unsigned int filter)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003474{
Konstantin Khlebnikovd1bfcdb2015-04-14 15:45:30 -07003475 unsigned long free_pcp = 0;
Jes Sorensenc7241912006-09-27 01:50:05 -07003476 int cpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003477 struct zone *zone;
3478
KOSAKI Motohiroee99c712009-03-31 15:19:31 -07003479 for_each_populated_zone(zone) {
David Rientjes7bf02ea2011-05-24 17:11:16 -07003480 if (skip_free_areas_node(filter, zone_to_nid(zone)))
David Rientjesddd588b2011-03-22 16:30:46 -07003481 continue;
Konstantin Khlebnikovd1bfcdb2015-04-14 15:45:30 -07003482
Konstantin Khlebnikov761b0672015-04-14 15:45:32 -07003483 for_each_online_cpu(cpu)
3484 free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003485 }
3486
KOSAKI Motohiroa7312862009-09-21 17:01:37 -07003487 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
3488 " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
Konstantin Khlebnikovd1bfcdb2015-04-14 15:45:30 -07003489 " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n"
3490 " slab_reclaimable:%lu slab_unreclaimable:%lu\n"
Bartlomiej Zolnierkiewiczd1ce7492012-10-08 16:32:02 -07003491 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
Konstantin Khlebnikovd1bfcdb2015-04-14 15:45:30 -07003492 " free:%lu free_pcp:%lu free_cma:%lu\n",
Rik van Riel4f98a2f2008-10-18 20:26:32 -07003493 global_page_state(NR_ACTIVE_ANON),
Rik van Riel4f98a2f2008-10-18 20:26:32 -07003494 global_page_state(NR_INACTIVE_ANON),
KOSAKI Motohiroa7312862009-09-21 17:01:37 -07003495 global_page_state(NR_ISOLATED_ANON),
3496 global_page_state(NR_ACTIVE_FILE),
Rik van Riel4f98a2f2008-10-18 20:26:32 -07003497 global_page_state(NR_INACTIVE_FILE),
KOSAKI Motohiroa7312862009-09-21 17:01:37 -07003498 global_page_state(NR_ISOLATED_FILE),
Lee Schermerhorn7b854122008-10-18 20:26:40 -07003499 global_page_state(NR_UNEVICTABLE),
Christoph Lameterb1e7a8f2006-06-30 01:55:39 -07003500 global_page_state(NR_FILE_DIRTY),
Christoph Lameterce866b32006-06-30 01:55:40 -07003501 global_page_state(NR_WRITEBACK),
Christoph Lameterfd39fc82006-06-30 01:55:40 -07003502 global_page_state(NR_UNSTABLE_NFS),
KOSAKI Motohiro3701b032009-09-21 17:01:29 -07003503 global_page_state(NR_SLAB_RECLAIMABLE),
3504 global_page_state(NR_SLAB_UNRECLAIMABLE),
Christoph Lameter65ba55f2006-06-30 01:55:34 -07003505 global_page_state(NR_FILE_MAPPED),
KOSAKI Motohiro4b021082009-09-21 17:01:33 -07003506 global_page_state(NR_SHMEM),
Andrew Mortona25700a2007-02-08 14:20:40 -08003507 global_page_state(NR_PAGETABLE),
Bartlomiej Zolnierkiewiczd1ce7492012-10-08 16:32:02 -07003508 global_page_state(NR_BOUNCE),
Konstantin Khlebnikovd1bfcdb2015-04-14 15:45:30 -07003509 global_page_state(NR_FREE_PAGES),
3510 free_pcp,
Bartlomiej Zolnierkiewiczd1ce7492012-10-08 16:32:02 -07003511 global_page_state(NR_FREE_CMA_PAGES));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003512
KOSAKI Motohiroee99c712009-03-31 15:19:31 -07003513 for_each_populated_zone(zone) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003514 int i;
3515
David Rientjes7bf02ea2011-05-24 17:11:16 -07003516 if (skip_free_areas_node(filter, zone_to_nid(zone)))
David Rientjesddd588b2011-03-22 16:30:46 -07003517 continue;
Konstantin Khlebnikovd1bfcdb2015-04-14 15:45:30 -07003518
3519 free_pcp = 0;
3520 for_each_online_cpu(cpu)
3521 free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
3522
Linus Torvalds1da177e2005-04-16 15:20:36 -07003523 show_node(zone);
3524 printk("%s"
3525 " free:%lukB"
3526 " min:%lukB"
3527 " low:%lukB"
3528 " high:%lukB"
Rik van Riel4f98a2f2008-10-18 20:26:32 -07003529 " active_anon:%lukB"
3530 " inactive_anon:%lukB"
3531 " active_file:%lukB"
3532 " inactive_file:%lukB"
Lee Schermerhorn7b854122008-10-18 20:26:40 -07003533 " unevictable:%lukB"
KOSAKI Motohiroa7312862009-09-21 17:01:37 -07003534 " isolated(anon):%lukB"
3535 " isolated(file):%lukB"
Linus Torvalds1da177e2005-04-16 15:20:36 -07003536 " present:%lukB"
Jiang Liu9feedc92012-12-12 13:52:12 -08003537 " managed:%lukB"
KOSAKI Motohiro4a0aa732009-09-21 17:01:30 -07003538 " mlocked:%lukB"
3539 " dirty:%lukB"
3540 " writeback:%lukB"
3541 " mapped:%lukB"
KOSAKI Motohiro4b021082009-09-21 17:01:33 -07003542 " shmem:%lukB"
KOSAKI Motohiro4a0aa732009-09-21 17:01:30 -07003543 " slab_reclaimable:%lukB"
3544 " slab_unreclaimable:%lukB"
KOSAKI Motohiroc6a7f572009-09-21 17:01:32 -07003545 " kernel_stack:%lukB"
KOSAKI Motohiro4a0aa732009-09-21 17:01:30 -07003546 " pagetables:%lukB"
3547 " unstable:%lukB"
3548 " bounce:%lukB"
Konstantin Khlebnikovd1bfcdb2015-04-14 15:45:30 -07003549 " free_pcp:%lukB"
3550 " local_pcp:%ukB"
Bartlomiej Zolnierkiewiczd1ce7492012-10-08 16:32:02 -07003551 " free_cma:%lukB"
KOSAKI Motohiro4a0aa732009-09-21 17:01:30 -07003552 " writeback_tmp:%lukB"
Linus Torvalds1da177e2005-04-16 15:20:36 -07003553 " pages_scanned:%lu"
3554 " all_unreclaimable? %s"
3555 "\n",
3556 zone->name,
Mel Gorman88f5acf2011-01-13 15:45:41 -08003557 K(zone_page_state(zone, NR_FREE_PAGES)),
Mel Gorman41858962009-06-16 15:32:12 -07003558 K(min_wmark_pages(zone)),
3559 K(low_wmark_pages(zone)),
3560 K(high_wmark_pages(zone)),
Rik van Riel4f98a2f2008-10-18 20:26:32 -07003561 K(zone_page_state(zone, NR_ACTIVE_ANON)),
3562 K(zone_page_state(zone, NR_INACTIVE_ANON)),
3563 K(zone_page_state(zone, NR_ACTIVE_FILE)),
3564 K(zone_page_state(zone, NR_INACTIVE_FILE)),
Lee Schermerhorn7b854122008-10-18 20:26:40 -07003565 K(zone_page_state(zone, NR_UNEVICTABLE)),
KOSAKI Motohiroa7312862009-09-21 17:01:37 -07003566 K(zone_page_state(zone, NR_ISOLATED_ANON)),
3567 K(zone_page_state(zone, NR_ISOLATED_FILE)),
Linus Torvalds1da177e2005-04-16 15:20:36 -07003568 K(zone->present_pages),
Jiang Liu9feedc92012-12-12 13:52:12 -08003569 K(zone->managed_pages),
KOSAKI Motohiro4a0aa732009-09-21 17:01:30 -07003570 K(zone_page_state(zone, NR_MLOCK)),
3571 K(zone_page_state(zone, NR_FILE_DIRTY)),
3572 K(zone_page_state(zone, NR_WRITEBACK)),
3573 K(zone_page_state(zone, NR_FILE_MAPPED)),
KOSAKI Motohiro4b021082009-09-21 17:01:33 -07003574 K(zone_page_state(zone, NR_SHMEM)),
KOSAKI Motohiro4a0aa732009-09-21 17:01:30 -07003575 K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
3576 K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
KOSAKI Motohiroc6a7f572009-09-21 17:01:32 -07003577 zone_page_state(zone, NR_KERNEL_STACK) *
3578 THREAD_SIZE / 1024,
KOSAKI Motohiro4a0aa732009-09-21 17:01:30 -07003579 K(zone_page_state(zone, NR_PAGETABLE)),
3580 K(zone_page_state(zone, NR_UNSTABLE_NFS)),
3581 K(zone_page_state(zone, NR_BOUNCE)),
Konstantin Khlebnikovd1bfcdb2015-04-14 15:45:30 -07003582 K(free_pcp),
3583 K(this_cpu_read(zone->pageset->pcp.count)),
Bartlomiej Zolnierkiewiczd1ce7492012-10-08 16:32:02 -07003584 K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
KOSAKI Motohiro4a0aa732009-09-21 17:01:30 -07003585 K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
Mel Gorman0d5d8232014-08-06 16:07:16 -07003586 K(zone_page_state(zone, NR_PAGES_SCANNED)),
Lisa Du6e543d52013-09-11 14:22:36 -07003587 (!zone_reclaimable(zone) ? "yes" : "no")
Linus Torvalds1da177e2005-04-16 15:20:36 -07003588 );
3589 printk("lowmem_reserve[]:");
3590 for (i = 0; i < MAX_NR_ZONES; i++)
Mel Gorman3484b2d2014-08-06 16:07:14 -07003591 printk(" %ld", zone->lowmem_reserve[i]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003592 printk("\n");
3593 }
3594
KOSAKI Motohiroee99c712009-03-31 15:19:31 -07003595 for_each_populated_zone(zone) {
Pintu Kumarb8af2942013-09-11 14:20:34 -07003596 unsigned long nr[MAX_ORDER], flags, order, total = 0;
Rabin Vincent377e4f12012-12-11 16:00:24 -08003597 unsigned char types[MAX_ORDER];
Linus Torvalds1da177e2005-04-16 15:20:36 -07003598
David Rientjes7bf02ea2011-05-24 17:11:16 -07003599 if (skip_free_areas_node(filter, zone_to_nid(zone)))
David Rientjesddd588b2011-03-22 16:30:46 -07003600 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003601 show_node(zone);
3602 printk("%s: ", zone->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003603
3604 spin_lock_irqsave(&zone->lock, flags);
3605 for (order = 0; order < MAX_ORDER; order++) {
Rabin Vincent377e4f12012-12-11 16:00:24 -08003606 struct free_area *area = &zone->free_area[order];
3607 int type;
3608
3609 nr[order] = area->nr_free;
Kirill Korotaev8f9de512006-06-23 02:03:50 -07003610 total += nr[order] << order;
Rabin Vincent377e4f12012-12-11 16:00:24 -08003611
3612 types[order] = 0;
3613 for (type = 0; type < MIGRATE_TYPES; type++) {
3614 if (!list_empty(&area->free_list[type]))
3615 types[order] |= 1 << type;
3616 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003617 }
3618 spin_unlock_irqrestore(&zone->lock, flags);
Rabin Vincent377e4f12012-12-11 16:00:24 -08003619 for (order = 0; order < MAX_ORDER; order++) {
Kirill Korotaev8f9de512006-06-23 02:03:50 -07003620 printk("%lu*%lukB ", nr[order], K(1UL) << order);
Rabin Vincent377e4f12012-12-11 16:00:24 -08003621 if (nr[order])
3622 show_migration_types(types[order]);
3623 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003624 printk("= %lukB\n", K(total));
3625 }
3626
David Rientjes949f7ec2013-04-29 15:07:48 -07003627 hugetlb_show_meminfo();
3628
Larry Woodmane6f36022008-02-04 22:29:30 -08003629 printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));
3630
Linus Torvalds1da177e2005-04-16 15:20:36 -07003631 show_swap_cache_info();
3632}
3633
Mel Gorman19770b32008-04-28 02:12:18 -07003634static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
3635{
3636 zoneref->zone = zone;
3637 zoneref->zone_idx = zone_idx(zone);
3638}
3639
Linus Torvalds1da177e2005-04-16 15:20:36 -07003640/*
3641 * Builds allocation fallback zone lists.
Christoph Lameter1a932052006-01-06 00:11:16 -08003642 *
3643 * Add all populated zones of a node to the zonelist.
Linus Torvalds1da177e2005-04-16 15:20:36 -07003644 */
KAMEZAWA Hiroyukif0c0b2b2007-07-15 23:38:01 -07003645static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
Zhang Yanfeibc732f12013-07-08 16:00:06 -07003646 int nr_zones)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003647{
Christoph Lameter1a932052006-01-06 00:11:16 -08003648 struct zone *zone;
Zhang Yanfeibc732f12013-07-08 16:00:06 -07003649 enum zone_type zone_type = MAX_NR_ZONES;
Christoph Lameter02a68a52006-01-06 00:11:18 -08003650
3651 do {
Christoph Lameter2f6726e2006-09-25 23:31:18 -07003652 zone_type--;
Christoph Lameter070f8032006-01-06 00:11:19 -08003653 zone = pgdat->node_zones + zone_type;
Christoph Lameter1a932052006-01-06 00:11:16 -08003654 if (populated_zone(zone)) {
Mel Gormandd1a2392008-04-28 02:12:17 -07003655 zoneref_set_zone(zone,
3656 &zonelist->_zonerefs[nr_zones++]);
Christoph Lameter070f8032006-01-06 00:11:19 -08003657 check_highest_zone(zone_type);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003658 }
Christoph Lameter2f6726e2006-09-25 23:31:18 -07003659 } while (zone_type);
Zhang Yanfeibc732f12013-07-08 16:00:06 -07003660
Christoph Lameter070f8032006-01-06 00:11:19 -08003661 return nr_zones;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003662}
3663
KAMEZAWA Hiroyukif0c0b2b2007-07-15 23:38:01 -07003664
3665/*
3666 * zonelist_order:
3667 * 0 = automatic detection of better ordering.
3668 * 1 = order by ([node] distance, -zonetype)
3669 * 2 = order by (-zonetype, [node] distance)
3670 *
3671 * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
3672 * the same zonelist. So only NUMA can configure this param.
3673 */
3674#define ZONELIST_ORDER_DEFAULT 0
3675#define ZONELIST_ORDER_NODE 1
3676#define ZONELIST_ORDER_ZONE 2
3677
3678/* zonelist order in the kernel.
3679 * set_zonelist_order() will set this to NODE or ZONE.
3680 */
3681static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
3682static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
3683
3684
Linus Torvalds1da177e2005-04-16 15:20:36 -07003685#ifdef CONFIG_NUMA
KAMEZAWA Hiroyukif0c0b2b2007-07-15 23:38:01 -07003686/* The value user specified ....changed by config */
3687static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
3688/* string for sysctl */
3689#define NUMA_ZONELIST_ORDER_LEN 16
3690char numa_zonelist_order[16] = "default";
3691
3692/*
3693 * interface for configure zonelist ordering.
3694 * command line option "numa_zonelist_order"
3695 * = "[dD]efault - default, automatic configuration.
3696 * = "[nN]ode - order by node locality, then by zone within node
3697 * = "[zZ]one - order by zone, then by locality within zone
3698 */
3699
3700static int __parse_numa_zonelist_order(char *s)
3701{
3702 if (*s == 'd' || *s == 'D') {
3703 user_zonelist_order = ZONELIST_ORDER_DEFAULT;
3704 } else if (*s == 'n' || *s == 'N') {
3705 user_zonelist_order = ZONELIST_ORDER_NODE;
3706 } else if (*s == 'z' || *s == 'Z') {
3707 user_zonelist_order = ZONELIST_ORDER_ZONE;
3708 } else {
3709 printk(KERN_WARNING
3710 "Ignoring invalid numa_zonelist_order value: "
3711 "%s\n", s);
3712 return -EINVAL;
3713 }
3714 return 0;
3715}
3716
3717static __init int setup_numa_zonelist_order(char *s)
3718{
Volodymyr G. Lukiianykecb256f2011-01-13 15:46:26 -08003719 int ret;
3720
3721 if (!s)
3722 return 0;
3723
3724 ret = __parse_numa_zonelist_order(s);
3725 if (ret == 0)
3726 strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
3727
3728 return ret;
KAMEZAWA Hiroyukif0c0b2b2007-07-15 23:38:01 -07003729}
3730early_param("numa_zonelist_order", setup_numa_zonelist_order);
3731
3732/*
3733 * sysctl handler for numa_zonelist_order
3734 */
Joe Perchescccad5b2014-06-06 14:38:09 -07003735int numa_zonelist_order_handler(struct ctl_table *table, int write,
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003736 void __user *buffer, size_t *length,
KAMEZAWA Hiroyukif0c0b2b2007-07-15 23:38:01 -07003737 loff_t *ppos)
3738{
3739 char saved_string[NUMA_ZONELIST_ORDER_LEN];
3740 int ret;
Andi Kleen443c6f12009-12-23 21:00:47 +01003741 static DEFINE_MUTEX(zl_order_mutex);
KAMEZAWA Hiroyukif0c0b2b2007-07-15 23:38:01 -07003742
Andi Kleen443c6f12009-12-23 21:00:47 +01003743 mutex_lock(&zl_order_mutex);
Chen Gangdacbde02013-07-03 15:02:35 -07003744 if (write) {
3745 if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {
3746 ret = -EINVAL;
3747 goto out;
3748 }
3749 strcpy(saved_string, (char *)table->data);
3750 }
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003751 ret = proc_dostring(table, write, buffer, length, ppos);
KAMEZAWA Hiroyukif0c0b2b2007-07-15 23:38:01 -07003752 if (ret)
Andi Kleen443c6f12009-12-23 21:00:47 +01003753 goto out;
KAMEZAWA Hiroyukif0c0b2b2007-07-15 23:38:01 -07003754 if (write) {
3755 int oldval = user_zonelist_order;
Chen Gangdacbde02013-07-03 15:02:35 -07003756
3757 ret = __parse_numa_zonelist_order((char *)table->data);
3758 if (ret) {
KAMEZAWA Hiroyukif0c0b2b2007-07-15 23:38:01 -07003759 /*
3760 * bogus value. restore saved string
3761 */
Chen Gangdacbde02013-07-03 15:02:35 -07003762 strncpy((char *)table->data, saved_string,
KAMEZAWA Hiroyukif0c0b2b2007-07-15 23:38:01 -07003763 NUMA_ZONELIST_ORDER_LEN);
3764 user_zonelist_order = oldval;
Haicheng Li4eaf3f62010-05-24 14:32:52 -07003765 } else if (oldval != user_zonelist_order) {
3766 mutex_lock(&zonelists_mutex);
Jiang Liu9adb62a2012-07-31 16:43:28 -07003767 build_all_zonelists(NULL, NULL);
Haicheng Li4eaf3f62010-05-24 14:32:52 -07003768 mutex_unlock(&zonelists_mutex);
3769 }
KAMEZAWA Hiroyukif0c0b2b2007-07-15 23:38:01 -07003770 }
Andi Kleen443c6f12009-12-23 21:00:47 +01003771out:
3772 mutex_unlock(&zl_order_mutex);
3773 return ret;
KAMEZAWA Hiroyukif0c0b2b2007-07-15 23:38:01 -07003774}
3775
3776
Christoph Lameter62bc62a2009-06-16 15:32:15 -07003777#define MAX_NODE_LOAD (nr_online_nodes)
KAMEZAWA Hiroyukif0c0b2b2007-07-15 23:38:01 -07003778static int node_load[MAX_NUMNODES];
3779
Linus Torvalds1da177e2005-04-16 15:20:36 -07003780/**
Pavel Pisa4dc3b162005-05-01 08:59:25 -07003781 * find_next_best_node - find the next node that should appear in a given node's fallback list
Linus Torvalds1da177e2005-04-16 15:20:36 -07003782 * @node: node whose fallback list we're appending
3783 * @used_node_mask: nodemask_t of already used nodes
3784 *
3785 * We use a number of factors to determine which is the next node that should
3786 * appear on a given node's fallback list. The node should not have appeared
3787 * already in @node's fallback list, and it should be the next closest node
3788 * according to the distance array (which contains arbitrary distance values
3789 * from each node to each node in the system), and should also prefer nodes
3790 * with no CPUs, since presumably they'll have very little allocation pressure
3791 * on them otherwise.
3792 * It returns -1 if no node is found.
3793 */
KAMEZAWA Hiroyukif0c0b2b2007-07-15 23:38:01 -07003794static int find_next_best_node(int node, nodemask_t *used_node_mask)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003795{
Linus Torvalds4cf808eb2006-02-17 20:38:21 +01003796 int n, val;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003797 int min_val = INT_MAX;
David Rientjes00ef2d22013-02-22 16:35:36 -08003798 int best_node = NUMA_NO_NODE;
Rusty Russella70f7302009-03-13 14:49:46 +10303799 const struct cpumask *tmp = cpumask_of_node(0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003800
Linus Torvalds4cf808eb2006-02-17 20:38:21 +01003801 /* Use the local node if we haven't already */
3802 if (!node_isset(node, *used_node_mask)) {
3803 node_set(node, *used_node_mask);
3804 return node;
3805 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003806
Lai Jiangshan4b0ef1fe2012-12-12 13:51:46 -08003807 for_each_node_state(n, N_MEMORY) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003808
3809 /* Don't want a node to appear more than once */
3810 if (node_isset(n, *used_node_mask))
3811 continue;
3812
Linus Torvalds1da177e2005-04-16 15:20:36 -07003813 /* Use the distance array to find the distance */
3814 val = node_distance(node, n);
3815
Linus Torvalds4cf808eb2006-02-17 20:38:21 +01003816 /* Penalize nodes under us ("prefer the next node") */
3817 val += (n < node);
3818
Linus Torvalds1da177e2005-04-16 15:20:36 -07003819 /* Give preference to headless and unused nodes */
Rusty Russella70f7302009-03-13 14:49:46 +10303820 tmp = cpumask_of_node(n);
3821 if (!cpumask_empty(tmp))
Linus Torvalds1da177e2005-04-16 15:20:36 -07003822 val += PENALTY_FOR_NODE_WITH_CPUS;
3823
3824 /* Slight preference for less loaded node */
3825 val *= (MAX_NODE_LOAD*MAX_NUMNODES);
3826 val += node_load[n];
3827
3828 if (val < min_val) {
3829 min_val = val;
3830 best_node = n;
3831 }
3832 }
3833
3834 if (best_node >= 0)
3835 node_set(best_node, *used_node_mask);
3836
3837 return best_node;
3838}
3839
KAMEZAWA Hiroyukif0c0b2b2007-07-15 23:38:01 -07003840
3841/*
3842 * Build zonelists ordered by node and zones within node.
3843 * This results in maximum locality--normal zone overflows into local
3844 * DMA zone, if any--but risks exhausting DMA zone.
3845 */
3846static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003847{
KAMEZAWA Hiroyukif0c0b2b2007-07-15 23:38:01 -07003848 int j;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003849 struct zonelist *zonelist;
KAMEZAWA Hiroyukif0c0b2b2007-07-15 23:38:01 -07003850
Mel Gorman54a6eb52008-04-28 02:12:16 -07003851 zonelist = &pgdat->node_zonelists[0];
Mel Gormandd1a2392008-04-28 02:12:17 -07003852 for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
Mel Gorman54a6eb52008-04-28 02:12:16 -07003853 ;
Zhang Yanfeibc732f12013-07-08 16:00:06 -07003854 j = build_zonelists_node(NODE_DATA(node), zonelist, j);
Mel Gormandd1a2392008-04-28 02:12:17 -07003855 zonelist->_zonerefs[j].zone = NULL;
3856 zonelist->_zonerefs[j].zone_idx = 0;
KAMEZAWA Hiroyukif0c0b2b2007-07-15 23:38:01 -07003857}
3858
3859/*
Christoph Lameter523b9452007-10-16 01:25:37 -07003860 * Build gfp_thisnode zonelists
3861 */
3862static void build_thisnode_zonelists(pg_data_t *pgdat)
3863{
Christoph Lameter523b9452007-10-16 01:25:37 -07003864 int j;
3865 struct zonelist *zonelist;
3866
Mel Gorman54a6eb52008-04-28 02:12:16 -07003867 zonelist = &pgdat->node_zonelists[1];
Zhang Yanfeibc732f12013-07-08 16:00:06 -07003868 j = build_zonelists_node(pgdat, zonelist, 0);
Mel Gormandd1a2392008-04-28 02:12:17 -07003869 zonelist->_zonerefs[j].zone = NULL;
3870 zonelist->_zonerefs[j].zone_idx = 0;
Christoph Lameter523b9452007-10-16 01:25:37 -07003871}
3872
3873/*
KAMEZAWA Hiroyukif0c0b2b2007-07-15 23:38:01 -07003874 * Build zonelists ordered by zone and nodes within zones.
3875 * This results in conserving DMA zone[s] until all Normal memory is
3876 * exhausted, but results in overflowing to remote node while memory
3877 * may still exist in local DMA zone.
3878 */
3879static int node_order[MAX_NUMNODES];
3880
3881static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
3882{
KAMEZAWA Hiroyukif0c0b2b2007-07-15 23:38:01 -07003883 int pos, j, node;
3884 int zone_type; /* needs to be signed */
3885 struct zone *z;
3886 struct zonelist *zonelist;
3887
Mel Gorman54a6eb52008-04-28 02:12:16 -07003888 zonelist = &pgdat->node_zonelists[0];
3889 pos = 0;
3890 for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
3891 for (j = 0; j < nr_nodes; j++) {
3892 node = node_order[j];
3893 z = &NODE_DATA(node)->node_zones[zone_type];
3894 if (populated_zone(z)) {
Mel Gormandd1a2392008-04-28 02:12:17 -07003895 zoneref_set_zone(z,
3896 &zonelist->_zonerefs[pos++]);
Mel Gorman54a6eb52008-04-28 02:12:16 -07003897 check_highest_zone(zone_type);
KAMEZAWA Hiroyukif0c0b2b2007-07-15 23:38:01 -07003898 }
3899 }
KAMEZAWA Hiroyukif0c0b2b2007-07-15 23:38:01 -07003900 }
Mel Gormandd1a2392008-04-28 02:12:17 -07003901 zonelist->_zonerefs[pos].zone = NULL;
3902 zonelist->_zonerefs[pos].zone_idx = 0;
KAMEZAWA Hiroyukif0c0b2b2007-07-15 23:38:01 -07003903}
3904
Mel Gorman31939132014-10-09 15:28:30 -07003905#if defined(CONFIG_64BIT)
3906/*
3907 * Devices that require DMA32/DMA are relatively rare and do not justify a
3908 * penalty to every machine in case the specialised case applies. Default
3909 * to Node-ordering on 64-bit NUMA machines
3910 */
KAMEZAWA Hiroyukif0c0b2b2007-07-15 23:38:01 -07003911static int default_zonelist_order(void)
3912{
Mel Gorman31939132014-10-09 15:28:30 -07003913 return ZONELIST_ORDER_NODE;
3914}
3915#else
3916/*
3917 * On 32-bit, the Normal zone needs to be preserved for allocations accessible
3918 * by the kernel. If processes running on node 0 deplete the low memory zone
3919 * then reclaim will occur more frequency increasing stalls and potentially
3920 * be easier to OOM if a large percentage of the zone is under writeback or
3921 * dirty. The problem is significantly worse if CONFIG_HIGHPTE is not set.
3922 * Hence, default to zone ordering on 32-bit.
3923 */
3924static int default_zonelist_order(void)
3925{
KAMEZAWA Hiroyukif0c0b2b2007-07-15 23:38:01 -07003926 return ZONELIST_ORDER_ZONE;
3927}
Mel Gorman31939132014-10-09 15:28:30 -07003928#endif /* CONFIG_64BIT */
KAMEZAWA Hiroyukif0c0b2b2007-07-15 23:38:01 -07003929
3930static void set_zonelist_order(void)
3931{
3932 if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
3933 current_zonelist_order = default_zonelist_order();
3934 else
3935 current_zonelist_order = user_zonelist_order;
3936}
3937
3938static void build_zonelists(pg_data_t *pgdat)
3939{
3940 int j, node, load;
3941 enum zone_type i;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003942 nodemask_t used_mask;
KAMEZAWA Hiroyukif0c0b2b2007-07-15 23:38:01 -07003943 int local_node, prev_node;
3944 struct zonelist *zonelist;
3945 int order = current_zonelist_order;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003946
3947 /* initialize zonelists */
Christoph Lameter523b9452007-10-16 01:25:37 -07003948 for (i = 0; i < MAX_ZONELISTS; i++) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003949 zonelist = pgdat->node_zonelists + i;
Mel Gormandd1a2392008-04-28 02:12:17 -07003950 zonelist->_zonerefs[0].zone = NULL;
3951 zonelist->_zonerefs[0].zone_idx = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003952 }
3953
3954 /* NUMA-aware ordering of nodes */
3955 local_node = pgdat->node_id;
Christoph Lameter62bc62a2009-06-16 15:32:15 -07003956 load = nr_online_nodes;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003957 prev_node = local_node;
3958 nodes_clear(used_mask);
KAMEZAWA Hiroyukif0c0b2b2007-07-15 23:38:01 -07003959
KAMEZAWA Hiroyukif0c0b2b2007-07-15 23:38:01 -07003960 memset(node_order, 0, sizeof(node_order));
3961 j = 0;
3962
Linus Torvalds1da177e2005-04-16 15:20:36 -07003963 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
3964 /*
3965 * We don't want to pressure a particular node.
3966 * So adding penalty to the first node in same
3967 * distance group to make it round-robin.
3968 */
David Rientjes957f8222012-10-08 16:33:24 -07003969 if (node_distance(local_node, node) !=
3970 node_distance(local_node, prev_node))
KAMEZAWA Hiroyukif0c0b2b2007-07-15 23:38:01 -07003971 node_load[node] = load;
3972
Linus Torvalds1da177e2005-04-16 15:20:36 -07003973 prev_node = node;
3974 load--;
KAMEZAWA Hiroyukif0c0b2b2007-07-15 23:38:01 -07003975 if (order == ZONELIST_ORDER_NODE)
3976 build_zonelists_in_node_order(pgdat, node);
3977 else
3978 node_order[j++] = node; /* remember order */
3979 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003980
KAMEZAWA Hiroyukif0c0b2b2007-07-15 23:38:01 -07003981 if (order == ZONELIST_ORDER_ZONE) {
3982 /* calculate node order -- i.e., DMA last! */
3983 build_zonelists_in_zone_order(pgdat, j);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003984 }
Christoph Lameter523b9452007-10-16 01:25:37 -07003985
3986 build_thisnode_zonelists(pgdat);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003987}
3988
Paul Jackson9276b1bc2006-12-06 20:31:48 -08003989/* Construct the zonelist performance cache - see further mmzone.h */
KAMEZAWA Hiroyukif0c0b2b2007-07-15 23:38:01 -07003990static void build_zonelist_cache(pg_data_t *pgdat)
Paul Jackson9276b1bc2006-12-06 20:31:48 -08003991{
Mel Gorman54a6eb52008-04-28 02:12:16 -07003992 struct zonelist *zonelist;
3993 struct zonelist_cache *zlc;
Mel Gormandd1a2392008-04-28 02:12:17 -07003994 struct zoneref *z;
Paul Jackson9276b1bc2006-12-06 20:31:48 -08003995
Mel Gorman54a6eb52008-04-28 02:12:16 -07003996 zonelist = &pgdat->node_zonelists[0];
3997 zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
3998 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
Mel Gormandd1a2392008-04-28 02:12:17 -07003999 for (z = zonelist->_zonerefs; z->zone; z++)
4000 zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
Paul Jackson9276b1bc2006-12-06 20:31:48 -08004001}
4002
Lee Schermerhorn7aac7892010-05-26 14:45:00 -07004003#ifdef CONFIG_HAVE_MEMORYLESS_NODES
4004/*
4005 * Return node id of node used for "local" allocations.
4006 * I.e., first node id of first zone in arg node's generic zonelist.
4007 * Used for initializing percpu 'numa_mem', which is used primarily
4008 * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
4009 */
4010int local_memory_node(int node)
4011{
4012 struct zone *zone;
4013
4014 (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
4015 gfp_zone(GFP_KERNEL),
4016 NULL,
4017 &zone);
4018 return zone->node;
4019}
4020#endif
KAMEZAWA Hiroyukif0c0b2b2007-07-15 23:38:01 -07004021
Linus Torvalds1da177e2005-04-16 15:20:36 -07004022#else /* CONFIG_NUMA */
4023
KAMEZAWA Hiroyukif0c0b2b2007-07-15 23:38:01 -07004024static void set_zonelist_order(void)
4025{
4026 current_zonelist_order = ZONELIST_ORDER_ZONE;
4027}
4028
4029static void build_zonelists(pg_data_t *pgdat)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004030{
Christoph Lameter19655d32006-09-25 23:31:19 -07004031 int node, local_node;
Mel Gorman54a6eb52008-04-28 02:12:16 -07004032 enum zone_type j;
4033 struct zonelist *zonelist;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004034
4035 local_node = pgdat->node_id;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004036
Mel Gorman54a6eb52008-04-28 02:12:16 -07004037 zonelist = &pgdat->node_zonelists[0];
Zhang Yanfeibc732f12013-07-08 16:00:06 -07004038 j = build_zonelists_node(pgdat, zonelist, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004039
Mel Gorman54a6eb52008-04-28 02:12:16 -07004040 /*
4041 * Now we build the zonelist so that it contains the zones
4042 * of all the other nodes.
4043 * We don't want to pressure a particular node, so when
4044 * building the zones for node N, we make sure that the
4045 * zones coming right after the local ones are those from
4046 * node N+1 (modulo N)
4047 */
4048 for (node = local_node + 1; node < MAX_NUMNODES; node++) {
4049 if (!node_online(node))
4050 continue;
Zhang Yanfeibc732f12013-07-08 16:00:06 -07004051 j = build_zonelists_node(NODE_DATA(node), zonelist, j);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004052 }
Mel Gorman54a6eb52008-04-28 02:12:16 -07004053 for (node = 0; node < local_node; node++) {
4054 if (!node_online(node))
4055 continue;
Zhang Yanfeibc732f12013-07-08 16:00:06 -07004056 j = build_zonelists_node(NODE_DATA(node), zonelist, j);
Mel Gorman54a6eb52008-04-28 02:12:16 -07004057 }
4058
Mel Gormandd1a2392008-04-28 02:12:17 -07004059 zonelist->_zonerefs[j].zone = NULL;
4060 zonelist->_zonerefs[j].zone_idx = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004061}
4062
Paul Jackson9276b1bc2006-12-06 20:31:48 -08004063/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
KAMEZAWA Hiroyukif0c0b2b2007-07-15 23:38:01 -07004064static void build_zonelist_cache(pg_data_t *pgdat)
Paul Jackson9276b1bc2006-12-06 20:31:48 -08004065{
Mel Gorman54a6eb52008-04-28 02:12:16 -07004066 pgdat->node_zonelists[0].zlcache_ptr = NULL;
Paul Jackson9276b1bc2006-12-06 20:31:48 -08004067}
4068
Linus Torvalds1da177e2005-04-16 15:20:36 -07004069#endif /* CONFIG_NUMA */
4070
Christoph Lameter99dcc3e2010-01-05 15:34:51 +09004071/*
4072 * Boot pageset table. One per cpu which is going to be used for all
4073 * zones and all nodes. The parameters will be set in such a way
4074 * that an item put on a list will immediately be handed over to
4075 * the buddy list. This is safe since pageset manipulation is done
4076 * with interrupts disabled.
4077 *
4078 * The boot_pagesets must be kept even after bootup is complete for
4079 * unused processors and/or zones. They do play a role for bootstrapping
4080 * hotplugged processors.
4081 *
4082 * zoneinfo_show() and maybe other functions do
4083 * not check if the processor is online before following the pageset pointer.
4084 * Other parts of the kernel may not check if the zone is available.
4085 */
4086static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
4087static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
Haicheng Li1f522502010-05-24 14:32:51 -07004088static void setup_zone_pageset(struct zone *zone);
Christoph Lameter99dcc3e2010-01-05 15:34:51 +09004089
Haicheng Li4eaf3f62010-05-24 14:32:52 -07004090/*
4091 * Global mutex to protect against size modification of zonelists
4092 * as well as to serialize pageset setup for the new populated zone.
4093 */
4094DEFINE_MUTEX(zonelists_mutex);
4095
Rusty Russell9b1a4d32008-07-28 12:16:30 -05004096/* return values int ....just for stop_machine() */
Jiang Liu4ed7e022012-07-31 16:43:35 -07004097static int __build_all_zonelists(void *data)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004098{
Yasunori Goto68113782006-06-23 02:03:11 -07004099 int nid;
Christoph Lameter99dcc3e2010-01-05 15:34:51 +09004100 int cpu;
Jiang Liu9adb62a2012-07-31 16:43:28 -07004101 pg_data_t *self = data;
Paul Jackson9276b1bc2006-12-06 20:31:48 -08004102
Bo Liu7f9cfb32009-08-18 14:11:19 -07004103#ifdef CONFIG_NUMA
4104 memset(node_load, 0, sizeof(node_load));
4105#endif
Jiang Liu9adb62a2012-07-31 16:43:28 -07004106
4107 if (self && !node_online(self->node_id)) {
4108 build_zonelists(self);
4109 build_zonelist_cache(self);
4110 }
4111
Paul Jackson9276b1bc2006-12-06 20:31:48 -08004112 for_each_online_node(nid) {
Christoph Lameter7ea15302007-10-16 01:25:29 -07004113 pg_data_t *pgdat = NODE_DATA(nid);
4114
4115 build_zonelists(pgdat);
4116 build_zonelist_cache(pgdat);
Paul Jackson9276b1bc2006-12-06 20:31:48 -08004117 }
Christoph Lameter99dcc3e2010-01-05 15:34:51 +09004118
4119 /*
4120 * Initialize the boot_pagesets that are going to be used
4121 * for bootstrapping processors. The real pagesets for
4122 * each zone will be allocated later when the per cpu
4123 * allocator is available.
4124 *
4125 * boot_pagesets are used also for bootstrapping offline
4126 * cpus if the system is already booted because the pagesets
4127 * are needed to initialize allocators on a specific cpu too.
4128 * F.e. the percpu allocator needs the page allocator which
4129 * needs the percpu allocator in order to allocate its pagesets
4130 * (a chicken-egg dilemma).
4131 */
Lee Schermerhorn7aac7892010-05-26 14:45:00 -07004132 for_each_possible_cpu(cpu) {
Christoph Lameter99dcc3e2010-01-05 15:34:51 +09004133 setup_pageset(&per_cpu(boot_pageset, cpu), 0);
4134
Lee Schermerhorn7aac7892010-05-26 14:45:00 -07004135#ifdef CONFIG_HAVE_MEMORYLESS_NODES
4136 /*
4137 * We now know the "local memory node" for each node--
4138 * i.e., the node of the first zone in the generic zonelist.
4139 * Set up numa_mem percpu variable for on-line cpus. During
4140 * boot, only the boot cpu should be on-line; we'll init the
4141 * secondary cpus' numa_mem as they come on-line. During
4142 * node/memory hotplug, we'll fixup all on-line cpus.
4143 */
4144 if (cpu_online(cpu))
4145 set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
4146#endif
4147 }
4148
Yasunori Goto68113782006-06-23 02:03:11 -07004149 return 0;
4150}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004151
Rasmus Villemoes061f67b2015-02-12 15:00:06 -08004152static noinline void __init
4153build_all_zonelists_init(void)
4154{
4155 __build_all_zonelists(NULL);
4156 mminit_verify_zonelist();
4157 cpuset_init_current_mems_allowed();
4158}
4159
Haicheng Li4eaf3f62010-05-24 14:32:52 -07004160/*
4161 * Called with zonelists_mutex held always
4162 * unless system_state == SYSTEM_BOOTING.
Rasmus Villemoes061f67b2015-02-12 15:00:06 -08004163 *
4164 * __ref due to (1) call of __meminit annotated setup_zone_pageset
4165 * [we're only called with non-NULL zone through __meminit paths] and
4166 * (2) call of __init annotated helper build_all_zonelists_init
4167 * [protected by SYSTEM_BOOTING].
Haicheng Li4eaf3f62010-05-24 14:32:52 -07004168 */
Jiang Liu9adb62a2012-07-31 16:43:28 -07004169void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
Yasunori Goto68113782006-06-23 02:03:11 -07004170{
KAMEZAWA Hiroyukif0c0b2b2007-07-15 23:38:01 -07004171 set_zonelist_order();
4172
Yasunori Goto68113782006-06-23 02:03:11 -07004173 if (system_state == SYSTEM_BOOTING) {
Rasmus Villemoes061f67b2015-02-12 15:00:06 -08004174 build_all_zonelists_init();
Yasunori Goto68113782006-06-23 02:03:11 -07004175 } else {
KAMEZAWA Hiroyukie9959f02010-11-24 12:57:09 -08004176#ifdef CONFIG_MEMORY_HOTPLUG
Jiang Liu9adb62a2012-07-31 16:43:28 -07004177 if (zone)
4178 setup_zone_pageset(zone);
KAMEZAWA Hiroyukie9959f02010-11-24 12:57:09 -08004179#endif
Cody P Schaferdd1895e2013-07-03 15:01:36 -07004180 /* we have to stop all cpus to guarantee there is no user
4181 of zonelist */
Jiang Liu9adb62a2012-07-31 16:43:28 -07004182 stop_machine(__build_all_zonelists, pgdat, NULL);
Yasunori Goto68113782006-06-23 02:03:11 -07004183 /* cpuset refresh routine should be here */
4184 }
Andrew Mortonbd1e22b2006-06-23 02:03:47 -07004185 vm_total_pages = nr_free_pagecache_pages();
Mel Gorman9ef9acb2007-10-16 01:25:54 -07004186 /*
4187 * Disable grouping by mobility if the number of pages in the
4188 * system is too low to allow the mechanism to work. It would be
4189 * more accurate, but expensive to check per-zone. This check is
4190 * made on memory-hotadd so a system can start with mobility
4191 * disabled and enable it later
4192 */
Mel Gormand9c23402007-10-16 01:26:01 -07004193 if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
Mel Gorman9ef9acb2007-10-16 01:25:54 -07004194 page_group_by_mobility_disabled = 1;
4195 else
4196 page_group_by_mobility_disabled = 0;
4197
Anton Blanchardf88dfff2014-12-10 15:42:53 -08004198 pr_info("Built %i zonelists in %s order, mobility grouping %s. "
Mel Gorman9ef9acb2007-10-16 01:25:54 -07004199 "Total pages: %ld\n",
Christoph Lameter62bc62a2009-06-16 15:32:15 -07004200 nr_online_nodes,
KAMEZAWA Hiroyukif0c0b2b2007-07-15 23:38:01 -07004201 zonelist_order_name[current_zonelist_order],
Mel Gorman9ef9acb2007-10-16 01:25:54 -07004202 page_group_by_mobility_disabled ? "off" : "on",
KAMEZAWA Hiroyukif0c0b2b2007-07-15 23:38:01 -07004203 vm_total_pages);
4204#ifdef CONFIG_NUMA
Anton Blanchardf88dfff2014-12-10 15:42:53 -08004205 pr_info("Policy zone: %s\n", zone_names[policy_zone]);
KAMEZAWA Hiroyukif0c0b2b2007-07-15 23:38:01 -07004206#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07004207}
4208
4209/*
4210 * Helper functions to size the waitqueue hash table.
4211 * Essentially these want to choose hash table sizes sufficiently
4212 * large so that collisions trying to wait on pages are rare.
4213 * But in fact, the number of active page waitqueues on typical
4214 * systems is ridiculously low, less than 200. So this is even
4215 * conservative, even though it seems large.
4216 *
4217 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
4218 * waitqueues, i.e. the size of the waitq table given the number of pages.
4219 */
4220#define PAGES_PER_WAITQUEUE 256
4221
Yasunori Gotocca448f2006-06-23 02:03:10 -07004222#ifndef CONFIG_MEMORY_HOTPLUG
Yasunori Goto02b694d2006-06-23 02:03:08 -07004223static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004224{
4225 unsigned long size = 1;
4226
4227 pages /= PAGES_PER_WAITQUEUE;
4228
4229 while (size < pages)
4230 size <<= 1;
4231
4232 /*
4233 * Once we have dozens or even hundreds of threads sleeping
4234 * on IO we've got bigger problems than wait queue collision.
4235 * Limit the size of the wait table to a reasonable size.
4236 */
4237 size = min(size, 4096UL);
4238
4239 return max(size, 4UL);
4240}
Yasunori Gotocca448f2006-06-23 02:03:10 -07004241#else
4242/*
4243 * A zone's size might be changed by hot-add, so it is not possible to determine
4244 * a suitable size for its wait_table. So we use the maximum size now.
4245 *
4246 * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:
4247 *
4248 * i386 (preemption config) : 4096 x 16 = 64Kbyte.
4249 * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
4250 * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.
4251 *
4252 * The maximum entries are prepared when a zone's memory is (512K + 256) pages
4253 * or more by the traditional way. (See above). It equals:
4254 *
4255 * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.
4256 * ia64(16K page size) : = ( 8G + 4M)byte.
4257 * powerpc (64K page size) : = (32G +16M)byte.
4258 */
4259static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
4260{
4261 return 4096UL;
4262}
4263#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07004264
4265/*
4266 * This is an integer logarithm so that shifts can be used later
4267 * to extract the more random high bits from the multiplicative
4268 * hash function before the remainder is taken.
4269 */
4270static inline unsigned long wait_table_bits(unsigned long size)
4271{
4272 return ffz(~size);
4273}
4274
Mel Gorman56fd56b2007-10-16 01:25:58 -07004275/*
Arve Hjønnevåg6d3163c2011-05-24 17:12:24 -07004276 * Check if a pageblock contains reserved pages
4277 */
4278static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
4279{
4280 unsigned long pfn;
4281
4282 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
4283 if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
4284 return 1;
4285 }
4286 return 0;
4287}
4288
4289/*
Mel Gormand9c23402007-10-16 01:26:01 -07004290 * Mark a number of pageblocks as MIGRATE_RESERVE. The number
Mel Gorman41858962009-06-16 15:32:12 -07004291 * of blocks reserved is based on min_wmark_pages(zone). The memory within
4292 * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
Mel Gorman56fd56b2007-10-16 01:25:58 -07004293 * higher will lead to a bigger reserve which will get freed as contiguous
4294 * blocks as reclaim kicks in
4295 */
4296static void setup_zone_migrate_reserve(struct zone *zone)
4297{
Arve Hjønnevåg6d3163c2011-05-24 17:12:24 -07004298 unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
Mel Gorman56fd56b2007-10-16 01:25:58 -07004299 struct page *page;
Mel Gorman78986a62009-09-21 17:03:02 -07004300 unsigned long block_migratetype;
4301 int reserve;
Yasuaki Ishimatsu943dca12014-01-21 15:49:06 -08004302 int old_reserve;
Mel Gorman56fd56b2007-10-16 01:25:58 -07004303
Michal Hockod02156382011-12-08 14:34:27 -08004304 /*
4305 * Get the start pfn, end pfn and the number of blocks to reserve
4306 * We have to be careful to be aligned to pageblock_nr_pages to
4307 * make sure that we always check pfn_valid for the first page in
4308 * the block.
4309 */
Mel Gorman56fd56b2007-10-16 01:25:58 -07004310 start_pfn = zone->zone_start_pfn;
Cody P Schafer108bcc92013-02-22 16:35:23 -08004311 end_pfn = zone_end_pfn(zone);
Michal Hockod02156382011-12-08 14:34:27 -08004312 start_pfn = roundup(start_pfn, pageblock_nr_pages);
Mel Gorman41858962009-06-16 15:32:12 -07004313 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
Mel Gormand9c23402007-10-16 01:26:01 -07004314 pageblock_order;
Mel Gorman56fd56b2007-10-16 01:25:58 -07004315
Mel Gorman78986a62009-09-21 17:03:02 -07004316 /*
4317 * Reserve blocks are generally in place to help high-order atomic
4318 * allocations that are short-lived. A min_free_kbytes value that
4319 * would result in more than 2 reserve blocks for atomic allocations
4320 * is assumed to be in place to help anti-fragmentation for the
4321 * future allocation of hugepages at runtime.
4322 */
4323 reserve = min(2, reserve);
Yasuaki Ishimatsu943dca12014-01-21 15:49:06 -08004324 old_reserve = zone->nr_migrate_reserve_block;
4325
4326 /* When memory hot-add, we almost always need to do nothing */
4327 if (reserve == old_reserve)
4328 return;
4329 zone->nr_migrate_reserve_block = reserve;
Mel Gorman78986a62009-09-21 17:03:02 -07004330
Mel Gormand9c23402007-10-16 01:26:01 -07004331 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
Mel Gorman56fd56b2007-10-16 01:25:58 -07004332 if (!pfn_valid(pfn))
4333 continue;
4334 page = pfn_to_page(pfn);
4335
Adam Litke344c7902008-09-02 14:35:38 -07004336 /* Watch out for overlapping nodes */
4337 if (page_to_nid(page) != zone_to_nid(zone))
4338 continue;
4339
Mel Gorman56fd56b2007-10-16 01:25:58 -07004340 block_migratetype = get_pageblock_migratetype(page);
4341
Mel Gorman938929f2012-01-10 15:07:14 -08004342 /* Only test what is necessary when the reserves are not met */
4343 if (reserve > 0) {
4344 /*
4345 * Blocks with reserved pages will never free, skip
4346 * them.
4347 */
4348 block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
4349 if (pageblock_is_reserved(pfn, block_end_pfn))
4350 continue;
Mel Gorman56fd56b2007-10-16 01:25:58 -07004351
Mel Gorman938929f2012-01-10 15:07:14 -08004352 /* If this block is reserved, account for it */
4353 if (block_migratetype == MIGRATE_RESERVE) {
4354 reserve--;
4355 continue;
4356 }
4357
4358 /* Suitable for reserving if this block is movable */
4359 if (block_migratetype == MIGRATE_MOVABLE) {
4360 set_pageblock_migratetype(page,
4361 MIGRATE_RESERVE);
4362 move_freepages_block(zone, page,
4363 MIGRATE_RESERVE);
4364 reserve--;
4365 continue;
4366 }
Yasuaki Ishimatsu943dca12014-01-21 15:49:06 -08004367 } else if (!old_reserve) {
4368 /*
4369 * At boot time we don't need to scan the whole zone
4370 * for turning off MIGRATE_RESERVE.
4371 */
4372 break;
Mel Gorman56fd56b2007-10-16 01:25:58 -07004373 }
4374
4375 /*
4376 * If the reserve is met and this is a previous reserved block,
4377 * take it back
4378 */
4379 if (block_migratetype == MIGRATE_RESERVE) {
4380 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
4381 move_freepages_block(zone, page, MIGRATE_MOVABLE);
4382 }
4383 }
4384}
Mel Gormanac0e5b72007-10-16 01:25:58 -07004385
Linus Torvalds1da177e2005-04-16 15:20:36 -07004386/*
4387 * Initially all pages are reserved - free ones are freed
4388 * up by free_all_bootmem() once the early boot process is
4389 * done. Non-atomic initialization, single-pass.
4390 */
Matt Tolentinoc09b4242006-01-17 07:03:44 +01004391void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
Dave Hansena2f3aa022007-01-10 23:15:30 -08004392 unsigned long start_pfn, enum memmap_context context)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004393{
Mel Gorman3a80a7f2015-06-30 14:57:02 -07004394 pg_data_t *pgdat = NODE_DATA(nid);
Andy Whitcroft29751f62005-06-23 00:08:00 -07004395 unsigned long end_pfn = start_pfn + size;
4396 unsigned long pfn;
KAMEZAWA Hiroyuki86051ca2008-04-29 00:58:21 -07004397 struct zone *z;
Mel Gorman3a80a7f2015-06-30 14:57:02 -07004398 unsigned long nr_initialised = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004399
Hugh Dickins22b31ee2009-01-06 14:40:09 -08004400 if (highest_memmap_pfn < end_pfn - 1)
4401 highest_memmap_pfn = end_pfn - 1;
4402
Mel Gorman3a80a7f2015-06-30 14:57:02 -07004403 z = &pgdat->node_zones[zone];
Greg Ungerercbe8dd42006-01-12 01:05:24 -08004404 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
Dave Hansena2f3aa022007-01-10 23:15:30 -08004405 /*
4406 * There can be holes in boot-time mem_map[]s
4407 * handed to this function. They do not
4408 * exist on hotplugged memory.
4409 */
4410 if (context == MEMMAP_EARLY) {
4411 if (!early_pfn_valid(pfn))
4412 continue;
4413 if (!early_pfn_in_nid(pfn, nid))
4414 continue;
Mel Gorman3a80a7f2015-06-30 14:57:02 -07004415 if (!update_defer_init(pgdat, pfn, end_pfn,
4416 &nr_initialised))
4417 break;
Dave Hansena2f3aa022007-01-10 23:15:30 -08004418 }
Robin Holt1e8ce832015-06-30 14:56:45 -07004419 __init_single_pfn(pfn, zone, nid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004420 }
4421}
4422
Andi Kleen1e548de2008-02-04 22:29:26 -08004423static void __meminit zone_init_free_lists(struct zone *zone)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004424{
Mel Gorman7aeb09f2014-06-04 16:10:21 -07004425 unsigned int order, t;
Mel Gormanb2a0ac82007-10-16 01:25:48 -07004426 for_each_migratetype_order(order, t) {
4427 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004428 zone->free_area[order].nr_free = 0;
4429 }
4430}
4431
4432#ifndef __HAVE_ARCH_MEMMAP_INIT
4433#define memmap_init(size, nid, zone, start_pfn) \
Dave Hansena2f3aa022007-01-10 23:15:30 -08004434 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004435#endif
4436
David Rientjes7cd2b0a2014-06-23 13:22:04 -07004437static int zone_batchsize(struct zone *zone)
Christoph Lametere7c8d5c2005-06-21 17:14:47 -07004438{
David Howells3a6be872009-05-06 16:03:03 -07004439#ifdef CONFIG_MMU
Christoph Lametere7c8d5c2005-06-21 17:14:47 -07004440 int batch;
4441
4442 /*
4443 * The per-cpu-pages pools are set to around 1000th of the
Seth, Rohitba56e912005-10-29 18:15:47 -07004444 * size of the zone. But no more than 1/2 of a meg.
Christoph Lametere7c8d5c2005-06-21 17:14:47 -07004445 *
4446 * OK, so we don't know how big the cache is. So guess.
4447 */
Jiang Liub40da042013-02-22 16:33:52 -08004448 batch = zone->managed_pages / 1024;
Seth, Rohitba56e912005-10-29 18:15:47 -07004449 if (batch * PAGE_SIZE > 512 * 1024)
4450 batch = (512 * 1024) / PAGE_SIZE;
Christoph Lametere7c8d5c2005-06-21 17:14:47 -07004451 batch /= 4; /* We effectively *= 4 below */
4452 if (batch < 1)
4453 batch = 1;
4454
4455 /*
Nick Piggin0ceaacc2005-12-04 13:55:25 +11004456 * Clamp the batch to a 2^n - 1 value. Having a power
4457 * of 2 value was found to be more likely to have
4458 * suboptimal cache aliasing properties in some cases.
Christoph Lametere7c8d5c2005-06-21 17:14:47 -07004459 *
Nick Piggin0ceaacc2005-12-04 13:55:25 +11004460 * For example if 2 tasks are alternately allocating
4461 * batches of pages, one task can end up with a lot
4462 * of pages of one half of the possible page colors
4463 * and the other with pages of the other colors.
Christoph Lametere7c8d5c2005-06-21 17:14:47 -07004464 */
David Howells91552032009-05-06 16:03:02 -07004465 batch = rounddown_pow_of_two(batch + batch/2) - 1;
Seth, Rohitba56e912005-10-29 18:15:47 -07004466
Christoph Lametere7c8d5c2005-06-21 17:14:47 -07004467 return batch;
David Howells3a6be872009-05-06 16:03:03 -07004468
4469#else
4470 /* The deferral and batching of frees should be suppressed under NOMMU
4471 * conditions.
4472 *
4473 * The problem is that NOMMU needs to be able to allocate large chunks
4474 * of contiguous memory as there's no hardware page translation to
4475 * assemble apparent contiguous memory from discontiguous pages.
4476 *
4477 * Queueing large contiguous runs of pages for batching, however,
4478 * causes the pages to actually be freed in smaller chunks. As there
4479 * can be a significant delay between the individual batches being
4480 * recycled, this leads to the once large chunks of space being
4481 * fragmented and becoming unavailable for high-order allocations.
4482 */
4483 return 0;
4484#endif
Christoph Lametere7c8d5c2005-06-21 17:14:47 -07004485}
4486
Cody P Schafer8d7a8fa2013-07-03 15:01:31 -07004487/*
4488 * pcp->high and pcp->batch values are related and dependent on one another:
4489 * ->batch must never be higher then ->high.
4490 * The following function updates them in a safe manner without read side
4491 * locking.
4492 *
4493 * Any new users of pcp->batch and pcp->high should ensure they can cope with
4494 * those fields changing asynchronously (acording the the above rule).
4495 *
4496 * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
4497 * outside of boot time (or some other assurance that no concurrent updaters
4498 * exist).
4499 */
4500static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
4501 unsigned long batch)
4502{
4503 /* start with a fail safe value for batch */
4504 pcp->batch = 1;
4505 smp_wmb();
4506
4507 /* Update high, then batch, in order */
4508 pcp->high = high;
4509 smp_wmb();
4510
4511 pcp->batch = batch;
4512}
4513
Cody P Schafer36640332013-07-03 15:01:40 -07004514/* a companion to pageset_set_high() */
Cody P Schafer4008bab2013-07-03 15:01:28 -07004515static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
4516{
Cody P Schafer8d7a8fa2013-07-03 15:01:31 -07004517 pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));
Cody P Schafer4008bab2013-07-03 15:01:28 -07004518}
4519
Cody P Schafer88c90db2013-07-03 15:01:35 -07004520static void pageset_init(struct per_cpu_pageset *p)
Christoph Lameter2caaad42005-06-21 17:15:00 -07004521{
4522 struct per_cpu_pages *pcp;
Mel Gorman5f8dcc22009-09-21 17:03:19 -07004523 int migratetype;
Christoph Lameter2caaad42005-06-21 17:15:00 -07004524
Magnus Damm1c6fe942005-10-26 01:58:59 -07004525 memset(p, 0, sizeof(*p));
4526
Christoph Lameter3dfa5722008-02-04 22:29:19 -08004527 pcp = &p->pcp;
Christoph Lameter2caaad42005-06-21 17:15:00 -07004528 pcp->count = 0;
Mel Gorman5f8dcc22009-09-21 17:03:19 -07004529 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
4530 INIT_LIST_HEAD(&pcp->lists[migratetype]);
Christoph Lameter2caaad42005-06-21 17:15:00 -07004531}
4532
Cody P Schafer88c90db2013-07-03 15:01:35 -07004533static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
4534{
4535 pageset_init(p);
4536 pageset_set_batch(p, batch);
4537}
4538
Rohit Seth8ad4b1f2006-01-08 01:00:40 -08004539/*
Cody P Schafer36640332013-07-03 15:01:40 -07004540 * pageset_set_high() sets the high water mark for hot per_cpu_pagelist
Rohit Seth8ad4b1f2006-01-08 01:00:40 -08004541 * to the value high for the pageset p.
4542 */
Cody P Schafer36640332013-07-03 15:01:40 -07004543static void pageset_set_high(struct per_cpu_pageset *p,
Rohit Seth8ad4b1f2006-01-08 01:00:40 -08004544 unsigned long high)
4545{
Cody P Schafer8d7a8fa2013-07-03 15:01:31 -07004546 unsigned long batch = max(1UL, high / 4);
4547 if ((high / 4) > (PAGE_SHIFT * 8))
4548 batch = PAGE_SHIFT * 8;
Rohit Seth8ad4b1f2006-01-08 01:00:40 -08004549
Cody P Schafer8d7a8fa2013-07-03 15:01:31 -07004550 pageset_update(&p->pcp, high, batch);
Rohit Seth8ad4b1f2006-01-08 01:00:40 -08004551}
4552
David Rientjes7cd2b0a2014-06-23 13:22:04 -07004553static void pageset_set_high_and_batch(struct zone *zone,
4554 struct per_cpu_pageset *pcp)
Cody P Schafer56cef2b2013-07-03 15:01:38 -07004555{
Cody P Schafer56cef2b2013-07-03 15:01:38 -07004556 if (percpu_pagelist_fraction)
Cody P Schafer36640332013-07-03 15:01:40 -07004557 pageset_set_high(pcp,
Cody P Schafer56cef2b2013-07-03 15:01:38 -07004558 (zone->managed_pages /
4559 percpu_pagelist_fraction));
4560 else
4561 pageset_set_batch(pcp, zone_batchsize(zone));
4562}
4563
Cody P Schafer169f6c12013-07-03 15:01:41 -07004564static void __meminit zone_pageset_init(struct zone *zone, int cpu)
4565{
4566 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
4567
4568 pageset_init(pcp);
4569 pageset_set_high_and_batch(zone, pcp);
4570}
4571
Jiang Liu4ed7e022012-07-31 16:43:35 -07004572static void __meminit setup_zone_pageset(struct zone *zone)
Wu Fengguang319774e2010-05-24 14:32:49 -07004573{
4574 int cpu;
Wu Fengguang319774e2010-05-24 14:32:49 -07004575 zone->pageset = alloc_percpu(struct per_cpu_pageset);
Cody P Schafer56cef2b2013-07-03 15:01:38 -07004576 for_each_possible_cpu(cpu)
4577 zone_pageset_init(zone, cpu);
Wu Fengguang319774e2010-05-24 14:32:49 -07004578}
4579
Christoph Lametere7c8d5c2005-06-21 17:14:47 -07004580/*
Christoph Lameter99dcc3e2010-01-05 15:34:51 +09004581 * Allocate per cpu pagesets and initialize them.
4582 * Before this call only boot pagesets were available.
Christoph Lameter2caaad42005-06-21 17:15:00 -07004583 */
Al Viro78d99552005-12-15 09:18:25 +00004584void __init setup_per_cpu_pageset(void)
Christoph Lametere7c8d5c2005-06-21 17:14:47 -07004585{
Christoph Lameter99dcc3e2010-01-05 15:34:51 +09004586 struct zone *zone;
Christoph Lametere7c8d5c2005-06-21 17:14:47 -07004587
Wu Fengguang319774e2010-05-24 14:32:49 -07004588 for_each_populated_zone(zone)
4589 setup_zone_pageset(zone);
Christoph Lametere7c8d5c2005-06-21 17:14:47 -07004590}
4591
Sam Ravnborg577a32f2007-05-17 23:29:25 +02004592static noinline __init_refok
Yasunori Gotocca448f2006-06-23 02:03:10 -07004593int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
Dave Hansened8ece22005-10-29 18:16:50 -07004594{
4595 int i;
Yasunori Gotocca448f2006-06-23 02:03:10 -07004596 size_t alloc_size;
Dave Hansened8ece22005-10-29 18:16:50 -07004597
4598 /*
4599 * The per-page waitqueue mechanism uses hashed waitqueues
4600 * per zone.
4601 */
Yasunori Goto02b694d2006-06-23 02:03:08 -07004602 zone->wait_table_hash_nr_entries =
4603 wait_table_hash_nr_entries(zone_size_pages);
4604 zone->wait_table_bits =
4605 wait_table_bits(zone->wait_table_hash_nr_entries);
Yasunori Gotocca448f2006-06-23 02:03:10 -07004606 alloc_size = zone->wait_table_hash_nr_entries
4607 * sizeof(wait_queue_head_t);
4608
Heiko Carstenscd94b9d2008-05-23 13:04:52 -07004609 if (!slab_is_available()) {
Yasunori Gotocca448f2006-06-23 02:03:10 -07004610 zone->wait_table = (wait_queue_head_t *)
Santosh Shilimkar67828322014-01-21 15:50:25 -08004611 memblock_virt_alloc_node_nopanic(
4612 alloc_size, zone->zone_pgdat->node_id);
Yasunori Gotocca448f2006-06-23 02:03:10 -07004613 } else {
4614 /*
4615 * This case means that a zone whose size was 0 gets new memory
4616 * via memory hot-add.
4617 * But it may be the case that a new node was hot-added. In
4618 * this case vmalloc() will not be able to use this new node's
4619 * memory - this wait_table must be initialized to use this new
4620 * node itself as well.
4621 * To use this new node's memory, further consideration will be
4622 * necessary.
4623 */
Jesper Juhl8691f3a2007-10-16 01:24:49 -07004624 zone->wait_table = vmalloc(alloc_size);
Yasunori Gotocca448f2006-06-23 02:03:10 -07004625 }
4626 if (!zone->wait_table)
4627 return -ENOMEM;
Dave Hansened8ece22005-10-29 18:16:50 -07004628
Pintu Kumarb8af2942013-09-11 14:20:34 -07004629 for (i = 0; i < zone->wait_table_hash_nr_entries; ++i)
Dave Hansened8ece22005-10-29 18:16:50 -07004630 init_waitqueue_head(zone->wait_table + i);
Yasunori Gotocca448f2006-06-23 02:03:10 -07004631
4632 return 0;
Dave Hansened8ece22005-10-29 18:16:50 -07004633}
4634
Matt Tolentinoc09b4242006-01-17 07:03:44 +01004635static __meminit void zone_pcp_init(struct zone *zone)
Dave Hansened8ece22005-10-29 18:16:50 -07004636{
Christoph Lameter99dcc3e2010-01-05 15:34:51 +09004637 /*
4638 * per cpu subsystem is not up at this point. The following code
4639 * relies on the ability of the linker to provide the
4640 * offset of a (static) per cpu variable into the per cpu area.
4641 */
4642 zone->pageset = &boot_pageset;
Dave Hansened8ece22005-10-29 18:16:50 -07004643
Xishi Qiub38a8722013-11-12 15:07:20 -08004644 if (populated_zone(zone))
Christoph Lameter99dcc3e2010-01-05 15:34:51 +09004645 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",
4646 zone->name, zone->present_pages,
4647 zone_batchsize(zone));
Dave Hansened8ece22005-10-29 18:16:50 -07004648}
4649
Jiang Liu4ed7e022012-07-31 16:43:35 -07004650int __meminit init_currently_empty_zone(struct zone *zone,
Yasunori Goto718127c2006-06-23 02:03:10 -07004651 unsigned long zone_start_pfn,
Dave Hansena2f3aa022007-01-10 23:15:30 -08004652 unsigned long size,
4653 enum memmap_context context)
Dave Hansened8ece22005-10-29 18:16:50 -07004654{
4655 struct pglist_data *pgdat = zone->zone_pgdat;
Yasunori Gotocca448f2006-06-23 02:03:10 -07004656 int ret;
4657 ret = zone_wait_table_init(zone, size);
4658 if (ret)
4659 return ret;
Dave Hansened8ece22005-10-29 18:16:50 -07004660 pgdat->nr_zones = zone_idx(zone) + 1;
4661
Dave Hansened8ece22005-10-29 18:16:50 -07004662 zone->zone_start_pfn = zone_start_pfn;
4663
Mel Gorman708614e2008-07-23 21:26:51 -07004664 mminit_dprintk(MMINIT_TRACE, "memmap_init",
4665 "Initialising map node %d zone %lu pfns %lu -> %lu\n",
4666 pgdat->node_id,
4667 (unsigned long)zone_idx(zone),
4668 zone_start_pfn, (zone_start_pfn + size));
4669
Andi Kleen1e548de2008-02-04 22:29:26 -08004670 zone_init_free_lists(zone);
Yasunori Goto718127c2006-06-23 02:03:10 -07004671
4672 return 0;
Dave Hansened8ece22005-10-29 18:16:50 -07004673}
4674
Tejun Heo0ee332c2011-12-08 10:22:09 -08004675#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
Mel Gormanc7132162006-09-27 01:49:43 -07004676#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
Mel Gorman8a942fd2015-06-30 14:56:55 -07004677
Mel Gormanc7132162006-09-27 01:49:43 -07004678/*
4679 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
Mel Gormanc7132162006-09-27 01:49:43 -07004680 */
Mel Gorman8a942fd2015-06-30 14:56:55 -07004681int __meminit __early_pfn_to_nid(unsigned long pfn,
4682 struct mminit_pfnnid_cache *state)
Mel Gormanc7132162006-09-27 01:49:43 -07004683{
Tejun Heoc13291a2011-07-12 10:46:30 +02004684 unsigned long start_pfn, end_pfn;
Yinghai Lue76b63f2013-09-11 14:22:17 -07004685 int nid;
Russ Anderson7c243c72013-04-29 15:07:59 -07004686
Mel Gorman8a942fd2015-06-30 14:56:55 -07004687 if (state->last_start <= pfn && pfn < state->last_end)
4688 return state->last_nid;
Mel Gormanc7132162006-09-27 01:49:43 -07004689
Yinghai Lue76b63f2013-09-11 14:22:17 -07004690 nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
4691 if (nid != -1) {
Mel Gorman8a942fd2015-06-30 14:56:55 -07004692 state->last_start = start_pfn;
4693 state->last_end = end_pfn;
4694 state->last_nid = nid;
Yinghai Lue76b63f2013-09-11 14:22:17 -07004695 }
4696
4697 return nid;
Mel Gormanc7132162006-09-27 01:49:43 -07004698}
4699#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
4700
Mel Gormanc7132162006-09-27 01:49:43 -07004701/**
Santosh Shilimkar67828322014-01-21 15:50:25 -08004702 * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range
Randy Dunlap88ca3b92006-10-04 02:15:25 -07004703 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
Santosh Shilimkar67828322014-01-21 15:50:25 -08004704 * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid
Mel Gormanc7132162006-09-27 01:49:43 -07004705 *
Zhang Zhen7d018172014-06-04 16:10:53 -07004706 * If an architecture guarantees that all ranges registered contain no holes
4707 * and may be freed, this this function may be used instead of calling
4708 * memblock_free_early_nid() manually.
Mel Gormanc7132162006-09-27 01:49:43 -07004709 */
Tejun Heoc13291a2011-07-12 10:46:30 +02004710void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
Mel Gormanc7132162006-09-27 01:49:43 -07004711{
Tejun Heoc13291a2011-07-12 10:46:30 +02004712 unsigned long start_pfn, end_pfn;
4713 int i, this_nid;
Mel Gormanc7132162006-09-27 01:49:43 -07004714
Tejun Heoc13291a2011-07-12 10:46:30 +02004715 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {
4716 start_pfn = min(start_pfn, max_low_pfn);
4717 end_pfn = min(end_pfn, max_low_pfn);
Mel Gormanc7132162006-09-27 01:49:43 -07004718
Tejun Heoc13291a2011-07-12 10:46:30 +02004719 if (start_pfn < end_pfn)
Santosh Shilimkar67828322014-01-21 15:50:25 -08004720 memblock_free_early_nid(PFN_PHYS(start_pfn),
4721 (end_pfn - start_pfn) << PAGE_SHIFT,
4722 this_nid);
Mel Gormanc7132162006-09-27 01:49:43 -07004723 }
4724}
4725
4726/**
4727 * sparse_memory_present_with_active_regions - Call memory_present for each active range
Randy Dunlap88ca3b92006-10-04 02:15:25 -07004728 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
Mel Gormanc7132162006-09-27 01:49:43 -07004729 *
Zhang Zhen7d018172014-06-04 16:10:53 -07004730 * If an architecture guarantees that all ranges registered contain no holes and may
4731 * be freed, this function may be used instead of calling memory_present() manually.
Mel Gormanc7132162006-09-27 01:49:43 -07004732 */
4733void __init sparse_memory_present_with_active_regions(int nid)
4734{
Tejun Heoc13291a2011-07-12 10:46:30 +02004735 unsigned long start_pfn, end_pfn;
4736 int i, this_nid;
Mel Gormanc7132162006-09-27 01:49:43 -07004737
Tejun Heoc13291a2011-07-12 10:46:30 +02004738 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
4739 memory_present(this_nid, start_pfn, end_pfn);
Mel Gormanc7132162006-09-27 01:49:43 -07004740}
4741
4742/**
4743 * get_pfn_range_for_nid - Return the start and end page frames for a node
Randy Dunlap88ca3b92006-10-04 02:15:25 -07004744 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
4745 * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
4746 * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
Mel Gormanc7132162006-09-27 01:49:43 -07004747 *
4748 * It returns the start and end page frame of a node based on information
Zhang Zhen7d018172014-06-04 16:10:53 -07004749 * provided by memblock_set_node(). If called for a node
Mel Gormanc7132162006-09-27 01:49:43 -07004750 * with no available memory, a warning is printed and the start and end
Randy Dunlap88ca3b92006-10-04 02:15:25 -07004751 * PFNs will be 0.
Mel Gormanc7132162006-09-27 01:49:43 -07004752 */
Yasunori Gotoa3142c82007-05-08 00:23:07 -07004753void __meminit get_pfn_range_for_nid(unsigned int nid,
Mel Gormanc7132162006-09-27 01:49:43 -07004754 unsigned long *start_pfn, unsigned long *end_pfn)
4755{
Tejun Heoc13291a2011-07-12 10:46:30 +02004756 unsigned long this_start_pfn, this_end_pfn;
Mel Gormanc7132162006-09-27 01:49:43 -07004757 int i;
Tejun Heoc13291a2011-07-12 10:46:30 +02004758
Mel Gormanc7132162006-09-27 01:49:43 -07004759 *start_pfn = -1UL;
4760 *end_pfn = 0;
4761
Tejun Heoc13291a2011-07-12 10:46:30 +02004762 for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
4763 *start_pfn = min(*start_pfn, this_start_pfn);
4764 *end_pfn = max(*end_pfn, this_end_pfn);
Mel Gormanc7132162006-09-27 01:49:43 -07004765 }
4766
Christoph Lameter633c0662007-10-16 01:25:37 -07004767 if (*start_pfn == -1UL)
Mel Gormanc7132162006-09-27 01:49:43 -07004768 *start_pfn = 0;
Mel Gormanc7132162006-09-27 01:49:43 -07004769}
4770
4771/*
Mel Gorman2a1e2742007-07-17 04:03:12 -07004772 * This finds a zone that can be used for ZONE_MOVABLE pages. The
4773 * assumption is made that zones within a node are ordered in monotonic
4774 * increasing memory addresses so that the "highest" populated zone is used
4775 */
Adrian Bunkb69a7282008-07-23 21:28:12 -07004776static void __init find_usable_zone_for_movable(void)
Mel Gorman2a1e2742007-07-17 04:03:12 -07004777{
4778 int zone_index;
4779 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
4780 if (zone_index == ZONE_MOVABLE)
4781 continue;
4782
4783 if (arch_zone_highest_possible_pfn[zone_index] >
4784 arch_zone_lowest_possible_pfn[zone_index])
4785 break;
4786 }
4787
4788 VM_BUG_ON(zone_index == -1);
4789 movable_zone = zone_index;
4790}
4791
4792/*
4793 * The zone ranges provided by the architecture do not include ZONE_MOVABLE
Lucas De Marchi25985ed2011-03-30 22:57:33 -03004794 * because it is sized independent of architecture. Unlike the other zones,
Mel Gorman2a1e2742007-07-17 04:03:12 -07004795 * the starting point for ZONE_MOVABLE is not fixed. It may be different
4796 * in each node depending on the size of each node and how evenly kernelcore
4797 * is distributed. This helper function adjusts the zone ranges
4798 * provided by the architecture for a given node by using the end of the
4799 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
4800 * zones within a node are in order of monotonic increases memory addresses
4801 */
Adrian Bunkb69a7282008-07-23 21:28:12 -07004802static void __meminit adjust_zone_range_for_zone_movable(int nid,
Mel Gorman2a1e2742007-07-17 04:03:12 -07004803 unsigned long zone_type,
4804 unsigned long node_start_pfn,
4805 unsigned long node_end_pfn,
4806 unsigned long *zone_start_pfn,
4807 unsigned long *zone_end_pfn)
4808{
4809 /* Only adjust if ZONE_MOVABLE is on this node */
4810 if (zone_movable_pfn[nid]) {
4811 /* Size ZONE_MOVABLE */
4812 if (zone_type == ZONE_MOVABLE) {
4813 *zone_start_pfn = zone_movable_pfn[nid];
4814 *zone_end_pfn = min(node_end_pfn,
4815 arch_zone_highest_possible_pfn[movable_zone]);
4816
4817 /* Adjust for ZONE_MOVABLE starting within this range */
4818 } else if (*zone_start_pfn < zone_movable_pfn[nid] &&
4819 *zone_end_pfn > zone_movable_pfn[nid]) {
4820 *zone_end_pfn = zone_movable_pfn[nid];
4821
4822 /* Check if this whole range is within ZONE_MOVABLE */
4823 } else if (*zone_start_pfn >= zone_movable_pfn[nid])
4824 *zone_start_pfn = *zone_end_pfn;
4825 }
4826}
4827
4828/*
Mel Gormanc7132162006-09-27 01:49:43 -07004829 * Return the number of pages a zone spans in a node, including holes
4830 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
4831 */
Paul Mundt6ea6e682007-07-15 23:38:20 -07004832static unsigned long __meminit zone_spanned_pages_in_node(int nid,
Mel Gormanc7132162006-09-27 01:49:43 -07004833 unsigned long zone_type,
Zhang Yanfei7960aed2013-07-08 15:59:52 -07004834 unsigned long node_start_pfn,
4835 unsigned long node_end_pfn,
Mel Gormanc7132162006-09-27 01:49:43 -07004836 unsigned long *ignored)
4837{
Mel Gormanc7132162006-09-27 01:49:43 -07004838 unsigned long zone_start_pfn, zone_end_pfn;
4839
Zhang Yanfei7960aed2013-07-08 15:59:52 -07004840 /* Get the start and end of the zone */
Mel Gormanc7132162006-09-27 01:49:43 -07004841 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
4842 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
Mel Gorman2a1e2742007-07-17 04:03:12 -07004843 adjust_zone_range_for_zone_movable(nid, zone_type,
4844 node_start_pfn, node_end_pfn,
4845 &zone_start_pfn, &zone_end_pfn);
Mel Gormanc7132162006-09-27 01:49:43 -07004846
4847 /* Check that this node has pages within the zone's required range */
4848 if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
4849 return 0;
4850
4851 /* Move the zone boundaries inside the node if necessary */
4852 zone_end_pfn = min(zone_end_pfn, node_end_pfn);
4853 zone_start_pfn = max(zone_start_pfn, node_start_pfn);
4854
4855 /* Return the spanned pages */
4856 return zone_end_pfn - zone_start_pfn;
4857}
4858
4859/*
4860 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
Randy Dunlap88ca3b92006-10-04 02:15:25 -07004861 * then all holes in the requested range will be accounted for.
Mel Gormanc7132162006-09-27 01:49:43 -07004862 */
Yinghai Lu32996252009-12-15 17:59:02 -08004863unsigned long __meminit __absent_pages_in_range(int nid,
Mel Gormanc7132162006-09-27 01:49:43 -07004864 unsigned long range_start_pfn,
4865 unsigned long range_end_pfn)
4866{
Tejun Heo96e907d2011-07-12 10:46:29 +02004867 unsigned long nr_absent = range_end_pfn - range_start_pfn;
4868 unsigned long start_pfn, end_pfn;
4869 int i;
Mel Gormanc7132162006-09-27 01:49:43 -07004870
Tejun Heo96e907d2011-07-12 10:46:29 +02004871 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
4872 start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
4873 end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
4874 nr_absent -= end_pfn - start_pfn;
Mel Gormanc7132162006-09-27 01:49:43 -07004875 }
Tejun Heo96e907d2011-07-12 10:46:29 +02004876 return nr_absent;
Mel Gormanc7132162006-09-27 01:49:43 -07004877}
4878
4879/**
4880 * absent_pages_in_range - Return number of page frames in holes within a range
4881 * @start_pfn: The start PFN to start searching for holes
4882 * @end_pfn: The end PFN to stop searching for holes
4883 *
Randy Dunlap88ca3b92006-10-04 02:15:25 -07004884 * It returns the number of pages frames in memory holes within a range.
Mel Gormanc7132162006-09-27 01:49:43 -07004885 */
4886unsigned long __init absent_pages_in_range(unsigned long start_pfn,
4887 unsigned long end_pfn)
4888{
4889 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
4890}
4891
4892/* Return the number of page frames in holes in a zone on a node */
Paul Mundt6ea6e682007-07-15 23:38:20 -07004893static unsigned long __meminit zone_absent_pages_in_node(int nid,
Mel Gormanc7132162006-09-27 01:49:43 -07004894 unsigned long zone_type,
Zhang Yanfei7960aed2013-07-08 15:59:52 -07004895 unsigned long node_start_pfn,
4896 unsigned long node_end_pfn,
Mel Gormanc7132162006-09-27 01:49:43 -07004897 unsigned long *ignored)
4898{
Tejun Heo96e907d2011-07-12 10:46:29 +02004899 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
4900 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
Mel Gorman9c7cd682006-09-27 01:49:58 -07004901 unsigned long zone_start_pfn, zone_end_pfn;
4902
Tejun Heo96e907d2011-07-12 10:46:29 +02004903 zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
4904 zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
Mel Gorman9c7cd682006-09-27 01:49:58 -07004905
Mel Gorman2a1e2742007-07-17 04:03:12 -07004906 adjust_zone_range_for_zone_movable(nid, zone_type,
4907 node_start_pfn, node_end_pfn,
4908 &zone_start_pfn, &zone_end_pfn);
Mel Gorman9c7cd682006-09-27 01:49:58 -07004909 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
Mel Gormanc7132162006-09-27 01:49:43 -07004910}
Mel Gorman0e0b8642006-09-27 01:49:56 -07004911
Tejun Heo0ee332c2011-12-08 10:22:09 -08004912#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
Paul Mundt6ea6e682007-07-15 23:38:20 -07004913static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
Mel Gormanc7132162006-09-27 01:49:43 -07004914 unsigned long zone_type,
Zhang Yanfei7960aed2013-07-08 15:59:52 -07004915 unsigned long node_start_pfn,
4916 unsigned long node_end_pfn,
Mel Gormanc7132162006-09-27 01:49:43 -07004917 unsigned long *zones_size)
4918{
4919 return zones_size[zone_type];
4920}
4921
Paul Mundt6ea6e682007-07-15 23:38:20 -07004922static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
Mel Gormanc7132162006-09-27 01:49:43 -07004923 unsigned long zone_type,
Zhang Yanfei7960aed2013-07-08 15:59:52 -07004924 unsigned long node_start_pfn,
4925 unsigned long node_end_pfn,
Mel Gormanc7132162006-09-27 01:49:43 -07004926 unsigned long *zholes_size)
4927{
4928 if (!zholes_size)
4929 return 0;
4930
4931 return zholes_size[zone_type];
4932}
Yinghai Lu20e69262013-03-01 14:51:27 -08004933
Tejun Heo0ee332c2011-12-08 10:22:09 -08004934#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
Mel Gormanc7132162006-09-27 01:49:43 -07004935
Yasunori Gotoa3142c82007-05-08 00:23:07 -07004936static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
Zhang Yanfei7960aed2013-07-08 15:59:52 -07004937 unsigned long node_start_pfn,
4938 unsigned long node_end_pfn,
4939 unsigned long *zones_size,
4940 unsigned long *zholes_size)
Mel Gormanc7132162006-09-27 01:49:43 -07004941{
Gu Zhengfebd5942015-06-24 16:57:02 -07004942 unsigned long realtotalpages = 0, totalpages = 0;
Mel Gormanc7132162006-09-27 01:49:43 -07004943 enum zone_type i;
4944
Gu Zhengfebd5942015-06-24 16:57:02 -07004945 for (i = 0; i < MAX_NR_ZONES; i++) {
4946 struct zone *zone = pgdat->node_zones + i;
4947 unsigned long size, real_size;
Mel Gormanc7132162006-09-27 01:49:43 -07004948
Gu Zhengfebd5942015-06-24 16:57:02 -07004949 size = zone_spanned_pages_in_node(pgdat->node_id, i,
4950 node_start_pfn,
4951 node_end_pfn,
4952 zones_size);
4953 real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,
Zhang Yanfei7960aed2013-07-08 15:59:52 -07004954 node_start_pfn, node_end_pfn,
4955 zholes_size);
Gu Zhengfebd5942015-06-24 16:57:02 -07004956 zone->spanned_pages = size;
4957 zone->present_pages = real_size;
4958
4959 totalpages += size;
4960 realtotalpages += real_size;
4961 }
4962
4963 pgdat->node_spanned_pages = totalpages;
Mel Gormanc7132162006-09-27 01:49:43 -07004964 pgdat->node_present_pages = realtotalpages;
4965 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
4966 realtotalpages);
4967}
4968
Mel Gorman835c1342007-10-16 01:25:47 -07004969#ifndef CONFIG_SPARSEMEM
4970/*
4971 * Calculate the size of the zone->blockflags rounded to an unsigned long
Mel Gormand9c23402007-10-16 01:26:01 -07004972 * Start by making sure zonesize is a multiple of pageblock_order by rounding
4973 * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
Mel Gorman835c1342007-10-16 01:25:47 -07004974 * round what is now in bits to nearest long in bits, then return it in
4975 * bytes.
4976 */
Linus Torvalds7c455122013-02-18 09:58:02 -08004977static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
Mel Gorman835c1342007-10-16 01:25:47 -07004978{
4979 unsigned long usemapsize;
4980
Linus Torvalds7c455122013-02-18 09:58:02 -08004981 zonesize += zone_start_pfn & (pageblock_nr_pages-1);
Mel Gormand9c23402007-10-16 01:26:01 -07004982 usemapsize = roundup(zonesize, pageblock_nr_pages);
4983 usemapsize = usemapsize >> pageblock_order;
Mel Gorman835c1342007-10-16 01:25:47 -07004984 usemapsize *= NR_PAGEBLOCK_BITS;
4985 usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
4986
4987 return usemapsize / 8;
4988}
4989
4990static void __init setup_usemap(struct pglist_data *pgdat,
Linus Torvalds7c455122013-02-18 09:58:02 -08004991 struct zone *zone,
4992 unsigned long zone_start_pfn,
4993 unsigned long zonesize)
Mel Gorman835c1342007-10-16 01:25:47 -07004994{
Linus Torvalds7c455122013-02-18 09:58:02 -08004995 unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
Mel Gorman835c1342007-10-16 01:25:47 -07004996 zone->pageblock_flags = NULL;
Julia Lawall58a01a42009-01-06 14:39:28 -08004997 if (usemapsize)
Santosh Shilimkar67828322014-01-21 15:50:25 -08004998 zone->pageblock_flags =
4999 memblock_virt_alloc_node_nopanic(usemapsize,
5000 pgdat->node_id);
Mel Gorman835c1342007-10-16 01:25:47 -07005001}
5002#else
Linus Torvalds7c455122013-02-18 09:58:02 -08005003static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
5004 unsigned long zone_start_pfn, unsigned long zonesize) {}
Mel Gorman835c1342007-10-16 01:25:47 -07005005#endif /* CONFIG_SPARSEMEM */
5006
Mel Gormand9c23402007-10-16 01:26:01 -07005007#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
Mel Gormanba72cb82007-11-28 16:21:13 -08005008
Mel Gormand9c23402007-10-16 01:26:01 -07005009/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
Chen Gang15ca2202013-09-11 14:20:27 -07005010void __paginginit set_pageblock_order(void)
Mel Gormand9c23402007-10-16 01:26:01 -07005011{
Andrew Morton955c1cd2012-05-29 15:06:31 -07005012 unsigned int order;
5013
Mel Gormand9c23402007-10-16 01:26:01 -07005014 /* Check that pageblock_nr_pages has not already been setup */
5015 if (pageblock_order)
5016 return;
5017
Andrew Morton955c1cd2012-05-29 15:06:31 -07005018 if (HPAGE_SHIFT > PAGE_SHIFT)
5019 order = HUGETLB_PAGE_ORDER;
5020 else
5021 order = MAX_ORDER - 1;
5022
Mel Gormand9c23402007-10-16 01:26:01 -07005023 /*
5024 * Assume the largest contiguous order of interest is a huge page.
Andrew Morton955c1cd2012-05-29 15:06:31 -07005025 * This value may be variable depending on boot parameters on IA64 and
5026 * powerpc.
Mel Gormand9c23402007-10-16 01:26:01 -07005027 */
5028 pageblock_order = order;
5029}
5030#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
5031
Mel Gormanba72cb82007-11-28 16:21:13 -08005032/*
5033 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
Andrew Morton955c1cd2012-05-29 15:06:31 -07005034 * is unused as pageblock_order is set at compile-time. See
5035 * include/linux/pageblock-flags.h for the values of pageblock_order based on
5036 * the kernel config
Mel Gormanba72cb82007-11-28 16:21:13 -08005037 */
Chen Gang15ca2202013-09-11 14:20:27 -07005038void __paginginit set_pageblock_order(void)
Mel Gormanba72cb82007-11-28 16:21:13 -08005039{
Mel Gormanba72cb82007-11-28 16:21:13 -08005040}
Mel Gormand9c23402007-10-16 01:26:01 -07005041
5042#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
5043
Jiang Liu01cefae2012-12-12 13:52:19 -08005044static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
5045 unsigned long present_pages)
5046{
5047 unsigned long pages = spanned_pages;
5048
5049 /*
5050 * Provide a more accurate estimation if there are holes within
5051 * the zone and SPARSEMEM is in use. If there are holes within the
5052 * zone, each populated memory region may cost us one or two extra
5053 * memmap pages due to alignment because memmap pages for each
5054 * populated regions may not naturally algined on page boundary.
5055 * So the (present_pages >> 4) heuristic is a tradeoff for that.
5056 */
5057 if (spanned_pages > present_pages + (present_pages >> 4) &&
5058 IS_ENABLED(CONFIG_SPARSEMEM))
5059 pages = present_pages;
5060
5061 return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
5062}
5063
Linus Torvalds1da177e2005-04-16 15:20:36 -07005064/*
5065 * Set up the zone data structures:
5066 * - mark all pages reserved
5067 * - mark all memory queues empty
5068 * - clear the memory bitmaps
Minchan Kim6527af52012-07-31 16:46:16 -07005069 *
5070 * NOTE: pgdat should get zeroed by caller.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005071 */
Alexander van Heukelumb5a0e012008-02-23 15:24:06 -08005072static void __paginginit free_area_init_core(struct pglist_data *pgdat,
Gu Zhengfebd5942015-06-24 16:57:02 -07005073 unsigned long node_start_pfn, unsigned long node_end_pfn)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005074{
Christoph Lameter2f1b6242006-09-25 23:31:13 -07005075 enum zone_type j;
Dave Hansened8ece22005-10-29 18:16:50 -07005076 int nid = pgdat->node_id;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005077 unsigned long zone_start_pfn = pgdat->node_start_pfn;
Yasunori Goto718127c2006-06-23 02:03:10 -07005078 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005079
Dave Hansen208d54e2005-10-29 18:16:52 -07005080 pgdat_resize_init(pgdat);
Andrea Arcangeli8177a422012-03-23 20:56:34 +01005081#ifdef CONFIG_NUMA_BALANCING
5082 spin_lock_init(&pgdat->numabalancing_migrate_lock);
5083 pgdat->numabalancing_migrate_nr_pages = 0;
5084 pgdat->numabalancing_migrate_next_window = jiffies;
5085#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07005086 init_waitqueue_head(&pgdat->kswapd_wait);
Mel Gorman55150612012-07-31 16:44:35 -07005087 init_waitqueue_head(&pgdat->pfmemalloc_wait);
Joonsoo Kimeefa864b2014-12-12 16:55:46 -08005088 pgdat_page_ext_init(pgdat);
Michal Nazarewicz5f63b722012-01-11 15:16:11 +01005089
Linus Torvalds1da177e2005-04-16 15:20:36 -07005090 for (j = 0; j < MAX_NR_ZONES; j++) {
5091 struct zone *zone = pgdat->node_zones + j;
Jiang Liu9feedc92012-12-12 13:52:12 -08005092 unsigned long size, realsize, freesize, memmap_pages;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005093
Gu Zhengfebd5942015-06-24 16:57:02 -07005094 size = zone->spanned_pages;
5095 realsize = freesize = zone->present_pages;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005096
Mel Gorman0e0b8642006-09-27 01:49:56 -07005097 /*
Jiang Liu9feedc92012-12-12 13:52:12 -08005098 * Adjust freesize so that it accounts for how much memory
Mel Gorman0e0b8642006-09-27 01:49:56 -07005099 * is used by this zone for memmap. This affects the watermark
5100 * and per-cpu initialisations
5101 */
Jiang Liu01cefae2012-12-12 13:52:19 -08005102 memmap_pages = calc_memmap_size(size, realsize);
Zhong Hongboba914f42014-12-12 16:56:21 -08005103 if (!is_highmem_idx(j)) {
5104 if (freesize >= memmap_pages) {
5105 freesize -= memmap_pages;
5106 if (memmap_pages)
5107 printk(KERN_DEBUG
5108 " %s zone: %lu pages used for memmap\n",
5109 zone_names[j], memmap_pages);
5110 } else
5111 printk(KERN_WARNING
5112 " %s zone: %lu pages exceeds freesize %lu\n",
5113 zone_names[j], memmap_pages, freesize);
5114 }
Mel Gorman0e0b8642006-09-27 01:49:56 -07005115
Christoph Lameter62672762007-02-10 01:43:07 -08005116 /* Account for reserved pages */
Jiang Liu9feedc92012-12-12 13:52:12 -08005117 if (j == 0 && freesize > dma_reserve) {
5118 freesize -= dma_reserve;
Yinghai Lud903ef92008-10-18 20:27:06 -07005119 printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
Christoph Lameter62672762007-02-10 01:43:07 -08005120 zone_names[0], dma_reserve);
Mel Gorman0e0b8642006-09-27 01:49:56 -07005121 }
5122
Christoph Lameter98d2b0e2006-09-25 23:31:12 -07005123 if (!is_highmem_idx(j))
Jiang Liu9feedc92012-12-12 13:52:12 -08005124 nr_kernel_pages += freesize;
Jiang Liu01cefae2012-12-12 13:52:19 -08005125 /* Charge for highmem memmap if there are enough kernel pages */
5126 else if (nr_kernel_pages > memmap_pages * 2)
5127 nr_kernel_pages -= memmap_pages;
Jiang Liu9feedc92012-12-12 13:52:12 -08005128 nr_all_pages += freesize;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005129
Jiang Liu9feedc92012-12-12 13:52:12 -08005130 /*
5131 * Set an approximate value for lowmem here, it will be adjusted
5132 * when the bootmem allocator frees pages into the buddy system.
5133 * And all highmem pages will be managed by the buddy system.
5134 */
5135 zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
Christoph Lameter96146342006-07-03 00:24:13 -07005136#ifdef CONFIG_NUMA
Christoph Lameterd5f541e2006-09-27 01:50:08 -07005137 zone->node = nid;
Jiang Liu9feedc92012-12-12 13:52:12 -08005138 zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
Christoph Lameter96146342006-07-03 00:24:13 -07005139 / 100;
Jiang Liu9feedc92012-12-12 13:52:12 -08005140 zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
Christoph Lameter96146342006-07-03 00:24:13 -07005141#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07005142 zone->name = zone_names[j];
5143 spin_lock_init(&zone->lock);
5144 spin_lock_init(&zone->lru_lock);
Dave Hansenbdc8cb92005-10-29 18:16:53 -07005145 zone_seqlock_init(zone);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005146 zone->zone_pgdat = pgdat;
Dave Hansened8ece22005-10-29 18:16:50 -07005147 zone_pcp_init(zone);
Johannes Weiner81c0a2b2013-09-11 14:20:47 -07005148
5149 /* For bootup, initialized properly in watermark setup */
5150 mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);
5151
Hugh Dickinsbea8c152012-11-16 14:14:54 -08005152 lruvec_init(&zone->lruvec);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005153 if (!size)
5154 continue;
5155
Andrew Morton955c1cd2012-05-29 15:06:31 -07005156 set_pageblock_order();
Linus Torvalds7c455122013-02-18 09:58:02 -08005157 setup_usemap(pgdat, zone, zone_start_pfn, size);
Dave Hansena2f3aa022007-01-10 23:15:30 -08005158 ret = init_currently_empty_zone(zone, zone_start_pfn,
5159 size, MEMMAP_EARLY);
Yasunori Goto718127c2006-06-23 02:03:10 -07005160 BUG_ON(ret);
Heiko Carstens76cdd582008-05-14 16:05:52 -07005161 memmap_init(size, nid, j, zone_start_pfn);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005162 zone_start_pfn += size;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005163 }
5164}
5165
Sam Ravnborg577a32f2007-05-17 23:29:25 +02005166static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005167{
Linus Torvalds1da177e2005-04-16 15:20:36 -07005168 /* Skip empty nodes */
5169 if (!pgdat->node_spanned_pages)
5170 return;
5171
Andy Whitcroftd41dee32005-06-23 00:07:54 -07005172#ifdef CONFIG_FLAT_NODE_MEM_MAP
Linus Torvalds1da177e2005-04-16 15:20:36 -07005173 /* ia64 gets its own node_mem_map, before this, without bootmem */
5174 if (!pgdat->node_mem_map) {
Bob Piccoe984bb42006-05-20 15:00:31 -07005175 unsigned long size, start, end;
Andy Whitcroftd41dee32005-06-23 00:07:54 -07005176 struct page *map;
5177
Bob Piccoe984bb42006-05-20 15:00:31 -07005178 /*
5179 * The zone's endpoints aren't required to be MAX_ORDER
5180 * aligned but the node_mem_map endpoints must be in order
5181 * for the buddy allocator to function correctly.
5182 */
5183 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
Cody P Schafer108bcc92013-02-22 16:35:23 -08005184 end = pgdat_end_pfn(pgdat);
Bob Piccoe984bb42006-05-20 15:00:31 -07005185 end = ALIGN(end, MAX_ORDER_NR_PAGES);
5186 size = (end - start) * sizeof(struct page);
Dave Hansen6f167ec2005-06-23 00:07:39 -07005187 map = alloc_remap(pgdat->node_id, size);
5188 if (!map)
Santosh Shilimkar67828322014-01-21 15:50:25 -08005189 map = memblock_virt_alloc_node_nopanic(size,
5190 pgdat->node_id);
Bob Piccoe984bb42006-05-20 15:00:31 -07005191 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005192 }
Roman Zippel12d810c2007-05-31 00:40:54 -07005193#ifndef CONFIG_NEED_MULTIPLE_NODES
Linus Torvalds1da177e2005-04-16 15:20:36 -07005194 /*
5195 * With no DISCONTIG, the global mem_map is just set as node 0's
5196 */
Mel Gormanc7132162006-09-27 01:49:43 -07005197 if (pgdat == NODE_DATA(0)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005198 mem_map = NODE_DATA(0)->node_mem_map;
Tejun Heo0ee332c2011-12-08 10:22:09 -08005199#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
Mel Gormanc7132162006-09-27 01:49:43 -07005200 if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
Thomas Bogendoerfer467bc462008-01-08 15:33:11 -08005201 mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
Tejun Heo0ee332c2011-12-08 10:22:09 -08005202#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
Mel Gormanc7132162006-09-27 01:49:43 -07005203 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005204#endif
Andy Whitcroftd41dee32005-06-23 00:07:54 -07005205#endif /* CONFIG_FLAT_NODE_MEM_MAP */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005206}
5207
Johannes Weiner9109fb72008-07-23 21:27:20 -07005208void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
5209 unsigned long node_start_pfn, unsigned long *zholes_size)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005210{
Johannes Weiner9109fb72008-07-23 21:27:20 -07005211 pg_data_t *pgdat = NODE_DATA(nid);
Zhang Yanfei7960aed2013-07-08 15:59:52 -07005212 unsigned long start_pfn = 0;
5213 unsigned long end_pfn = 0;
Johannes Weiner9109fb72008-07-23 21:27:20 -07005214
Minchan Kim88fdf752012-07-31 16:46:14 -07005215 /* pg_data_t should be reset to zero when it's allocated */
Linus Torvalds8783b6e2012-08-02 10:37:03 -07005216 WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
Minchan Kim88fdf752012-07-31 16:46:14 -07005217
Mel Gorman3a80a7f2015-06-30 14:57:02 -07005218 reset_deferred_meminit(pgdat);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005219 pgdat->node_id = nid;
5220 pgdat->node_start_pfn = node_start_pfn;
Zhang Yanfei7960aed2013-07-08 15:59:52 -07005221#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
5222 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
Juergen Gross8d29e182015-02-11 15:26:01 -08005223 pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
5224 (u64)start_pfn << PAGE_SHIFT, ((u64)end_pfn << PAGE_SHIFT) - 1);
Zhang Yanfei7960aed2013-07-08 15:59:52 -07005225#endif
5226 calculate_node_totalpages(pgdat, start_pfn, end_pfn,
5227 zones_size, zholes_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005228
5229 alloc_node_mem_map(pgdat);
Yinghai Lue8c27ac2008-06-01 13:15:22 -07005230#ifdef CONFIG_FLAT_NODE_MEM_MAP
5231 printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
5232 nid, (unsigned long)pgdat,
5233 (unsigned long)pgdat->node_mem_map);
5234#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07005235
Gu Zhengfebd5942015-06-24 16:57:02 -07005236 free_area_init_core(pgdat, start_pfn, end_pfn);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005237}
5238
Tejun Heo0ee332c2011-12-08 10:22:09 -08005239#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
Miklos Szeredi418508c2007-05-23 13:57:55 -07005240
5241#if MAX_NUMNODES > 1
5242/*
5243 * Figure out the number of possible node ids.
5244 */
Cody P Schaferf9872ca2013-04-29 15:08:01 -07005245void __init setup_nr_node_ids(void)
Miklos Szeredi418508c2007-05-23 13:57:55 -07005246{
5247 unsigned int node;
5248 unsigned int highest = 0;
5249
5250 for_each_node_mask(node, node_possible_map)
5251 highest = node;
5252 nr_node_ids = highest + 1;
5253}
Miklos Szeredi418508c2007-05-23 13:57:55 -07005254#endif
5255
Mel Gormanc7132162006-09-27 01:49:43 -07005256/**
Tejun Heo1e019792011-07-12 09:45:34 +02005257 * node_map_pfn_alignment - determine the maximum internode alignment
5258 *
5259 * This function should be called after node map is populated and sorted.
5260 * It calculates the maximum power of two alignment which can distinguish
5261 * all the nodes.
5262 *
5263 * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
5264 * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the
5265 * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is
5266 * shifted, 1GiB is enough and this function will indicate so.
5267 *
5268 * This is used to test whether pfn -> nid mapping of the chosen memory
5269 * model has fine enough granularity to avoid incorrect mapping for the
5270 * populated node map.
5271 *
5272 * Returns the determined alignment in pfn's. 0 if there is no alignment
5273 * requirement (single node).
5274 */
5275unsigned long __init node_map_pfn_alignment(void)
5276{
5277 unsigned long accl_mask = 0, last_end = 0;
Tejun Heoc13291a2011-07-12 10:46:30 +02005278 unsigned long start, end, mask;
Tejun Heo1e019792011-07-12 09:45:34 +02005279 int last_nid = -1;
Tejun Heoc13291a2011-07-12 10:46:30 +02005280 int i, nid;
Tejun Heo1e019792011-07-12 09:45:34 +02005281
Tejun Heoc13291a2011-07-12 10:46:30 +02005282 for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
Tejun Heo1e019792011-07-12 09:45:34 +02005283 if (!start || last_nid < 0 || last_nid == nid) {
5284 last_nid = nid;
5285 last_end = end;
5286 continue;
5287 }
5288
5289 /*
5290 * Start with a mask granular enough to pin-point to the
5291 * start pfn and tick off bits one-by-one until it becomes
5292 * too coarse to separate the current node from the last.
5293 */
5294 mask = ~((1 << __ffs(start)) - 1);
5295 while (mask && last_end <= (start & (mask << 1)))
5296 mask <<= 1;
5297
5298 /* accumulate all internode masks */
5299 accl_mask |= mask;
5300 }
5301
5302 /* convert mask to number of pages */
5303 return ~accl_mask + 1;
5304}
5305
Mel Gormana6af2bc2007-02-10 01:42:57 -08005306/* Find the lowest pfn for a node */
Adrian Bunkb69a7282008-07-23 21:28:12 -07005307static unsigned long __init find_min_pfn_for_node(int nid)
Mel Gormanc7132162006-09-27 01:49:43 -07005308{
Mel Gormana6af2bc2007-02-10 01:42:57 -08005309 unsigned long min_pfn = ULONG_MAX;
Tejun Heoc13291a2011-07-12 10:46:30 +02005310 unsigned long start_pfn;
5311 int i;
Mel Gorman1abbfb42006-11-23 12:01:41 +00005312
Tejun Heoc13291a2011-07-12 10:46:30 +02005313 for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
5314 min_pfn = min(min_pfn, start_pfn);
Mel Gormanc7132162006-09-27 01:49:43 -07005315
Mel Gormana6af2bc2007-02-10 01:42:57 -08005316 if (min_pfn == ULONG_MAX) {
5317 printk(KERN_WARNING
Paul Jackson2bc0d2612008-06-22 07:22:17 -07005318 "Could not find start_pfn for node %d\n", nid);
Mel Gormana6af2bc2007-02-10 01:42:57 -08005319 return 0;
5320 }
5321
5322 return min_pfn;
Mel Gormanc7132162006-09-27 01:49:43 -07005323}
5324
5325/**
5326 * find_min_pfn_with_active_regions - Find the minimum PFN registered
5327 *
5328 * It returns the minimum PFN based on information provided via
Zhang Zhen7d018172014-06-04 16:10:53 -07005329 * memblock_set_node().
Mel Gormanc7132162006-09-27 01:49:43 -07005330 */
5331unsigned long __init find_min_pfn_with_active_regions(void)
5332{
5333 return find_min_pfn_for_node(MAX_NUMNODES);
5334}
5335
Lee Schermerhorn37b07e42007-10-16 01:25:39 -07005336/*
5337 * early_calculate_totalpages()
5338 * Sum pages in active regions for movable zone.
Lai Jiangshan4b0ef1fe2012-12-12 13:51:46 -08005339 * Populate N_MEMORY for calculating usable_nodes.
Lee Schermerhorn37b07e42007-10-16 01:25:39 -07005340 */
Adrian Bunk484f51f2007-10-16 01:26:03 -07005341static unsigned long __init early_calculate_totalpages(void)
Mel Gorman7e63efe2007-07-17 04:03:15 -07005342{
Mel Gorman7e63efe2007-07-17 04:03:15 -07005343 unsigned long totalpages = 0;
Tejun Heoc13291a2011-07-12 10:46:30 +02005344 unsigned long start_pfn, end_pfn;
5345 int i, nid;
Mel Gorman7e63efe2007-07-17 04:03:15 -07005346
Tejun Heoc13291a2011-07-12 10:46:30 +02005347 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
5348 unsigned long pages = end_pfn - start_pfn;
5349
Lee Schermerhorn37b07e42007-10-16 01:25:39 -07005350 totalpages += pages;
5351 if (pages)
Lai Jiangshan4b0ef1fe2012-12-12 13:51:46 -08005352 node_set_state(nid, N_MEMORY);
Lee Schermerhorn37b07e42007-10-16 01:25:39 -07005353 }
Pintu Kumarb8af2942013-09-11 14:20:34 -07005354 return totalpages;
Mel Gorman7e63efe2007-07-17 04:03:15 -07005355}
5356
Mel Gorman2a1e2742007-07-17 04:03:12 -07005357/*
5358 * Find the PFN the Movable zone begins in each node. Kernel memory
5359 * is spread evenly between nodes as long as the nodes have enough
5360 * memory. When they don't, some nodes will have more kernelcore than
5361 * others
5362 */
Kautuk Consulb224ef82012-03-21 16:34:15 -07005363static void __init find_zone_movable_pfns_for_nodes(void)
Mel Gorman2a1e2742007-07-17 04:03:12 -07005364{
5365 int i, nid;
5366 unsigned long usable_startpfn;
5367 unsigned long kernelcore_node, kernelcore_remaining;
Yinghai Lu66918dc2009-06-30 11:41:37 -07005368 /* save the state before borrow the nodemask */
Lai Jiangshan4b0ef1fe2012-12-12 13:51:46 -08005369 nodemask_t saved_node_state = node_states[N_MEMORY];
Lee Schermerhorn37b07e42007-10-16 01:25:39 -07005370 unsigned long totalpages = early_calculate_totalpages();
Lai Jiangshan4b0ef1fe2012-12-12 13:51:46 -08005371 int usable_nodes = nodes_weight(node_states[N_MEMORY]);
Emil Medve136199f2014-04-07 15:37:52 -07005372 struct memblock_region *r;
Tang Chenb2f3eeb2014-01-21 15:49:38 -08005373
5374 /* Need to find movable_zone earlier when movable_node is specified. */
5375 find_usable_zone_for_movable();
Mel Gorman2a1e2742007-07-17 04:03:12 -07005376
Mel Gorman7e63efe2007-07-17 04:03:15 -07005377 /*
Tang Chenb2f3eeb2014-01-21 15:49:38 -08005378 * If movable_node is specified, ignore kernelcore and movablecore
5379 * options.
5380 */
5381 if (movable_node_is_enabled()) {
Emil Medve136199f2014-04-07 15:37:52 -07005382 for_each_memblock(memory, r) {
5383 if (!memblock_is_hotpluggable(r))
Tang Chenb2f3eeb2014-01-21 15:49:38 -08005384 continue;
5385
Emil Medve136199f2014-04-07 15:37:52 -07005386 nid = r->nid;
Tang Chenb2f3eeb2014-01-21 15:49:38 -08005387
Emil Medve136199f2014-04-07 15:37:52 -07005388 usable_startpfn = PFN_DOWN(r->base);
Tang Chenb2f3eeb2014-01-21 15:49:38 -08005389 zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
5390 min(usable_startpfn, zone_movable_pfn[nid]) :
5391 usable_startpfn;
5392 }
5393
5394 goto out2;
5395 }
5396
5397 /*
5398 * If movablecore=nn[KMG] was specified, calculate what size of
Mel Gorman7e63efe2007-07-17 04:03:15 -07005399 * kernelcore that corresponds so that memory usable for
5400 * any allocation type is evenly spread. If both kernelcore
5401 * and movablecore are specified, then the value of kernelcore
5402 * will be used for required_kernelcore if it's greater than
5403 * what movablecore would have allowed.
5404 */
5405 if (required_movablecore) {
Mel Gorman7e63efe2007-07-17 04:03:15 -07005406 unsigned long corepages;
5407
5408 /*
5409 * Round-up so that ZONE_MOVABLE is at least as large as what
5410 * was requested by the user
5411 */
5412 required_movablecore =
5413 roundup(required_movablecore, MAX_ORDER_NR_PAGES);
5414 corepages = totalpages - required_movablecore;
5415
5416 required_kernelcore = max(required_kernelcore, corepages);
5417 }
5418
Yinghai Lu20e69262013-03-01 14:51:27 -08005419 /* If kernelcore was not specified, there is no ZONE_MOVABLE */
5420 if (!required_kernelcore)
Yinghai Lu66918dc2009-06-30 11:41:37 -07005421 goto out;
Mel Gorman2a1e2742007-07-17 04:03:12 -07005422
5423 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
Mel Gorman2a1e2742007-07-17 04:03:12 -07005424 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
5425
5426restart:
5427 /* Spread kernelcore memory as evenly as possible throughout nodes */
5428 kernelcore_node = required_kernelcore / usable_nodes;
Lai Jiangshan4b0ef1fe2012-12-12 13:51:46 -08005429 for_each_node_state(nid, N_MEMORY) {
Tejun Heoc13291a2011-07-12 10:46:30 +02005430 unsigned long start_pfn, end_pfn;
5431
Mel Gorman2a1e2742007-07-17 04:03:12 -07005432 /*
5433 * Recalculate kernelcore_node if the division per node
5434 * now exceeds what is necessary to satisfy the requested
5435 * amount of memory for the kernel
5436 */
5437 if (required_kernelcore < kernelcore_node)
5438 kernelcore_node = required_kernelcore / usable_nodes;
5439
5440 /*
5441 * As the map is walked, we track how much memory is usable
5442 * by the kernel using kernelcore_remaining. When it is
5443 * 0, the rest of the node is usable by ZONE_MOVABLE
5444 */
5445 kernelcore_remaining = kernelcore_node;
5446
5447 /* Go through each range of PFNs within this node */
Tejun Heoc13291a2011-07-12 10:46:30 +02005448 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
Mel Gorman2a1e2742007-07-17 04:03:12 -07005449 unsigned long size_pages;
5450
Tejun Heoc13291a2011-07-12 10:46:30 +02005451 start_pfn = max(start_pfn, zone_movable_pfn[nid]);
Mel Gorman2a1e2742007-07-17 04:03:12 -07005452 if (start_pfn >= end_pfn)
5453 continue;
5454
5455 /* Account for what is only usable for kernelcore */
5456 if (start_pfn < usable_startpfn) {
5457 unsigned long kernel_pages;
5458 kernel_pages = min(end_pfn, usable_startpfn)
5459 - start_pfn;
5460
5461 kernelcore_remaining -= min(kernel_pages,
5462 kernelcore_remaining);
5463 required_kernelcore -= min(kernel_pages,
5464 required_kernelcore);
5465
5466 /* Continue if range is now fully accounted */
5467 if (end_pfn <= usable_startpfn) {
5468
5469 /*
5470 * Push zone_movable_pfn to the end so
5471 * that if we have to rebalance
5472 * kernelcore across nodes, we will
5473 * not double account here
5474 */
5475 zone_movable_pfn[nid] = end_pfn;
5476 continue;
5477 }
5478 start_pfn = usable_startpfn;
5479 }
5480
5481 /*
5482 * The usable PFN range for ZONE_MOVABLE is from
5483 * start_pfn->end_pfn. Calculate size_pages as the
5484 * number of pages used as kernelcore
5485 */
5486 size_pages = end_pfn - start_pfn;
5487 if (size_pages > kernelcore_remaining)
5488 size_pages = kernelcore_remaining;
5489 zone_movable_pfn[nid] = start_pfn + size_pages;
5490
5491 /*
5492 * Some kernelcore has been met, update counts and
5493 * break if the kernelcore for this node has been
Pintu Kumarb8af2942013-09-11 14:20:34 -07005494 * satisfied
Mel Gorman2a1e2742007-07-17 04:03:12 -07005495 */
5496 required_kernelcore -= min(required_kernelcore,
5497 size_pages);
5498 kernelcore_remaining -= size_pages;
5499 if (!kernelcore_remaining)
5500 break;
5501 }
5502 }
5503
5504 /*
5505 * If there is still required_kernelcore, we do another pass with one
5506 * less node in the count. This will push zone_movable_pfn[nid] further
5507 * along on the nodes that still have memory until kernelcore is
Pintu Kumarb8af2942013-09-11 14:20:34 -07005508 * satisfied
Mel Gorman2a1e2742007-07-17 04:03:12 -07005509 */
5510 usable_nodes--;
5511 if (usable_nodes && required_kernelcore > usable_nodes)
5512 goto restart;
5513
Tang Chenb2f3eeb2014-01-21 15:49:38 -08005514out2:
Mel Gorman2a1e2742007-07-17 04:03:12 -07005515 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
5516 for (nid = 0; nid < MAX_NUMNODES; nid++)
5517 zone_movable_pfn[nid] =
5518 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
Yinghai Lu66918dc2009-06-30 11:41:37 -07005519
Yinghai Lu20e69262013-03-01 14:51:27 -08005520out:
Yinghai Lu66918dc2009-06-30 11:41:37 -07005521 /* restore the node_state */
Lai Jiangshan4b0ef1fe2012-12-12 13:51:46 -08005522 node_states[N_MEMORY] = saved_node_state;
Mel Gorman2a1e2742007-07-17 04:03:12 -07005523}
5524
Lai Jiangshan4b0ef1fe2012-12-12 13:51:46 -08005525/* Any regular or high memory on that node ? */
5526static void check_for_memory(pg_data_t *pgdat, int nid)
Lee Schermerhorn37b07e42007-10-16 01:25:39 -07005527{
Lee Schermerhorn37b07e42007-10-16 01:25:39 -07005528 enum zone_type zone_type;
5529
Lai Jiangshan4b0ef1fe2012-12-12 13:51:46 -08005530 if (N_MEMORY == N_NORMAL_MEMORY)
5531 return;
5532
5533 for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
Lee Schermerhorn37b07e42007-10-16 01:25:39 -07005534 struct zone *zone = &pgdat->node_zones[zone_type];
Xishi Qiub38a8722013-11-12 15:07:20 -08005535 if (populated_zone(zone)) {
Lai Jiangshan4b0ef1fe2012-12-12 13:51:46 -08005536 node_set_state(nid, N_HIGH_MEMORY);
5537 if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
5538 zone_type <= ZONE_NORMAL)
5539 node_set_state(nid, N_NORMAL_MEMORY);
Bob Liud0048b02012-01-12 17:19:07 -08005540 break;
5541 }
Lee Schermerhorn37b07e42007-10-16 01:25:39 -07005542 }
Lee Schermerhorn37b07e42007-10-16 01:25:39 -07005543}
5544
Mel Gormanc7132162006-09-27 01:49:43 -07005545/**
5546 * free_area_init_nodes - Initialise all pg_data_t and zone data
Randy Dunlap88ca3b92006-10-04 02:15:25 -07005547 * @max_zone_pfn: an array of max PFNs for each zone
Mel Gormanc7132162006-09-27 01:49:43 -07005548 *
5549 * This will call free_area_init_node() for each active node in the system.
Zhang Zhen7d018172014-06-04 16:10:53 -07005550 * Using the page ranges provided by memblock_set_node(), the size of each
Mel Gormanc7132162006-09-27 01:49:43 -07005551 * zone in each node and their holes is calculated. If the maximum PFN
5552 * between two adjacent zones match, it is assumed that the zone is empty.
5553 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
5554 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
5555 * starts where the previous one ended. For example, ZONE_DMA32 starts
5556 * at arch_max_dma_pfn.
5557 */
5558void __init free_area_init_nodes(unsigned long *max_zone_pfn)
5559{
Tejun Heoc13291a2011-07-12 10:46:30 +02005560 unsigned long start_pfn, end_pfn;
5561 int i, nid;
Mel Gormana6af2bc2007-02-10 01:42:57 -08005562
Mel Gormanc7132162006-09-27 01:49:43 -07005563 /* Record where the zone boundaries are */
5564 memset(arch_zone_lowest_possible_pfn, 0,
5565 sizeof(arch_zone_lowest_possible_pfn));
5566 memset(arch_zone_highest_possible_pfn, 0,
5567 sizeof(arch_zone_highest_possible_pfn));
5568 arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
5569 arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
5570 for (i = 1; i < MAX_NR_ZONES; i++) {
Mel Gorman2a1e2742007-07-17 04:03:12 -07005571 if (i == ZONE_MOVABLE)
5572 continue;
Mel Gormanc7132162006-09-27 01:49:43 -07005573 arch_zone_lowest_possible_pfn[i] =
5574 arch_zone_highest_possible_pfn[i-1];
5575 arch_zone_highest_possible_pfn[i] =
5576 max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
5577 }
Mel Gorman2a1e2742007-07-17 04:03:12 -07005578 arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
5579 arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
5580
5581 /* Find the PFNs that ZONE_MOVABLE begins at in each node */
5582 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
Kautuk Consulb224ef82012-03-21 16:34:15 -07005583 find_zone_movable_pfns_for_nodes();
Mel Gormanc7132162006-09-27 01:49:43 -07005584
Mel Gormanc7132162006-09-27 01:49:43 -07005585 /* Print out the zone ranges */
Anton Blanchardf88dfff2014-12-10 15:42:53 -08005586 pr_info("Zone ranges:\n");
Mel Gorman2a1e2742007-07-17 04:03:12 -07005587 for (i = 0; i < MAX_NR_ZONES; i++) {
5588 if (i == ZONE_MOVABLE)
5589 continue;
Anton Blanchardf88dfff2014-12-10 15:42:53 -08005590 pr_info(" %-8s ", zone_names[i]);
David Rientjes72f0ba02010-03-05 13:42:14 -08005591 if (arch_zone_lowest_possible_pfn[i] ==
5592 arch_zone_highest_possible_pfn[i])
Anton Blanchardf88dfff2014-12-10 15:42:53 -08005593 pr_cont("empty\n");
David Rientjes72f0ba02010-03-05 13:42:14 -08005594 else
Juergen Gross8d29e182015-02-11 15:26:01 -08005595 pr_cont("[mem %#018Lx-%#018Lx]\n",
5596 (u64)arch_zone_lowest_possible_pfn[i]
5597 << PAGE_SHIFT,
5598 ((u64)arch_zone_highest_possible_pfn[i]
Bjorn Helgaasa62e2f42012-05-29 15:06:30 -07005599 << PAGE_SHIFT) - 1);
Mel Gorman2a1e2742007-07-17 04:03:12 -07005600 }
5601
5602 /* Print out the PFNs ZONE_MOVABLE begins at in each node */
Anton Blanchardf88dfff2014-12-10 15:42:53 -08005603 pr_info("Movable zone start for each node\n");
Mel Gorman2a1e2742007-07-17 04:03:12 -07005604 for (i = 0; i < MAX_NUMNODES; i++) {
5605 if (zone_movable_pfn[i])
Juergen Gross8d29e182015-02-11 15:26:01 -08005606 pr_info(" Node %d: %#018Lx\n", i,
5607 (u64)zone_movable_pfn[i] << PAGE_SHIFT);
Mel Gorman2a1e2742007-07-17 04:03:12 -07005608 }
Mel Gormanc7132162006-09-27 01:49:43 -07005609
Wanpeng Lif2d52fe2012-10-08 16:32:24 -07005610 /* Print out the early node map */
Anton Blanchardf88dfff2014-12-10 15:42:53 -08005611 pr_info("Early memory node ranges\n");
Tejun Heoc13291a2011-07-12 10:46:30 +02005612 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
Juergen Gross8d29e182015-02-11 15:26:01 -08005613 pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,
5614 (u64)start_pfn << PAGE_SHIFT,
5615 ((u64)end_pfn << PAGE_SHIFT) - 1);
Mel Gormanc7132162006-09-27 01:49:43 -07005616
5617 /* Initialise every node */
Mel Gorman708614e2008-07-23 21:26:51 -07005618 mminit_verify_pageflags_layout();
Christoph Lameter8ef82862007-02-20 13:57:52 -08005619 setup_nr_node_ids();
Mel Gormanc7132162006-09-27 01:49:43 -07005620 for_each_online_node(nid) {
5621 pg_data_t *pgdat = NODE_DATA(nid);
Johannes Weiner9109fb72008-07-23 21:27:20 -07005622 free_area_init_node(nid, NULL,
Mel Gormanc7132162006-09-27 01:49:43 -07005623 find_min_pfn_for_node(nid), NULL);
Lee Schermerhorn37b07e42007-10-16 01:25:39 -07005624
5625 /* Any memory on that node */
5626 if (pgdat->node_present_pages)
Lai Jiangshan4b0ef1fe2012-12-12 13:51:46 -08005627 node_set_state(nid, N_MEMORY);
5628 check_for_memory(pgdat, nid);
Mel Gormanc7132162006-09-27 01:49:43 -07005629 }
5630}
Mel Gorman2a1e2742007-07-17 04:03:12 -07005631
Mel Gorman7e63efe2007-07-17 04:03:15 -07005632static int __init cmdline_parse_core(char *p, unsigned long *core)
Mel Gorman2a1e2742007-07-17 04:03:12 -07005633{
5634 unsigned long long coremem;
5635 if (!p)
5636 return -EINVAL;
5637
5638 coremem = memparse(p, &p);
Mel Gorman7e63efe2007-07-17 04:03:15 -07005639 *core = coremem >> PAGE_SHIFT;
Mel Gorman2a1e2742007-07-17 04:03:12 -07005640
Mel Gorman7e63efe2007-07-17 04:03:15 -07005641 /* Paranoid check that UL is enough for the coremem value */
Mel Gorman2a1e2742007-07-17 04:03:12 -07005642 WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
5643
5644 return 0;
5645}
Mel Gormaned7ed362007-07-17 04:03:14 -07005646
Mel Gorman7e63efe2007-07-17 04:03:15 -07005647/*
5648 * kernelcore=size sets the amount of memory for use for allocations that
5649 * cannot be reclaimed or migrated.
5650 */
5651static int __init cmdline_parse_kernelcore(char *p)
5652{
5653 return cmdline_parse_core(p, &required_kernelcore);
5654}
5655
5656/*
5657 * movablecore=size sets the amount of memory for use for allocations that
5658 * can be reclaimed or migrated.
5659 */
5660static int __init cmdline_parse_movablecore(char *p)
5661{
5662 return cmdline_parse_core(p, &required_movablecore);
5663}
5664
Mel Gormaned7ed362007-07-17 04:03:14 -07005665early_param("kernelcore", cmdline_parse_kernelcore);
Mel Gorman7e63efe2007-07-17 04:03:15 -07005666early_param("movablecore", cmdline_parse_movablecore);
Mel Gormaned7ed362007-07-17 04:03:14 -07005667
Tejun Heo0ee332c2011-12-08 10:22:09 -08005668#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
Mel Gormanc7132162006-09-27 01:49:43 -07005669
Jiang Liuc3d5f5f2013-07-03 15:03:14 -07005670void adjust_managed_page_count(struct page *page, long count)
5671{
5672 spin_lock(&managed_page_count_lock);
5673 page_zone(page)->managed_pages += count;
5674 totalram_pages += count;
Jiang Liu3dcc0572013-07-03 15:03:21 -07005675#ifdef CONFIG_HIGHMEM
5676 if (PageHighMem(page))
5677 totalhigh_pages += count;
5678#endif
Jiang Liuc3d5f5f2013-07-03 15:03:14 -07005679 spin_unlock(&managed_page_count_lock);
5680}
Jiang Liu3dcc0572013-07-03 15:03:21 -07005681EXPORT_SYMBOL(adjust_managed_page_count);
Jiang Liuc3d5f5f2013-07-03 15:03:14 -07005682
Jiang Liu11199692013-07-03 15:02:48 -07005683unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
Jiang Liu69afade2013-04-29 15:06:21 -07005684{
Jiang Liu11199692013-07-03 15:02:48 -07005685 void *pos;
5686 unsigned long pages = 0;
Jiang Liu69afade2013-04-29 15:06:21 -07005687
Jiang Liu11199692013-07-03 15:02:48 -07005688 start = (void *)PAGE_ALIGN((unsigned long)start);
5689 end = (void *)((unsigned long)end & PAGE_MASK);
5690 for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
Jiang Liudbe67df2013-07-03 15:02:51 -07005691 if ((unsigned int)poison <= 0xFF)
Jiang Liu11199692013-07-03 15:02:48 -07005692 memset(pos, poison, PAGE_SIZE);
5693 free_reserved_page(virt_to_page(pos));
Jiang Liu69afade2013-04-29 15:06:21 -07005694 }
5695
5696 if (pages && s)
Jiang Liu11199692013-07-03 15:02:48 -07005697 pr_info("Freeing %s memory: %ldK (%p - %p)\n",
Jiang Liu69afade2013-04-29 15:06:21 -07005698 s, pages << (PAGE_SHIFT - 10), start, end);
5699
5700 return pages;
5701}
Jiang Liu11199692013-07-03 15:02:48 -07005702EXPORT_SYMBOL(free_reserved_area);
Jiang Liu69afade2013-04-29 15:06:21 -07005703
Jiang Liucfa11e02013-04-29 15:07:00 -07005704#ifdef CONFIG_HIGHMEM
5705void free_highmem_page(struct page *page)
5706{
5707 __free_reserved_page(page);
5708 totalram_pages++;
Jiang Liu7b4b2a02013-07-03 15:03:11 -07005709 page_zone(page)->managed_pages++;
Jiang Liucfa11e02013-04-29 15:07:00 -07005710 totalhigh_pages++;
5711}
5712#endif
5713
Jiang Liu7ee3d4e2013-07-03 15:03:41 -07005714
5715void __init mem_init_print_info(const char *str)
5716{
5717 unsigned long physpages, codesize, datasize, rosize, bss_size;
5718 unsigned long init_code_size, init_data_size;
5719
5720 physpages = get_num_physpages();
5721 codesize = _etext - _stext;
5722 datasize = _edata - _sdata;
5723 rosize = __end_rodata - __start_rodata;
5724 bss_size = __bss_stop - __bss_start;
5725 init_data_size = __init_end - __init_begin;
5726 init_code_size = _einittext - _sinittext;
5727
5728 /*
5729 * Detect special cases and adjust section sizes accordingly:
5730 * 1) .init.* may be embedded into .data sections
5731 * 2) .init.text.* may be out of [__init_begin, __init_end],
5732 * please refer to arch/tile/kernel/vmlinux.lds.S.
5733 * 3) .rodata.* may be embedded into .text or .data sections.
5734 */
5735#define adj_init_size(start, end, size, pos, adj) \
Pintu Kumarb8af2942013-09-11 14:20:34 -07005736 do { \
5737 if (start <= pos && pos < end && size > adj) \
5738 size -= adj; \
5739 } while (0)
Jiang Liu7ee3d4e2013-07-03 15:03:41 -07005740
5741 adj_init_size(__init_begin, __init_end, init_data_size,
5742 _sinittext, init_code_size);
5743 adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
5744 adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
5745 adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
5746 adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
5747
5748#undef adj_init_size
5749
Anton Blanchardf88dfff2014-12-10 15:42:53 -08005750 pr_info("Memory: %luK/%luK available "
Jiang Liu7ee3d4e2013-07-03 15:03:41 -07005751 "(%luK kernel code, %luK rwdata, %luK rodata, "
Pintu Kumare48322a2014-12-18 16:17:15 -08005752 "%luK init, %luK bss, %luK reserved, %luK cma-reserved"
Jiang Liu7ee3d4e2013-07-03 15:03:41 -07005753#ifdef CONFIG_HIGHMEM
5754 ", %luK highmem"
5755#endif
5756 "%s%s)\n",
5757 nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),
5758 codesize >> 10, datasize >> 10, rosize >> 10,
5759 (init_data_size + init_code_size) >> 10, bss_size >> 10,
Pintu Kumare48322a2014-12-18 16:17:15 -08005760 (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT-10),
5761 totalcma_pages << (PAGE_SHIFT-10),
Jiang Liu7ee3d4e2013-07-03 15:03:41 -07005762#ifdef CONFIG_HIGHMEM
5763 totalhigh_pages << (PAGE_SHIFT-10),
5764#endif
5765 str ? ", " : "", str ? str : "");
5766}
5767
Mel Gorman0e0b8642006-09-27 01:49:56 -07005768/**
Randy Dunlap88ca3b92006-10-04 02:15:25 -07005769 * set_dma_reserve - set the specified number of pages reserved in the first zone
5770 * @new_dma_reserve: The number of pages to mark reserved
Mel Gorman0e0b8642006-09-27 01:49:56 -07005771 *
5772 * The per-cpu batchsize and zone watermarks are determined by present_pages.
5773 * In the DMA zone, a significant percentage may be consumed by kernel image
5774 * and other unfreeable allocations which can skew the watermarks badly. This
Randy Dunlap88ca3b92006-10-04 02:15:25 -07005775 * function may optionally be used to account for unfreeable pages in the
5776 * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
5777 * smaller per-cpu batchsize.
Mel Gorman0e0b8642006-09-27 01:49:56 -07005778 */
5779void __init set_dma_reserve(unsigned long new_dma_reserve)
5780{
5781 dma_reserve = new_dma_reserve;
5782}
5783
Linus Torvalds1da177e2005-04-16 15:20:36 -07005784void __init free_area_init(unsigned long *zones_size)
5785{
Johannes Weiner9109fb72008-07-23 21:27:20 -07005786 free_area_init_node(0, zones_size,
Linus Torvalds1da177e2005-04-16 15:20:36 -07005787 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
5788}
Linus Torvalds1da177e2005-04-16 15:20:36 -07005789
Linus Torvalds1da177e2005-04-16 15:20:36 -07005790static int page_alloc_cpu_notify(struct notifier_block *self,
5791 unsigned long action, void *hcpu)
5792{
5793 int cpu = (unsigned long)hcpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005794
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07005795 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
Konstantin Khlebnikovf0cb3c72012-03-21 16:34:06 -07005796 lru_add_drain_cpu(cpu);
Christoph Lameter9f8f2172008-02-04 22:29:11 -08005797 drain_pages(cpu);
5798
5799 /*
5800 * Spill the event counters of the dead processor
5801 * into the current processors event counters.
5802 * This artificially elevates the count of the current
5803 * processor.
5804 */
Christoph Lameterf8891e52006-06-30 01:55:45 -07005805 vm_events_fold_cpu(cpu);
Christoph Lameter9f8f2172008-02-04 22:29:11 -08005806
5807 /*
5808 * Zero the differential counters of the dead processor
5809 * so that the vm statistics are consistent.
5810 *
5811 * This is only okay since the processor is dead and cannot
5812 * race with what we are doing.
5813 */
Christoph Lameter2bb921e2013-09-11 14:21:30 -07005814 cpu_vm_stats_fold(cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005815 }
5816 return NOTIFY_OK;
5817}
Linus Torvalds1da177e2005-04-16 15:20:36 -07005818
5819void __init page_alloc_init(void)
5820{
5821 hotcpu_notifier(page_alloc_cpu_notify, 0);
5822}
5823
5824/*
Hideo AOKIcb45b0e2006-04-10 22:52:59 -07005825 * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
5826 * or min_free_kbytes changes.
5827 */
5828static void calculate_totalreserve_pages(void)
5829{
5830 struct pglist_data *pgdat;
5831 unsigned long reserve_pages = 0;
Christoph Lameter2f6726e2006-09-25 23:31:18 -07005832 enum zone_type i, j;
Hideo AOKIcb45b0e2006-04-10 22:52:59 -07005833
5834 for_each_online_pgdat(pgdat) {
5835 for (i = 0; i < MAX_NR_ZONES; i++) {
5836 struct zone *zone = pgdat->node_zones + i;
Mel Gorman3484b2d2014-08-06 16:07:14 -07005837 long max = 0;
Hideo AOKIcb45b0e2006-04-10 22:52:59 -07005838
5839 /* Find valid and maximum lowmem_reserve in the zone */
5840 for (j = i; j < MAX_NR_ZONES; j++) {
5841 if (zone->lowmem_reserve[j] > max)
5842 max = zone->lowmem_reserve[j];
5843 }
5844
Mel Gorman41858962009-06-16 15:32:12 -07005845 /* we treat the high watermark as reserved pages. */
5846 max += high_wmark_pages(zone);
Hideo AOKIcb45b0e2006-04-10 22:52:59 -07005847
Jiang Liub40da042013-02-22 16:33:52 -08005848 if (max > zone->managed_pages)
5849 max = zone->managed_pages;
Hideo AOKIcb45b0e2006-04-10 22:52:59 -07005850 reserve_pages += max;
Johannes Weinerab8fabd2012-01-10 15:07:42 -08005851 /*
5852 * Lowmem reserves are not available to
5853 * GFP_HIGHUSER page cache allocations and
5854 * kswapd tries to balance zones to their high
5855 * watermark. As a result, neither should be
5856 * regarded as dirtyable memory, to prevent a
5857 * situation where reclaim has to clean pages
5858 * in order to balance the zones.
5859 */
5860 zone->dirty_balance_reserve = max;
Hideo AOKIcb45b0e2006-04-10 22:52:59 -07005861 }
5862 }
Johannes Weinerab8fabd2012-01-10 15:07:42 -08005863 dirty_balance_reserve = reserve_pages;
Hideo AOKIcb45b0e2006-04-10 22:52:59 -07005864 totalreserve_pages = reserve_pages;
5865}
5866
5867/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07005868 * setup_per_zone_lowmem_reserve - called whenever
5869 * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone
5870 * has a correct pages reserved value, so an adequate number of
5871 * pages are left in the zone after a successful __alloc_pages().
5872 */
5873static void setup_per_zone_lowmem_reserve(void)
5874{
5875 struct pglist_data *pgdat;
Christoph Lameter2f6726e2006-09-25 23:31:18 -07005876 enum zone_type j, idx;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005877
KAMEZAWA Hiroyukiec936fc2006-03-27 01:15:59 -08005878 for_each_online_pgdat(pgdat) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005879 for (j = 0; j < MAX_NR_ZONES; j++) {
5880 struct zone *zone = pgdat->node_zones + j;
Jiang Liub40da042013-02-22 16:33:52 -08005881 unsigned long managed_pages = zone->managed_pages;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005882
5883 zone->lowmem_reserve[j] = 0;
5884
Christoph Lameter2f6726e2006-09-25 23:31:18 -07005885 idx = j;
5886 while (idx) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005887 struct zone *lower_zone;
5888
Christoph Lameter2f6726e2006-09-25 23:31:18 -07005889 idx--;
5890
Linus Torvalds1da177e2005-04-16 15:20:36 -07005891 if (sysctl_lowmem_reserve_ratio[idx] < 1)
5892 sysctl_lowmem_reserve_ratio[idx] = 1;
5893
5894 lower_zone = pgdat->node_zones + idx;
Jiang Liub40da042013-02-22 16:33:52 -08005895 lower_zone->lowmem_reserve[j] = managed_pages /
Linus Torvalds1da177e2005-04-16 15:20:36 -07005896 sysctl_lowmem_reserve_ratio[idx];
Jiang Liub40da042013-02-22 16:33:52 -08005897 managed_pages += lower_zone->managed_pages;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005898 }
5899 }
5900 }
Hideo AOKIcb45b0e2006-04-10 22:52:59 -07005901
5902 /* update totalreserve_pages */
5903 calculate_totalreserve_pages();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005904}
5905
Mel Gormancfd3da12011-04-25 21:36:42 +00005906static void __setup_per_zone_wmarks(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005907{
5908 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
5909 unsigned long lowmem_pages = 0;
5910 struct zone *zone;
5911 unsigned long flags;
5912
5913 /* Calculate total number of !ZONE_HIGHMEM pages */
5914 for_each_zone(zone) {
5915 if (!is_highmem(zone))
Jiang Liub40da042013-02-22 16:33:52 -08005916 lowmem_pages += zone->managed_pages;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005917 }
5918
5919 for_each_zone(zone) {
Andrew Mortonac924c62006-05-15 09:43:59 -07005920 u64 tmp;
5921
Gerald Schaefer1125b4e2008-10-18 20:27:11 -07005922 spin_lock_irqsave(&zone->lock, flags);
Jiang Liub40da042013-02-22 16:33:52 -08005923 tmp = (u64)pages_min * zone->managed_pages;
Andrew Mortonac924c62006-05-15 09:43:59 -07005924 do_div(tmp, lowmem_pages);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005925 if (is_highmem(zone)) {
5926 /*
Nick Piggin669ed172005-11-13 16:06:45 -08005927 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
5928 * need highmem pages, so cap pages_min to a small
5929 * value here.
5930 *
Mel Gorman41858962009-06-16 15:32:12 -07005931 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
Yaowei Bai42ff2702015-04-14 15:47:14 -07005932 * deltas control asynch page reclaim, and so should
Nick Piggin669ed172005-11-13 16:06:45 -08005933 * not be capped for highmem.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005934 */
Andrew Morton90ae8d62013-02-22 16:32:22 -08005935 unsigned long min_pages;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005936
Jiang Liub40da042013-02-22 16:33:52 -08005937 min_pages = zone->managed_pages / 1024;
Andrew Morton90ae8d62013-02-22 16:32:22 -08005938 min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
Mel Gorman41858962009-06-16 15:32:12 -07005939 zone->watermark[WMARK_MIN] = min_pages;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005940 } else {
Nick Piggin669ed172005-11-13 16:06:45 -08005941 /*
5942 * If it's a lowmem zone, reserve a number of pages
Linus Torvalds1da177e2005-04-16 15:20:36 -07005943 * proportionate to the zone's size.
5944 */
Mel Gorman41858962009-06-16 15:32:12 -07005945 zone->watermark[WMARK_MIN] = tmp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005946 }
5947
Mel Gorman41858962009-06-16 15:32:12 -07005948 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
5949 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
Marek Szyprowski49f223a2012-01-25 12:49:24 +01005950
Johannes Weiner81c0a2b2013-09-11 14:20:47 -07005951 __mod_zone_page_state(zone, NR_ALLOC_BATCH,
Johannes Weinerabe5f972014-10-02 16:21:10 -07005952 high_wmark_pages(zone) - low_wmark_pages(zone) -
5953 atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
Johannes Weiner81c0a2b2013-09-11 14:20:47 -07005954
Mel Gorman56fd56b2007-10-16 01:25:58 -07005955 setup_zone_migrate_reserve(zone);
Gerald Schaefer1125b4e2008-10-18 20:27:11 -07005956 spin_unlock_irqrestore(&zone->lock, flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005957 }
Hideo AOKIcb45b0e2006-04-10 22:52:59 -07005958
5959 /* update totalreserve_pages */
5960 calculate_totalreserve_pages();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005961}
5962
Mel Gormancfd3da12011-04-25 21:36:42 +00005963/**
5964 * setup_per_zone_wmarks - called when min_free_kbytes changes
5965 * or when memory is hot-{added|removed}
5966 *
5967 * Ensures that the watermark[min,low,high] values for each zone are set
5968 * correctly with respect to min_free_kbytes.
5969 */
5970void setup_per_zone_wmarks(void)
5971{
5972 mutex_lock(&zonelists_mutex);
5973 __setup_per_zone_wmarks();
5974 mutex_unlock(&zonelists_mutex);
5975}
5976
Randy Dunlap55a44622009-09-21 17:01:20 -07005977/*
Rik van Riel556adec2008-10-18 20:26:34 -07005978 * The inactive anon list should be small enough that the VM never has to
5979 * do too much work, but large enough that each inactive page has a chance
5980 * to be referenced again before it is swapped out.
5981 *
5982 * The inactive_anon ratio is the target ratio of ACTIVE_ANON to
5983 * INACTIVE_ANON pages on this zone's LRU, maintained by the
5984 * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
5985 * the anonymous pages are kept on the inactive list.
5986 *
5987 * total target max
5988 * memory ratio inactive anon
5989 * -------------------------------------
5990 * 10MB 1 5MB
5991 * 100MB 1 50MB
5992 * 1GB 3 250MB
5993 * 10GB 10 0.9GB
5994 * 100GB 31 3GB
5995 * 1TB 101 10GB
5996 * 10TB 320 32GB
5997 */
KOSAKI Motohiro1b79acc2011-05-24 17:11:32 -07005998static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
Minchan Kim96cb4df2009-06-16 15:32:49 -07005999{
6000 unsigned int gb, ratio;
6001
6002 /* Zone size in gigabytes */
Jiang Liub40da042013-02-22 16:33:52 -08006003 gb = zone->managed_pages >> (30 - PAGE_SHIFT);
Minchan Kim96cb4df2009-06-16 15:32:49 -07006004 if (gb)
6005 ratio = int_sqrt(10 * gb);
6006 else
6007 ratio = 1;
6008
6009 zone->inactive_ratio = ratio;
6010}
6011
KOSAKI Motohiro839a4fc2011-05-24 17:11:31 -07006012static void __meminit setup_per_zone_inactive_ratio(void)
Rik van Riel556adec2008-10-18 20:26:34 -07006013{
6014 struct zone *zone;
6015
Minchan Kim96cb4df2009-06-16 15:32:49 -07006016 for_each_zone(zone)
6017 calculate_zone_inactive_ratio(zone);
Rik van Riel556adec2008-10-18 20:26:34 -07006018}
6019
Linus Torvalds1da177e2005-04-16 15:20:36 -07006020/*
6021 * Initialise min_free_kbytes.
6022 *
6023 * For small machines we want it small (128k min). For large machines
6024 * we want it large (64MB max). But it is not linear, because network
6025 * bandwidth does not increase linearly with machine size. We use
6026 *
Pintu Kumarb8af2942013-09-11 14:20:34 -07006027 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
Linus Torvalds1da177e2005-04-16 15:20:36 -07006028 * min_free_kbytes = sqrt(lowmem_kbytes * 16)
6029 *
6030 * which yields
6031 *
6032 * 16MB: 512k
6033 * 32MB: 724k
6034 * 64MB: 1024k
6035 * 128MB: 1448k
6036 * 256MB: 2048k
6037 * 512MB: 2896k
6038 * 1024MB: 4096k
6039 * 2048MB: 5792k
6040 * 4096MB: 8192k
6041 * 8192MB: 11584k
6042 * 16384MB: 16384k
6043 */
KOSAKI Motohiro1b79acc2011-05-24 17:11:32 -07006044int __meminit init_per_zone_wmark_min(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006045{
6046 unsigned long lowmem_kbytes;
Michal Hocko5f127332013-07-08 16:00:40 -07006047 int new_min_free_kbytes;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006048
6049 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
Michal Hocko5f127332013-07-08 16:00:40 -07006050 new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006051
Michal Hocko5f127332013-07-08 16:00:40 -07006052 if (new_min_free_kbytes > user_min_free_kbytes) {
6053 min_free_kbytes = new_min_free_kbytes;
6054 if (min_free_kbytes < 128)
6055 min_free_kbytes = 128;
6056 if (min_free_kbytes > 65536)
6057 min_free_kbytes = 65536;
6058 } else {
6059 pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
6060 new_min_free_kbytes, user_min_free_kbytes);
6061 }
Minchan Kimbc75d332009-06-16 15:32:48 -07006062 setup_per_zone_wmarks();
KOSAKI Motohiroa6cccdc2011-05-24 17:11:33 -07006063 refresh_zone_stat_thresholds();
Linus Torvalds1da177e2005-04-16 15:20:36 -07006064 setup_per_zone_lowmem_reserve();
Rik van Riel556adec2008-10-18 20:26:34 -07006065 setup_per_zone_inactive_ratio();
Linus Torvalds1da177e2005-04-16 15:20:36 -07006066 return 0;
6067}
Minchan Kimbc75d332009-06-16 15:32:48 -07006068module_init(init_per_zone_wmark_min)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006069
6070/*
Pintu Kumarb8af2942013-09-11 14:20:34 -07006071 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
Linus Torvalds1da177e2005-04-16 15:20:36 -07006072 * that we can call two helper functions whenever min_free_kbytes
6073 * changes.
6074 */
Joe Perchescccad5b2014-06-06 14:38:09 -07006075int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07006076 void __user *buffer, size_t *length, loff_t *ppos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006077{
Han Pingtianda8c7572014-01-23 15:53:17 -08006078 int rc;
6079
6080 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
6081 if (rc)
6082 return rc;
6083
Michal Hocko5f127332013-07-08 16:00:40 -07006084 if (write) {
6085 user_min_free_kbytes = min_free_kbytes;
Minchan Kimbc75d332009-06-16 15:32:48 -07006086 setup_per_zone_wmarks();
Michal Hocko5f127332013-07-08 16:00:40 -07006087 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07006088 return 0;
6089}
6090
Christoph Lameter96146342006-07-03 00:24:13 -07006091#ifdef CONFIG_NUMA
Joe Perchescccad5b2014-06-06 14:38:09 -07006092int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07006093 void __user *buffer, size_t *length, loff_t *ppos)
Christoph Lameter96146342006-07-03 00:24:13 -07006094{
6095 struct zone *zone;
6096 int rc;
6097
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07006098 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
Christoph Lameter96146342006-07-03 00:24:13 -07006099 if (rc)
6100 return rc;
6101
6102 for_each_zone(zone)
Jiang Liub40da042013-02-22 16:33:52 -08006103 zone->min_unmapped_pages = (zone->managed_pages *
Christoph Lameter96146342006-07-03 00:24:13 -07006104 sysctl_min_unmapped_ratio) / 100;
6105 return 0;
6106}
Christoph Lameter0ff38492006-09-25 23:31:52 -07006107
Joe Perchescccad5b2014-06-06 14:38:09 -07006108int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07006109 void __user *buffer, size_t *length, loff_t *ppos)
Christoph Lameter0ff38492006-09-25 23:31:52 -07006110{
6111 struct zone *zone;
6112 int rc;
6113
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07006114 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
Christoph Lameter0ff38492006-09-25 23:31:52 -07006115 if (rc)
6116 return rc;
6117
6118 for_each_zone(zone)
Jiang Liub40da042013-02-22 16:33:52 -08006119 zone->min_slab_pages = (zone->managed_pages *
Christoph Lameter0ff38492006-09-25 23:31:52 -07006120 sysctl_min_slab_ratio) / 100;
6121 return 0;
6122}
Christoph Lameter96146342006-07-03 00:24:13 -07006123#endif
6124
Linus Torvalds1da177e2005-04-16 15:20:36 -07006125/*
6126 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
6127 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
6128 * whenever sysctl_lowmem_reserve_ratio changes.
6129 *
6130 * The reserve ratio obviously has absolutely no relation with the
Mel Gorman41858962009-06-16 15:32:12 -07006131 * minimum watermarks. The lowmem reserve ratio can only make sense
Linus Torvalds1da177e2005-04-16 15:20:36 -07006132 * if in function of the boot time zone sizes.
6133 */
Joe Perchescccad5b2014-06-06 14:38:09 -07006134int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07006135 void __user *buffer, size_t *length, loff_t *ppos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006136{
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07006137 proc_dointvec_minmax(table, write, buffer, length, ppos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006138 setup_per_zone_lowmem_reserve();
6139 return 0;
6140}
6141
Rohit Seth8ad4b1f2006-01-08 01:00:40 -08006142/*
6143 * percpu_pagelist_fraction - changes the pcp->high for each zone on each
Pintu Kumarb8af2942013-09-11 14:20:34 -07006144 * cpu. It is the fraction of total pages in each zone that a hot per cpu
6145 * pagelist can have before it gets flushed back to buddy allocator.
Rohit Seth8ad4b1f2006-01-08 01:00:40 -08006146 */
Joe Perchescccad5b2014-06-06 14:38:09 -07006147int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07006148 void __user *buffer, size_t *length, loff_t *ppos)
Rohit Seth8ad4b1f2006-01-08 01:00:40 -08006149{
6150 struct zone *zone;
David Rientjes7cd2b0a2014-06-23 13:22:04 -07006151 int old_percpu_pagelist_fraction;
Rohit Seth8ad4b1f2006-01-08 01:00:40 -08006152 int ret;
6153
Cody P Schaferc8e251f2013-07-03 15:01:29 -07006154 mutex_lock(&pcp_batch_high_lock);
David Rientjes7cd2b0a2014-06-23 13:22:04 -07006155 old_percpu_pagelist_fraction = percpu_pagelist_fraction;
6156
6157 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
6158 if (!write || ret < 0)
6159 goto out;
6160
6161 /* Sanity checking to avoid pcp imbalance */
6162 if (percpu_pagelist_fraction &&
6163 percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
6164 percpu_pagelist_fraction = old_percpu_pagelist_fraction;
6165 ret = -EINVAL;
6166 goto out;
Rohit Seth8ad4b1f2006-01-08 01:00:40 -08006167 }
David Rientjes7cd2b0a2014-06-23 13:22:04 -07006168
6169 /* No change? */
6170 if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
6171 goto out;
6172
6173 for_each_populated_zone(zone) {
6174 unsigned int cpu;
6175
6176 for_each_possible_cpu(cpu)
6177 pageset_set_high_and_batch(zone,
6178 per_cpu_ptr(zone->pageset, cpu));
6179 }
6180out:
Cody P Schaferc8e251f2013-07-03 15:01:29 -07006181 mutex_unlock(&pcp_batch_high_lock);
David Rientjes7cd2b0a2014-06-23 13:22:04 -07006182 return ret;
Rohit Seth8ad4b1f2006-01-08 01:00:40 -08006183}
6184
Rasmus Villemoesa9919c72015-06-24 16:56:28 -07006185#ifdef CONFIG_NUMA
David S. Millerf034b5d2006-08-24 03:08:07 -07006186int hashdist = HASHDIST_DEFAULT;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006187
Linus Torvalds1da177e2005-04-16 15:20:36 -07006188static int __init set_hashdist(char *str)
6189{
6190 if (!str)
6191 return 0;
6192 hashdist = simple_strtoul(str, &str, 0);
6193 return 1;
6194}
6195__setup("hashdist=", set_hashdist);
6196#endif
6197
6198/*
6199 * allocate a large system hash table from bootmem
6200 * - it is assumed that the hash table must contain an exact power-of-2
6201 * quantity of entries
6202 * - limit is the number of hash buckets, not the total allocation size
6203 */
6204void *__init alloc_large_system_hash(const char *tablename,
6205 unsigned long bucketsize,
6206 unsigned long numentries,
6207 int scale,
6208 int flags,
6209 unsigned int *_hash_shift,
6210 unsigned int *_hash_mask,
Tim Bird31fe62b2012-05-23 13:33:35 +00006211 unsigned long low_limit,
6212 unsigned long high_limit)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006213{
Tim Bird31fe62b2012-05-23 13:33:35 +00006214 unsigned long long max = high_limit;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006215 unsigned long log2qty, size;
6216 void *table = NULL;
6217
6218 /* allow the kernel cmdline to have a say */
6219 if (!numentries) {
6220 /* round applicable memory size up to nearest megabyte */
Andrew Morton04903662006-12-06 20:37:33 -08006221 numentries = nr_kernel_pages;
Jerry Zhoua7e83312013-09-11 14:20:26 -07006222
6223 /* It isn't necessary when PAGE_SIZE >= 1MB */
6224 if (PAGE_SHIFT < 20)
6225 numentries = round_up(numentries, (1<<20)/PAGE_SIZE);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006226
6227 /* limit to 1 bucket per 2^scale bytes of low memory */
6228 if (scale > PAGE_SHIFT)
6229 numentries >>= (scale - PAGE_SHIFT);
6230 else
6231 numentries <<= (PAGE_SHIFT - scale);
Paul Mundt9ab37b82007-01-05 16:36:30 -08006232
6233 /* Make sure we've got at least a 0-order allocation.. */
Jan Beulich2c85f512009-09-21 17:03:07 -07006234 if (unlikely(flags & HASH_SMALL)) {
6235 /* Makes no sense without HASH_EARLY */
6236 WARN_ON(!(flags & HASH_EARLY));
6237 if (!(numentries >> *_hash_shift)) {
6238 numentries = 1UL << *_hash_shift;
6239 BUG_ON(!numentries);
6240 }
6241 } else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
Paul Mundt9ab37b82007-01-05 16:36:30 -08006242 numentries = PAGE_SIZE / bucketsize;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006243 }
John Hawkes6e692ed2006-03-25 03:08:02 -08006244 numentries = roundup_pow_of_two(numentries);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006245
6246 /* limit allocation size to 1/16 total memory by default */
6247 if (max == 0) {
6248 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
6249 do_div(max, bucketsize);
6250 }
Dimitri Sivanich074b8512012-02-08 12:39:07 -08006251 max = min(max, 0x80000000ULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006252
Tim Bird31fe62b2012-05-23 13:33:35 +00006253 if (numentries < low_limit)
6254 numentries = low_limit;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006255 if (numentries > max)
6256 numentries = max;
6257
David Howellsf0d1b0b2006-12-08 02:37:49 -08006258 log2qty = ilog2(numentries);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006259
6260 do {
6261 size = bucketsize << log2qty;
6262 if (flags & HASH_EARLY)
Santosh Shilimkar67828322014-01-21 15:50:25 -08006263 table = memblock_virt_alloc_nopanic(size, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006264 else if (hashdist)
6265 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
6266 else {
Eric Dumazet1037b832007-07-15 23:38:05 -07006267 /*
6268 * If bucketsize is not a power-of-two, we may free
Mel Gormana1dd2682009-06-16 15:32:19 -07006269 * some pages at the end of hash table which
6270 * alloc_pages_exact() automatically does
Eric Dumazet1037b832007-07-15 23:38:05 -07006271 */
Catalin Marinas264ef8a2009-07-07 10:33:01 +01006272 if (get_order(size) < MAX_ORDER) {
Mel Gormana1dd2682009-06-16 15:32:19 -07006273 table = alloc_pages_exact(size, GFP_ATOMIC);
Catalin Marinas264ef8a2009-07-07 10:33:01 +01006274 kmemleak_alloc(table, size, 1, GFP_ATOMIC);
6275 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07006276 }
6277 } while (!table && size > PAGE_SIZE && --log2qty);
6278
6279 if (!table)
6280 panic("Failed to allocate %s hash table\n", tablename);
6281
Robin Holtf241e6602010-10-07 12:59:26 -07006282 printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -07006283 tablename,
Robin Holtf241e6602010-10-07 12:59:26 -07006284 (1UL << log2qty),
David Howellsf0d1b0b2006-12-08 02:37:49 -08006285 ilog2(size) - PAGE_SHIFT,
Linus Torvalds1da177e2005-04-16 15:20:36 -07006286 size);
6287
6288 if (_hash_shift)
6289 *_hash_shift = log2qty;
6290 if (_hash_mask)
6291 *_hash_mask = (1 << log2qty) - 1;
6292
6293 return table;
6294}
KAMEZAWA Hiroyukia117e662006-03-27 01:15:25 -08006295
Mel Gorman835c1342007-10-16 01:25:47 -07006296/* Return a pointer to the bitmap storing bits affecting a block of pages */
6297static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
6298 unsigned long pfn)
6299{
6300#ifdef CONFIG_SPARSEMEM
6301 return __pfn_to_section(pfn)->pageblock_flags;
6302#else
6303 return zone->pageblock_flags;
6304#endif /* CONFIG_SPARSEMEM */
6305}
Andrew Morton6220ec72006-10-19 23:29:05 -07006306
Mel Gorman835c1342007-10-16 01:25:47 -07006307static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
6308{
6309#ifdef CONFIG_SPARSEMEM
6310 pfn &= (PAGES_PER_SECTION-1);
Mel Gormand9c23402007-10-16 01:26:01 -07006311 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
Mel Gorman835c1342007-10-16 01:25:47 -07006312#else
Laura Abbottc060f942013-01-11 14:31:51 -08006313 pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);
Mel Gormand9c23402007-10-16 01:26:01 -07006314 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
Mel Gorman835c1342007-10-16 01:25:47 -07006315#endif /* CONFIG_SPARSEMEM */
6316}
6317
6318/**
Randy Dunlap1aab4d72014-07-27 14:15:33 -07006319 * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
Mel Gorman835c1342007-10-16 01:25:47 -07006320 * @page: The page within the block of interest
Randy Dunlap1aab4d72014-07-27 14:15:33 -07006321 * @pfn: The target page frame number
6322 * @end_bitidx: The last bit of interest to retrieve
6323 * @mask: mask of bits that the caller is interested in
6324 *
6325 * Return: pageblock_bits flags
Mel Gorman835c1342007-10-16 01:25:47 -07006326 */
Mel Gormandc4b0ca2014-06-04 16:10:17 -07006327unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
Mel Gormane58469b2014-06-04 16:10:16 -07006328 unsigned long end_bitidx,
6329 unsigned long mask)
Mel Gorman835c1342007-10-16 01:25:47 -07006330{
6331 struct zone *zone;
6332 unsigned long *bitmap;
Mel Gormandc4b0ca2014-06-04 16:10:17 -07006333 unsigned long bitidx, word_bitidx;
Mel Gormane58469b2014-06-04 16:10:16 -07006334 unsigned long word;
Mel Gorman835c1342007-10-16 01:25:47 -07006335
6336 zone = page_zone(page);
Mel Gorman835c1342007-10-16 01:25:47 -07006337 bitmap = get_pageblock_bitmap(zone, pfn);
6338 bitidx = pfn_to_bitidx(zone, pfn);
Mel Gormane58469b2014-06-04 16:10:16 -07006339 word_bitidx = bitidx / BITS_PER_LONG;
6340 bitidx &= (BITS_PER_LONG-1);
Mel Gorman835c1342007-10-16 01:25:47 -07006341
Mel Gormane58469b2014-06-04 16:10:16 -07006342 word = bitmap[word_bitidx];
6343 bitidx += end_bitidx;
6344 return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
Mel Gorman835c1342007-10-16 01:25:47 -07006345}
6346
6347/**
Mel Gormandc4b0ca2014-06-04 16:10:17 -07006348 * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
Mel Gorman835c1342007-10-16 01:25:47 -07006349 * @page: The page within the block of interest
Mel Gorman835c1342007-10-16 01:25:47 -07006350 * @flags: The flags to set
Randy Dunlap1aab4d72014-07-27 14:15:33 -07006351 * @pfn: The target page frame number
6352 * @end_bitidx: The last bit of interest
6353 * @mask: mask of bits that the caller is interested in
Mel Gorman835c1342007-10-16 01:25:47 -07006354 */
Mel Gormandc4b0ca2014-06-04 16:10:17 -07006355void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
6356 unsigned long pfn,
Mel Gormane58469b2014-06-04 16:10:16 -07006357 unsigned long end_bitidx,
6358 unsigned long mask)
Mel Gorman835c1342007-10-16 01:25:47 -07006359{
6360 struct zone *zone;
6361 unsigned long *bitmap;
Mel Gormandc4b0ca2014-06-04 16:10:17 -07006362 unsigned long bitidx, word_bitidx;
Mel Gormane58469b2014-06-04 16:10:16 -07006363 unsigned long old_word, word;
6364
6365 BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
Mel Gorman835c1342007-10-16 01:25:47 -07006366
6367 zone = page_zone(page);
Mel Gorman835c1342007-10-16 01:25:47 -07006368 bitmap = get_pageblock_bitmap(zone, pfn);
6369 bitidx = pfn_to_bitidx(zone, pfn);
Mel Gormane58469b2014-06-04 16:10:16 -07006370 word_bitidx = bitidx / BITS_PER_LONG;
6371 bitidx &= (BITS_PER_LONG-1);
6372
Sasha Levin309381fea2014-01-23 15:52:54 -08006373 VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page);
Mel Gorman835c1342007-10-16 01:25:47 -07006374
Mel Gormane58469b2014-06-04 16:10:16 -07006375 bitidx += end_bitidx;
6376 mask <<= (BITS_PER_LONG - bitidx - 1);
6377 flags <<= (BITS_PER_LONG - bitidx - 1);
6378
Jason Low4db0c3c2015-04-15 16:14:08 -07006379 word = READ_ONCE(bitmap[word_bitidx]);
Mel Gormane58469b2014-06-04 16:10:16 -07006380 for (;;) {
6381 old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
6382 if (word == old_word)
6383 break;
6384 word = old_word;
6385 }
Mel Gorman835c1342007-10-16 01:25:47 -07006386}
KAMEZAWA Hiroyukia5d76b542007-10-16 01:26:11 -07006387
6388/*
Minchan Kim80934512012-07-31 16:43:01 -07006389 * This function checks whether pageblock includes unmovable pages or not.
6390 * If @count is not zero, it is okay to include less @count unmovable pages
6391 *
Pintu Kumarb8af2942013-09-11 14:20:34 -07006392 * PageLRU check without isolation or lru_lock could race so that
Minchan Kim80934512012-07-31 16:43:01 -07006393 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
6394 * expect this function should be exact.
KAMEZAWA Hiroyukia5d76b542007-10-16 01:26:11 -07006395 */
Wen Congyangb023f462012-12-11 16:00:45 -08006396bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
6397 bool skip_hwpoisoned_pages)
KAMEZAWA Hiroyuki49ac8252010-10-26 14:21:30 -07006398{
6399 unsigned long pfn, iter, found;
Michal Nazarewicz47118af2011-12-29 13:09:50 +01006400 int mt;
6401
KAMEZAWA Hiroyuki49ac8252010-10-26 14:21:30 -07006402 /*
6403 * For avoiding noise data, lru_add_drain_all() should be called
Minchan Kim80934512012-07-31 16:43:01 -07006404 * If ZONE_MOVABLE, the zone never contains unmovable pages
KAMEZAWA Hiroyuki49ac8252010-10-26 14:21:30 -07006405 */
6406 if (zone_idx(zone) == ZONE_MOVABLE)
Minchan Kim80934512012-07-31 16:43:01 -07006407 return false;
Michal Nazarewicz47118af2011-12-29 13:09:50 +01006408 mt = get_pageblock_migratetype(page);
6409 if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
Minchan Kim80934512012-07-31 16:43:01 -07006410 return false;
KAMEZAWA Hiroyuki49ac8252010-10-26 14:21:30 -07006411
6412 pfn = page_to_pfn(page);
6413 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
6414 unsigned long check = pfn + iter;
6415
Namhyung Kim29723fc2011-02-25 14:44:25 -08006416 if (!pfn_valid_within(check))
KAMEZAWA Hiroyuki49ac8252010-10-26 14:21:30 -07006417 continue;
Namhyung Kim29723fc2011-02-25 14:44:25 -08006418
KAMEZAWA Hiroyuki49ac8252010-10-26 14:21:30 -07006419 page = pfn_to_page(check);
Naoya Horiguchic8721bb2013-09-11 14:22:09 -07006420
6421 /*
6422 * Hugepages are not in LRU lists, but they're movable.
6423 * We need not scan over tail pages bacause we don't
6424 * handle each tail page individually in migration.
6425 */
6426 if (PageHuge(page)) {
6427 iter = round_up(iter + 1, 1<<compound_order(page)) - 1;
6428 continue;
6429 }
6430
Minchan Kim97d255c2012-07-31 16:42:59 -07006431 /*
6432 * We can't use page_count without pin a page
6433 * because another CPU can free compound page.
6434 * This check already skips compound tails of THP
6435 * because their page->_count is zero at all time.
6436 */
6437 if (!atomic_read(&page->_count)) {
KAMEZAWA Hiroyuki49ac8252010-10-26 14:21:30 -07006438 if (PageBuddy(page))
6439 iter += (1 << page_order(page)) - 1;
6440 continue;
6441 }
Minchan Kim97d255c2012-07-31 16:42:59 -07006442
Wen Congyangb023f462012-12-11 16:00:45 -08006443 /*
6444 * The HWPoisoned page may be not in buddy system, and
6445 * page_count() is not 0.
6446 */
6447 if (skip_hwpoisoned_pages && PageHWPoison(page))
6448 continue;
6449
KAMEZAWA Hiroyuki49ac8252010-10-26 14:21:30 -07006450 if (!PageLRU(page))
6451 found++;
6452 /*
Johannes Weiner6b4f7792014-12-12 16:56:13 -08006453 * If there are RECLAIMABLE pages, we need to check
6454 * it. But now, memory offline itself doesn't call
6455 * shrink_node_slabs() and it still to be fixed.
KAMEZAWA Hiroyuki49ac8252010-10-26 14:21:30 -07006456 */
6457 /*
6458 * If the page is not RAM, page_count()should be 0.
6459 * we don't need more check. This is an _used_ not-movable page.
6460 *
6461 * The problematic thing here is PG_reserved pages. PG_reserved
6462 * is set to both of a memory hole page and a _used_ kernel
6463 * page at boot.
6464 */
6465 if (found > count)
Minchan Kim80934512012-07-31 16:43:01 -07006466 return true;
KAMEZAWA Hiroyuki49ac8252010-10-26 14:21:30 -07006467 }
Minchan Kim80934512012-07-31 16:43:01 -07006468 return false;
KAMEZAWA Hiroyuki49ac8252010-10-26 14:21:30 -07006469}
6470
6471bool is_pageblock_removable_nolock(struct page *page)
6472{
Michal Hocko656a0702012-01-20 14:33:58 -08006473 struct zone *zone;
6474 unsigned long pfn;
Michal Hocko687875fb2012-01-20 14:33:55 -08006475
6476 /*
6477 * We have to be careful here because we are iterating over memory
6478 * sections which are not zone aware so we might end up outside of
6479 * the zone but still within the section.
Michal Hocko656a0702012-01-20 14:33:58 -08006480 * We have to take care about the node as well. If the node is offline
6481 * its NODE_DATA will be NULL - see page_zone.
Michal Hocko687875fb2012-01-20 14:33:55 -08006482 */
Michal Hocko656a0702012-01-20 14:33:58 -08006483 if (!node_online(page_to_nid(page)))
6484 return false;
6485
6486 zone = page_zone(page);
6487 pfn = page_to_pfn(page);
Cody P Schafer108bcc92013-02-22 16:35:23 -08006488 if (!zone_spans_pfn(zone, pfn))
Michal Hocko687875fb2012-01-20 14:33:55 -08006489 return false;
6490
Wen Congyangb023f462012-12-11 16:00:45 -08006491 return !has_unmovable_pages(zone, page, 0, true);
KAMEZAWA Hiroyukia5d76b542007-10-16 01:26:11 -07006492}
KAMEZAWA Hiroyuki0c0e6192007-10-16 01:26:12 -07006493
Michal Nazarewicz041d3a82011-12-29 13:09:50 +01006494#ifdef CONFIG_CMA
6495
6496static unsigned long pfn_max_align_down(unsigned long pfn)
6497{
6498 return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
6499 pageblock_nr_pages) - 1);
6500}
6501
6502static unsigned long pfn_max_align_up(unsigned long pfn)
6503{
6504 return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
6505 pageblock_nr_pages));
6506}
6507
Michal Nazarewicz041d3a82011-12-29 13:09:50 +01006508/* [start, end) must belong to a single zone. */
Mel Gormanbb13ffe2012-10-08 16:32:41 -07006509static int __alloc_contig_migrate_range(struct compact_control *cc,
6510 unsigned long start, unsigned long end)
Michal Nazarewicz041d3a82011-12-29 13:09:50 +01006511{
6512 /* This function is based on compact_zone() from compaction.c. */
Minchan Kimbeb51ea2012-10-08 16:33:51 -07006513 unsigned long nr_reclaimed;
Michal Nazarewicz041d3a82011-12-29 13:09:50 +01006514 unsigned long pfn = start;
6515 unsigned int tries = 0;
6516 int ret = 0;
6517
Marek Szyprowskibe49a6e2012-12-12 13:51:19 -08006518 migrate_prep();
Michal Nazarewicz041d3a82011-12-29 13:09:50 +01006519
Mel Gormanbb13ffe2012-10-08 16:32:41 -07006520 while (pfn < end || !list_empty(&cc->migratepages)) {
Michal Nazarewicz041d3a82011-12-29 13:09:50 +01006521 if (fatal_signal_pending(current)) {
6522 ret = -EINTR;
6523 break;
6524 }
6525
Mel Gormanbb13ffe2012-10-08 16:32:41 -07006526 if (list_empty(&cc->migratepages)) {
6527 cc->nr_migratepages = 0;
Vlastimil Babkaedc2ca62014-10-09 15:27:09 -07006528 pfn = isolate_migratepages_range(cc, pfn, end);
Michal Nazarewicz041d3a82011-12-29 13:09:50 +01006529 if (!pfn) {
6530 ret = -EINTR;
6531 break;
6532 }
6533 tries = 0;
6534 } else if (++tries == 5) {
6535 ret = ret < 0 ? ret : -EBUSY;
6536 break;
6537 }
6538
Minchan Kimbeb51ea2012-10-08 16:33:51 -07006539 nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
6540 &cc->migratepages);
6541 cc->nr_migratepages -= nr_reclaimed;
Minchan Kim02c6de82012-10-08 16:31:55 -07006542
Hugh Dickins9c620e22013-02-22 16:35:14 -08006543 ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
David Rientjese0b9dae2014-06-04 16:08:28 -07006544 NULL, 0, cc->mode, MR_CMA);
Michal Nazarewicz041d3a82011-12-29 13:09:50 +01006545 }
Srinivas Pandruvada2a6f5122013-02-22 16:32:09 -08006546 if (ret < 0) {
6547 putback_movable_pages(&cc->migratepages);
6548 return ret;
6549 }
6550 return 0;
Michal Nazarewicz041d3a82011-12-29 13:09:50 +01006551}
6552
6553/**
6554 * alloc_contig_range() -- tries to allocate given range of pages
6555 * @start: start PFN to allocate
6556 * @end: one-past-the-last PFN to allocate
Michal Nazarewicz0815f3d2012-04-03 15:06:15 +02006557 * @migratetype: migratetype of the underlaying pageblocks (either
6558 * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks
6559 * in range must have the same migratetype and it must
6560 * be either of the two.
Michal Nazarewicz041d3a82011-12-29 13:09:50 +01006561 *
6562 * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
6563 * aligned, however it's the caller's responsibility to guarantee that
6564 * we are the only thread that changes migrate type of pageblocks the
6565 * pages fall in.
6566 *
6567 * The PFN range must belong to a single zone.
6568 *
6569 * Returns zero on success or negative error code. On success all
6570 * pages which PFN is in [start, end) are allocated for the caller and
6571 * need to be freed with free_contig_range().
6572 */
Michal Nazarewicz0815f3d2012-04-03 15:06:15 +02006573int alloc_contig_range(unsigned long start, unsigned long end,
6574 unsigned migratetype)
Michal Nazarewicz041d3a82011-12-29 13:09:50 +01006575{
Michal Nazarewicz041d3a82011-12-29 13:09:50 +01006576 unsigned long outer_start, outer_end;
6577 int ret = 0, order;
6578
Mel Gormanbb13ffe2012-10-08 16:32:41 -07006579 struct compact_control cc = {
6580 .nr_migratepages = 0,
6581 .order = -1,
6582 .zone = page_zone(pfn_to_page(start)),
David Rientjese0b9dae2014-06-04 16:08:28 -07006583 .mode = MIGRATE_SYNC,
Mel Gormanbb13ffe2012-10-08 16:32:41 -07006584 .ignore_skip_hint = true,
6585 };
6586 INIT_LIST_HEAD(&cc.migratepages);
6587
Michal Nazarewicz041d3a82011-12-29 13:09:50 +01006588 /*
6589 * What we do here is we mark all pageblocks in range as
6590 * MIGRATE_ISOLATE. Because pageblock and max order pages may
6591 * have different sizes, and due to the way page allocator
6592 * work, we align the range to biggest of the two pages so
6593 * that page allocator won't try to merge buddies from
6594 * different pageblocks and change MIGRATE_ISOLATE to some
6595 * other migration type.
6596 *
6597 * Once the pageblocks are marked as MIGRATE_ISOLATE, we
6598 * migrate the pages from an unaligned range (ie. pages that
6599 * we are interested in). This will put all the pages in
6600 * range back to page allocator as MIGRATE_ISOLATE.
6601 *
6602 * When this is done, we take the pages in range from page
6603 * allocator removing them from the buddy system. This way
6604 * page allocator will never consider using them.
6605 *
6606 * This lets us mark the pageblocks back as
6607 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
6608 * aligned range but not in the unaligned, original range are
6609 * put back to page allocator so that buddy can use them.
6610 */
6611
6612 ret = start_isolate_page_range(pfn_max_align_down(start),
Wen Congyangb023f462012-12-11 16:00:45 -08006613 pfn_max_align_up(end), migratetype,
6614 false);
Michal Nazarewicz041d3a82011-12-29 13:09:50 +01006615 if (ret)
Bob Liu86a595f2012-10-25 13:37:56 -07006616 return ret;
Michal Nazarewicz041d3a82011-12-29 13:09:50 +01006617
Mel Gormanbb13ffe2012-10-08 16:32:41 -07006618 ret = __alloc_contig_migrate_range(&cc, start, end);
Michal Nazarewicz041d3a82011-12-29 13:09:50 +01006619 if (ret)
6620 goto done;
6621
6622 /*
6623 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
6624 * aligned blocks that are marked as MIGRATE_ISOLATE. What's
6625 * more, all pages in [start, end) are free in page allocator.
6626 * What we are going to do is to allocate all pages from
6627 * [start, end) (that is remove them from page allocator).
6628 *
6629 * The only problem is that pages at the beginning and at the
6630 * end of interesting range may be not aligned with pages that
6631 * page allocator holds, ie. they can be part of higher order
6632 * pages. Because of this, we reserve the bigger range and
6633 * once this is done free the pages we are not interested in.
6634 *
6635 * We don't have to hold zone->lock here because the pages are
6636 * isolated thus they won't get removed from buddy.
6637 */
6638
6639 lru_add_drain_all();
Vlastimil Babka510f5502014-12-10 15:43:07 -08006640 drain_all_pages(cc.zone);
Michal Nazarewicz041d3a82011-12-29 13:09:50 +01006641
6642 order = 0;
6643 outer_start = start;
6644 while (!PageBuddy(pfn_to_page(outer_start))) {
6645 if (++order >= MAX_ORDER) {
6646 ret = -EBUSY;
6647 goto done;
6648 }
6649 outer_start &= ~0UL << order;
6650 }
6651
6652 /* Make sure the range is really isolated. */
Wen Congyangb023f462012-12-11 16:00:45 -08006653 if (test_pages_isolated(outer_start, end, false)) {
Michal Nazarewiczdae803e2014-11-13 15:19:27 -08006654 pr_info("%s: [%lx, %lx) PFNs busy\n",
6655 __func__, outer_start, end);
Michal Nazarewicz041d3a82011-12-29 13:09:50 +01006656 ret = -EBUSY;
6657 goto done;
6658 }
6659
Marek Szyprowski49f223a2012-01-25 12:49:24 +01006660 /* Grab isolated pages from freelists. */
Mel Gormanbb13ffe2012-10-08 16:32:41 -07006661 outer_end = isolate_freepages_range(&cc, outer_start, end);
Michal Nazarewicz041d3a82011-12-29 13:09:50 +01006662 if (!outer_end) {
6663 ret = -EBUSY;
6664 goto done;
6665 }
6666
6667 /* Free head and tail (if any) */
6668 if (start != outer_start)
6669 free_contig_range(outer_start, start - outer_start);
6670 if (end != outer_end)
6671 free_contig_range(end, outer_end - end);
6672
6673done:
6674 undo_isolate_page_range(pfn_max_align_down(start),
Michal Nazarewicz0815f3d2012-04-03 15:06:15 +02006675 pfn_max_align_up(end), migratetype);
Michal Nazarewicz041d3a82011-12-29 13:09:50 +01006676 return ret;
6677}
6678
6679void free_contig_range(unsigned long pfn, unsigned nr_pages)
6680{
Marek Szyprowskibcc2b022012-12-20 15:05:18 -08006681 unsigned int count = 0;
6682
6683 for (; nr_pages--; pfn++) {
6684 struct page *page = pfn_to_page(pfn);
6685
6686 count += page_count(page) != 1;
6687 __free_page(page);
6688 }
6689 WARN(count != 0, "%d pages are still in use!\n", count);
Michal Nazarewicz041d3a82011-12-29 13:09:50 +01006690}
6691#endif
6692
Jiang Liu4ed7e022012-07-31 16:43:35 -07006693#ifdef CONFIG_MEMORY_HOTPLUG
Cody P Schafer0a647f32013-07-03 15:01:33 -07006694/*
6695 * The zone indicated has a new number of managed_pages; batch sizes and percpu
6696 * page high values need to be recalulated.
6697 */
Jiang Liu4ed7e022012-07-31 16:43:35 -07006698void __meminit zone_pcp_update(struct zone *zone)
6699{
Cody P Schafer0a647f32013-07-03 15:01:33 -07006700 unsigned cpu;
Cody P Schaferc8e251f2013-07-03 15:01:29 -07006701 mutex_lock(&pcp_batch_high_lock);
Cody P Schafer0a647f32013-07-03 15:01:33 -07006702 for_each_possible_cpu(cpu)
Cody P Schafer169f6c12013-07-03 15:01:41 -07006703 pageset_set_high_and_batch(zone,
6704 per_cpu_ptr(zone->pageset, cpu));
Cody P Schaferc8e251f2013-07-03 15:01:29 -07006705 mutex_unlock(&pcp_batch_high_lock);
Jiang Liu4ed7e022012-07-31 16:43:35 -07006706}
6707#endif
6708
Jiang Liu340175b2012-07-31 16:43:32 -07006709void zone_pcp_reset(struct zone *zone)
6710{
6711 unsigned long flags;
Minchan Kim5a883812012-10-08 16:33:39 -07006712 int cpu;
6713 struct per_cpu_pageset *pset;
Jiang Liu340175b2012-07-31 16:43:32 -07006714
6715 /* avoid races with drain_pages() */
6716 local_irq_save(flags);
6717 if (zone->pageset != &boot_pageset) {
Minchan Kim5a883812012-10-08 16:33:39 -07006718 for_each_online_cpu(cpu) {
6719 pset = per_cpu_ptr(zone->pageset, cpu);
6720 drain_zonestat(zone, pset);
6721 }
Jiang Liu340175b2012-07-31 16:43:32 -07006722 free_percpu(zone->pageset);
6723 zone->pageset = &boot_pageset;
6724 }
6725 local_irq_restore(flags);
6726}
6727
Wen Congyang6dcd73d2012-12-11 16:01:01 -08006728#ifdef CONFIG_MEMORY_HOTREMOVE
KAMEZAWA Hiroyuki0c0e6192007-10-16 01:26:12 -07006729/*
6730 * All pages in the range must be isolated before calling this.
6731 */
6732void
6733__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
6734{
6735 struct page *page;
6736 struct zone *zone;
Mel Gorman7aeb09f2014-06-04 16:10:21 -07006737 unsigned int order, i;
KAMEZAWA Hiroyuki0c0e6192007-10-16 01:26:12 -07006738 unsigned long pfn;
6739 unsigned long flags;
6740 /* find the first valid pfn */
6741 for (pfn = start_pfn; pfn < end_pfn; pfn++)
6742 if (pfn_valid(pfn))
6743 break;
6744 if (pfn == end_pfn)
6745 return;
6746 zone = page_zone(pfn_to_page(pfn));
6747 spin_lock_irqsave(&zone->lock, flags);
6748 pfn = start_pfn;
6749 while (pfn < end_pfn) {
6750 if (!pfn_valid(pfn)) {
6751 pfn++;
6752 continue;
6753 }
6754 page = pfn_to_page(pfn);
Wen Congyangb023f462012-12-11 16:00:45 -08006755 /*
6756 * The HWPoisoned page may be not in buddy system, and
6757 * page_count() is not 0.
6758 */
6759 if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
6760 pfn++;
6761 SetPageReserved(page);
6762 continue;
6763 }
6764
KAMEZAWA Hiroyuki0c0e6192007-10-16 01:26:12 -07006765 BUG_ON(page_count(page));
6766 BUG_ON(!PageBuddy(page));
6767 order = page_order(page);
6768#ifdef CONFIG_DEBUG_VM
6769 printk(KERN_INFO "remove from free list %lx %d %lx\n",
6770 pfn, 1 << order, end_pfn);
6771#endif
6772 list_del(&page->lru);
6773 rmv_page_order(page);
6774 zone->free_area[order].nr_free--;
KAMEZAWA Hiroyuki0c0e6192007-10-16 01:26:12 -07006775 for (i = 0; i < (1 << order); i++)
6776 SetPageReserved((page+i));
6777 pfn += (1 << order);
6778 }
6779 spin_unlock_irqrestore(&zone->lock, flags);
6780}
6781#endif
Wu Fengguang8d22ba12009-12-16 12:19:58 +01006782
6783#ifdef CONFIG_MEMORY_FAILURE
6784bool is_free_buddy_page(struct page *page)
6785{
6786 struct zone *zone = page_zone(page);
6787 unsigned long pfn = page_to_pfn(page);
6788 unsigned long flags;
Mel Gorman7aeb09f2014-06-04 16:10:21 -07006789 unsigned int order;
Wu Fengguang8d22ba12009-12-16 12:19:58 +01006790
6791 spin_lock_irqsave(&zone->lock, flags);
6792 for (order = 0; order < MAX_ORDER; order++) {
6793 struct page *page_head = page - (pfn & ((1 << order) - 1));
6794
6795 if (PageBuddy(page_head) && page_order(page_head) >= order)
6796 break;
6797 }
6798 spin_unlock_irqrestore(&zone->lock, flags);
6799
6800 return order < MAX_ORDER;
6801}
6802#endif