mm: meminit: finish initialisation of struct pages before basic setup
Waiman Long reported that 24TB machines hit OOM during basic setup when
struct page initialisation was deferred. One approach is to initialise
memory on demand but it interferes with page allocator paths. This patch
creates dedicated threads to initialise memory before basic setup. It
then blocks on a rw_semaphore until completion as a wait_queue and counter
is overkill. This may be slower to boot but it's simplier overall and
also gets rid of a section mangling which existed so kswapd could do the
initialisation.
[akpm@linux-foundation.org: include rwsem.h, use DECLARE_RWSEM, fix comment, remove unneeded cast]
Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Waiman Long <waiman.long@hp.com
Cc: Nathan Zimmer <nzimmer@sgi.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Scott Norton <scott.norton@hp.com>
Tested-by: Daniel J Blueman <daniel@numascale.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5a38e39..506eac8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -18,6 +18,7 @@
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/interrupt.h>
+#include <linux/rwsem.h>
#include <linux/pagemap.h>
#include <linux/jiffies.h>
#include <linux/bootmem.h>
@@ -61,6 +62,7 @@
#include <linux/hugetlb.h>
#include <linux/sched/rt.h>
#include <linux/page_owner.h>
+#include <linux/kthread.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
@@ -242,7 +244,7 @@
}
/* Returns true if the struct page for the pfn is uninitialised */
-static inline bool __defermem_init early_page_uninitialised(unsigned long pfn)
+static inline bool __meminit early_page_uninitialised(unsigned long pfn)
{
int nid = early_pfn_to_nid(pfn);
@@ -958,7 +960,7 @@
local_irq_restore(flags);
}
-static void __defer_init __free_pages_boot_core(struct page *page,
+static void __init __free_pages_boot_core(struct page *page,
unsigned long pfn, unsigned int order)
{
unsigned int nr_pages = 1 << order;
@@ -1031,7 +1033,7 @@
#endif
-void __defer_init __free_pages_bootmem(struct page *page, unsigned long pfn,
+void __init __free_pages_bootmem(struct page *page, unsigned long pfn,
unsigned int order)
{
if (early_page_uninitialised(pfn))
@@ -1040,7 +1042,7 @@
}
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
-static void __defermem_init deferred_free_range(struct page *page,
+static void __init deferred_free_range(struct page *page,
unsigned long pfn, int nr_pages)
{
int i;
@@ -1060,20 +1062,30 @@
__free_pages_boot_core(page, pfn, 0);
}
+static __initdata DECLARE_RWSEM(pgdat_init_rwsem);
+
/* Initialise remaining memory on a node */
-void __defermem_init deferred_init_memmap(int nid)
+static int __init deferred_init_memmap(void *data)
{
+ pg_data_t *pgdat = data;
+ int nid = pgdat->node_id;
struct mminit_pfnnid_cache nid_init_state = { };
unsigned long start = jiffies;
unsigned long nr_pages = 0;
unsigned long walk_start, walk_end;
int i, zid;
struct zone *zone;
- pg_data_t *pgdat = NODE_DATA(nid);
unsigned long first_init_pfn = pgdat->first_deferred_pfn;
+ const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
- if (first_init_pfn == ULONG_MAX)
- return;
+ if (first_init_pfn == ULONG_MAX) {
+ up_read(&pgdat_init_rwsem);
+ return 0;
+ }
+
+ /* Bind memory initialisation thread to a local node if possible */
+ if (!cpumask_empty(cpumask))
+ set_cpus_allowed_ptr(current, cpumask);
/* Sanity check boundaries */
BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
@@ -1165,8 +1177,24 @@
/* Sanity check that the next zone really is unpopulated */
WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
- pr_info("kswapd %d initialised %lu pages in %ums\n", nid, nr_pages,
+ pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages,
jiffies_to_msecs(jiffies - start));
+ up_read(&pgdat_init_rwsem);
+ return 0;
+}
+
+void __init page_alloc_init_late(void)
+{
+ int nid;
+
+ for_each_node_state(nid, N_MEMORY) {
+ down_read(&pgdat_init_rwsem);
+ kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);
+ }
+
+ /* Block until all are initialised */
+ down_write(&pgdat_init_rwsem);
+ up_write(&pgdat_init_rwsem);
}
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */