blob: 5c4b1fb73187aab40500f8574d1d05044902dc52 [file] [log] [blame]
Alexander Duyck36e66c52020-04-06 20:04:56 -07001// SPDX-License-Identifier: GPL-2.0
2#include <linux/mm.h>
3#include <linux/mmzone.h>
4#include <linux/page_reporting.h>
5#include <linux/gfp.h>
6#include <linux/export.h>
Gavin Shanf58780a2021-06-28 19:35:19 -07007#include <linux/module.h>
Alexander Duyck36e66c52020-04-06 20:04:56 -07008#include <linux/delay.h>
9#include <linux/scatterlist.h>
Keir Fraserb206dbe2022-08-18 10:41:35 +000010#include <linux/mem_relinquish.h>
Alexander Duyck36e66c52020-04-06 20:04:56 -070011
12#include "page_reporting.h"
13#include "internal.h"
14
Gavin Shanf58780a2021-06-28 19:35:19 -070015unsigned int page_reporting_order = MAX_ORDER;
16module_param(page_reporting_order, uint, 0644);
17MODULE_PARM_DESC(page_reporting_order, "Set page reporting order");
18
Alexander Duyck36e66c52020-04-06 20:04:56 -070019#define PAGE_REPORTING_DELAY (2 * HZ)
20static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly;
21
22enum {
23 PAGE_REPORTING_IDLE = 0,
24 PAGE_REPORTING_REQUESTED,
25 PAGE_REPORTING_ACTIVE
26};
27
28/* request page reporting */
29static void
30__page_reporting_request(struct page_reporting_dev_info *prdev)
31{
32 unsigned int state;
33
34 /* Check to see if we are in desired state */
35 state = atomic_read(&prdev->state);
36 if (state == PAGE_REPORTING_REQUESTED)
37 return;
38
39 /*
Gavin Shan5631de52021-06-28 19:35:16 -070040 * If reporting is already active there is nothing we need to do.
41 * Test against 0 as that represents PAGE_REPORTING_IDLE.
Alexander Duyck36e66c52020-04-06 20:04:56 -070042 */
43 state = atomic_xchg(&prdev->state, PAGE_REPORTING_REQUESTED);
44 if (state != PAGE_REPORTING_IDLE)
45 return;
46
47 /*
48 * Delay the start of work to allow a sizable queue to build. For
49 * now we are limiting this to running no more than once every
50 * couple of seconds.
51 */
52 schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY);
53}
54
55/* notify prdev of free page reporting request */
56void __page_reporting_notify(void)
57{
58 struct page_reporting_dev_info *prdev;
59
60 /*
61 * We use RCU to protect the pr_dev_info pointer. In almost all
62 * cases this should be present, however in the unlikely case of
63 * a shutdown this will be NULL and we should exit.
64 */
65 rcu_read_lock();
66 prdev = rcu_dereference(pr_dev_info);
67 if (likely(prdev))
68 __page_reporting_request(prdev);
69
70 rcu_read_unlock();
71}
72
73static void
74page_reporting_drain(struct page_reporting_dev_info *prdev,
75 struct scatterlist *sgl, unsigned int nents, bool reported)
76{
77 struct scatterlist *sg = sgl;
78
79 /*
80 * Drain the now reported pages back into their respective
81 * free lists/areas. We assume at least one page is populated.
82 */
83 do {
84 struct page *page = sg_page(sg);
85 int mt = get_pageblock_migratetype(page);
86 unsigned int order = get_order(sg->length);
87
88 __putback_isolated_page(page, order, mt);
89
90 /* If the pages were not reported due to error skip flagging */
91 if (!reported)
92 continue;
93
94 /*
95 * If page was not comingled with another page we can
96 * consider the result to be "reported" since the page
97 * hasn't been modified, otherwise we will need to
98 * report on the new larger page when we make our way
99 * up to that higher order.
100 */
Matthew Wilcox (Oracle)ab130f912020-10-15 20:10:15 -0700101 if (PageBuddy(page) && buddy_order(page) == order)
Alexander Duyck36e66c52020-04-06 20:04:56 -0700102 __SetPageReported(page);
103 } while ((sg = sg_next(sg)));
104
105 /* reinitialize scatterlist now that it is empty */
106 sg_init_table(sgl, nents);
107}
108
109/*
110 * The page reporting cycle consists of 4 stages, fill, report, drain, and
111 * idle. We will cycle through the first 3 stages until we cannot obtain a
112 * full scatterlist of pages, in that case we will switch to idle.
113 */
114static int
115page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone,
116 unsigned int order, unsigned int mt,
117 struct scatterlist *sgl, unsigned int *offset)
118{
119 struct free_area *area = &zone->free_area[order];
120 struct list_head *list = &area->free_list[mt];
121 unsigned int page_len = PAGE_SIZE << order;
122 struct page *page, *next;
Alexander Duyck43b76f22020-04-06 20:05:14 -0700123 long budget;
Keir Fraserb206dbe2022-08-18 10:41:35 +0000124 int i, err = 0;
Alexander Duyck36e66c52020-04-06 20:04:56 -0700125
126 /*
127 * Perform early check, if free area is empty there is
128 * nothing to process so we can skip this free_list.
129 */
130 if (list_empty(list))
131 return err;
132
133 spin_lock_irq(&zone->lock);
134
Alexander Duyck43b76f22020-04-06 20:05:14 -0700135 /*
136 * Limit how many calls we will be making to the page reporting
137 * device for this list. By doing this we avoid processing any
138 * given list for too long.
139 *
140 * The current value used allows us enough calls to process over a
141 * sixteenth of the current list plus one additional call to handle
142 * any pages that may have already been present from the previous
143 * list processed. This should result in us reporting all pages on
144 * an idle system in about 30 seconds.
145 *
146 * The division here should be cheap since PAGE_REPORTING_CAPACITY
147 * should always be a power of 2.
148 */
149 budget = DIV_ROUND_UP(area->nr_free, PAGE_REPORTING_CAPACITY * 16);
150
Alexander Duyck36e66c52020-04-06 20:04:56 -0700151 /* loop through free list adding unreported pages to sg list */
152 list_for_each_entry_safe(page, next, list, lru) {
153 /* We are going to skip over the reported pages. */
154 if (PageReported(page))
155 continue;
156
Alexander Duyck43b76f22020-04-06 20:05:14 -0700157 /*
158 * If we fully consumed our budget then update our
159 * state to indicate that we are requesting additional
160 * processing and exit this list.
161 */
162 if (budget < 0) {
163 atomic_set(&prdev->state, PAGE_REPORTING_REQUESTED);
164 next = page;
165 break;
166 }
167
Alexander Duyck02cf8712020-04-06 20:05:10 -0700168 /* Attempt to pull page from list and place in scatterlist */
169 if (*offset) {
170 if (!__isolate_free_page(page, order)) {
171 next = page;
172 break;
173 }
Alexander Duyck36e66c52020-04-06 20:04:56 -0700174
Alexander Duyck02cf8712020-04-06 20:05:10 -0700175 /* Add page to scatter list */
176 --(*offset);
177 sg_set_page(&sgl[*offset], page, page_len, 0);
Alexander Duyck36e66c52020-04-06 20:04:56 -0700178
Keir Fraserb206dbe2022-08-18 10:41:35 +0000179 /* Notify hyp that these pages are reclaimable. */
180 for (i = 0; i < (1<<order); i++)
181 page_relinquish(page+i);
182
Alexander Duyck36e66c52020-04-06 20:04:56 -0700183 continue;
Alexander Duyck02cf8712020-04-06 20:05:10 -0700184 }
185
186 /*
Alexander Duyck43b76f22020-04-06 20:05:14 -0700187 * Make the first non-reported page in the free list
Alexander Duyck02cf8712020-04-06 20:05:10 -0700188 * the new head of the free list before we release the
189 * zone lock.
190 */
Wei Yang58f6f032020-10-15 20:09:49 -0700191 if (!list_is_first(&page->lru, list))
Alexander Duyck02cf8712020-04-06 20:05:10 -0700192 list_rotate_to_front(&page->lru, list);
Alexander Duyck36e66c52020-04-06 20:04:56 -0700193
194 /* release lock before waiting on report processing */
195 spin_unlock_irq(&zone->lock);
196
197 /* begin processing pages in local list */
198 err = prdev->report(prdev, sgl, PAGE_REPORTING_CAPACITY);
199
200 /* reset offset since the full list was reported */
201 *offset = PAGE_REPORTING_CAPACITY;
202
Alexander Duyck43b76f22020-04-06 20:05:14 -0700203 /* update budget to reflect call to report function */
204 budget--;
205
Alexander Duyck36e66c52020-04-06 20:04:56 -0700206 /* reacquire zone lock and resume processing */
207 spin_lock_irq(&zone->lock);
208
209 /* flush reported pages from the sg list */
210 page_reporting_drain(prdev, sgl, PAGE_REPORTING_CAPACITY, !err);
211
212 /*
213 * Reset next to first entry, the old next isn't valid
214 * since we dropped the lock to report the pages
215 */
216 next = list_first_entry(list, struct page, lru);
217
218 /* exit on error */
219 if (err)
220 break;
221 }
222
Alexander Duyck02cf8712020-04-06 20:05:10 -0700223 /* Rotate any leftover pages to the head of the freelist */
sh_def@163.com5df6d792021-02-24 12:04:57 -0800224 if (!list_entry_is_head(next, list, lru) && !list_is_first(&next->lru, list))
Alexander Duyck02cf8712020-04-06 20:05:10 -0700225 list_rotate_to_front(&next->lru, list);
226
Alexander Duyck36e66c52020-04-06 20:04:56 -0700227 spin_unlock_irq(&zone->lock);
228
229 return err;
230}
231
232static int
233page_reporting_process_zone(struct page_reporting_dev_info *prdev,
234 struct scatterlist *sgl, struct zone *zone)
235{
236 unsigned int order, mt, leftover, offset = PAGE_REPORTING_CAPACITY;
237 unsigned long watermark;
238 int err = 0;
239
240 /* Generate minimum watermark to be able to guarantee progress */
241 watermark = low_wmark_pages(zone) +
Gavin Shanf58780a2021-06-28 19:35:19 -0700242 (PAGE_REPORTING_CAPACITY << page_reporting_order);
Alexander Duyck36e66c52020-04-06 20:04:56 -0700243
244 /*
245 * Cancel request if insufficient free memory or if we failed
246 * to allocate page reporting statistics for the zone.
247 */
248 if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
249 return err;
250
251 /* Process each free list starting from lowest order/mt */
Gavin Shanf58780a2021-06-28 19:35:19 -0700252 for (order = page_reporting_order; order < MAX_ORDER; order++) {
Alexander Duyck36e66c52020-04-06 20:04:56 -0700253 for (mt = 0; mt < MIGRATE_TYPES; mt++) {
254 /* We do not pull pages from the isolate free list */
255 if (is_migrate_isolate(mt))
256 continue;
257
258 err = page_reporting_cycle(prdev, zone, order, mt,
259 sgl, &offset);
260 if (err)
261 return err;
262 }
263 }
264
265 /* report the leftover pages before going idle */
266 leftover = PAGE_REPORTING_CAPACITY - offset;
267 if (leftover) {
268 sgl = &sgl[offset];
269 err = prdev->report(prdev, sgl, leftover);
270
271 /* flush any remaining pages out from the last report */
272 spin_lock_irq(&zone->lock);
273 page_reporting_drain(prdev, sgl, leftover, !err);
274 spin_unlock_irq(&zone->lock);
275 }
276
277 return err;
278}
279
280static void page_reporting_process(struct work_struct *work)
281{
282 struct delayed_work *d_work = to_delayed_work(work);
283 struct page_reporting_dev_info *prdev =
284 container_of(d_work, struct page_reporting_dev_info, work);
285 int err = 0, state = PAGE_REPORTING_ACTIVE;
286 struct scatterlist *sgl;
287 struct zone *zone;
288
289 /*
290 * Change the state to "Active" so that we can track if there is
291 * anyone requests page reporting after we complete our pass. If
292 * the state is not altered by the end of the pass we will switch
293 * to idle and quit scheduling reporting runs.
294 */
295 atomic_set(&prdev->state, state);
296
297 /* allocate scatterlist to store pages being reported on */
298 sgl = kmalloc_array(PAGE_REPORTING_CAPACITY, sizeof(*sgl), GFP_KERNEL);
299 if (!sgl)
300 goto err_out;
301
302 sg_init_table(sgl, PAGE_REPORTING_CAPACITY);
303
304 for_each_zone(zone) {
305 err = page_reporting_process_zone(prdev, sgl, zone);
306 if (err)
307 break;
308 }
309
310 kfree(sgl);
311err_out:
312 /*
313 * If the state has reverted back to requested then there may be
314 * additional pages to be processed. We will defer for 2s to allow
315 * more pages to accumulate.
316 */
317 state = atomic_cmpxchg(&prdev->state, state, PAGE_REPORTING_IDLE);
318 if (state == PAGE_REPORTING_REQUESTED)
319 schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY);
320}
321
322static DEFINE_MUTEX(page_reporting_mutex);
323DEFINE_STATIC_KEY_FALSE(page_reporting_enabled);
324
325int page_reporting_register(struct page_reporting_dev_info *prdev)
326{
327 int err = 0;
328
329 mutex_lock(&page_reporting_mutex);
330
331 /* nothing to do if already in use */
332 if (rcu_access_pointer(pr_dev_info)) {
333 err = -EBUSY;
334 goto err_out;
335 }
336
Gavin Shan9f849c62021-06-28 19:35:22 -0700337 /*
338 * Update the page reporting order if it's specified by driver.
339 * Otherwise, it falls back to @pageblock_order.
340 */
341 page_reporting_order = prdev->order ? : pageblock_order;
342
Alexander Duyck36e66c52020-04-06 20:04:56 -0700343 /* initialize state and work structures */
344 atomic_set(&prdev->state, PAGE_REPORTING_IDLE);
345 INIT_DELAYED_WORK(&prdev->work, &page_reporting_process);
346
347 /* Begin initial flush of zones */
348 __page_reporting_request(prdev);
349
350 /* Assign device to allow notifications */
351 rcu_assign_pointer(pr_dev_info, prdev);
352
353 /* enable page reporting notification */
354 if (!static_key_enabled(&page_reporting_enabled)) {
355 static_branch_enable(&page_reporting_enabled);
356 pr_info("Free page reporting enabled\n");
357 }
358err_out:
359 mutex_unlock(&page_reporting_mutex);
360
361 return err;
362}
363EXPORT_SYMBOL_GPL(page_reporting_register);
364
365void page_reporting_unregister(struct page_reporting_dev_info *prdev)
366{
367 mutex_lock(&page_reporting_mutex);
368
369 if (rcu_access_pointer(pr_dev_info) == prdev) {
370 /* Disable page reporting notification */
371 RCU_INIT_POINTER(pr_dev_info, NULL);
372 synchronize_rcu();
373
374 /* Flush any existing work, and lock it out */
375 cancel_delayed_work_sync(&prdev->work);
376 }
377
378 mutex_unlock(&page_reporting_mutex);
379}
380EXPORT_SYMBOL_GPL(page_reporting_unregister);