blob: 172984848d51910a5fda8dbffe38898ccd4b87aa [file] [log] [blame]
Jérôme Glisse133ff0e2017-09-08 16:11:23 -07001/*
2 * Copyright 2013 Red Hat Inc.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * Authors: Jérôme Glisse <jglisse@redhat.com>
15 */
16/*
17 * Refer to include/linux/hmm.h for information about heterogeneous memory
18 * management or HMM for short.
19 */
20#include <linux/mm.h>
21#include <linux/hmm.h>
Jérôme Glisseda4c3c72017-09-08 16:11:31 -070022#include <linux/rmap.h>
23#include <linux/swap.h>
Jérôme Glisse133ff0e2017-09-08 16:11:23 -070024#include <linux/slab.h>
25#include <linux/sched.h>
Jérôme Glisseda4c3c72017-09-08 16:11:31 -070026#include <linux/swapops.h>
27#include <linux/hugetlb.h>
Jérôme Glissec0b12402017-09-08 16:11:27 -070028#include <linux/mmu_notifier.h>
Jérôme Glisse133ff0e2017-09-08 16:11:23 -070029
30
31#ifdef CONFIG_HMM
Jérôme Glissec0b12402017-09-08 16:11:27 -070032static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
33
Jérôme Glisse133ff0e2017-09-08 16:11:23 -070034/*
35 * struct hmm - HMM per mm struct
36 *
37 * @mm: mm struct this HMM struct is bound to
Jérôme Glisseda4c3c72017-09-08 16:11:31 -070038 * @lock: lock protecting ranges list
Jérôme Glissec0b12402017-09-08 16:11:27 -070039 * @sequence: we track updates to the CPU page table with a sequence number
Jérôme Glisseda4c3c72017-09-08 16:11:31 -070040 * @ranges: list of range being snapshotted
Jérôme Glissec0b12402017-09-08 16:11:27 -070041 * @mirrors: list of mirrors for this mm
42 * @mmu_notifier: mmu notifier to track updates to CPU page table
43 * @mirrors_sem: read/write semaphore protecting the mirrors list
Jérôme Glisse133ff0e2017-09-08 16:11:23 -070044 */
45struct hmm {
46 struct mm_struct *mm;
Jérôme Glisseda4c3c72017-09-08 16:11:31 -070047 spinlock_t lock;
Jérôme Glissec0b12402017-09-08 16:11:27 -070048 atomic_t sequence;
Jérôme Glisseda4c3c72017-09-08 16:11:31 -070049 struct list_head ranges;
Jérôme Glissec0b12402017-09-08 16:11:27 -070050 struct list_head mirrors;
51 struct mmu_notifier mmu_notifier;
52 struct rw_semaphore mirrors_sem;
Jérôme Glisse133ff0e2017-09-08 16:11:23 -070053};
54
55/*
56 * hmm_register - register HMM against an mm (HMM internal)
57 *
58 * @mm: mm struct to attach to
59 *
60 * This is not intended to be used directly by device drivers. It allocates an
61 * HMM struct if mm does not have one, and initializes it.
62 */
63static struct hmm *hmm_register(struct mm_struct *mm)
64{
Jérôme Glissec0b12402017-09-08 16:11:27 -070065 struct hmm *hmm = READ_ONCE(mm->hmm);
66 bool cleanup = false;
Jérôme Glisse133ff0e2017-09-08 16:11:23 -070067
68 /*
69 * The hmm struct can only be freed once the mm_struct goes away,
70 * hence we should always have pre-allocated an new hmm struct
71 * above.
72 */
Jérôme Glissec0b12402017-09-08 16:11:27 -070073 if (hmm)
74 return hmm;
75
76 hmm = kmalloc(sizeof(*hmm), GFP_KERNEL);
77 if (!hmm)
78 return NULL;
79 INIT_LIST_HEAD(&hmm->mirrors);
80 init_rwsem(&hmm->mirrors_sem);
81 atomic_set(&hmm->sequence, 0);
82 hmm->mmu_notifier.ops = NULL;
Jérôme Glisseda4c3c72017-09-08 16:11:31 -070083 INIT_LIST_HEAD(&hmm->ranges);
84 spin_lock_init(&hmm->lock);
Jérôme Glissec0b12402017-09-08 16:11:27 -070085 hmm->mm = mm;
86
87 /*
88 * We should only get here if hold the mmap_sem in write mode ie on
89 * registration of first mirror through hmm_mirror_register()
90 */
91 hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops;
92 if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) {
93 kfree(hmm);
94 return NULL;
95 }
96
97 spin_lock(&mm->page_table_lock);
98 if (!mm->hmm)
99 mm->hmm = hmm;
100 else
101 cleanup = true;
102 spin_unlock(&mm->page_table_lock);
103
104 if (cleanup) {
105 mmu_notifier_unregister(&hmm->mmu_notifier, mm);
106 kfree(hmm);
107 }
108
Jérôme Glisse133ff0e2017-09-08 16:11:23 -0700109 return mm->hmm;
110}
111
112void hmm_mm_destroy(struct mm_struct *mm)
113{
114 kfree(mm->hmm);
115}
116#endif /* CONFIG_HMM */
Jérôme Glissec0b12402017-09-08 16:11:27 -0700117
118#if IS_ENABLED(CONFIG_HMM_MIRROR)
119static void hmm_invalidate_range(struct hmm *hmm,
120 enum hmm_update_type action,
121 unsigned long start,
122 unsigned long end)
123{
124 struct hmm_mirror *mirror;
Jérôme Glisseda4c3c72017-09-08 16:11:31 -0700125 struct hmm_range *range;
126
127 spin_lock(&hmm->lock);
128 list_for_each_entry(range, &hmm->ranges, list) {
129 unsigned long addr, idx, npages;
130
131 if (end < range->start || start >= range->end)
132 continue;
133
134 range->valid = false;
135 addr = max(start, range->start);
136 idx = (addr - range->start) >> PAGE_SHIFT;
137 npages = (min(range->end, end) - addr) >> PAGE_SHIFT;
138 memset(&range->pfns[idx], 0, sizeof(*range->pfns) * npages);
139 }
140 spin_unlock(&hmm->lock);
Jérôme Glissec0b12402017-09-08 16:11:27 -0700141
142 down_read(&hmm->mirrors_sem);
143 list_for_each_entry(mirror, &hmm->mirrors, list)
144 mirror->ops->sync_cpu_device_pagetables(mirror, action,
145 start, end);
146 up_read(&hmm->mirrors_sem);
147}
148
149static void hmm_invalidate_range_start(struct mmu_notifier *mn,
150 struct mm_struct *mm,
151 unsigned long start,
152 unsigned long end)
153{
154 struct hmm *hmm = mm->hmm;
155
156 VM_BUG_ON(!hmm);
157
158 atomic_inc(&hmm->sequence);
159}
160
161static void hmm_invalidate_range_end(struct mmu_notifier *mn,
162 struct mm_struct *mm,
163 unsigned long start,
164 unsigned long end)
165{
166 struct hmm *hmm = mm->hmm;
167
168 VM_BUG_ON(!hmm);
169
170 hmm_invalidate_range(mm->hmm, HMM_UPDATE_INVALIDATE, start, end);
171}
172
173static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
174 .invalidate_range_start = hmm_invalidate_range_start,
175 .invalidate_range_end = hmm_invalidate_range_end,
176};
177
178/*
179 * hmm_mirror_register() - register a mirror against an mm
180 *
181 * @mirror: new mirror struct to register
182 * @mm: mm to register against
183 *
184 * To start mirroring a process address space, the device driver must register
185 * an HMM mirror struct.
186 *
187 * THE mm->mmap_sem MUST BE HELD IN WRITE MODE !
188 */
189int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm)
190{
191 /* Sanity check */
192 if (!mm || !mirror || !mirror->ops)
193 return -EINVAL;
194
195 mirror->hmm = hmm_register(mm);
196 if (!mirror->hmm)
197 return -ENOMEM;
198
199 down_write(&mirror->hmm->mirrors_sem);
200 list_add(&mirror->list, &mirror->hmm->mirrors);
201 up_write(&mirror->hmm->mirrors_sem);
202
203 return 0;
204}
205EXPORT_SYMBOL(hmm_mirror_register);
206
207/*
208 * hmm_mirror_unregister() - unregister a mirror
209 *
210 * @mirror: new mirror struct to register
211 *
212 * Stop mirroring a process address space, and cleanup.
213 */
214void hmm_mirror_unregister(struct hmm_mirror *mirror)
215{
216 struct hmm *hmm = mirror->hmm;
217
218 down_write(&hmm->mirrors_sem);
219 list_del(&mirror->list);
220 up_write(&hmm->mirrors_sem);
221}
222EXPORT_SYMBOL(hmm_mirror_unregister);
Jérôme Glisseda4c3c72017-09-08 16:11:31 -0700223
224static void hmm_pfns_special(hmm_pfn_t *pfns,
225 unsigned long addr,
226 unsigned long end)
227{
228 for (; addr < end; addr += PAGE_SIZE, pfns++)
229 *pfns = HMM_PFN_SPECIAL;
230}
231
232static int hmm_pfns_bad(unsigned long addr,
233 unsigned long end,
234 struct mm_walk *walk)
235{
236 struct hmm_range *range = walk->private;
237 hmm_pfn_t *pfns = range->pfns;
238 unsigned long i;
239
240 i = (addr - range->start) >> PAGE_SHIFT;
241 for (; addr < end; addr += PAGE_SIZE, i++)
242 pfns[i] = HMM_PFN_ERROR;
243
244 return 0;
245}
246
247static int hmm_vma_walk_hole(unsigned long addr,
248 unsigned long end,
249 struct mm_walk *walk)
250{
251 struct hmm_range *range = walk->private;
252 hmm_pfn_t *pfns = range->pfns;
253 unsigned long i;
254
255 i = (addr - range->start) >> PAGE_SHIFT;
256 for (; addr < end; addr += PAGE_SIZE, i++)
257 pfns[i] = HMM_PFN_EMPTY;
258
259 return 0;
260}
261
262static int hmm_vma_walk_clear(unsigned long addr,
263 unsigned long end,
264 struct mm_walk *walk)
265{
266 struct hmm_range *range = walk->private;
267 hmm_pfn_t *pfns = range->pfns;
268 unsigned long i;
269
270 i = (addr - range->start) >> PAGE_SHIFT;
271 for (; addr < end; addr += PAGE_SIZE, i++)
272 pfns[i] = 0;
273
274 return 0;
275}
276
277static int hmm_vma_walk_pmd(pmd_t *pmdp,
278 unsigned long start,
279 unsigned long end,
280 struct mm_walk *walk)
281{
282 struct hmm_range *range = walk->private;
283 struct vm_area_struct *vma = walk->vma;
284 hmm_pfn_t *pfns = range->pfns;
285 unsigned long addr = start, i;
286 hmm_pfn_t flag;
287 pte_t *ptep;
288
289 i = (addr - range->start) >> PAGE_SHIFT;
290 flag = vma->vm_flags & VM_READ ? HMM_PFN_READ : 0;
291
292again:
293 if (pmd_none(*pmdp))
294 return hmm_vma_walk_hole(start, end, walk);
295
296 if (pmd_huge(*pmdp) && vma->vm_flags & VM_HUGETLB)
297 return hmm_pfns_bad(start, end, walk);
298
299 if (pmd_devmap(*pmdp) || pmd_trans_huge(*pmdp)) {
300 unsigned long pfn;
301 pmd_t pmd;
302
303 /*
304 * No need to take pmd_lock here, even if some other threads
305 * is splitting the huge pmd we will get that event through
306 * mmu_notifier callback.
307 *
308 * So just read pmd value and check again its a transparent
309 * huge or device mapping one and compute corresponding pfn
310 * values.
311 */
312 pmd = pmd_read_atomic(pmdp);
313 barrier();
314 if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
315 goto again;
316 if (pmd_protnone(pmd))
317 return hmm_vma_walk_clear(start, end, walk);
318
319 pfn = pmd_pfn(pmd) + pte_index(addr);
320 flag |= pmd_write(pmd) ? HMM_PFN_WRITE : 0;
321 for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
322 pfns[i] = hmm_pfn_t_from_pfn(pfn) | flag;
323 return 0;
324 }
325
326 if (pmd_bad(*pmdp))
327 return hmm_pfns_bad(start, end, walk);
328
329 ptep = pte_offset_map(pmdp, addr);
330 for (; addr < end; addr += PAGE_SIZE, ptep++, i++) {
331 pte_t pte = *ptep;
332
333 pfns[i] = 0;
334
335 if (pte_none(pte) || !pte_present(pte)) {
336 pfns[i] = HMM_PFN_EMPTY;
337 continue;
338 }
339
340 pfns[i] = hmm_pfn_t_from_pfn(pte_pfn(pte)) | flag;
341 pfns[i] |= pte_write(pte) ? HMM_PFN_WRITE : 0;
342 }
343 pte_unmap(ptep - 1);
344
345 return 0;
346}
347
348/*
349 * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses
350 * @vma: virtual memory area containing the virtual address range
351 * @range: used to track snapshot validity
352 * @start: range virtual start address (inclusive)
353 * @end: range virtual end address (exclusive)
354 * @entries: array of hmm_pfn_t: provided by the caller, filled in by function
355 * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, 0 success
356 *
357 * This snapshots the CPU page table for a range of virtual addresses. Snapshot
358 * validity is tracked by range struct. See hmm_vma_range_done() for further
359 * information.
360 *
361 * The range struct is initialized here. It tracks the CPU page table, but only
362 * if the function returns success (0), in which case the caller must then call
363 * hmm_vma_range_done() to stop CPU page table update tracking on this range.
364 *
365 * NOT CALLING hmm_vma_range_done() IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS
366 * MEMORY CORRUPTION ! YOU HAVE BEEN WARNED !
367 */
368int hmm_vma_get_pfns(struct vm_area_struct *vma,
369 struct hmm_range *range,
370 unsigned long start,
371 unsigned long end,
372 hmm_pfn_t *pfns)
373{
374 struct mm_walk mm_walk;
375 struct hmm *hmm;
376
377 /* FIXME support hugetlb fs */
378 if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) {
379 hmm_pfns_special(pfns, start, end);
380 return -EINVAL;
381 }
382
383 /* Sanity check, this really should not happen ! */
384 if (start < vma->vm_start || start >= vma->vm_end)
385 return -EINVAL;
386 if (end < vma->vm_start || end > vma->vm_end)
387 return -EINVAL;
388
389 hmm = hmm_register(vma->vm_mm);
390 if (!hmm)
391 return -ENOMEM;
392 /* Caller must have registered a mirror, via hmm_mirror_register() ! */
393 if (!hmm->mmu_notifier.ops)
394 return -EINVAL;
395
396 /* Initialize range to track CPU page table update */
397 range->start = start;
398 range->pfns = pfns;
399 range->end = end;
400 spin_lock(&hmm->lock);
401 range->valid = true;
402 list_add_rcu(&range->list, &hmm->ranges);
403 spin_unlock(&hmm->lock);
404
405 mm_walk.vma = vma;
406 mm_walk.mm = vma->vm_mm;
407 mm_walk.private = range;
408 mm_walk.pte_entry = NULL;
409 mm_walk.test_walk = NULL;
410 mm_walk.hugetlb_entry = NULL;
411 mm_walk.pmd_entry = hmm_vma_walk_pmd;
412 mm_walk.pte_hole = hmm_vma_walk_hole;
413
414 walk_page_range(start, end, &mm_walk);
415
416 return 0;
417}
418EXPORT_SYMBOL(hmm_vma_get_pfns);
419
420/*
421 * hmm_vma_range_done() - stop tracking change to CPU page table over a range
422 * @vma: virtual memory area containing the virtual address range
423 * @range: range being tracked
424 * Returns: false if range data has been invalidated, true otherwise
425 *
426 * Range struct is used to track updates to the CPU page table after a call to
427 * either hmm_vma_get_pfns() or hmm_vma_fault(). Once the device driver is done
428 * using the data, or wants to lock updates to the data it got from those
429 * functions, it must call the hmm_vma_range_done() function, which will then
430 * stop tracking CPU page table updates.
431 *
432 * Note that device driver must still implement general CPU page table update
433 * tracking either by using hmm_mirror (see hmm_mirror_register()) or by using
434 * the mmu_notifier API directly.
435 *
436 * CPU page table update tracking done through hmm_range is only temporary and
437 * to be used while trying to duplicate CPU page table contents for a range of
438 * virtual addresses.
439 *
440 * There are two ways to use this :
441 * again:
442 * hmm_vma_get_pfns(vma, range, start, end, pfns);
443 * trans = device_build_page_table_update_transaction(pfns);
444 * device_page_table_lock();
445 * if (!hmm_vma_range_done(vma, range)) {
446 * device_page_table_unlock();
447 * goto again;
448 * }
449 * device_commit_transaction(trans);
450 * device_page_table_unlock();
451 *
452 * Or:
453 * hmm_vma_get_pfns(vma, range, start, end, pfns);
454 * device_page_table_lock();
455 * hmm_vma_range_done(vma, range);
456 * device_update_page_table(pfns);
457 * device_page_table_unlock();
458 */
459bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range)
460{
461 unsigned long npages = (range->end - range->start) >> PAGE_SHIFT;
462 struct hmm *hmm;
463
464 if (range->end <= range->start) {
465 BUG();
466 return false;
467 }
468
469 hmm = hmm_register(vma->vm_mm);
470 if (!hmm) {
471 memset(range->pfns, 0, sizeof(*range->pfns) * npages);
472 return false;
473 }
474
475 spin_lock(&hmm->lock);
476 list_del_rcu(&range->list);
477 spin_unlock(&hmm->lock);
478
479 return range->valid;
480}
481EXPORT_SYMBOL(hmm_vma_range_done);
Jérôme Glissec0b12402017-09-08 16:11:27 -0700482#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */