From ebcbc6ea7d8a604ad8504dae70a6ac1b1e64a0b7 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Feb 2022 18:20:24 -0800 Subject: mm/munlock: delete page_mlock() and all its works We have recommended some applications to mlock their userspace, but that turns out to be counter-productive: when many processes mlock the same file, contention on rmap's i_mmap_rwsem can become intolerable at exit: it is needed for write, to remove any vma mapping that file from rmap's tree; but hogged for read by those with mlocks calling page_mlock() (formerly known as try_to_munlock()) on *each* page mapped from the file (the purpose being to find out whether another process has the page mlocked, so therefore it should not be unmlocked yet). Several optimizations have been made in the past: one is to skip page_mlock() when mapcount tells that nothing else has this page mapped; but that doesn't help at all when others do have it mapped. This time around, I initially intended to add a preliminary search of the rmap tree for overlapping VM_LOCKED ranges; but that gets messy with locking order, when in doubt whether a page is actually present; and risks adding even more contention on the i_mmap_rwsem. A solution would be much easier, if only there were space in struct page for an mlock_count... but actually, most of the time, there is space for it - an mlocked page spends most of its life on an unevictable LRU, but since 3.18 removed the scan_unevictable_pages sysctl, that "LRU" has been redundant. Let's try to reuse its page->lru. But leave that until a later patch: in this patch, clear the ground by removing page_mlock(), and all the infrastructure that has gathered around it - which mostly hinders understanding, and will make reviewing new additions harder. Don't mind those old comments about THPs, they date from before 4.5's refcounting rework: splitting is not a risk here. Just keep a minimal version of munlock_vma_page(), as reminder of what it should attend to (in particular, the odd way PGSTRANDED is counted out of PGMUNLOCKED), and likewise a stub for munlock_vma_pages_range(). Move unchanged __mlock_posix_error_return() out of the way, down to above its caller: this series then makes no further change after mlock_fixup(). After this and each following commit, the kernel builds, boots and runs; but with deficiencies which may show up in testing of mlock and munlock. The system calls succeed or fail as before, and mlock remains effective in preventing page reclaim; but meminfo's Unevictable and Mlocked amounts may be shown too low after mlock, grow, then stay too high after munlock: with previously mlocked pages remaining unevictable for too long, until finally unmapped and freed and counts corrected. Normal service will be resumed in "mm/munlock: mlock_pte_range() when mlocking or munlocking". Signed-off-by: Hugh Dickins Acked-by: Vlastimil Babka Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/rmap.h | 6 - mm/internal.h | 2 +- mm/mlock.c | 375 ++++----------------------------------------------- mm/rmap.c | 80 ----------- 4 files changed, 25 insertions(+), 438 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index e704b1a4c06c..dc48aa8c2c94 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -237,12 +237,6 @@ unsigned long page_address_in_vma(struct page *, struct vm_area_struct *); */ int folio_mkclean(struct folio *); -/* - * called in munlock()/munmap() path to check for other vmas holding - * the page mlocked. - */ -void page_mlock(struct page *page); - void remove_migration_ptes(struct page *old, struct page *new, bool locked); /* diff --git a/mm/internal.h b/mm/internal.h index d80300392a19..e48c486d5ddf 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -409,7 +409,7 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) * must be called with vma's mmap_lock held for read or write, and page locked. */ extern void mlock_vma_page(struct page *page); -extern unsigned int munlock_vma_page(struct page *page); +extern void munlock_vma_page(struct page *page); extern int mlock_future_check(struct mm_struct *mm, unsigned long flags, unsigned long len); diff --git a/mm/mlock.c b/mm/mlock.c index 8f584eddd305..aec4ce7919da 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -46,12 +46,6 @@ EXPORT_SYMBOL(can_do_mlock); * be placed on the LRU "unevictable" list, rather than the [in]active lists. * The unevictable list is an LRU sibling list to the [in]active lists. * PageUnevictable is set to indicate the unevictable state. - * - * When lazy mlocking via vmscan, it is important to ensure that the - * vma's VM_LOCKED status is not concurrently being modified, otherwise we - * may have mlocked a page that is being munlocked. So lazy mlock must take - * the mmap_lock for read, and verify that the vma really is locked - * (see mm/rmap.c). */ /* @@ -106,299 +100,28 @@ void mlock_vma_page(struct page *page) } } -/* - * Finish munlock after successful page isolation - * - * Page must be locked. This is a wrapper for page_mlock() - * and putback_lru_page() with munlock accounting. - */ -static void __munlock_isolated_page(struct page *page) -{ - /* - * Optimization: if the page was mapped just once, that's our mapping - * and we don't need to check all the other vmas. - */ - if (page_mapcount(page) > 1) - page_mlock(page); - - /* Did try_to_unlock() succeed or punt? */ - if (!PageMlocked(page)) - count_vm_events(UNEVICTABLE_PGMUNLOCKED, thp_nr_pages(page)); - - putback_lru_page(page); -} - -/* - * Accounting for page isolation fail during munlock - * - * Performs accounting when page isolation fails in munlock. There is nothing - * else to do because it means some other task has already removed the page - * from the LRU. putback_lru_page() will take care of removing the page from - * the unevictable list, if necessary. vmscan [page_referenced()] will move - * the page back to the unevictable list if some other vma has it mlocked. - */ -static void __munlock_isolation_failed(struct page *page) -{ - int nr_pages = thp_nr_pages(page); - - if (PageUnevictable(page)) - __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); - else - __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages); -} - /** * munlock_vma_page - munlock a vma page * @page: page to be unlocked, either a normal page or THP page head - * - * returns the size of the page as a page mask (0 for normal page, - * HPAGE_PMD_NR - 1 for THP head page) - * - * called from munlock()/munmap() path with page supposedly on the LRU. - * When we munlock a page, because the vma where we found the page is being - * munlock()ed or munmap()ed, we want to check whether other vmas hold the - * page locked so that we can leave it on the unevictable lru list and not - * bother vmscan with it. However, to walk the page's rmap list in - * page_mlock() we must isolate the page from the LRU. If some other - * task has removed the page from the LRU, we won't be able to do that. - * So we clear the PageMlocked as we might not get another chance. If we - * can't isolate the page, we leave it for putback_lru_page() and vmscan - * [page_referenced()/try_to_unmap()] to deal with. */ -unsigned int munlock_vma_page(struct page *page) +void munlock_vma_page(struct page *page) { - int nr_pages; - - /* For page_mlock() and to serialize with page migration */ + /* Serialize with page migration */ BUG_ON(!PageLocked(page)); - VM_BUG_ON_PAGE(PageTail(page), page); - - if (!TestClearPageMlocked(page)) { - /* Potentially, PTE-mapped THP: do not skip the rest PTEs */ - return 0; - } - - nr_pages = thp_nr_pages(page); - mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); - - if (!isolate_lru_page(page)) - __munlock_isolated_page(page); - else - __munlock_isolation_failed(page); - - return nr_pages - 1; -} - -/* - * convert get_user_pages() return value to posix mlock() error - */ -static int __mlock_posix_error_return(long retval) -{ - if (retval == -EFAULT) - retval = -ENOMEM; - else if (retval == -ENOMEM) - retval = -EAGAIN; - return retval; -} - -/* - * Prepare page for fast batched LRU putback via putback_lru_evictable_pagevec() - * - * The fast path is available only for evictable pages with single mapping. - * Then we can bypass the per-cpu pvec and get better performance. - * when mapcount > 1 we need page_mlock() which can fail. - * when !page_evictable(), we need the full redo logic of putback_lru_page to - * avoid leaving evictable page in unevictable list. - * - * In case of success, @page is added to @pvec and @pgrescued is incremented - * in case that the page was previously unevictable. @page is also unlocked. - */ -static bool __putback_lru_fast_prepare(struct page *page, struct pagevec *pvec, - int *pgrescued) -{ - VM_BUG_ON_PAGE(PageLRU(page), page); - VM_BUG_ON_PAGE(!PageLocked(page), page); - - if (page_mapcount(page) <= 1 && page_evictable(page)) { - pagevec_add(pvec, page); - if (TestClearPageUnevictable(page)) - (*pgrescued)++; - unlock_page(page); - return true; - } - - return false; -} -/* - * Putback multiple evictable pages to the LRU - * - * Batched putback of evictable pages that bypasses the per-cpu pvec. Some of - * the pages might have meanwhile become unevictable but that is OK. - */ -static void __putback_lru_fast(struct pagevec *pvec, int pgrescued) -{ - count_vm_events(UNEVICTABLE_PGMUNLOCKED, pagevec_count(pvec)); - /* - *__pagevec_lru_add() calls release_pages() so we don't call - * put_page() explicitly - */ - __pagevec_lru_add(pvec); - count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); -} - -/* - * Munlock a batch of pages from the same zone - * - * The work is split to two main phases. First phase clears the Mlocked flag - * and attempts to isolate the pages, all under a single zone lru lock. - * The second phase finishes the munlock only for pages where isolation - * succeeded. - * - * Note that the pagevec may be modified during the process. - */ -static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) -{ - int i; - int nr = pagevec_count(pvec); - int delta_munlocked = -nr; - struct pagevec pvec_putback; - struct lruvec *lruvec = NULL; - int pgrescued = 0; - - pagevec_init(&pvec_putback); - - /* Phase 1: page isolation */ - for (i = 0; i < nr; i++) { - struct page *page = pvec->pages[i]; - struct folio *folio = page_folio(page); - - if (TestClearPageMlocked(page)) { - /* - * We already have pin from follow_page_mask() - * so we can spare the get_page() here. - */ - if (TestClearPageLRU(page)) { - lruvec = folio_lruvec_relock_irq(folio, lruvec); - del_page_from_lru_list(page, lruvec); - continue; - } else - __munlock_isolation_failed(page); - } else { - delta_munlocked++; - } + VM_BUG_ON_PAGE(PageTail(page), page); - /* - * We won't be munlocking this page in the next phase - * but we still need to release the follow_page_mask() - * pin. We cannot do it under lru_lock however. If it's - * the last pin, __page_cache_release() would deadlock. - */ - pagevec_add(&pvec_putback, pvec->pages[i]); - pvec->pages[i] = NULL; - } - if (lruvec) { - __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); - unlock_page_lruvec_irq(lruvec); - } else if (delta_munlocked) { - mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); - } + if (TestClearPageMlocked(page)) { + int nr_pages = thp_nr_pages(page); - /* Now we can release pins of pages that we are not munlocking */ - pagevec_release(&pvec_putback); - - /* Phase 2: page munlock */ - for (i = 0; i < nr; i++) { - struct page *page = pvec->pages[i]; - - if (page) { - lock_page(page); - if (!__putback_lru_fast_prepare(page, &pvec_putback, - &pgrescued)) { - /* - * Slow path. We don't want to lose the last - * pin before unlock_page() - */ - get_page(page); /* for putback_lru_page() */ - __munlock_isolated_page(page); - unlock_page(page); - put_page(page); /* from follow_page_mask() */ - } + mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); + if (!isolate_lru_page(page)) { + putback_lru_page(page); + count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages); + } else if (PageUnevictable(page)) { + count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); } } - - /* - * Phase 3: page putback for pages that qualified for the fast path - * This will also call put_page() to return pin from follow_page_mask() - */ - if (pagevec_count(&pvec_putback)) - __putback_lru_fast(&pvec_putback, pgrescued); -} - -/* - * Fill up pagevec for __munlock_pagevec using pte walk - * - * The function expects that the struct page corresponding to @start address is - * a non-TPH page already pinned and in the @pvec, and that it belongs to @zone. - * - * The rest of @pvec is filled by subsequent pages within the same pmd and same - * zone, as long as the pte's are present and vm_normal_page() succeeds. These - * pages also get pinned. - * - * Returns the address of the next page that should be scanned. This equals - * @start + PAGE_SIZE when no page could be added by the pte walk. - */ -static unsigned long __munlock_pagevec_fill(struct pagevec *pvec, - struct vm_area_struct *vma, struct zone *zone, - unsigned long start, unsigned long end) -{ - pte_t *pte; - spinlock_t *ptl; - - /* - * Initialize pte walk starting at the already pinned page where we - * are sure that there is a pte, as it was pinned under the same - * mmap_lock write op. - */ - pte = get_locked_pte(vma->vm_mm, start, &ptl); - /* Make sure we do not cross the page table boundary */ - end = pgd_addr_end(start, end); - end = p4d_addr_end(start, end); - end = pud_addr_end(start, end); - end = pmd_addr_end(start, end); - - /* The page next to the pinned page is the first we will try to get */ - start += PAGE_SIZE; - while (start < end) { - struct page *page = NULL; - pte++; - if (pte_present(*pte)) - page = vm_normal_page(vma, start, *pte); - /* - * Break if page could not be obtained or the page's node+zone does not - * match - */ - if (!page || page_zone(page) != zone) - break; - - /* - * Do not use pagevec for PTE-mapped THP, - * munlock_vma_pages_range() will handle them. - */ - if (PageTransCompound(page)) - break; - - get_page(page); - /* - * Increase the address that will be returned *before* the - * eventual break due to pvec becoming full by adding the page - */ - start += PAGE_SIZE; - if (pagevec_add(pvec, page) == 0) - break; - } - pte_unmap_unlock(pte, ptl); - return start; } /* @@ -413,75 +136,13 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec, * * Returns with VM_LOCKED cleared. Callers must be prepared to * deal with this. - * - * We don't save and restore VM_LOCKED here because pages are - * still on lru. In unmap path, pages might be scanned by reclaim - * and re-mlocked by page_mlock/try_to_unmap before we unmap and - * free them. This will result in freeing mlocked pages. */ void munlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { vma->vm_flags &= VM_LOCKED_CLEAR_MASK; - while (start < end) { - struct page *page; - unsigned int page_mask = 0; - unsigned long page_increm; - struct pagevec pvec; - struct zone *zone; - - pagevec_init(&pvec); - /* - * Although FOLL_DUMP is intended for get_dump_page(), - * it just so happens that its special treatment of the - * ZERO_PAGE (returning an error instead of doing get_page) - * suits munlock very well (and if somehow an abnormal page - * has sneaked into the range, we won't oops here: great). - */ - page = follow_page(vma, start, FOLL_GET | FOLL_DUMP); - - if (page && !IS_ERR(page)) { - if (PageTransTail(page)) { - VM_BUG_ON_PAGE(PageMlocked(page), page); - put_page(page); /* follow_page_mask() */ - } else if (PageTransHuge(page)) { - lock_page(page); - /* - * Any THP page found by follow_page_mask() may - * have gotten split before reaching - * munlock_vma_page(), so we need to compute - * the page_mask here instead. - */ - page_mask = munlock_vma_page(page); - unlock_page(page); - put_page(page); /* follow_page_mask() */ - } else { - /* - * Non-huge pages are handled in batches via - * pagevec. The pin from follow_page_mask() - * prevents them from collapsing by THP. - */ - pagevec_add(&pvec, page); - zone = page_zone(page); - - /* - * Try to fill the rest of pagevec using fast - * pte walk. This will also update start to - * the next page to process. Then munlock the - * pagevec. - */ - start = __munlock_pagevec_fill(&pvec, vma, - zone, start, end); - __munlock_pagevec(&pvec, zone); - goto next; - } - } - page_increm = 1 + page_mask; - start += page_increm * PAGE_SIZE; -next: - cond_resched(); - } + /* Reimplementation to follow in later commit */ } /* @@ -645,6 +306,18 @@ static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm, return count >> PAGE_SHIFT; } +/* + * convert get_user_pages() return value to posix mlock() error + */ +static int __mlock_posix_error_return(long retval) +{ + if (retval == -EFAULT) + retval = -ENOMEM; + else if (retval == -ENOMEM) + retval = -EAGAIN; + return retval; +} + static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags) { unsigned long locked; diff --git a/mm/rmap.c b/mm/rmap.c index 6a1e8c7f6213..7ce7f1946cff 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1996,76 +1996,6 @@ void try_to_migrate(struct page *page, enum ttu_flags flags) rmap_walk(page, &rwc); } -/* - * Walks the vma's mapping a page and mlocks the page if any locked vma's are - * found. Once one is found the page is locked and the scan can be terminated. - */ -static bool page_mlock_one(struct page *page, struct vm_area_struct *vma, - unsigned long address, void *unused) -{ - struct page_vma_mapped_walk pvmw = { - .page = page, - .vma = vma, - .address = address, - }; - - /* An un-locked vma doesn't have any pages to lock, continue the scan */ - if (!(vma->vm_flags & VM_LOCKED)) - return true; - - while (page_vma_mapped_walk(&pvmw)) { - /* - * Need to recheck under the ptl to serialise with - * __munlock_pagevec_fill() after VM_LOCKED is cleared in - * munlock_vma_pages_range(). - */ - if (vma->vm_flags & VM_LOCKED) { - /* - * PTE-mapped THP are never marked as mlocked; but - * this function is never called on a DoubleMap THP, - * nor on an Anon THP (which may still be PTE-mapped - * after DoubleMap was cleared). - */ - mlock_vma_page(page); - /* - * No need to scan further once the page is marked - * as mlocked. - */ - page_vma_mapped_walk_done(&pvmw); - return false; - } - } - - return true; -} - -/** - * page_mlock - try to mlock a page - * @page: the page to be mlocked - * - * Called from munlock code. Checks all of the VMAs mapping the page and mlocks - * the page if any are found. The page will be returned with PG_mlocked cleared - * if it is not mapped by any locked vmas. - */ -void page_mlock(struct page *page) -{ - struct rmap_walk_control rwc = { - .rmap_one = page_mlock_one, - .done = page_not_mapped, - .anon_lock = page_lock_anon_vma_read, - - }; - - VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page); - VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page); - - /* Anon THP are only marked as mlocked when singly mapped */ - if (PageTransCompound(page) && PageAnon(page)) - return; - - rmap_walk(page, &rwc); -} - #ifdef CONFIG_DEVICE_PRIVATE struct make_exclusive_args { struct mm_struct *mm; @@ -2291,11 +2221,6 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page, * * Find all the mappings of a page using the mapping pointer and the vma chains * contained in the anon_vma struct it points to. - * - * When called from page_mlock(), the mmap_lock of the mm containing the vma - * where the page was found will be held for write. So, we won't recheck - * vm_flags for that VMA. That should be OK, because that vma shouldn't be - * LOCKED. */ static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, bool locked) @@ -2344,11 +2269,6 @@ static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, * * Find all the mappings of a page using the mapping pointer and the vma chains * contained in the address_space struct it points to. - * - * When called from page_mlock(), the mmap_lock of the mm containing the vma - * where the page was found will be held for write. So, we won't recheck - * vm_flags for that VMA. That should be OK, because that vma shouldn't be - * LOCKED. */ static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, bool locked) -- cgit v1.2.3 From b67bf49ce7aae72f63739abee6ac25f64bf20081 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Feb 2022 18:21:52 -0800 Subject: mm/munlock: delete FOLL_MLOCK and FOLL_POPULATE If counting page mlocks, we must not double-count: follow_page_pte() can tell if a page has already been Mlocked or not, but cannot tell if a pte has already been counted or not: that will have to be done when the pte is mapped in (which lru_cache_add_inactive_or_unevictable() already tracks for new anon pages, but there's no such tracking yet for others). Delete all the FOLL_MLOCK code - faulting in the missing pages will do all that is necessary, without special mlock_vma_page() calls from here. But then FOLL_POPULATE turns out to serve no purpose - it was there so that its absence would tell faultin_page() not to faultin page when setting up VM_LOCKONFAULT areas; but if there's no special work needed here for mlock, then there's no work at all here for VM_LOCKONFAULT. Have I got that right? I've not looked into the history, but see that FOLL_POPULATE goes back before VM_LOCKONFAULT: did it serve a different purpose before? Ah, yes, it was used to skip the old stack guard page. And is it intentional that COW is not broken on existing pages when setting up a VM_LOCKONFAULT area? I can see that being argued either way, and have no reason to disagree with current behaviour. Signed-off-by: Hugh Dickins Acked-by: Vlastimil Babka Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/mm.h | 2 -- mm/gup.c | 43 ++++++++----------------------------------- mm/huge_memory.c | 33 --------------------------------- 3 files changed, 8 insertions(+), 70 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 213cc569b192..74ee50c2033b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2925,13 +2925,11 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, #define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */ #define FOLL_NOWAIT 0x20 /* if a disk transfer is needed, start the IO * and return without waiting upon it */ -#define FOLL_POPULATE 0x40 /* fault in pages (with FOLL_MLOCK) */ #define FOLL_NOFAULT 0x80 /* do not fault in pages */ #define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ #define FOLL_NUMA 0x200 /* force NUMA hinting page fault */ #define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */ #define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */ -#define FOLL_MLOCK 0x1000 /* lock present pages */ #define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ #define FOLL_COW 0x4000 /* internal GUP flag */ #define FOLL_ANON 0x8000 /* don't do file mappings */ diff --git a/mm/gup.c b/mm/gup.c index a9d4d724aef7..87fec8a5c10d 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -597,32 +597,6 @@ retry: */ mark_page_accessed(page); } - if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { - /* Do not mlock pte-mapped THP */ - if (PageTransCompound(page)) - goto out; - - /* - * The preliminary mapping check is mainly to avoid the - * pointless overhead of lock_page on the ZERO_PAGE - * which might bounce very badly if there is contention. - * - * If the page is already locked, we don't need to - * handle it now - vmscan will handle it later if and - * when it attempts to reclaim the page. - */ - if (page->mapping && trylock_page(page)) { - lru_add_drain(); /* push cached pages to LRU */ - /* - * Because we lock page here, and migration is - * blocked by the pte's page reference, and we - * know the page is still mapped, we don't even - * need to check for file-cache page truncation. - */ - mlock_vma_page(page); - unlock_page(page); - } - } out: pte_unmap_unlock(ptep, ptl); return page; @@ -945,9 +919,6 @@ static int faultin_page(struct vm_area_struct *vma, unsigned int fault_flags = 0; vm_fault_t ret; - /* mlock all present pages, but do not fault in new pages */ - if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK) - return -ENOENT; if (*flags & FOLL_NOFAULT) return -EFAULT; if (*flags & FOLL_WRITE) @@ -1198,8 +1169,6 @@ retry: case -ENOMEM: case -EHWPOISON: goto out; - case -ENOENT: - goto next_page; } BUG(); } else if (PTR_ERR(page) == -EEXIST) { @@ -1497,9 +1466,14 @@ long populate_vma_page_range(struct vm_area_struct *vma, VM_BUG_ON_VMA(end > vma->vm_end, vma); mmap_assert_locked(mm); - gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK; + /* + * Rightly or wrongly, the VM_LOCKONFAULT case has never used + * faultin_page() to break COW, so it has no work to do here. + */ if (vma->vm_flags & VM_LOCKONFAULT) - gup_flags &= ~FOLL_POPULATE; + return nr_pages; + + gup_flags = FOLL_TOUCH; /* * We want to touch writable mappings with a write fault in order * to break COW, except for shared mappings because these don't COW @@ -1566,10 +1540,9 @@ long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, * in the page table. * FOLL_HWPOISON: Return -EHWPOISON instead of -EFAULT when we hit * a poisoned page. - * FOLL_POPULATE: Always populate memory with VM_LOCKONFAULT. * !FOLL_FORCE: Require proper access permissions. */ - gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK | FOLL_HWPOISON; + gup_flags = FOLL_TOUCH | FOLL_HWPOISON; if (write) gup_flags |= FOLL_WRITE; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 406a3c28c026..9a34b85ebcf8 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1380,39 +1380,6 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, if (flags & FOLL_TOUCH) touch_pmd(vma, addr, pmd, flags); - if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { - /* - * We don't mlock() pte-mapped THPs. This way we can avoid - * leaking mlocked pages into non-VM_LOCKED VMAs. - * - * For anon THP: - * - * In most cases the pmd is the only mapping of the page as we - * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for - * writable private mappings in populate_vma_page_range(). - * - * The only scenario when we have the page shared here is if we - * mlocking read-only mapping shared over fork(). We skip - * mlocking such pages. - * - * For file THP: - * - * We can expect PageDoubleMap() to be stable under page lock: - * for file pages we set it in page_add_file_rmap(), which - * requires page to be locked. - */ - - if (PageAnon(page) && compound_mapcount(page) != 1) - goto skip_mlock; - if (PageDoubleMap(page) || !page->mapping) - goto skip_mlock; - if (!trylock_page(page)) - goto skip_mlock; - if (page->mapping && !PageDoubleMap(page)) - mlock_vma_page(page); - unlock_page(page); - } -skip_mlock: page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page); -- cgit v1.2.3 From a213e5cf71cbcea4b23caedcb8fe6629a333b275 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Feb 2022 18:23:29 -0800 Subject: mm/munlock: delete munlock_vma_pages_all(), allow oomreap munlock_vma_pages_range() will still be required, when munlocking but not munmapping a set of pages; but when unmapping a pte, the mlock count will be maintained in much the same way as it will be maintained when mapping in the pte. Which removes the need for munlock_vma_pages_all() on mlocked vmas when munmapping or exiting: eliminating the catastrophic contention on i_mmap_rwsem, and the need for page lock on the pages. There is still a need to update locked_vm accounting according to the munmapped vmas when munmapping: do that in detach_vmas_to_be_unmapped(). exit_mmap() does not need locked_vm updates, so delete unlock_range(). And wasn't I the one who forbade the OOM reaper to attack mlocked vmas, because of the uncertainty in blocking on all those page locks? No fear of that now, so permit the OOM reaper on mlocked vmas. Signed-off-by: Hugh Dickins Acked-by: Vlastimil Babka Signed-off-by: Matthew Wilcox (Oracle) --- mm/internal.h | 16 ++-------------- mm/madvise.c | 5 +++++ mm/mlock.c | 4 ++-- mm/mmap.c | 32 ++------------------------------ mm/oom_kill.c | 2 +- 5 files changed, 12 insertions(+), 47 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index e48c486d5ddf..f235aa92e564 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -71,11 +71,6 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte); -static inline bool can_madv_lru_vma(struct vm_area_struct *vma) -{ - return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)); -} - struct zap_details; void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, @@ -398,12 +393,8 @@ extern long populate_vma_page_range(struct vm_area_struct *vma, extern long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, bool write, int *locked); -extern void munlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end); -static inline void munlock_vma_pages_all(struct vm_area_struct *vma) -{ - munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end); -} +extern int mlock_future_check(struct mm_struct *mm, unsigned long flags, + unsigned long len); /* * must be called with vma's mmap_lock held for read or write, and page locked. @@ -411,9 +402,6 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) extern void mlock_vma_page(struct page *page); extern void munlock_vma_page(struct page *page); -extern int mlock_future_check(struct mm_struct *mm, unsigned long flags, - unsigned long len); - /* * Clear the page's PageMlocked(). This can be useful in a situation where * we want to unconditionally remove a page from the pagecache -- e.g., diff --git a/mm/madvise.c b/mm/madvise.c index 5604064df464..ae35d72627ef 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -530,6 +530,11 @@ static void madvise_cold_page_range(struct mmu_gather *tlb, tlb_end_vma(tlb, vma); } +static inline bool can_madv_lru_vma(struct vm_area_struct *vma) +{ + return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)); +} + static long madvise_cold(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start_addr, unsigned long end_addr) diff --git a/mm/mlock.c b/mm/mlock.c index aec4ce7919da..5d7ced8303be 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -137,8 +137,8 @@ void munlock_vma_page(struct page *page) * Returns with VM_LOCKED cleared. Callers must be prepared to * deal with this. */ -void munlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) +static void munlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) { vma->vm_flags &= VM_LOCKED_CLEAR_MASK; diff --git a/mm/mmap.c b/mm/mmap.c index 1e8fdb0b51ed..64b5985b5295 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2674,6 +2674,8 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, vma->vm_prev = NULL; do { vma_rb_erase(vma, &mm->mm_rb); + if (vma->vm_flags & VM_LOCKED) + mm->locked_vm -= vma_pages(vma); mm->map_count--; tail_vma = vma; vma = vma->vm_next; @@ -2778,22 +2780,6 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, return __split_vma(mm, vma, addr, new_below); } -static inline void -unlock_range(struct vm_area_struct *start, unsigned long limit) -{ - struct mm_struct *mm = start->vm_mm; - struct vm_area_struct *tmp = start; - - while (tmp && tmp->vm_start < limit) { - if (tmp->vm_flags & VM_LOCKED) { - mm->locked_vm -= vma_pages(tmp); - munlock_vma_pages_all(tmp); - } - - tmp = tmp->vm_next; - } -} - /* Munmap is split into 2 main parts -- this part which finds * what needs doing, and the areas themselves, which do the * work. This now handles partial unmappings. @@ -2874,12 +2860,6 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, return error; } - /* - * unlock any mlock()ed ranges before detaching vmas - */ - if (mm->locked_vm) - unlock_range(vma, end); - /* Detach vmas from rbtree */ if (!detach_vmas_to_be_unmapped(mm, vma, prev, end)) downgrade = false; @@ -3147,20 +3127,12 @@ void exit_mmap(struct mm_struct *mm) * Nothing can be holding mm->mmap_lock here and the above call * to mmu_notifier_release(mm) ensures mmu notifier callbacks in * __oom_reap_task_mm() will not block. - * - * This needs to be done before calling unlock_range(), - * which clears VM_LOCKED, otherwise the oom reaper cannot - * reliably test it. */ (void)__oom_reap_task_mm(mm); - set_bit(MMF_OOM_SKIP, &mm->flags); } mmap_write_lock(mm); - if (mm->locked_vm) - unlock_range(mm->mmap, ULONG_MAX); - arch_exit_mmap(mm); vma = mm->mmap; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 832fb330376e..6b875acabd1e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -526,7 +526,7 @@ bool __oom_reap_task_mm(struct mm_struct *mm) set_bit(MMF_UNSTABLE, &mm->flags); for (vma = mm->mmap ; vma; vma = vma->vm_next) { - if (!can_madv_lru_vma(vma)) + if (vma->vm_flags & (VM_HUGETLB|VM_PFNMAP)) continue; /* -- cgit v1.2.3 From cea86fe246b694a191804b47378eb9d77aefabec Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Feb 2022 18:26:39 -0800 Subject: mm/munlock: rmap call mlock_vma_page() munlock_vma_page() Add vma argument to mlock_vma_page() and munlock_vma_page(), make them inline functions which check (vma->vm_flags & VM_LOCKED) before calling mlock_page() and munlock_page() in mm/mlock.c. Add bool compound to mlock_vma_page() and munlock_vma_page(): this is because we have understandable difficulty in accounting pte maps of THPs, and if passed a PageHead page, mlock_page() and munlock_page() cannot tell whether it's a pmd map to be counted or a pte map to be ignored. Add vma arg to page_add_file_rmap() and page_remove_rmap(), like the others, and use that to call mlock_vma_page() at the end of the page adds, and munlock_vma_page() at the end of page_remove_rmap() (end or beginning? unimportant, but end was easier for assertions in testing). No page lock is required (although almost all adds happen to hold it): delete the "Serialize with page migration" BUG_ON(!PageLocked(page))s. Certainly page lock did serialize with page migration, but I'm having difficulty explaining why that was ever important. Mlock accounting on THPs has been hard to define, differed between anon and file, involved PageDoubleMap in some places and not others, required clear_page_mlock() at some points. Keep it simple now: just count the pmds and ignore the ptes, there is no reason for ptes to undo pmd mlocks. page_add_new_anon_rmap() callers unchanged: they have long been calling lru_cache_add_inactive_or_unevictable(), which does its own VM_LOCKED handling (it also checks for not VM_SPECIAL: I think that's overcautious, and inconsistent with other checks, that mmap_region() already prevents VM_LOCKED on VM_SPECIAL; but haven't quite convinced myself to change it). Signed-off-by: Hugh Dickins Acked-by: Vlastimil Babka Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/rmap.h | 17 ++++++++------- kernel/events/uprobes.c | 7 ++----- mm/huge_memory.c | 17 +++++++-------- mm/hugetlb.c | 4 ++-- mm/internal.h | 36 ++++++++++++++++++++++++++----- mm/khugepaged.c | 4 ++-- mm/ksm.c | 12 +---------- mm/memory.c | 45 +++++++++++++-------------------------- mm/migrate.c | 9 ++------ mm/mlock.c | 21 +++++++------------ mm/rmap.c | 56 +++++++++++++++++++++++-------------------------- mm/userfaultfd.c | 14 +++++++------ 12 files changed, 113 insertions(+), 129 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index dc48aa8c2c94..ac29b076082b 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -167,18 +167,19 @@ struct anon_vma *page_get_anon_vma(struct page *page); */ void page_move_anon_rmap(struct page *, struct vm_area_struct *); void page_add_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long, bool); + unsigned long address, bool compound); void do_page_add_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long, int); + unsigned long address, int flags); void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long, bool); -void page_add_file_rmap(struct page *, bool); -void page_remove_rmap(struct page *, bool); - + unsigned long address, bool compound); +void page_add_file_rmap(struct page *, struct vm_area_struct *, + bool compound); +void page_remove_rmap(struct page *, struct vm_area_struct *, + bool compound); void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long); + unsigned long address); void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long); + unsigned long address); static inline void page_dup_rmap(struct page *page, bool compound) { diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 6357c3580d07..eed2f7437d96 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -173,7 +173,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, return err; } - /* For try_to_free_swap() and munlock_vma_page() below */ + /* For try_to_free_swap() below */ lock_page(old_page); mmu_notifier_invalidate_range_start(&range); @@ -201,13 +201,10 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, set_pte_at_notify(mm, addr, pvmw.pte, mk_pte(new_page, vma->vm_page_prot)); - page_remove_rmap(old_page, false); + page_remove_rmap(old_page, vma, false); if (!page_mapped(old_page)) try_to_free_swap(old_page); page_vma_mapped_walk_done(&pvmw); - - if ((vma->vm_flags & VM_LOCKED) && !PageCompound(old_page)) - munlock_vma_page(old_page); put_page(old_page); err = 0; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9a34b85ebcf8..d6477f48a27e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1577,7 +1577,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (pmd_present(orig_pmd)) { page = pmd_page(orig_pmd); - page_remove_rmap(page, true); + page_remove_rmap(page, vma, true); VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); VM_BUG_ON_PAGE(!PageHead(page), page); } else if (thp_migration_supported()) { @@ -1962,7 +1962,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, set_page_dirty(page); if (!PageReferenced(page) && pmd_young(old_pmd)) SetPageReferenced(page); - page_remove_rmap(page, true); + page_remove_rmap(page, vma, true); put_page(page); } add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR); @@ -2096,6 +2096,9 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, } } unlock_page_memcg(page); + + /* Above is effectively page_remove_rmap(page, vma, true) */ + munlock_vma_page(page, vma, true); } smp_wmb(); /* make pte visible before pmd */ @@ -2103,7 +2106,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, if (freeze) { for (i = 0; i < HPAGE_PMD_NR; i++) { - page_remove_rmap(page + i, false); + page_remove_rmap(page + i, vma, false); put_page(page + i); } } @@ -2163,8 +2166,6 @@ repeat: do_unlock_page = true; } } - if (PageMlocked(page)) - clear_page_mlock(page); } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd))) goto out; __split_huge_pmd_locked(vma, pmd, range.start, freeze); @@ -3138,7 +3139,7 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, if (pmd_soft_dirty(pmdval)) pmdswp = pmd_swp_mksoft_dirty(pmdswp); set_pmd_at(mm, address, pvmw->pmd, pmdswp); - page_remove_rmap(page, true); + page_remove_rmap(page, vma, true); put_page(page); } @@ -3168,10 +3169,8 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) if (PageAnon(new)) page_add_anon_rmap(new, vma, mmun_start, true); else - page_add_file_rmap(new, true); + page_add_file_rmap(new, vma, true); set_pmd_at(mm, mmun_start, pvmw->pmd, pmde); - if ((vma->vm_flags & VM_LOCKED) && !PageDoubleMap(new)) - mlock_vma_page(new); update_mmu_cache_pmd(vma, address, pvmw->pmd); } #endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 61895cc01d09..43fb3155298e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5014,7 +5014,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct set_page_dirty(page); hugetlb_count_sub(pages_per_huge_page(h), mm); - page_remove_rmap(page, true); + page_remove_rmap(page, vma, true); spin_unlock(ptl); tlb_remove_page_size(tlb, page, huge_page_size(h)); @@ -5259,7 +5259,7 @@ retry_avoidcopy: /* Break COW */ huge_ptep_clear_flush(vma, haddr, ptep); mmu_notifier_invalidate_range(mm, range.start, range.end); - page_remove_rmap(old_page, true); + page_remove_rmap(old_page, vma, true); hugepage_add_new_anon_rmap(new_page, vma, haddr); set_huge_pte_at(mm, haddr, ptep, make_huge_pte(vma, new_page, 1)); diff --git a/mm/internal.h b/mm/internal.h index f235aa92e564..3d7dfc8bc471 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -395,12 +395,35 @@ extern long faultin_vma_page_range(struct vm_area_struct *vma, bool write, int *locked); extern int mlock_future_check(struct mm_struct *mm, unsigned long flags, unsigned long len); - /* - * must be called with vma's mmap_lock held for read or write, and page locked. + * mlock_vma_page() and munlock_vma_page(): + * should be called with vma's mmap_lock held for read or write, + * under page table lock for the pte/pmd being added or removed. + * + * mlock is usually called at the end of page_add_*_rmap(), + * munlock at the end of page_remove_rmap(); but new anon + * pages are managed in lru_cache_add_inactive_or_unevictable(). + * + * @compound is used to include pmd mappings of THPs, but filter out + * pte mappings of THPs, which cannot be consistently counted: a pte + * mapping of the THP head cannot be distinguished by the page alone. */ -extern void mlock_vma_page(struct page *page); -extern void munlock_vma_page(struct page *page); +void mlock_page(struct page *page); +static inline void mlock_vma_page(struct page *page, + struct vm_area_struct *vma, bool compound) +{ + if (unlikely(vma->vm_flags & VM_LOCKED) && + (compound || !PageTransCompound(page))) + mlock_page(page); +} +void munlock_page(struct page *page); +static inline void munlock_vma_page(struct page *page, + struct vm_area_struct *vma, bool compound) +{ + if (unlikely(vma->vm_flags & VM_LOCKED) && + (compound || !PageTransCompound(page))) + munlock_page(page); +} /* * Clear the page's PageMlocked(). This can be useful in a situation where @@ -487,7 +510,10 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, #else /* !CONFIG_MMU */ static inline void unmap_mapping_folio(struct folio *folio) { } static inline void clear_page_mlock(struct page *page) { } -static inline void mlock_vma_page(struct page *page) { } +static inline void mlock_vma_page(struct page *page, + struct vm_area_struct *vma, bool compound) { } +static inline void munlock_vma_page(struct page *page, + struct vm_area_struct *vma, bool compound) { } static inline void vunmap_range_noflush(unsigned long start, unsigned long end) { } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 131492fd1148..52add1825525 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -774,7 +774,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, */ spin_lock(ptl); ptep_clear(vma->vm_mm, address, _pte); - page_remove_rmap(src_page, false); + page_remove_rmap(src_page, vma, false); spin_unlock(ptl); free_page_and_swap_cache(src_page); } @@ -1513,7 +1513,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) if (pte_none(*pte)) continue; page = vm_normal_page(vma, addr, *pte); - page_remove_rmap(page, false); + page_remove_rmap(page, vma, false); } pte_unmap_unlock(start_pte, ptl); diff --git a/mm/ksm.c b/mm/ksm.c index c20bd4d9a0d9..c5a4403b5dc9 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1177,7 +1177,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, ptep_clear_flush(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, newpte); - page_remove_rmap(page, false); + page_remove_rmap(page, vma, false); if (!page_mapped(page)) try_to_free_swap(page); put_page(page); @@ -1252,16 +1252,6 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, err = replace_page(vma, page, kpage, orig_pte); } - if ((vma->vm_flags & VM_LOCKED) && kpage && !err) { - munlock_vma_page(page); - if (!PageMlocked(kpage)) { - unlock_page(page); - lock_page(kpage); - mlock_vma_page(kpage); - page = kpage; /* for final unlock */ - } - } - out_unlock: unlock_page(page); out: diff --git a/mm/memory.c b/mm/memory.c index c125c4969913..53bd9e5f2e33 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -735,9 +735,6 @@ static void restore_exclusive_pte(struct vm_area_struct *vma, set_pte_at(vma->vm_mm, address, ptep, pte); - if (vma->vm_flags & VM_LOCKED) - mlock_vma_page(page); - /* * No need to invalidate - it was non-present before. However * secondary CPUs may have mappings that need invalidating. @@ -1377,7 +1374,7 @@ again: mark_page_accessed(page); } rss[mm_counter(page)]--; - page_remove_rmap(page, false); + page_remove_rmap(page, vma, false); if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); if (unlikely(__tlb_remove_page(tlb, page))) { @@ -1397,10 +1394,8 @@ again: continue; pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); rss[mm_counter(page)]--; - if (is_device_private_entry(entry)) - page_remove_rmap(page, false); - + page_remove_rmap(page, vma, false); put_page(page); continue; } @@ -1753,16 +1748,16 @@ static int validate_page_before_insert(struct page *page) return 0; } -static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte, +static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte, unsigned long addr, struct page *page, pgprot_t prot) { if (!pte_none(*pte)) return -EBUSY; /* Ok, finally just insert the thing.. */ get_page(page); - inc_mm_counter_fast(mm, mm_counter_file(page)); - page_add_file_rmap(page, false); - set_pte_at(mm, addr, pte, mk_pte(page, prot)); + inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); + page_add_file_rmap(page, vma, false); + set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot)); return 0; } @@ -1776,7 +1771,6 @@ static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte, static int insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot) { - struct mm_struct *mm = vma->vm_mm; int retval; pte_t *pte; spinlock_t *ptl; @@ -1785,17 +1779,17 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, if (retval) goto out; retval = -ENOMEM; - pte = get_locked_pte(mm, addr, &ptl); + pte = get_locked_pte(vma->vm_mm, addr, &ptl); if (!pte) goto out; - retval = insert_page_into_pte_locked(mm, pte, addr, page, prot); + retval = insert_page_into_pte_locked(vma, pte, addr, page, prot); pte_unmap_unlock(pte, ptl); out: return retval; } #ifdef pte_index -static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte, +static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte, unsigned long addr, struct page *page, pgprot_t prot) { int err; @@ -1805,7 +1799,7 @@ static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte, err = validate_page_before_insert(page); if (err) return err; - return insert_page_into_pte_locked(mm, pte, addr, page, prot); + return insert_page_into_pte_locked(vma, pte, addr, page, prot); } /* insert_pages() amortizes the cost of spinlock operations @@ -1842,7 +1836,7 @@ more: start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock); for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) { - int err = insert_page_in_batch_locked(mm, pte, + int err = insert_page_in_batch_locked(vma, pte, addr, pages[curr_page_idx], prot); if (unlikely(err)) { pte_unmap_unlock(start_pte, pte_lock); @@ -3098,7 +3092,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * mapcount is visible. So transitively, TLBs to * old page will be flushed before it can be reused. */ - page_remove_rmap(old_page, false); + page_remove_rmap(old_page, vma, false); } /* Free the old page.. */ @@ -3118,16 +3112,6 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) */ mmu_notifier_invalidate_range_only_end(&range); if (old_page) { - /* - * Don't let another task, with possibly unlocked vma, - * keep the mlocked page. - */ - if (page_copied && (vma->vm_flags & VM_LOCKED)) { - lock_page(old_page); /* LRU manipulation */ - if (PageMlocked(old_page)) - munlock_vma_page(old_page); - unlock_page(old_page); - } if (page_copied) free_swap_cache(old_page); put_page(old_page); @@ -3947,7 +3931,8 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR); - page_add_file_rmap(page, true); + page_add_file_rmap(page, vma, true); + /* * deposit and withdraw with pmd lock held */ @@ -3996,7 +3981,7 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) lru_cache_add_inactive_or_unevictable(page, vma); } else { inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); - page_add_file_rmap(page, false); + page_add_file_rmap(page, vma, false); } set_pte_at(vma->vm_mm, addr, vmf->pte, entry); } diff --git a/mm/migrate.c b/mm/migrate.c index c7da064b4781..7c4223ce2500 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -248,14 +248,9 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, if (PageAnon(new)) page_add_anon_rmap(new, vma, pvmw.address, false); else - page_add_file_rmap(new, false); + page_add_file_rmap(new, vma, false); set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); } - if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new)) - mlock_vma_page(new); - - if (PageTransHuge(page) && PageMlocked(page)) - clear_page_mlock(page); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, pvmw.address, pvmw.pte); @@ -2331,7 +2326,7 @@ again: * drop page refcount. Page won't be freed, as we took * a reference just above. */ - page_remove_rmap(page, false); + page_remove_rmap(page, vma, false); put_page(page); if (pte_present(pte)) diff --git a/mm/mlock.c b/mm/mlock.c index 5d7ced8303be..92f28258b4ae 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -78,17 +78,13 @@ void clear_page_mlock(struct page *page) } } -/* - * Mark page as mlocked if not already. - * If page on LRU, isolate and putback to move to unevictable list. +/** + * mlock_page - mlock a page + * @page: page to be mlocked, either a normal page or a THP head. */ -void mlock_vma_page(struct page *page) +void mlock_page(struct page *page) { - /* Serialize with page migration */ - BUG_ON(!PageLocked(page)); - VM_BUG_ON_PAGE(PageTail(page), page); - VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page); if (!TestSetPageMlocked(page)) { int nr_pages = thp_nr_pages(page); @@ -101,14 +97,11 @@ void mlock_vma_page(struct page *page) } /** - * munlock_vma_page - munlock a vma page - * @page: page to be unlocked, either a normal page or THP page head + * munlock_page - munlock a page + * @page: page to be munlocked, either a normal page or a THP head. */ -void munlock_vma_page(struct page *page) +void munlock_page(struct page *page) { - /* Serialize with page migration */ - BUG_ON(!PageLocked(page)); - VM_BUG_ON_PAGE(PageTail(page), page); if (TestClearPageMlocked(page)) { diff --git a/mm/rmap.c b/mm/rmap.c index 7ce7f1946cff..6cc8bf129f18 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1181,17 +1181,17 @@ void do_page_add_anon_rmap(struct page *page, __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); } - if (unlikely(PageKsm(page))) { + if (unlikely(PageKsm(page))) unlock_page_memcg(page); - return; - } /* address might be in next vma when migration races vma_adjust */ - if (first) + else if (first) __page_set_anon_rmap(page, vma, address, flags & RMAP_EXCLUSIVE); else __page_check_anon_rmap(page, vma, address); + + mlock_vma_page(page, vma, compound); } /** @@ -1232,12 +1232,14 @@ void page_add_new_anon_rmap(struct page *page, /** * page_add_file_rmap - add pte mapping to a file page - * @page: the page to add the mapping to - * @compound: charge the page as compound or small page + * @page: the page to add the mapping to + * @vma: the vm area in which the mapping is added + * @compound: charge the page as compound or small page * * The caller needs to hold the pte lock. */ -void page_add_file_rmap(struct page *page, bool compound) +void page_add_file_rmap(struct page *page, + struct vm_area_struct *vma, bool compound) { int i, nr = 1; @@ -1260,13 +1262,8 @@ void page_add_file_rmap(struct page *page, bool compound) nr_pages); } else { if (PageTransCompound(page) && page_mapping(page)) { - struct page *head = compound_head(page); - VM_WARN_ON_ONCE(!PageLocked(page)); - - SetPageDoubleMap(head); - if (PageMlocked(page)) - clear_page_mlock(head); + SetPageDoubleMap(compound_head(page)); } if (!atomic_inc_and_test(&page->_mapcount)) goto out; @@ -1274,6 +1271,8 @@ void page_add_file_rmap(struct page *page, bool compound) __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr); out: unlock_page_memcg(page); + + mlock_vma_page(page, vma, compound); } static void page_remove_file_rmap(struct page *page, bool compound) @@ -1368,11 +1367,13 @@ static void page_remove_anon_compound_rmap(struct page *page) /** * page_remove_rmap - take down pte mapping from a page * @page: page to remove mapping from + * @vma: the vm area from which the mapping is removed * @compound: uncharge the page as compound or small page * * The caller needs to hold the pte lock. */ -void page_remove_rmap(struct page *page, bool compound) +void page_remove_rmap(struct page *page, + struct vm_area_struct *vma, bool compound) { lock_page_memcg(page); @@ -1414,6 +1415,8 @@ void page_remove_rmap(struct page *page, bool compound) */ out: unlock_page_memcg(page); + + munlock_vma_page(page, vma, compound); } /* @@ -1469,28 +1472,21 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(&pvmw)) { + /* Unexpected PMD-mapped THP? */ + VM_BUG_ON_PAGE(!pvmw.pte, page); + /* - * If the page is mlock()d, we cannot swap it out. + * If the page is in an mlock()d vma, we must not swap it out. */ if (!(flags & TTU_IGNORE_MLOCK) && (vma->vm_flags & VM_LOCKED)) { - /* - * PTE-mapped THP are never marked as mlocked: so do - * not set it on a DoubleMap THP, nor on an Anon THP - * (which may still be PTE-mapped after DoubleMap was - * cleared). But stop unmapping even in those cases. - */ - if (!PageTransCompound(page) || (PageHead(page) && - !PageDoubleMap(page) && !PageAnon(page))) - mlock_vma_page(page); + /* Restore the mlock which got missed */ + mlock_vma_page(page, vma, false); page_vma_mapped_walk_done(&pvmw); ret = false; break; } - /* Unexpected PMD-mapped THP? */ - VM_BUG_ON_PAGE(!pvmw.pte, page); - subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte); address = pvmw.address; @@ -1668,7 +1664,7 @@ discard: * * See Documentation/vm/mmu_notifier.rst */ - page_remove_rmap(subpage, PageHuge(page)); + page_remove_rmap(subpage, vma, PageHuge(page)); put_page(page); } @@ -1942,7 +1938,7 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma, * * See Documentation/vm/mmu_notifier.rst */ - page_remove_rmap(subpage, PageHuge(page)); + page_remove_rmap(subpage, vma, PageHuge(page)); put_page(page); } @@ -2078,7 +2074,7 @@ static bool page_make_device_exclusive_one(struct page *page, * There is a reference on the page for the swap entry which has * been removed, so shouldn't take another. */ - page_remove_rmap(subpage, false); + page_remove_rmap(subpage, vma, false); } mmu_notifier_invalidate_range_end(&range); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 0780c2a57ff1..15d3e97a6e04 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -95,10 +95,15 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, if (!pte_none(*dst_pte)) goto out_unlock; - if (page_in_cache) - page_add_file_rmap(page, false); - else + if (page_in_cache) { + /* Usually, cache pages are already added to LRU */ + if (newly_allocated) + lru_cache_add(page); + page_add_file_rmap(page, dst_vma, false); + } else { page_add_new_anon_rmap(page, dst_vma, dst_addr, false); + lru_cache_add_inactive_or_unevictable(page, dst_vma); + } /* * Must happen after rmap, as mm_counter() checks mapping (via @@ -106,9 +111,6 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, */ inc_mm_counter(dst_mm, mm_counter(page)); - if (newly_allocated) - lru_cache_add_inactive_or_unevictable(page, dst_vma); - set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); /* No need to invalidate - it was non-present before */ -- cgit v1.2.3 From b109b87050df5438ee745b2bddfa3587970025bb Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Feb 2022 18:28:05 -0800 Subject: mm/munlock: replace clear_page_mlock() by final clearance Placing munlock_vma_page() at the end of page_remove_rmap() shifts most of the munlocking to clear_page_mlock(), since PageMlocked is typically still set when mapcount has fallen to 0. That is not what we want: we want /proc/vmstat's unevictable_pgs_cleared to remain as a useful check on the integrity of of the mlock/munlock protocol - small numbers are not surprising, but big numbers mean the protocol is not working. That could be easily fixed by placing munlock_vma_page() at the start of page_remove_rmap(); but later in the series we shall want to batch the munlocking, and that too would tend to leave PageMlocked still set at the point when it is checked. So delete clear_page_mlock() now: leave it instead to release_pages() (and __page_cache_release()) to do this backstop clearing of Mlocked, when page refcount has fallen to 0. If a pinned page occasionally gets counted as Mlocked and Unevictable until it is unpinned, that's okay. A slightly regrettable side-effect of this change is that, since release_pages() and __page_cache_release() may be called at interrupt time, those places which update NR_MLOCK with interrupts enabled had better use mod_zone_page_state() than __mod_zone_page_state() (but holding the lruvec lock always has interrupts disabled). This change, forcing Mlocked off when refcount 0 instead of earlier when mapcount 0, is not fundamental: it can be reversed if performance or something else is found to suffer; but this is the easiest way to separate the stats - let's not complicate that without good reason. Signed-off-by: Hugh Dickins Acked-by: Vlastimil Babka Signed-off-by: Matthew Wilcox (Oracle) --- mm/internal.h | 12 ------------ mm/mlock.c | 30 ------------------------------ mm/rmap.c | 9 --------- mm/swap.c | 32 ++++++++++++++++++++++++-------- 4 files changed, 24 insertions(+), 59 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 3d7dfc8bc471..a43d79335c16 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -425,17 +425,6 @@ static inline void munlock_vma_page(struct page *page, munlock_page(page); } -/* - * Clear the page's PageMlocked(). This can be useful in a situation where - * we want to unconditionally remove a page from the pagecache -- e.g., - * on truncation or freeing. - * - * It is legal to call this function for any page, mlocked or not. - * If called for a page that is still mapped by mlocked vmas, all we do - * is revert to lazy LRU behaviour -- semantics are not broken. - */ -extern void clear_page_mlock(struct page *page); - extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); /* @@ -509,7 +498,6 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, } #else /* !CONFIG_MMU */ static inline void unmap_mapping_folio(struct folio *folio) { } -static inline void clear_page_mlock(struct page *page) { } static inline void mlock_vma_page(struct page *page, struct vm_area_struct *vma, bool compound) { } static inline void munlock_vma_page(struct page *page, diff --git a/mm/mlock.c b/mm/mlock.c index 92f28258b4ae..3c26473050a3 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -48,36 +48,6 @@ EXPORT_SYMBOL(can_do_mlock); * PageUnevictable is set to indicate the unevictable state. */ -/* - * LRU accounting for clear_page_mlock() - */ -void clear_page_mlock(struct page *page) -{ - int nr_pages; - - if (!TestClearPageMlocked(page)) - return; - - nr_pages = thp_nr_pages(page); - mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); - count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages); - /* - * The previous TestClearPageMlocked() corresponds to the smp_mb() - * in __pagevec_lru_add_fn(). - * - * See __pagevec_lru_add_fn for more explanation. - */ - if (!isolate_lru_page(page)) { - putback_lru_page(page); - } else { - /* - * We lost the race. the page already moved to evictable list. - */ - if (PageUnevictable(page)) - count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); - } -} - /** * mlock_page - mlock a page * @page: page to be mlocked, either a normal page or a THP head. diff --git a/mm/rmap.c b/mm/rmap.c index 6cc8bf129f18..5442a5c97a85 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1315,9 +1315,6 @@ static void page_remove_file_rmap(struct page *page, bool compound) * pte lock(a spinlock) is held, which implies preemption disabled. */ __mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr); - - if (unlikely(PageMlocked(page))) - clear_page_mlock(page); } static void page_remove_anon_compound_rmap(struct page *page) @@ -1357,9 +1354,6 @@ static void page_remove_anon_compound_rmap(struct page *page) nr = thp_nr_pages(page); } - if (unlikely(PageMlocked(page))) - clear_page_mlock(page); - if (nr) __mod_lruvec_page_state(page, NR_ANON_MAPPED, -nr); } @@ -1398,9 +1392,6 @@ void page_remove_rmap(struct page *page, */ __dec_lruvec_page_state(page, NR_ANON_MAPPED); - if (unlikely(PageMlocked(page))) - clear_page_mlock(page); - if (PageTransCompound(page)) deferred_split_huge_page(compound_head(page)); diff --git a/mm/swap.c b/mm/swap.c index bcf3ac288b56..ff4810e4a4bc 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -74,8 +74,8 @@ static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = { }; /* - * This path almost never happens for VM activity - pages are norm