aboutsummaryrefslogtreecommitdiff
path: root/fs/proc
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-06-19 10:14:34 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2026-06-19 10:14:34 -0700
commita552c81ff4a16738ca5a44a177d552eb38d552ce (patch)
tree82800368fc5bc70e728875edb52777521f082ca8 /fs/proc
parentc98d767b34574be82b74d77d02264a830ae1cadd (diff)
parente3d8707358ea76b78bdec9928937bb9a797f2c8f (diff)
Merge tag 'mm-stable-2026-06-18-09-26' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton: - "selftests/mm: clean up build output and verbosity" (Li Wang) Remove some noise from the MM selftests build - "mm: Free contiguous order-0 pages efficiently" (Ryan Roberts) Speed up the freeing of a batch of 0-order pages by first scanning them for coalescing opportunities. This is applicable to vfree() and to the releasing of frozen pages - "mm/damon: introduce DAMOS failed region quota charge ratio" (SeongJae Park) Address a DAMOS usability issue: The DAMOS quota often exhausts prematurely because it charges for all memory attempted, causing slow and inconsistent performance when actions fail on unreclaimable memory. To fix this, a new feature lets users set a smaller, flexible quota charge ratio (via a numerator and denominator) for failed regions. Since failed actions cause less overhead, reducing their quota cost ensures more predictable and efficient DAMOS processing - "selftests/cgroup: improve zswap tests robustness and support large page sizes" (Li Wang) Fix various spurious failures and improves the overall robustness of the cgroup zswap selftests - "fix MAP_DROPPABLE not supported errno" (Anthony Yznaga) Fix an issue in the mlock selftests on arm32 - "mm: huge_memory: clean up defrag sysfs with shared" (Breno Leitao) Some maintenance work in the huge_memory code - "treewide: fixup gfp_t printks" (Brendan Jackman) Use the special vprintf() gfp_t conversion in various places - "mm: Fix vmemmap optimization accounting and initialization" (Muchun Song) Fix several bugs in the vmemmap optimization, mainly around incorrect page accounting and memmap initialization in the DAX and memory hotplug paths. It also fixes pageblock migratetype initialization and struct page initialization for ZONE_DEVICE compound pages - "mm/damon: repost non-hotfix reviewed patches in damon/next tree" A sprinkle of unrelated minor bugfixes for DAMON - "mm: remove page_mapped()" (David Hildenbrand) Remove this function from the tree, replacing it with folio_mapped() - "mm/damon: let DAMON be paused and resumed" (SeongJae Park) Allow DAMON to be paused and resumed without losing its current state - "kasan: hw_tags: Disable tagging for stack and page-tables" (Muhammad Usama Anjum) Simplify and speed up kasan by removing its ineffective tagging of stacks and page tables - "mm/damon/reclaim,lru_sort: monitor all system rams by default" (SeongJae Park) Simplify deployment on diverse hardware like NUMA systems by updating DAMON_RECLAIM and DAMON_LRU_SORT to automatically monitor the physical address range covering all System RAM areas by default, replacing the overly restrictive behavior that only targeted the single largest memory block to save on negligible overhead - "mm/damon/sysfs: document filters/ directory as deprecated" (SeongJae Park) Update some DAMON docs - "mm: use spinlock guards for zone lock" (Dmitry Ilvokhin) Switch zone->lock handling over to using the guard() mechanisms - "mm/filemap: tighten mmap_miss hit accounting" (fujunjie) Fix a flaw where the mmap_miss counter over-credited page cache hits during fault-arounds and page-fault retries. This results in significant reduction of redundant synchronous mmap readahead I/O, drastically cutting down execution time and gigabytes read for sparse random or strided memory access workloads - "selftests/cgroup: Fix false positive failures in test_percpu_basic" (Li Wang) Fix a couple of false-positives in the cgroup kmem selftests - "mm/damon/reclaim: support monitoring intervals auto-tuning" (SeongJae Park) Add a new parameter to DAMON permitting DAMON_RECLAIM to automatically tune DAMON's sampling and aggregation intervals - "mm/damon/stat: add kdamond_pid parameter" (SeongJae Park) Change DAMON_STAT to provide the pid of its kdamond - "mm/kmemleak: dedupe verbose scan output" (Breno Leitao) Remove large amounts of duplicated backtraces from the verbose-mode kmemleak output - "mm: remove CONFIG_HAVE_BOOTMEM_INFO_NODE (Part 1)" (David Hildenbrand) Reduce our use of CONFIG_HAVE_BOOTMEM_INFO_NODE, with a view to removing it entirely in a later series - "mm/damon: validate min_region_size to be power of 2" (Liew Rui Yan) Prevent users from passing a non-power-of-2 value of `addr_unit', as this later results in undesirable behavior - "mm: document read_pages and simplify usage" (Frederick Mayle) - "tools/mm/page-types: Fix misc bugs" (Ye Liu) Fix three issues in tools/mm/page-types.c - "mm: misc cleanups from __GFP_UNMAPPED series" (Brendan Jackman) Implement several cleanups in the page allocator and related code - "mm, swap: swap table phase IV: unify allocation" (Kairui Song) Unify the allocation and charging of anon and shmem swap in folios, provides better synchronization, consolidates the metadata management, hence dropping the static array and map, and improves performance - "mm/damon: introduce data attributes monitoring" (SeongJae Park( Extend DAMON to monitor general data attributes other than accesses - "mm/vmalloc: free unused pages on vrealloc() shrink" (Shivam Kalra) Implement the TODO in vrealloc() to unmap and free unused pages when shrinking across a page boundary - "mm/damon: documentation and comment fixes" (niecheng) - "remove mmap_action success, error hooks" (Lorenzo Stoakes) Eliminate custom hooks from mmap_action by removing the problematic success_hook which allowed drivers to improperly access uninitialized VMAs. It replaces the error_hook with a simple error-code field and updates the memory char driver accordingly - "mm/damon: minor improvements for code readability and tests" (SeongJae Park) - "mm/damon: fix macro arguments and clarify quota goals doc" (Maksym Shcherba) - "userfaultfd: merge fs/userfaultfd.c into mm/userfaultfd.c" (Mike Rapoport) - "mm/mglru: improve reclaim loop and dirty folio" (Kairui Song and others) Clean up and slightly improves MGLRU's reclaim loop and dirty writeback handling. Large performance improvements are measured - "use vma locks for proc/pid/{smaps|numa_maps} reads" (Suren Baghdasaryan) Use per-vma locks when reading /proc/pid/smaps and numa_maps similar to reduce contention on central mmap_lock - "refactors thpsize_shmem_enabled_store() and thpsize_shmem_enabled_show()" (Ran Xiaokai) Some cleanup work in the THP code - "selftests/memfd: fix compilation warnings" (Konstantin Khorenko) Fix a few build glitches in the memfd selftest code. - "memcg: shrink obj_stock_pcp and cache multiple objcgs" (Shakeel Butt) Resolve a 68% performance regression caused by NUMA-node cache thrashing around struct obj_stock_pcp by shrinking its existing fields and expanding it into a multi-slot array that caches up to five obj_cgroup pointers per CPU, allowing per-node variants of the same memcg to coexist within a single 64-byte cache line. - "zram: writeback fixes" (Sergey Senozhatsky) address a couple of unrelated zram writeback issues - "mm: switch THP shrinker to list_lru" (Johannes Weiner) Resolve NUMA-awareness issues and streamlines callsite interaction by refactoring and extending the list_lru API to completely replace the complex, open-coded deferred split queue for Transparent Huge Pages - "mm: improve large folio readahead for exec memory" (Usama Arif) Improve large-folio readahead on systems like 64K-page arm64 by preventing the mmap_miss check from permanently disabling target-oriented VM_EXEC readahead, and by generalizing the force_thp_readahead gate to support mappings with any usefully large maximum folio order under the cache cap. - "userfaultfd/pagemap: pre-existing fixes" (Kiryl Shutsemau) Fix a bunch of minor issues in the userfaultfd/pagemap, all of which were flagged by Sashiko review of proposed new material - "mm/sparse-vmemmap: Provide generic vmemmap_set_pmd() and vmemmap_check_pmd()" (Muchun Song) Provide generic versions of these two functions so the four arch-specific implementations can be removed. - "mm/swap, PM: hibernate: fix swapoff race in uswsusp by pinning swap device" (Youngjun Park) Address a uswsusp-vs-swapoff race and reduces the swap device reference taking/releasing frequency. - "mm/hmm: A fix and a selftest" (Dev Jain) * tag 'mm-stable-2026-06-18-09-26' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (321 commits) selftests/mm/hmm-tests: test pagemap reads of PMD device-private entries fs/proc/task_mmu: do not warn on seeing non-migration pmd entry lib/test_hmm: check alloc_page_vma() return value and handle OOM mm/compaction: cap compact_gap() at COMPACT_CLUSTER_MAX mm/swap: remove redundant swap device reference in alloc/free mm/swap, PM: hibernate: fix swapoff race in uswsusp by pinning swap device mm/filemap: use folio_next_index() for start vmalloc: fix NULL pointer dereference in is_vm_area_hugepages() sparc/mm: drop vmemmap_check_pmd helper and use generic code loongarch/mm: drop vmemmap_check_pmd helper and use generic code riscv/mm: drop vmemmap_pmd helpers and use generic code arm64/mm: drop vmemmap_pmd helpers and use generic code mm/sparse-vmemmap: provide generic vmemmap_set_pmd() and vmemmap_check_pmd() rust: page: mark Page::nid as inline userfaultfd: build __VMA_UFFD_FLAGS from config-gated masks userfaultfd: gate must_wait writability check on pte_present() mm/huge_memory: preserve pmd_swp_uffd_wp on device-private PMD downgrade fs/proc/task_mmu: fix hugetlb self-deadlock in pagemap_scan_pte_hole() fs/proc/task_mmu: use huge_page_size() in pagemap_scan_hugetlb_entry() fs/proc/task_mmu: fix make_uffd_wp_huge_pte() prot-update race ...
Diffstat (limited to 'fs/proc')
-rw-r--r--fs/proc/task_mmu.c269
1 files changed, 223 insertions, 46 deletions
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 751b9ba160fb..d32408f7cd5e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -132,6 +132,22 @@ static void release_task_mempolicy(struct proc_maps_private *priv)
#ifdef CONFIG_PER_VMA_LOCK
+static inline int lock_ctx_mm(struct proc_maps_locking_ctx *lock_ctx)
+{
+ int ret = mmap_read_lock_killable(lock_ctx->mm);
+
+ if (!ret)
+ lock_ctx->mmap_locked = true;
+
+ return ret;
+}
+
+static inline void unlock_ctx_mm(struct proc_maps_locking_ctx *lock_ctx)
+{
+ mmap_read_unlock(lock_ctx->mm);
+ lock_ctx->mmap_locked = false;
+}
+
static void reset_lock_ctx(struct proc_maps_locking_ctx *lock_ctx)
{
lock_ctx->locked_vma = NULL;
@@ -146,25 +162,11 @@ static void unlock_ctx_vma(struct proc_maps_locking_ctx *lock_ctx)
}
}
-static const struct seq_operations proc_pid_maps_op;
-
static inline bool lock_vma_range(struct seq_file *m,
struct proc_maps_locking_ctx *lock_ctx)
{
- /*
- * smaps and numa_maps perform page table walk, therefore require
- * mmap_lock but maps can be read with locking just the vma and
- * walking the vma tree under rcu read protection.
- */
- if (m->op != &proc_pid_maps_op) {
- if (mmap_read_lock_killable(lock_ctx->mm))
- return false;
-
- lock_ctx->mmap_locked = true;
- } else {
- rcu_read_lock();
- reset_lock_ctx(lock_ctx);
- }
+ rcu_read_lock();
+ reset_lock_ctx(lock_ctx);
return true;
}
@@ -172,7 +174,7 @@ static inline bool lock_vma_range(struct seq_file *m,
static inline void unlock_vma_range(struct proc_maps_locking_ctx *lock_ctx)
{
if (lock_ctx->mmap_locked) {
- mmap_read_unlock(lock_ctx->mm);
+ unlock_ctx_mm(lock_ctx);
} else {
unlock_ctx_vma(lock_ctx);
rcu_read_unlock();
@@ -213,17 +215,45 @@ static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv,
return true;
}
+static inline void drop_rcu(struct proc_maps_private *priv)
+{
+ if (priv->lock_ctx.mmap_locked)
+ return;
+
+ rcu_read_unlock();
+}
+
+static inline void reacquire_rcu(struct proc_maps_private *priv)
+{
+ if (priv->lock_ctx.mmap_locked)
+ return;
+
+ rcu_read_lock();
+ /* Reinitialize the iterator. */
+ vma_iter_set(&priv->iter, priv->lock_ctx.locked_vma->vm_end);
+}
+
#else /* CONFIG_PER_VMA_LOCK */
+static inline int lock_ctx_mm(struct proc_maps_locking_ctx *lock_ctx)
+{
+ return mmap_read_lock_killable(lock_ctx->mm);
+}
+
+static inline void unlock_ctx_mm(struct proc_maps_locking_ctx *lock_ctx)
+{
+ mmap_read_unlock(lock_ctx->mm);
+}
+
static inline bool lock_vma_range(struct seq_file *m,
struct proc_maps_locking_ctx *lock_ctx)
{
- return mmap_read_lock_killable(lock_ctx->mm) == 0;
+ return lock_ctx_mm(lock_ctx) == 0;
}
static inline void unlock_vma_range(struct proc_maps_locking_ctx *lock_ctx)
{
- mmap_read_unlock(lock_ctx->mm);
+ unlock_ctx_mm(lock_ctx);
}
static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv,
@@ -238,6 +268,9 @@ static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv,
return false;
}
+static inline void drop_rcu(struct proc_maps_private *priv) {}
+static inline void reacquire_rcu(struct proc_maps_private *priv) {}
+
#endif /* CONFIG_PER_VMA_LOCK */
static struct vm_area_struct *proc_get_vma(struct seq_file *m, loff_t *ppos)
@@ -538,12 +571,10 @@ static int query_vma_setup(struct proc_maps_locking_ctx *lock_ctx)
static void query_vma_teardown(struct proc_maps_locking_ctx *lock_ctx)
{
- if (lock_ctx->mmap_locked) {
- mmap_read_unlock(lock_ctx->mm);
- lock_ctx->mmap_locked = false;
- } else {
+ if (lock_ctx->mmap_locked)
+ unlock_ctx_mm(lock_ctx);
+ else
unlock_ctx_vma(lock_ctx);
- }
}
static struct vm_area_struct *query_vma_find_by_addr(struct proc_maps_locking_ctx *lock_ctx,
@@ -1280,21 +1311,75 @@ static const struct mm_walk_ops smaps_shmem_walk_ops = {
.walk_lock = PGWALK_RDLOCK,
};
+#ifdef CONFIG_PER_VMA_LOCK
+
+static const struct mm_walk_ops smaps_walk_vma_lock_ops = {
+ .pmd_entry = smaps_pte_range,
+ .hugetlb_entry = smaps_hugetlb_range,
+ .walk_lock = PGWALK_VMA_RDLOCK_VERIFY,
+};
+
+static const struct mm_walk_ops smaps_shmem_walk_vma_lock_ops = {
+ .pmd_entry = smaps_pte_range,
+ .hugetlb_entry = smaps_hugetlb_range,
+ .pte_hole = smaps_pte_hole,
+ .walk_lock = PGWALK_VMA_RDLOCK_VERIFY,
+};
+
+static inline const struct mm_walk_ops *
+get_smaps_walk_ops(struct proc_maps_private *priv)
+{
+ if (priv->lock_ctx.mmap_locked)
+ return &smaps_walk_ops;
+ return &smaps_walk_vma_lock_ops;
+}
+
+static inline const struct mm_walk_ops *
+get_smaps_shmem_walk_ops(struct proc_maps_private *priv)
+{
+ if (priv->lock_ctx.mmap_locked)
+ return &smaps_shmem_walk_ops;
+ return &smaps_shmem_walk_vma_lock_ops;
+}
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static inline const struct mm_walk_ops *
+get_smaps_walk_ops(struct proc_maps_private *priv)
+{
+ return &smaps_walk_ops;
+}
+
+static inline const struct mm_walk_ops *
+get_smaps_shmem_walk_ops(struct proc_maps_private *priv)
+{
+ return &smaps_shmem_walk_ops;
+}
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
/*
* Gather mem stats from @vma with the indicated beginning
* address @start, and keep them in @mss.
*
* Use vm_start of @vma as the beginning address if @start is 0.
*/
-static void smap_gather_stats(struct vm_area_struct *vma,
- struct mem_size_stats *mss, unsigned long start)
+static void smap_gather_stats(struct proc_maps_private *priv,
+ struct vm_area_struct *vma,
+ struct mem_size_stats *mss, unsigned long start)
{
- const struct mm_walk_ops *ops = &smaps_walk_ops;
+ const struct mm_walk_ops *ops = get_smaps_walk_ops(priv);
/* Invalid start */
if (start >= vma->vm_end)
return;
+ if (vma == get_gate_vma(priv->lock_ctx.mm))
+ return;
+
+ /* Might sleep. Drop RCU read lock but keep the VMA locked. */
+ drop_rcu(priv);
+
if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
/*
* For shared or readonly shmem mappings we know that all
@@ -1312,15 +1397,16 @@ static void smap_gather_stats(struct vm_area_struct *vma,
!(vma->vm_flags & VM_WRITE))) {
mss->swap += shmem_swapped;
} else {
- ops = &smaps_shmem_walk_ops;
+ ops = get_smaps_shmem_walk_ops(priv);
}
}
- /* mmap_lock is held in m_start */
if (!start)
walk_page_vma(vma, ops, mss);
else
walk_page_range(vma->vm_mm, start, vma->vm_end, ops, mss);
+
+ reacquire_rcu(priv);
}
#define SEQ_PUT_DEC(str, val) \
@@ -1369,10 +1455,11 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss,
static int show_smap(struct seq_file *m, void *v)
{
+ struct proc_maps_private *priv = m->private;
struct vm_area_struct *vma = v;
struct mem_size_stats mss = {};
- smap_gather_stats(vma, &mss, 0);
+ smap_gather_stats(priv, vma, &mss, 0);
show_map_vma(m, vma);
@@ -1413,7 +1500,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
goto out_put_task;
}
- ret = mmap_read_lock_killable(mm);
+ ret = lock_ctx_mm(&priv->lock_ctx);
if (ret)
goto out_put_mm;
@@ -1425,7 +1512,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
vma_start = vma->vm_start;
do {
- smap_gather_stats(vma, &mss, 0);
+ smap_gather_stats(priv, vma, &mss, 0);
last_vma_end = vma->vm_end;
/*
@@ -1434,8 +1521,8 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
*/
if (mmap_lock_is_contended(mm)) {
vma_iter_invalidate(&vmi);
- mmap_read_unlock(mm);
- ret = mmap_read_lock_killable(mm);
+ unlock_ctx_mm(&priv->lock_ctx);
+ ret = lock_ctx_mm(&priv->lock_ctx);
if (ret) {
release_task_mempolicy(priv);
goto out_put_mm;
@@ -1484,14 +1571,14 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
/* Case 1 and 2 above */
if (vma->vm_start >= last_vma_end) {
- smap_gather_stats(vma, &mss, 0);
+ smap_gather_stats(priv, vma, &mss, 0);
last_vma_end = vma->vm_end;
continue;
}
/* Case 4 above */
if (vma->vm_end > last_vma_end) {
- smap_gather_stats(vma, &mss, last_vma_end);
+ smap_gather_stats(priv, vma, &mss, last_vma_end);
last_vma_end = vma->vm_end;
}
}
@@ -1505,7 +1592,7 @@ empty_set:
__show_smap(m, &mss, true);
release_task_mempolicy(priv);
- mmap_read_unlock(mm);
+ unlock_ctx_mm(&priv->lock_ctx);
out_put_mm:
mmput(mm);
@@ -2042,7 +2129,6 @@ static int pagemap_pmd_range_thp(pmd_t *pmdp, unsigned long addr,
flags |= PM_SOFT_DIRTY;
if (pmd_swp_uffd_wp(pmd))
flags |= PM_UFFD_WP;
- VM_WARN_ON_ONCE(!pmd_is_migration_entry(pmd));
page = softleaf_to_page(entry);
}
@@ -2523,12 +2609,16 @@ static void make_uffd_wp_huge_pte(struct vm_area_struct *vma,
if (softleaf_is_hwpoison(entry) || softleaf_is_marker(entry))
return;
- if (softleaf_is_migration(entry))
+ if (softleaf_is_migration(entry)) {
set_huge_pte_at(vma->vm_mm, addr, ptep,
pte_swp_mkuffd_wp(ptent), psize);
- else
- huge_ptep_modify_prot_commit(vma, addr, ptep, ptent,
- huge_pte_mkuffd_wp(ptent));
+ } else {
+ pte_t old_pte, new_pte;
+
+ old_pte = huge_ptep_modify_prot_start(vma, addr, ptep);
+ new_pte = huge_pte_mkuffd_wp(old_pte);
+ huge_ptep_modify_prot_commit(vma, addr, ptep, old_pte, new_pte);
+ }
}
#endif /* CONFIG_HUGETLB_PAGE */
@@ -2869,7 +2959,7 @@ static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask,
if (~categories & PAGE_IS_WRITTEN)
goto out_unlock;
- if (end != start + HPAGE_SIZE) {
+ if (end != start + huge_page_size(hstate_vma(vma))) {
/* Partial HugeTLB page WP isn't possible. */
pagemap_scan_backout_range(p, start, end);
p->arg.walk_end = start;
@@ -2886,8 +2976,62 @@ out_unlock:
return ret;
}
+
+/*
+ * Write-protect the unpopulated hugetlb entries covering [addr, end) by
+ * installing uffd-wp markers inline, exactly as pagemap_scan_hugetlb_entry()
+ * does for populated entries.
+ *
+ * walk_hugetlb_range() currently calls ->pte_hole() once per huge page, so the
+ * loop normally runs a single iteration; it is written to cover the full range
+ * in case the walker ever coalesces adjacent holes.
+ *
+ * The obvious route -- uffd_wp_range() -> hugetlb_change_protection() --
+ * cannot be used here: it takes hugetlb_vma_lock_write(), but the page-table
+ * walker (walk_hugetlb_range()) already holds hugetlb_vma_lock_read() on the
+ * same VMA, so the scanning thread would deadlock against itself. PMD sharing
+ * is disabled on uffd-wp VMAs (hugetlb_unshare_all_pmds() at registration), so
+ * the vma lock guards nothing that matters for these entries anyway.
+ */
+static int pagemap_scan_hugetlb_hole_wp(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ struct hstate *h = hstate_vma(vma);
+ unsigned long psize = huge_page_size(h);
+ struct mm_struct *mm = vma->vm_mm;
+ spinlock_t *ptl;
+ pte_t *ptep;
+ pte_t pte;
+
+ for (addr = ALIGN_DOWN(addr, psize); addr < end; addr += psize) {
+ ptep = huge_pte_alloc(mm, vma, addr, psize);
+ if (!ptep)
+ return -ENOMEM;
+
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+ ptl = huge_pte_lock(h, mm, ptep);
+ pte = huge_ptep_get(mm, addr, ptep);
+ make_uffd_wp_huge_pte(vma, addr, ptep, pte);
+ /*
+ * A none entry has no cached translation, so installing the
+ * marker needs no TLB flush. Flush only if a fault populated
+ * the entry between huge_pte_alloc() and the page table lock.
+ */
+ if (!huge_pte_none(pte))
+ flush_hugetlb_tlb_range(vma, addr, addr + psize);
+ spin_unlock(ptl);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+ }
+
+ return 0;
+}
#else
#define pagemap_scan_hugetlb_entry NULL
+static int pagemap_scan_hugetlb_hole_wp(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ return 0;
+}
#endif
static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end,
@@ -2907,7 +3051,10 @@ static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end,
if (~p->arg.flags & PM_SCAN_WP_MATCHING)
return ret;
- err = uffd_wp_range(vma, addr, end - addr, true);
+ if (is_vm_hugetlb_page(vma))
+ err = pagemap_scan_hugetlb_hole_wp(vma, addr, end);
+ else
+ err = uffd_wp_range(vma, addr, end - addr, true);
if (err < 0)
ret = err;
@@ -3291,6 +3438,31 @@ static const struct mm_walk_ops show_numa_ops = {
.walk_lock = PGWALK_RDLOCK,
};
+#ifdef CONFIG_PER_VMA_LOCK
+static const struct mm_walk_ops show_numa_vma_lock_ops = {
+ .hugetlb_entry = gather_hugetlb_stats,
+ .pmd_entry = gather_pte_stats,
+ .walk_lock = PGWALK_VMA_RDLOCK_VERIFY,
+};
+
+static inline const struct mm_walk_ops *
+get_show_numa_ops(struct proc_maps_private *priv)
+{
+ if (priv->lock_ctx.mmap_locked)
+ return &show_numa_ops;
+ return &show_numa_vma_lock_ops;
+}
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static inline const struct mm_walk_ops *
+get_show_numa_ops(struct proc_maps_private *priv)
+{
+ return &show_numa_ops;
+}
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
/*
* Display pages allocated per node and memory policy via /proc.
*/
@@ -3335,8 +3507,13 @@ static int show_numa_map(struct seq_file *m, void *v)
if (is_vm_hugetlb_page(vma))
seq_puts(m, " huge");
- /* mmap_lock is held by m_start */
- walk_page_vma(vma, &show_numa_ops, md);
+ /* Skip walking pages if gate VMA */
+ if (vma != get_gate_vma(proc_priv->lock_ctx.mm)) {
+ /* Might sleep. Drop RCU read lock but keep the VMA locked. */
+ drop_rcu(proc_priv);
+ walk_page_vma(vma, get_show_numa_ops(proc_priv), md);
+ reacquire_rcu(proc_priv);
+ }
if (!md->pages)
goto out;