aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-07-02 12:08:10 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2021-07-02 12:08:10 -0700
commit71bd9341011f626d692aabe024f099820f02c497 (patch)
treea1c27fd8f17daff36e380800c5b69769d0d9cc99 /mm
parent3dbdb38e286903ec220aaf1fb29a8d94297da246 (diff)
parentb869d5be0acf0e125e69adcffdca04000dc5b17c (diff)
Merge branch 'akpm' (patches from Andrew)
Merge more updates from Andrew Morton: "190 patches. Subsystems affected by this patch series: mm (hugetlb, userfaultfd, vmscan, kconfig, proc, z3fold, zbud, ras, mempolicy, memblock, migration, thp, nommu, kconfig, madvise, memory-hotplug, zswap, zsmalloc, zram, cleanups, kfence, and hmm), procfs, sysctl, misc, core-kernel, lib, lz4, checkpatch, init, kprobes, nilfs2, hfs, signals, exec, kcov, selftests, compress/decompress, and ipc" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (190 commits) ipc/util.c: use binary search for max_idx ipc/sem.c: use READ_ONCE()/WRITE_ONCE() for use_global_lock ipc: use kmalloc for msg_queue and shmid_kernel ipc sem: use kvmalloc for sem_undo allocation lib/decompressors: remove set but not used variabled 'level' selftests/vm/pkeys: exercise x86 XSAVE init state selftests/vm/pkeys: refill shadow register after implicit kernel write selftests/vm/pkeys: handle negative sys_pkey_alloc() return code selftests/vm/pkeys: fix alloc_random_pkey() to make it really, really random kcov: add __no_sanitize_coverage to fix noinstr for all architectures exec: remove checks in __register_bimfmt() x86: signal: don't do sas_ss_reset() until we are certain that sigframe won't be abandoned hfsplus: report create_date to kstat.btime hfsplus: remove unnecessary oom message nilfs2: remove redundant continue statement in a while-loop kprobes: remove duplicated strong free_insn_page in x86 and s390 init: print out unknown kernel parameters checkpatch: do not complain about positive return values starting with EPOLL checkpatch: improve the indented label test checkpatch: scripts/spdxcheck.py now requires python3 ...
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig16
-rw-r--r--mm/Makefile2
-rw-r--r--mm/bootmem_info.c127
-rw-r--r--mm/compaction.c20
-rw-r--r--mm/debug_vm_pgtable.c109
-rw-r--r--mm/gup.c58
-rw-r--r--mm/hmm.c12
-rw-r--r--mm/huge_memory.c265
-rw-r--r--mm/hugetlb.c361
-rw-r--r--mm/hugetlb_vmemmap.c298
-rw-r--r--mm/hugetlb_vmemmap.h45
-rw-r--r--mm/internal.h29
-rw-r--r--mm/kfence/core.c4
-rw-r--r--mm/khugepaged.c20
-rw-r--r--mm/madvise.c66
-rw-r--r--mm/mapping_dirty_helpers.c2
-rw-r--r--mm/memblock.c28
-rw-r--r--mm/memcontrol.c4
-rw-r--r--mm/memory-failure.c38
-rw-r--r--mm/memory.c235
-rw-r--r--mm/memory_hotplug.c159
-rw-r--r--mm/mempolicy.c303
-rw-r--r--mm/migrate.c268
-rw-r--r--mm/mlock.c12
-rw-r--r--mm/mmap_lock.c59
-rw-r--r--mm/mprotect.c18
-rw-r--r--mm/nommu.c5
-rw-r--r--mm/oom_kill.c2
-rw-r--r--mm/page_alloc.c5
-rw-r--r--mm/page_vma_mapped.c15
-rw-r--r--mm/rmap.c628
-rw-r--r--mm/shmem.c123
-rw-r--r--mm/sparse-vmemmap.c354
-rw-r--r--mm/sparse.c1
-rw-r--r--mm/swap.c2
-rw-r--r--mm/swapfile.c2
-rw-r--r--mm/userfaultfd.c225
-rw-r--r--mm/util.c40
-rw-r--r--mm/vmalloc.c37
-rw-r--r--mm/vmscan.c20
-rw-r--r--mm/workingset.c10
-rw-r--r--mm/z3fold.c39
-rw-r--r--mm/zbud.c235
-rw-r--r--mm/zsmalloc.c3
-rw-r--r--mm/zswap.c26
45 files changed, 2919 insertions, 1411 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index ded98fb859ab..a02498c0e13d 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -96,6 +96,9 @@ config HAVE_FAST_GUP
depends on MMU
bool
+config HOLES_IN_ZONE
+ bool
+
# Don't discard allocated memory used to track "memory" and "reserved" memblocks
# after early boot, so it can still be used to test for validity of memory.
# Also, memblocks are updated with memory hot(un)plug.
@@ -671,6 +674,7 @@ config ZPOOL
config ZBUD
tristate "Low (Up to 2x) density storage for compressed pages"
+ depends on ZPOOL
help
A special purpose allocator for storing compressed pages.
It is designed to store up to two compressed pages per physical
@@ -757,6 +761,18 @@ config ARCH_HAS_CACHE_LINE_SIZE
config ARCH_HAS_PTE_DEVMAP
bool
+config ARCH_HAS_ZONE_DMA_SET
+ bool
+
+config ZONE_DMA
+ bool "Support DMA zone" if ARCH_HAS_ZONE_DMA_SET
+ default y if ARM64 || X86
+
+config ZONE_DMA32
+ bool "Support DMA32 zone" if ARCH_HAS_ZONE_DMA_SET
+ depends on !X86_32
+ default y if ARM64
+
config ZONE_DEVICE
bool "Device memory (pmem, HMM, etc...) hotplug support"
depends on MEMORY_HOTPLUG
diff --git a/mm/Makefile b/mm/Makefile
index bf71e295e9f6..74b47c354682 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -75,6 +75,7 @@ obj-$(CONFIG_FRONTSWAP) += frontswap.o
obj-$(CONFIG_ZSWAP) += zswap.o
obj-$(CONFIG_HAS_DMA) += dmapool.o
obj-$(CONFIG_HUGETLBFS) += hugetlb.o
+obj-$(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP) += hugetlb_vmemmap.o
obj-$(CONFIG_NUMA) += mempolicy.o
obj-$(CONFIG_SPARSEMEM) += sparse.o
obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
@@ -125,3 +126,4 @@ obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o
obj-$(CONFIG_PTDUMP_CORE) += ptdump.o
obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
obj-$(CONFIG_IO_MAPPING) += io-mapping.o
+obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
diff --git a/mm/bootmem_info.c b/mm/bootmem_info.c
new file mode 100644
index 000000000000..5b152dba7344
--- /dev/null
+++ b/mm/bootmem_info.c
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Bootmem core functions.
+ *
+ * Copyright (c) 2020, Bytedance.
+ *
+ * Author: Muchun Song <songmuchun@bytedance.com>
+ *
+ */
+#include <linux/mm.h>
+#include <linux/compiler.h>
+#include <linux/memblock.h>
+#include <linux/bootmem_info.h>
+#include <linux/memory_hotplug.h>
+
+void get_page_bootmem(unsigned long info, struct page *page, unsigned long type)
+{
+ page->freelist = (void *)type;
+ SetPagePrivate(page);
+ set_page_private(page, info);
+ page_ref_inc(page);
+}
+
+void put_page_bootmem(struct page *page)
+{
+ unsigned long type;
+
+ type = (unsigned long) page->freelist;
+ BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
+ type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
+
+ if (page_ref_dec_return(page) == 1) {
+ page->freelist = NULL;
+ ClearPagePrivate(page);
+ set_page_private(page, 0);
+ INIT_LIST_HEAD(&page->lru);
+ free_reserved_page(page);
+ }
+}
+
+#ifndef CONFIG_SPARSEMEM_VMEMMAP
+static void register_page_bootmem_info_section(unsigned long start_pfn)
+{
+ unsigned long mapsize, section_nr, i;
+ struct mem_section *ms;
+ struct page *page, *memmap;
+ struct mem_section_usage *usage;
+
+ section_nr = pfn_to_section_nr(start_pfn);
+ ms = __nr_to_section(section_nr);
+
+ /* Get section's memmap address */
+ memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
+
+ /*
+ * Get page for the memmap's phys address
+ * XXX: need more consideration for sparse_vmemmap...
+ */
+ page = virt_to_page(memmap);
+ mapsize = sizeof(struct page) * PAGES_PER_SECTION;
+ mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT;
+
+ /* remember memmap's page */
+ for (i = 0; i < mapsize; i++, page++)
+ get_page_bootmem(section_nr, page, SECTION_INFO);
+
+ usage = ms->usage;
+ page = virt_to_page(usage);
+
+ mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
+
+ for (i = 0; i < mapsize; i++, page++)
+ get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
+
+}
+#else /* CONFIG_SPARSEMEM_VMEMMAP */
+static void register_page_bootmem_info_section(unsigned long start_pfn)
+{
+ unsigned long mapsize, section_nr, i;
+ struct mem_section *ms;
+ struct page *page, *memmap;
+ struct mem_section_usage *usage;
+
+ section_nr = pfn_to_section_nr(start_pfn);
+ ms = __nr_to_section(section_nr);
+
+ memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
+
+ register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
+
+ usage = ms->usage;
+ page = virt_to_page(usage);
+
+ mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
+
+ for (i = 0; i < mapsize; i++, page++)
+ get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
+}
+#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
+
+void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
+{
+ unsigned long i, pfn, end_pfn, nr_pages;
+ int node = pgdat->node_id;
+ struct page *page;
+
+ nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT;
+ page = virt_to_page(pgdat);
+
+ for (i = 0; i < nr_pages; i++, page++)
+ get_page_bootmem(node, page, NODE_INFO);
+
+ pfn = pgdat->node_start_pfn;
+ end_pfn = pgdat_end_pfn(pgdat);
+
+ /* register section info */
+ for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+ /*
+ * Some platforms can assign the same pfn to multiple nodes - on
+ * node0 as well as nodeN. To avoid registering a pfn against
+ * multiple nodes we check that this pfn does not already
+ * reside in some other nodes.
+ */
+ if (pfn_valid(pfn) && (early_pfn_to_nid(pfn) == node))
+ register_page_bootmem_info_section(pfn);
+ }
+}
diff --git a/mm/compaction.c b/mm/compaction.c
index 3a509fbf2bea..621508e0ecd5 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1297,8 +1297,7 @@ move_freelist_head(struct list_head *freelist, struct page *freepage)
if (!list_is_last(freelist, &freepage->lru)) {
list_cut_before(&sublist, freelist, &freepage->lru);
- if (!list_empty(&sublist))
- list_splice_tail(&sublist, freelist);
+ list_splice_tail(&sublist, freelist);
}
}
@@ -1315,8 +1314,7 @@ move_freelist_tail(struct list_head *freelist, struct page *freepage)
if (!list_is_first(freelist, &freepage->lru)) {
list_cut_position(&sublist, freelist, &freepage->lru);
- if (!list_empty(&sublist))
- list_splice_tail(&sublist, freelist);
+ list_splice_tail(&sublist, freelist);
}
}
@@ -1380,7 +1378,7 @@ static int next_search_order(struct compact_control *cc, int order)
static unsigned long
fast_isolate_freepages(struct compact_control *cc)
{
- unsigned int limit = min(1U, freelist_scan_limit(cc) >> 1);
+ unsigned int limit = max(1U, freelist_scan_limit(cc) >> 1);
unsigned int nr_scanned = 0;
unsigned long low_pfn, min_pfn, highest = 0;
unsigned long nr_isolated = 0;
@@ -1492,11 +1490,11 @@ fast_isolate_freepages(struct compact_control *cc)
spin_unlock_irqrestore(&cc->zone->lock, flags);
/*
- * Smaller scan on next order so the total scan ig related
+ * Smaller scan on next order so the total scan is related
* to freelist_scan_limit.
*/
if (order_scanned >= limit)
- limit = min(1U, limit >> 1);
+ limit = max(1U, limit >> 1);
}
if (!page) {
@@ -2722,9 +2720,9 @@ int sysctl_compaction_handler(struct ctl_table *table, int write,
}
#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
-static ssize_t sysfs_compact_node(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
+static ssize_t compact_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
{
int nid = dev->id;
@@ -2737,7 +2735,7 @@ static ssize_t sysfs_compact_node(struct device *dev,
return count;
}
-static DEVICE_ATTR(compact, 0200, NULL, sysfs_compact_node);
+static DEVICE_ATTR_WO(compact);
int compaction_register_node(struct node *node)
{
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 92bfc37300df..1c922691aa61 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -91,7 +91,7 @@ static void __init pte_advanced_tests(struct mm_struct *mm,
unsigned long pfn, unsigned long vaddr,
pgprot_t prot)
{
- pte_t pte = pfn_pte(pfn, prot);
+ pte_t pte;
/*
* Architectures optimize set_pte_at by avoiding TLB flush.
@@ -248,29 +248,6 @@ static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot)
WARN_ON(!pmd_leaf(pmd));
}
-#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
-static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
-{
- pmd_t pmd;
-
- if (!arch_vmap_pmd_supported(prot))
- return;
-
- pr_debug("Validating PMD huge\n");
- /*
- * X86 defined pmd_set_huge() verifies that the given
- * PMD is not a populated non-leaf entry.
- */
- WRITE_ONCE(*pmdp, __pmd(0));
- WARN_ON(!pmd_set_huge(pmdp, __pfn_to_phys(pfn), prot));
- WARN_ON(!pmd_clear_huge(pmdp));
- pmd = READ_ONCE(*pmdp);
- WARN_ON(!pmd_none(pmd));
-}
-#else /* CONFIG_HAVE_ARCH_HUGE_VMAP */
-static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot) { }
-#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
-
static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot)
{
pmd_t pmd;
@@ -395,30 +372,6 @@ static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot)
pud = pud_mkhuge(pud);
WARN_ON(!pud_leaf(pud));
}
-
-#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
-static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
-{
- pud_t pud;
-
- if (!arch_vmap_pud_supported(prot))
- return;
-
- pr_debug("Validating PUD huge\n");
- /*
- * X86 defined pud_set_huge() verifies that the given
- * PUD is not a populated non-leaf entry.
- */
- WRITE_ONCE(*pudp, __pud(0));
- WARN_ON(!pud_set_huge(pudp, __pfn_to_phys(pfn), prot));
- WARN_ON(!pud_clear_huge(pudp));
- pud = READ_ONCE(*pudp);
- WARN_ON(!pud_none(pud));
-}
-#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
-static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) { }
-#endif /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
-
#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int idx) { }
static void __init pud_advanced_tests(struct mm_struct *mm,
@@ -428,9 +381,6 @@ static void __init pud_advanced_tests(struct mm_struct *mm,
{
}
static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) { }
-static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
-{
-}
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
static void __init pmd_basic_tests(unsigned long pfn, int idx) { }
@@ -449,14 +399,51 @@ static void __init pud_advanced_tests(struct mm_struct *mm,
}
static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot) { }
static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
{
+ pmd_t pmd;
+
+ if (!arch_vmap_pmd_supported(prot))
+ return;
+
+ pr_debug("Validating PMD huge\n");
+ /*
+ * X86 defined pmd_set_huge() verifies that the given
+ * PMD is not a populated non-leaf entry.
+ */
+ WRITE_ONCE(*pmdp, __pmd(0));
+ WARN_ON(!pmd_set_huge(pmdp, __pfn_to_phys(pfn), prot));
+ WARN_ON(!pmd_clear_huge(pmdp));
+ pmd = READ_ONCE(*pmdp);
+ WARN_ON(!pmd_none(pmd));
}
+
static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
{
+ pud_t pud;
+
+ if (!arch_vmap_pud_supported(prot))
+ return;
+
+ pr_debug("Validating PUD huge\n");
+ /*
+ * X86 defined pud_set_huge() verifies that the given
+ * PUD is not a populated non-leaf entry.
+ */
+ WRITE_ONCE(*pudp, __pud(0));
+ WARN_ON(!pud_set_huge(pudp, __pfn_to_phys(pfn), prot));
+ WARN_ON(!pud_clear_huge(pudp));
+ pud = READ_ONCE(*pudp);
+ WARN_ON(!pud_none(pud));
}
-static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot) { }
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
+static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot) { }
+static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
static void __init p4d_basic_tests(unsigned long pfn, pgprot_t prot)
{
@@ -791,12 +778,12 @@ static void __init pmd_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
WARN_ON(!pmd_swp_soft_dirty(pmd_swp_mksoft_dirty(pmd)));
WARN_ON(pmd_swp_soft_dirty(pmd_swp_clear_soft_dirty(pmd)));
}
-#else /* !CONFIG_ARCH_HAS_PTE_DEVMAP */
+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
static void __init pmd_soft_dirty_tests(unsigned long pfn, pgprot_t prot) { }
static void __init pmd_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
{
}
-#endif /* CONFIG_ARCH_HAS_PTE_DEVMAP */
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
static void __init pte_swap_tests(unsigned long pfn, pgprot_t prot)
{
@@ -856,17 +843,17 @@ static void __init swap_migration_tests(void)
* locked, otherwise it stumbles upon a BUG_ON().
*/
__SetPageLocked(page);
- swp = make_migration_entry(page, 1);
+ swp = make_writable_migration_entry(page_to_pfn(page));
WARN_ON(!is_migration_entry(swp));
- WARN_ON(!is_write_migration_entry(swp));
+ WARN_ON(!is_writable_migration_entry(swp));
- make_migration_entry_read(&swp);
+ swp = make_readable_migration_entry(swp_offset(swp));
WARN_ON(!is_migration_entry(swp));
- WARN_ON(is_write_migration_entry(swp));
+ WARN_ON(is_writable_migration_entry(swp));
- swp = make_migration_entry(page, 0);
+ swp = make_readable_migration_entry(page_to_pfn(page));
WARN_ON(!is_migration_entry(swp));
- WARN_ON(is_write_migration_entry(swp));
+ WARN_ON(is_writable_migration_entry(swp));
__ClearPageLocked(page);
__free_page(page);
}
diff --git a/mm/gup.c b/mm/gup.c
index 8651309f8ec3..728d996767cb 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1501,6 +1501,64 @@ long populate_vma_page_range(struct vm_area_struct *vma,
}
/*
+ * faultin_vma_page_range() - populate (prefault) page tables inside the
+ * given VMA range readable/writable
+ *
+ * This takes care of mlocking the pages, too, if VM_LOCKED is set.
+ *
+ * @vma: target vma
+ * @start: start address
+ * @end: end address
+ * @write: whether to prefault readable or writable
+ * @locked: whether the mmap_lock is still held
+ *
+ * Returns either number of processed pages in the vma, or a negative error
+ * code on error (see __get_user_pages()).
+ *
+ * vma->vm_mm->mmap_lock must be held. The range must be page-aligned and
+ * covered by the VMA.
+ *
+ * If @locked is NULL, it may be held for read or write and will be unperturbed.
+ *
+ * If @locked is non-NULL, it must held for read only and may be released. If
+ * it's released, *@locked will be set to 0.
+ */
+long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, bool write, int *locked)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long nr_pages = (end - start) / PAGE_SIZE;
+ int gup_flags;
+
+ VM_BUG_ON(!PAGE_ALIGNED(start));
+ VM_BUG_ON(!PAGE_ALIGNED(end));
+ VM_BUG_ON_VMA(start < vma->vm_start, vma);
+ VM_BUG_ON_VMA(end > vma->vm_end, vma);
+ mmap_assert_locked(mm);
+
+ /*
+ * FOLL_TOUCH: Mark page accessed and thereby young; will also mark
+ * the page dirty with FOLL_WRITE -- which doesn't make a
+ * difference with !FOLL_FORCE, because the page is writable
+ * in the page table.
+ * FOLL_HWPOISON: Return -EHWPOISON instead of -EFAULT when we hit
+ * a poisoned page.
+ * FOLL_POPULATE: Always populate memory with VM_LOCKONFAULT.
+ * !FOLL_FORCE: Require proper access permissions.
+ */
+ gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK | FOLL_HWPOISON;
+ if (write)
+ gup_flags |= FOLL_WRITE;
+
+ /*
+ * See check_vma_flags(): Will return -EFAULT on incompatible mappings
+ * or with insufficient permissions.
+ */
+ return __get_user_pages(mm, start, nr_pages, gup_flags,
+ NULL, NULL, locked);
+}
+
+/*
* __mm_populate - populate and/or mlock pages within a range of address space.
*
* This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
diff --git a/mm/hmm.c b/mm/hmm.c
index 943cb2ba4442..fad6be2bf072 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -26,6 +26,8 @@
#include <linux/mmu_notifier.h>
#include <linux/memory_hotplug.h>
+#include "internal.h"
+
struct hmm_vma_walk {
struct hmm_range *range;
unsigned long last;
@@ -214,7 +216,7 @@ static inline bool hmm_is_device_private_entry(struct hmm_range *range,
swp_entry_t entry)
{
return is_device_private_entry(entry) &&
- device_private_entry_to_page(entry)->pgmap->owner ==
+ pfn_swap_entry_to_page(entry)->pgmap->owner ==
range->dev_private_owner;
}
@@ -255,10 +257,9 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
*/
if (hmm_is_device_private_entry(range, entry)) {
cpu_flags = HMM_PFN_VALID;
- if (is_write_device_private_entry(entry))
+ if (is_writable_device_private_entry(entry))
cpu_flags |= HMM_PFN_WRITE;
- *hmm_pfn = device_private_entry_to_pfn(entry) |
- cpu_flags;
+ *hmm_pfn = swp_offset(entry) | cpu_flags;
return 0;
}
@@ -272,6 +273,9 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
if (!non_swap_entry(entry))
goto fault;
+ if (is_device_exclusive_entry(entry))
+ goto fault;
+
if (is_migration_entry(entry)) {
pte_unmap(ptep);
hmm_vma_walk->last = addr;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 6d2a0119fc58..8b731d53e9f4 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -64,7 +64,14 @@ static atomic_t huge_zero_refcount;
struct page *huge_zero_page __read_mostly;
unsigned long huge_zero_pfn __read_mostly = ~0UL;
-bool transparent_hugepage_enabled(struct vm_area_struct *vma)
+static inline bool file_thp_enabled(struct vm_area_struct *vma)
+{
+ return transhuge_vma_enabled(vma, vma->vm_flags) && vma->vm_file &&
+ !inode_is_open_for_write(vma->vm_file->f_inode) &&
+ (vma->vm_flags & VM_EXEC);
+}
+
+bool transparent_hugepage_active(struct vm_area_struct *vma)
{
/* The addr is used to check if the vma size fits */
unsigned long addr = (vma->vm_end & HPAGE_PMD_MASK) - HPAGE_PMD_SIZE;
@@ -75,6 +82,8 @@ bool transparent_hugepage_enabled(struct vm_area_struct *vma)
return __transparent_hugepage_enabled(vma);
if (vma_is_shmem(vma))
return shmem_huge_enabled(vma);
+ if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS))
+ return file_thp_enabled(vma);
return false;
}
@@ -1017,7 +1026,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
- struct vm_area_struct *vma)
+ struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
spinlock_t *dst_ptl, *src_ptl;
struct page *src_page;
@@ -1026,7 +1035,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
int ret = -ENOMEM;
/* Skip if can be re-fill on fault */
- if (!vma_is_anonymous(vma))
+ if (!vma_is_anonymous(dst_vma))
return 0;
pgtable = pte_alloc_one(dst_mm);
@@ -1040,29 +1049,26 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
ret = -EAGAIN;
pmd = *src_pmd;
- /*
- * Make sure the _PAGE_UFFD_WP bit is cleared if the new VMA
- * does not have the VM_UFFD_WP, which means that the uffd
- * fork event is not enabled.
- */
- if (!(vma->vm_flags & VM_UFFD_WP))
- pmd = pmd_clear_uffd_wp(pmd);
-
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
if (unlikely(is_swap_pmd(pmd))) {
swp_entry_t entry = pmd_to_swp_entry(pmd);
VM_BUG_ON(!is_pmd_migration_entry(pmd));
- if (is_write_migration_entry(entry)) {
- make_migration_entry_read(&entry);
+ if (is_writable_migration_entry(entry)) {
+ entry = make_readable_migration_entry(
+ swp_offset(entry));
pmd = swp_entry_to_pmd(entry);
if (pmd_swp_soft_dirty(*src_pmd))
pmd = pmd_swp_mksoft_dirty(pmd);
+ if (pmd_swp_uffd_wp(*src_pmd))
+ pmd = pmd_swp_mkuffd_wp(pmd);
set_pmd_at(src_mm, addr, src_pmd, pmd);
}
add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
mm_inc_nr_ptes(dst_mm);
pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
+ if (!userfaultfd_wp(dst_vma))
+ pmd = pmd_swp_clear_uffd_wp(pmd);
set_pmd_at(dst_mm, addr, dst_pmd, pmd);
ret = 0;
goto out_unlock;
@@ -1079,17 +1085,13 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* a page table.
*/
if (is_huge_zero_pmd(pmd)) {
- struct page *zero_page;
/*
* get_huge_zero_page() will never allocate a new page here,
* since we already have a zero page to copy. It just takes a
* reference.
*/
- zero_page = mm_get_huge_zero_page(dst_mm);
- set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
- zero_page);
- ret = 0;
- goto out_unlock;
+ mm_get_huge_zero_page(dst_mm);
+ goto out_zero_page;
}
src_page = pmd_page(pmd);
@@ -1102,21 +1104,23 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* best effort that the pinned pages won't be replaced by another
* random page during the coming copy-on-write.
*/
- if (unlikely(page_needs_cow_for_dma(vma, src_page))) {
+ if (unlikely(page_needs_cow_for_dma(src_vma, src_page))) {
pte_free(dst_mm, pgtable);
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
- __split_huge_pmd(vma, src_pmd, addr, false, NULL);
+ __split_huge_pmd(src_vma, src_pmd, addr, false, NULL);
return -EAGAIN;
}
get_page(src_page);
page_dup_rmap(src_page, true);
add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+out_zero_page:
mm_inc_nr_ptes(dst_mm);
pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
-
pmdp_set_wrprotect(src_mm, addr, src_pmd);
+ if (!userfaultfd_wp(dst_vma))
+ pmd = pmd_clear_uffd_wp(pmd);
pmd = pmd_mkold(pmd_wrprotect(pmd));
set_pmd_at(dst_mm, addr, dst_pmd, pmd);
@@ -1254,11 +1258,12 @@ unlock:
}
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
-void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd)
+void huge_pmd_set_accessed(struct vm_fault *vmf)
{
pmd_t entry;
unsigned long haddr;
bool write = vmf->flags & FAULT_FLAG_WRITE;
+ pmd_t orig_pmd = vmf->orig_pmd;